diff --git a/crates/typst-library/src/loading/html.rs b/crates/typst-library/src/loading/html.rs
new file mode 100644
index 000000000..002536375
--- /dev/null
+++ b/crates/typst-library/src/loading/html.rs
@@ -0,0 +1,99 @@
+use ecow::eco_format;
+use ego_tree::NodeRef;
+use scraper::Node;
+use typst_syntax::Spanned;
+
+use crate::diag::{At, FileError, SourceDiagnostic, SourceResult};
+use crate::engine::Engine;
+use crate::foundations::{dict, func, Array, Dict, IntoValue, Value};
+use crate::loading::{DataSource, Load};
+
+/// Reads structured data from an HTML file.
+///
+/// The HTML file is parsed into an array of dictionaries and strings. It is compatible with
+/// the XML format, parsed by the [`xml`]($xml) function.
+#[func(title = "HTML")]
+pub fn html_decode(
+ engine: &mut Engine,
+ /// A [path]($syntax/#paths) to an HTML file or raw HTML bytes.
+ source: Spanned,
+) -> SourceResult {
+ let data = source.load(engine.world)?;
+ let text = data.as_str().map_err(FileError::from).at(source.span)?;
+ let document = scraper::Html::parse_document(text);
+
+ if !document.errors.is_empty() {
+ let errors = document.errors.iter();
+ return Err(errors
+ .map(|msg| {
+ SourceDiagnostic::error(
+ source.span,
+ eco_format!("failed to parse HTML ({msg})"),
+ )
+ })
+ .collect());
+ }
+
+ Ok(convert_html(document.tree.root()))
+}
+
+/// Convert an HTML node to a Typst value.
+fn convert_html(node_ref: NodeRef) -> Value {
+ // `prefix` and `name` are part of the tag name. For example,
+ // in the following HTML, `html5` is the prefix and `div` is the name:
+ // ```
+ //
+ // ```
+ let (prefix, name, attrs) = match node_ref.value() {
+ Node::Text(text) => return (*text).into_value(),
+ Node::Document => return Value::Array(convert_html_children(node_ref)),
+ // todo: the namespace is ignored
+ Node::Element(element) => {
+ (element.name.prefix.as_ref(), &*element.name.local, Some(element.attrs()))
+ }
+ Node::Fragment => (None, "fragment", None),
+ // todo: doc type and processing instruction are ignored
+ // https://en.wikipedia.org/wiki/Processing_Instruction
+ Node::Doctype(..) | Node::ProcessingInstruction(..) => return Value::None,
+ Node::Comment(comment) => {
+ // Werid but compatible with current `xml`.
+ return Value::Dict(dict! {
+ "tag" => "",
+ "attrs" => dict! {},
+ "children" => [(*comment).into_value()].into_iter().collect::(),
+ });
+ }
+ };
+
+ let children = convert_html_children(node_ref);
+
+ let attrs: Dict = attrs
+ .into_iter()
+ .flatten()
+ .map(|(name, value)| (name.into(), value.into_value()))
+ .collect();
+
+ let mut converted = dict! {
+ "tag" => name.into_value(),
+ "attrs" => attrs,
+ "children" => children,
+ };
+
+ // In most cases, the prefix is not set, so we only add it if it exists.
+ if let Some(prefix) = prefix {
+ converted.insert("prefix".into(), (*prefix).into_value());
+ }
+
+ Value::Dict(converted)
+}
+
+/// Convert children an HTML node to a Typst value.
+fn convert_html_children(node_ref: NodeRef) -> Array {
+ node_ref
+ .children()
+ .filter(|v| {
+ !matches!(v.value(), Node::Doctype(..) | Node::ProcessingInstruction(..))
+ })
+ .map(convert_html)
+ .collect()
+}
diff --git a/crates/typst-library/src/loading/mod.rs b/crates/typst-library/src/loading/mod.rs
index c57e02888..b53decb9c 100644
--- a/crates/typst-library/src/loading/mod.rs
+++ b/crates/typst-library/src/loading/mod.rs
@@ -4,6 +4,8 @@
mod cbor_;
#[path = "csv.rs"]
mod csv_;
+#[path = "html.rs"]
+mod html_;
#[path = "json.rs"]
mod json_;
#[path = "read.rs"]
@@ -21,6 +23,7 @@ use typst_syntax::Spanned;
pub use self::cbor_::*;
pub use self::csv_::*;
+pub use self::html_::*;
pub use self::json_::*;
pub use self::read_::*;
pub use self::toml_::*;
@@ -37,6 +40,7 @@ pub(super) fn define(global: &mut Scope) {
global.start_category(crate::Category::DataLoading);
global.define_func::();
global.define_func::();
+ global.define_func::();
global.define_func::();
global.define_func::();
global.define_func::();