From 1130d6b74651e67134cd97c3888dd8ce33a4eb1a Mon Sep 17 00:00:00 2001 From: Myriad-Dreamin Date: Wed, 28 May 2025 22:00:11 +0800 Subject: [PATCH] Core Impl --- crates/typst-library/src/loading/html.rs | 99 ++++++++++++++++++++++++ crates/typst-library/src/loading/mod.rs | 4 + 2 files changed, 103 insertions(+) create mode 100644 crates/typst-library/src/loading/html.rs diff --git a/crates/typst-library/src/loading/html.rs b/crates/typst-library/src/loading/html.rs new file mode 100644 index 000000000..002536375 --- /dev/null +++ b/crates/typst-library/src/loading/html.rs @@ -0,0 +1,99 @@ +use ecow::eco_format; +use ego_tree::NodeRef; +use scraper::Node; +use typst_syntax::Spanned; + +use crate::diag::{At, FileError, SourceDiagnostic, SourceResult}; +use crate::engine::Engine; +use crate::foundations::{dict, func, Array, Dict, IntoValue, Value}; +use crate::loading::{DataSource, Load}; + +/// Reads structured data from an HTML file. +/// +/// The HTML file is parsed into an array of dictionaries and strings. It is compatible with +/// the XML format, parsed by the [`xml`]($xml) function. +#[func(title = "HTML")] +pub fn html_decode( + engine: &mut Engine, + /// A [path]($syntax/#paths) to an HTML file or raw HTML bytes. + source: Spanned, +) -> SourceResult { + let data = source.load(engine.world)?; + let text = data.as_str().map_err(FileError::from).at(source.span)?; + let document = scraper::Html::parse_document(text); + + if !document.errors.is_empty() { + let errors = document.errors.iter(); + return Err(errors + .map(|msg| { + SourceDiagnostic::error( + source.span, + eco_format!("failed to parse HTML ({msg})"), + ) + }) + .collect()); + } + + Ok(convert_html(document.tree.root())) +} + +/// Convert an HTML node to a Typst value. +fn convert_html(node_ref: NodeRef) -> Value { + // `prefix` and `name` are part of the tag name. For example, + // in the following HTML, `html5` is the prefix and `div` is the name: + // ``` + // + // ``` + let (prefix, name, attrs) = match node_ref.value() { + Node::Text(text) => return (*text).into_value(), + Node::Document => return Value::Array(convert_html_children(node_ref)), + // todo: the namespace is ignored + Node::Element(element) => { + (element.name.prefix.as_ref(), &*element.name.local, Some(element.attrs())) + } + Node::Fragment => (None, "fragment", None), + // todo: doc type and processing instruction are ignored + // https://en.wikipedia.org/wiki/Processing_Instruction + Node::Doctype(..) | Node::ProcessingInstruction(..) => return Value::None, + Node::Comment(comment) => { + // Werid but compatible with current `xml`. + return Value::Dict(dict! { + "tag" => "", + "attrs" => dict! {}, + "children" => [(*comment).into_value()].into_iter().collect::(), + }); + } + }; + + let children = convert_html_children(node_ref); + + let attrs: Dict = attrs + .into_iter() + .flatten() + .map(|(name, value)| (name.into(), value.into_value())) + .collect(); + + let mut converted = dict! { + "tag" => name.into_value(), + "attrs" => attrs, + "children" => children, + }; + + // In most cases, the prefix is not set, so we only add it if it exists. + if let Some(prefix) = prefix { + converted.insert("prefix".into(), (*prefix).into_value()); + } + + Value::Dict(converted) +} + +/// Convert children an HTML node to a Typst value. +fn convert_html_children(node_ref: NodeRef) -> Array { + node_ref + .children() + .filter(|v| { + !matches!(v.value(), Node::Doctype(..) | Node::ProcessingInstruction(..)) + }) + .map(convert_html) + .collect() +} diff --git a/crates/typst-library/src/loading/mod.rs b/crates/typst-library/src/loading/mod.rs index c57e02888..b53decb9c 100644 --- a/crates/typst-library/src/loading/mod.rs +++ b/crates/typst-library/src/loading/mod.rs @@ -4,6 +4,8 @@ mod cbor_; #[path = "csv.rs"] mod csv_; +#[path = "html.rs"] +mod html_; #[path = "json.rs"] mod json_; #[path = "read.rs"] @@ -21,6 +23,7 @@ use typst_syntax::Spanned; pub use self::cbor_::*; pub use self::csv_::*; +pub use self::html_::*; pub use self::json_::*; pub use self::read_::*; pub use self::toml_::*; @@ -37,6 +40,7 @@ pub(super) fn define(global: &mut Scope) { global.start_category(crate::Category::DataLoading); global.define_func::(); global.define_func::(); + global.define_func::(); global.define_func::(); global.define_func::(); global.define_func::();