mirror of
https://github.com/typst/typst
synced 2025-07-27 14:27:56 +08:00
Core Impl
This commit is contained in:
parent
1b44fea9d8
commit
1130d6b746
99
crates/typst-library/src/loading/html.rs
Normal file
99
crates/typst-library/src/loading/html.rs
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
use ecow::eco_format;
|
||||||
|
use ego_tree::NodeRef;
|
||||||
|
use scraper::Node;
|
||||||
|
use typst_syntax::Spanned;
|
||||||
|
|
||||||
|
use crate::diag::{At, FileError, SourceDiagnostic, SourceResult};
|
||||||
|
use crate::engine::Engine;
|
||||||
|
use crate::foundations::{dict, func, Array, Dict, IntoValue, Value};
|
||||||
|
use crate::loading::{DataSource, Load};
|
||||||
|
|
||||||
|
/// Reads structured data from an HTML file.
|
||||||
|
///
|
||||||
|
/// The HTML file is parsed into an array of dictionaries and strings. It is compatible with
|
||||||
|
/// the XML format, parsed by the [`xml`]($xml) function.
|
||||||
|
#[func(title = "HTML")]
|
||||||
|
pub fn html_decode(
|
||||||
|
engine: &mut Engine,
|
||||||
|
/// A [path]($syntax/#paths) to an HTML file or raw HTML bytes.
|
||||||
|
source: Spanned<DataSource>,
|
||||||
|
) -> SourceResult<Value> {
|
||||||
|
let data = source.load(engine.world)?;
|
||||||
|
let text = data.as_str().map_err(FileError::from).at(source.span)?;
|
||||||
|
let document = scraper::Html::parse_document(text);
|
||||||
|
|
||||||
|
if !document.errors.is_empty() {
|
||||||
|
let errors = document.errors.iter();
|
||||||
|
return Err(errors
|
||||||
|
.map(|msg| {
|
||||||
|
SourceDiagnostic::error(
|
||||||
|
source.span,
|
||||||
|
eco_format!("failed to parse HTML ({msg})"),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(convert_html(document.tree.root()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert an HTML node to a Typst value.
|
||||||
|
fn convert_html(node_ref: NodeRef<Node>) -> Value {
|
||||||
|
// `prefix` and `name` are part of the tag name. For example,
|
||||||
|
// in the following HTML, `html5` is the prefix and `div` is the name:
|
||||||
|
// ```
|
||||||
|
// <html5:div class="example" />
|
||||||
|
// ```
|
||||||
|
let (prefix, name, attrs) = match node_ref.value() {
|
||||||
|
Node::Text(text) => return (*text).into_value(),
|
||||||
|
Node::Document => return Value::Array(convert_html_children(node_ref)),
|
||||||
|
// todo: the namespace is ignored
|
||||||
|
Node::Element(element) => {
|
||||||
|
(element.name.prefix.as_ref(), &*element.name.local, Some(element.attrs()))
|
||||||
|
}
|
||||||
|
Node::Fragment => (None, "fragment", None),
|
||||||
|
// todo: doc type and processing instruction are ignored
|
||||||
|
// https://en.wikipedia.org/wiki/Processing_Instruction
|
||||||
|
Node::Doctype(..) | Node::ProcessingInstruction(..) => return Value::None,
|
||||||
|
Node::Comment(comment) => {
|
||||||
|
// Werid but compatible with current `xml`.
|
||||||
|
return Value::Dict(dict! {
|
||||||
|
"tag" => "",
|
||||||
|
"attrs" => dict! {},
|
||||||
|
"children" => [(*comment).into_value()].into_iter().collect::<Array>(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let children = convert_html_children(node_ref);
|
||||||
|
|
||||||
|
let attrs: Dict = attrs
|
||||||
|
.into_iter()
|
||||||
|
.flatten()
|
||||||
|
.map(|(name, value)| (name.into(), value.into_value()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut converted = dict! {
|
||||||
|
"tag" => name.into_value(),
|
||||||
|
"attrs" => attrs,
|
||||||
|
"children" => children,
|
||||||
|
};
|
||||||
|
|
||||||
|
// In most cases, the prefix is not set, so we only add it if it exists.
|
||||||
|
if let Some(prefix) = prefix {
|
||||||
|
converted.insert("prefix".into(), (*prefix).into_value());
|
||||||
|
}
|
||||||
|
|
||||||
|
Value::Dict(converted)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert children an HTML node to a Typst value.
|
||||||
|
fn convert_html_children(node_ref: NodeRef<Node>) -> Array {
|
||||||
|
node_ref
|
||||||
|
.children()
|
||||||
|
.filter(|v| {
|
||||||
|
!matches!(v.value(), Node::Doctype(..) | Node::ProcessingInstruction(..))
|
||||||
|
})
|
||||||
|
.map(convert_html)
|
||||||
|
.collect()
|
||||||
|
}
|
@ -4,6 +4,8 @@
|
|||||||
mod cbor_;
|
mod cbor_;
|
||||||
#[path = "csv.rs"]
|
#[path = "csv.rs"]
|
||||||
mod csv_;
|
mod csv_;
|
||||||
|
#[path = "html.rs"]
|
||||||
|
mod html_;
|
||||||
#[path = "json.rs"]
|
#[path = "json.rs"]
|
||||||
mod json_;
|
mod json_;
|
||||||
#[path = "read.rs"]
|
#[path = "read.rs"]
|
||||||
@ -21,6 +23,7 @@ use typst_syntax::Spanned;
|
|||||||
|
|
||||||
pub use self::cbor_::*;
|
pub use self::cbor_::*;
|
||||||
pub use self::csv_::*;
|
pub use self::csv_::*;
|
||||||
|
pub use self::html_::*;
|
||||||
pub use self::json_::*;
|
pub use self::json_::*;
|
||||||
pub use self::read_::*;
|
pub use self::read_::*;
|
||||||
pub use self::toml_::*;
|
pub use self::toml_::*;
|
||||||
@ -37,6 +40,7 @@ pub(super) fn define(global: &mut Scope) {
|
|||||||
global.start_category(crate::Category::DataLoading);
|
global.start_category(crate::Category::DataLoading);
|
||||||
global.define_func::<read>();
|
global.define_func::<read>();
|
||||||
global.define_func::<csv>();
|
global.define_func::<csv>();
|
||||||
|
global.define_func::<html_decode>();
|
||||||
global.define_func::<json>();
|
global.define_func::<json>();
|
||||||
global.define_func::<toml>();
|
global.define_func::<toml>();
|
||||||
global.define_func::<yaml>();
|
global.define_func::<yaml>();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user