Compare commits

...

7 Commits

Author SHA1 Message Date
Myriad-Dreamin
fc9fbbb526
Merge eaf63ca80cdd13b6ef801262ab1b47c82dc0fd4a into 78355421ad73fdcbe93b4acca890b439c4b6f98d 2025-07-22 14:35:52 +02:00
Laurenz
78355421ad
Add pdf extension to image autocompletions (#6643) 2025-07-22 12:07:29 +00:00
Myriad-Dreamin
eaf63ca80c Remove comment 2025-06-03 02:06:53 +08:00
Myriad-Dreamin
e4c316f2cc Add tests 2025-05-28 22:19:59 +08:00
Myriad-Dreamin
1130d6b746 Core Impl 2025-05-28 22:19:56 +08:00
Myriad-Dreamin
1b44fea9d8 Add egotree 2025-05-28 22:00:33 +08:00
Myriad-Dreamin
8311997274 Add scraper 2025-05-28 22:00:33 +08:00
7 changed files with 370 additions and 1 deletions

217
Cargo.lock generated
View File

@ -545,6 +545,29 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
[[package]]
name = "cssparser"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn",
]
[[package]] [[package]]
name = "csv" name = "csv"
version = "1.3.1" version = "1.3.1"
@ -592,6 +615,17 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "derive_more"
version = "0.99.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "dirs" name = "dirs"
version = "6.0.0" version = "6.0.0"
@ -630,6 +664,21 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
[[package]]
name = "dtoa"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]] [[package]]
name = "ecow" name = "ecow"
version = "0.2.3" version = "0.2.3"
@ -639,6 +688,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]] [[package]]
name = "either" name = "either"
version = "1.13.0" version = "1.13.0"
@ -861,6 +916,16 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]] [[package]]
name = "fxhash" name = "fxhash"
version = "0.2.1" version = "0.2.1"
@ -1033,6 +1098,18 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "html5ever"
version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c"
dependencies = [
"log",
"mac",
"markup5ever",
"match_token",
]
[[package]] [[package]]
name = "httpdate" name = "httpdate"
version = "1.0.3" version = "1.0.3"
@ -1599,6 +1676,37 @@ dependencies = [
"pkg-config", "pkg-config",
] ]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "match_token"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.4" version = "2.7.4"
@ -1671,6 +1779,12 @@ dependencies = [
"tempfile", "tempfile",
] ]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]] [[package]]
name = "nom" name = "nom"
version = "7.1.3" version = "7.1.3"
@ -1928,6 +2042,16 @@ dependencies = [
"phf_shared", "phf_shared",
] ]
[[package]]
name = "phf_codegen"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]] [[package]]
name = "phf_generator" name = "phf_generator"
version = "0.11.3" version = "0.11.3"
@ -2040,6 +2164,12 @@ dependencies = [
"zerocopy", "zerocopy",
] ]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.93" version = "1.0.93"
@ -2341,6 +2471,21 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "527e65d9d888567588db4c12da1087598d0f6f8b346cc2c5abc91f05fc2dffe2"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]] [[package]]
name = "security-framework" name = "security-framework"
version = "2.11.1" version = "2.11.1"
@ -2364,6 +2509,25 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "selectors"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8"
dependencies = [
"bitflags 2.8.0",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]] [[package]]
name = "self-replace" name = "self-replace"
version = "1.5.0" version = "1.5.0"
@ -2447,6 +2611,15 @@ dependencies = [
"unsafe-libyaml", "unsafe-libyaml",
] ]
[[package]]
name = "servo_arc"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a"
dependencies = [
"stable_deref_trait",
]
[[package]] [[package]]
name = "shell-escape" name = "shell-escape"
version = "0.1.5" version = "0.1.5"
@ -2558,6 +2731,31 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "string_cache"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.11.1" version = "0.11.1"
@ -2680,6 +2878,17 @@ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]] [[package]]
name = "termcolor" name = "termcolor"
version = "1.4.1" version = "1.4.1"
@ -3128,6 +3337,7 @@ dependencies = [
"comemo", "comemo",
"csv", "csv",
"ecow", "ecow",
"ego-tree",
"flate2", "flate2",
"fontdb", "fontdb",
"glidesort", "glidesort",
@ -3152,6 +3362,7 @@ dependencies = [
"roxmltree", "roxmltree",
"rust_decimal", "rust_decimal",
"rustybuzz", "rustybuzz",
"scraper",
"serde", "serde",
"serde_json", "serde_json",
"serde_yaml 0.9.34+deprecated", "serde_yaml 0.9.34+deprecated",
@ -3488,6 +3699,12 @@ dependencies = [
"xmlwriter", "xmlwriter",
] ]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]] [[package]]
name = "utf16_iter" name = "utf16_iter"
version = "1.0.5" version = "1.0.5"

View File

@ -54,6 +54,7 @@ csv = "1"
ctrlc = "3.4.1" ctrlc = "3.4.1"
dirs = "6" dirs = "6"
ecow = { version = "0.2", features = ["serde"] } ecow = { version = "0.2", features = ["serde"] }
ego-tree = "0.10"
env_proxy = "0.4" env_proxy = "0.4"
fastrand = "2.3" fastrand = "2.3"
flate2 = "1" flate2 = "1"
@ -105,6 +106,7 @@ roxmltree = "0.20"
rust_decimal = { version = "1.36.0", default-features = false, features = ["maths"] } rust_decimal = { version = "1.36.0", default-features = false, features = ["maths"] }
rustybuzz = "0.20" rustybuzz = "0.20"
same-file = "1" same-file = "1"
scraper = "0.23.1"
self-replace = "1.3.7" self-replace = "1.3.7"
semver = "1" semver = "1"
serde = { version = "1.0.184", features = ["derive"] } serde = { version = "1.0.184", features = ["derive"] }

View File

@ -834,7 +834,7 @@ fn param_value_completions<'a>(
fn path_completion(func: &Func, param: &ParamInfo) -> Option<&'static [&'static str]> { fn path_completion(func: &Func, param: &ParamInfo) -> Option<&'static [&'static str]> {
Some(match (func.name(), param.name) { Some(match (func.name(), param.name) {
(Some("image"), "source") => { (Some("image"), "source") => {
&["png", "jpg", "jpeg", "gif", "svg", "svgz", "webp"] &["png", "jpg", "jpeg", "gif", "svg", "svgz", "webp", "pdf"]
} }
(Some("csv"), "source") => &["csv"], (Some("csv"), "source") => &["csv"],
(Some("plugin"), "source") => &["wasm"], (Some("plugin"), "source") => &["wasm"],

View File

@ -27,6 +27,7 @@ codex = { workspace = true }
comemo = { workspace = true } comemo = { workspace = true }
csv = { workspace = true } csv = { workspace = true }
ecow = { workspace = true } ecow = { workspace = true }
ego-tree = { workspace = true }
flate2 = { workspace = true } flate2 = { workspace = true }
fontdb = { workspace = true } fontdb = { workspace = true }
glidesort = { workspace = true } glidesort = { workspace = true }
@ -51,6 +52,7 @@ regex-syntax = { workspace = true }
roxmltree = { workspace = true } roxmltree = { workspace = true }
rust_decimal = { workspace = true } rust_decimal = { workspace = true }
rustybuzz = { workspace = true } rustybuzz = { workspace = true }
scraper = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
serde_yaml = { workspace = true } serde_yaml = { workspace = true }

View File

@ -0,0 +1,98 @@
use ecow::eco_format;
use ego_tree::NodeRef;
use scraper::Node;
use typst_syntax::Spanned;
use crate::diag::{At, FileError, SourceDiagnostic, SourceResult};
use crate::engine::Engine;
use crate::foundations::{dict, func, Array, Dict, IntoValue, Value};
use crate::loading::{DataSource, Load};
/// Reads structured data from an HTML file.
///
/// The HTML file is parsed into an array of dictionaries and strings. It is compatible with
/// the XML format, parsed by the [`xml`]($xml) function.
#[func(title = "HTML")]
pub fn html_decode(
engine: &mut Engine,
/// A [path]($syntax/#paths) to an HTML file or raw HTML bytes.
source: Spanned<DataSource>,
) -> SourceResult<Value> {
let data = source.load(engine.world)?;
let text = data.as_str().map_err(FileError::from).at(source.span)?;
let document = scraper::Html::parse_document(text);
if !document.errors.is_empty() {
let errors = document.errors.iter();
return Err(errors
.map(|msg| {
SourceDiagnostic::error(
source.span,
eco_format!("failed to parse HTML ({msg})"),
)
})
.collect());
}
Ok(convert_html(document.tree.root()))
}
/// Convert an HTML node to a Typst value.
fn convert_html(node_ref: NodeRef<Node>) -> Value {
// `prefix` and `name` are part of the tag name. For example,
// in the following HTML, `html5` is the prefix and `div` is the name:
// ```
// <html5:div class="example" />
// ```
let (prefix, name, attrs) = match node_ref.value() {
Node::Text(text) => return (*text).into_value(),
Node::Document => return Value::Array(convert_html_children(node_ref)),
// todo: the namespace is ignored
Node::Element(element) => {
(element.name.prefix.as_ref(), &*element.name.local, Some(element.attrs()))
}
Node::Fragment => (None, "fragment", None),
// todo: doc type and processing instruction are ignored
// https://en.wikipedia.org/wiki/Processing_Instruction
Node::Doctype(..) | Node::ProcessingInstruction(..) => return Value::None,
Node::Comment(comment) => {
return Value::Dict(dict! {
"tag" => "",
"attrs" => dict! {},
"children" => [(*comment).into_value()].into_iter().collect::<Array>(),
});
}
};
let children = convert_html_children(node_ref);
let attrs: Dict = attrs
.into_iter()
.flatten()
.map(|(name, value)| (name.into(), value.into_value()))
.collect();
let mut converted = dict! {
"tag" => name.into_value(),
"attrs" => attrs,
"children" => children,
};
// In most cases, the prefix is not set, so we only add it if it exists.
if let Some(prefix) = prefix {
converted.insert("prefix".into(), (*prefix).into_value());
}
Value::Dict(converted)
}
/// Convert children an HTML node to a Typst value.
fn convert_html_children(node_ref: NodeRef<Node>) -> Array {
node_ref
.children()
.filter(|v| {
!matches!(v.value(), Node::Doctype(..) | Node::ProcessingInstruction(..))
})
.map(convert_html)
.collect()
}

View File

@ -4,6 +4,8 @@
mod cbor_; mod cbor_;
#[path = "csv.rs"] #[path = "csv.rs"]
mod csv_; mod csv_;
#[path = "html.rs"]
mod html_;
#[path = "json.rs"] #[path = "json.rs"]
mod json_; mod json_;
#[path = "read.rs"] #[path = "read.rs"]
@ -21,6 +23,7 @@ use typst_syntax::{FileId, Spanned};
pub use self::cbor_::*; pub use self::cbor_::*;
pub use self::csv_::*; pub use self::csv_::*;
pub use self::html_::*;
pub use self::json_::*; pub use self::json_::*;
pub use self::read_::*; pub use self::read_::*;
pub use self::toml_::*; pub use self::toml_::*;
@ -37,6 +40,7 @@ pub(super) fn define(global: &mut Scope) {
global.start_category(crate::Category::DataLoading); global.start_category(crate::Category::DataLoading);
global.define_func::<read>(); global.define_func::<read>();
global.define_func::<csv>(); global.define_func::<csv>();
global.define_func::<html_decode>();
global.define_func::<json>(); global.define_func::<json>();
global.define_func::<toml>(); global.define_func::<toml>();
global.define_func::<yaml>(); global.define_func::<yaml>();

View File

@ -0,0 +1,46 @@
--- html ---
// Test reading XML data.
#let data = html-decode("/assets/text/example.html")
#test(data, ((
tag: "html",
attrs: (:),
children: (
(
tag: "head",
attrs: (:),
children: (
"\n ",
(
tag: "meta",
attrs: (charset: "UTF-8"),
children: (),
),
"\n ",
(
tag: "title",
attrs: (:),
children: ("Example document",),
),
"\n ",
),
),
"\n ",
(
tag: "body",
attrs: (:),
children: (
"\n ",
(
tag: "h1",
attrs: (:),
children: ("Hello, world!",),
),
"\n \n\n",
),
),
),
),))
--- html-invalid ---
// Error: 14-38 failed to parse HTML (Unexpected token)
#html-decode("/assets/text/hello.txt")