From bf8ef2a4a5ffa9c30fce9fc254ffcf982634e4c6 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Mon, 23 Jun 2025 15:54:52 +0200 Subject: [PATCH] Properly handle raw text elements --- crates/typst-html/src/encode.rs | 110 +++++++++++++++++- ...capable-raw-text-contains-closing-tag.html | 8 ++ tests/ref/html/html-script.html | 21 ++++ tests/ref/html/html-style.html | 14 +++ tests/suite/html/syntax.typ | 51 ++++++++ 5 files changed, 202 insertions(+), 2 deletions(-) create mode 100644 tests/ref/html/html-escapable-raw-text-contains-closing-tag.html create mode 100644 tests/ref/html/html-script.html create mode 100644 tests/ref/html/html-style.html diff --git a/crates/typst-html/src/encode.rs b/crates/typst-html/src/encode.rs index 758bf0b91..adcb6e032 100644 --- a/crates/typst-html/src/encode.rs +++ b/crates/typst-html/src/encode.rs @@ -2,7 +2,9 @@ use std::fmt::Write; use typst_library::diag::{bail, At, SourceResult, StrResult}; use typst_library::foundations::Repr; -use typst_library::html::{charsets, tag, HtmlDocument, HtmlElement, HtmlNode, HtmlTag}; +use typst_library::html::{ + attr, charsets, tag, HtmlDocument, HtmlElement, HtmlNode, HtmlTag, +}; use typst_library::layout::Frame; use typst_syntax::Span; @@ -95,7 +97,9 @@ fn write_element(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> { return Ok(()); } - if !element.children.is_empty() { + if tag::is_raw(element.tag) { + write_raw(w, element)?; + } else if !element.children.is_empty() { write_children(w, element)?; } @@ -157,6 +161,108 @@ fn starts_with_newline(element: &HtmlElement) -> bool { false } +/// Encodes the contents of a raw text element. +fn write_raw(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> { + let text = collect_raw_text(element)?; + + if let Some(closing) = find_closing_tag(&text, element.tag) { + bail!( + element.span, + "HTML raw text element cannot contain its own closing tag"; + hint: "the sequence `{closing}` appears in the raw text", + ) + } + + let mode = if w.pretty { RawMode::of(element, &text) } else { RawMode::Keep }; + match mode { + RawMode::Keep => { + w.buf.push_str(&text); + } + RawMode::Wrap => { + w.buf.push('\n'); + w.buf.push_str(&text); + write_indent(w); + } + RawMode::Indent => { + w.level += 1; + for line in text.lines() { + write_indent(w); + w.buf.push_str(line); + } + w.level -= 1; + write_indent(w); + } + } + + Ok(()) +} + +/// Collects the textual contents of a raw text element. +fn collect_raw_text(element: &HtmlElement) -> SourceResult { + let mut output = String::new(); + for c in &element.children { + match c { + HtmlNode::Tag(_) => continue, + HtmlNode::Text(text, _) => output.push_str(text), + HtmlNode::Element(_) | HtmlNode::Frame(_) => { + let span = match c { + HtmlNode::Element(child) => child.span, + _ => element.span, + }; + bail!(span, "HTML raw text element cannot have non-text children") + } + }; + } + Ok(output) +} + +/// Finds a closing sequence for the given tag in the text, if it exists. +/// +/// See HTML spec ยง 13.1.2.6. +fn find_closing_tag(text: &str, tag: HtmlTag) -> Option<&str> { + let s = tag.resolve(); + let len = s.len(); + text.match_indices("= len + && rest[..len].eq_ignore_ascii_case(&s) + && rest[len..].starts_with(['\t', '\n', '\u{c}', '\r', ' ', '>', '/']); + disallowed.then(|| &text[i..i + 2 + len]) + }) +} + +/// How to format the contents of a raw text element. +enum RawMode { + /// Just don't touch it. + Keep, + /// Newline after the opening and newline + indent before the closing tag. + Wrap, + /// Newlines after opening and before closing tag and each line indented. + Indent, +} + +impl RawMode { + fn of(element: &HtmlElement, text: &str) -> Self { + match element.tag { + tag::script + if !element.attrs.0.iter().any(|(attr, value)| { + *attr == attr::r#type && value != "text/javascript" + }) => + { + // Template literals can be multi-line, so indent may change + // the semantics of the JavaScript. + if text.contains('`') { + Self::Wrap + } else { + Self::Indent + } + } + tag::style => Self::Indent, + _ => Self::Keep, + } + } +} + /// Whether we are allowed to add an extra newline at the start and end of the /// element's contents. /// diff --git a/tests/ref/html/html-escapable-raw-text-contains-closing-tag.html b/tests/ref/html/html-escapable-raw-text-contains-closing-tag.html new file mode 100644 index 000000000..9e0b96433 --- /dev/null +++ b/tests/ref/html/html-escapable-raw-text-contains-closing-tag.html @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/ref/html/html-script.html b/tests/ref/html/html-script.html new file mode 100644 index 000000000..81b74765a --- /dev/null +++ b/tests/ref/html/html-script.html @@ -0,0 +1,21 @@ + + + + + + + + + + + + diff --git a/tests/ref/html/html-style.html b/tests/ref/html/html-style.html new file mode 100644 index 000000000..c8d558bce --- /dev/null +++ b/tests/ref/html/html-style.html @@ -0,0 +1,14 @@ + + + + + + + + + + diff --git a/tests/suite/html/syntax.typ b/tests/suite/html/syntax.typ index fb5caf3bd..eb1c86994 100644 --- a/tests/suite/html/syntax.typ +++ b/tests/suite/html/syntax.typ @@ -10,3 +10,54 @@ #html.pre("hello") #html.pre("\nhello") #html.pre("\n\nhello") + +--- html-script html --- +// This should be pretty and indented. +#html.script( + ```js + const x = 1 + const y = 2 + console.log(x < y, Math.max(1, 2)) + ```.text, +) + +// This should have extra newlines, but no indent because of the multiline +// string literal. +#html.script("console.log(`Hello\nWorld`)") + +// This should be untouched. +#html.script( + type: "text/python", + ```py + x = 1 + y = 2 + print(x < y, max(x, y)) + ```.text, +) + +--- html-style html --- +// This should be pretty and indented. +#html.style( + ```css + body { + text: red; + } + ```.text, +) + +--- html-raw-text-contains-elem html --- +// Error: 14-32 HTML raw text element cannot have non-text children +#html.script(html.strong[Hello]) + +--- html-raw-text-contains-frame html --- +// Error: 2-29 HTML raw text element cannot have non-text children +#html.script(html.frame[Ok]) + +--- html-raw-text-contains-closing-tag html --- +// Error: 2-32 HTML raw text element cannot contain its own closing tag +// Hint: 2-32 the sequence `")