Encoding fixes for HTML raw text elements (#6720)

This commit is contained in:
Laurenz 2025-08-07 19:27:59 +02:00 committed by GitHub
parent df9a9caee0
commit bcc71ddb9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 55 additions and 14 deletions

View File

@ -1,5 +1,6 @@
use std::fmt::Write;
use ecow::{EcoString, eco_format};
use typst_library::diag::{At, SourceResult, StrResult, bail};
use typst_library::foundations::Repr;
use typst_library::introspection::Introspector;
@ -107,8 +108,15 @@ fn write_element(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
return Ok(());
}
// See HTML spec § 13.1.2.5.
if matches!(element.tag, tag::pre | tag::textarea) && starts_with_newline(element) {
w.buf.push('\n');
}
if tag::is_raw(element.tag) {
write_raw(w, element)?;
} else if tag::is_escapable_raw(element.tag) {
write_escapable_raw(w, element)?;
} else if !element.children.is_empty() {
write_children(w, element)?;
}
@ -122,11 +130,6 @@ fn write_element(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
/// Encodes the children of an element.
fn write_children(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
// See HTML spec § 13.1.2.5.
if matches!(element.tag, tag::pre | tag::textarea) && starts_with_newline(element) {
w.buf.push('\n');
}
let pretty = w.pretty;
let pretty_inside = allows_pretty_inside(element.tag)
&& element.children.iter().any(|node| match node {
@ -208,20 +211,40 @@ fn write_raw(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
Ok(())
}
/// Encodes the contents of an escapable raw text element.
fn write_escapable_raw(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
walk_raw_text(element, |piece, span| write_text(w, piece, span))
}
/// Collects the textual contents of a raw text element.
fn collect_raw_text(element: &HtmlElement) -> SourceResult<String> {
let mut output = String::new();
let mut text = String::new();
walk_raw_text(element, |piece, span| {
if let Some(c) = piece.chars().find(|&c| !charsets::is_w3c_text_char(c)) {
return Err(unencodable(c)).at(span);
}
text.push_str(piece);
Ok(())
})?;
Ok(text)
}
/// Iterates over the textual contents of a raw text element.
fn walk_raw_text(
element: &HtmlElement,
mut f: impl FnMut(&str, Span) -> SourceResult<()>,
) -> SourceResult<()> {
for c in &element.children {
match c {
HtmlNode::Tag(_) => continue,
HtmlNode::Text(text, _) => output.push_str(text),
HtmlNode::Text(text, span) => f(text, *span)?,
HtmlNode::Element(HtmlElement { span, .. })
| HtmlNode::Frame(HtmlFrame { span, .. }) => {
bail!(*span, "HTML raw text element cannot have non-text children")
}
};
}
}
Ok(output)
Ok(())
}
/// Finds a closing sequence for the given tag in the text, if it exists.
@ -302,11 +325,17 @@ fn write_escape(w: &mut Writer, c: char) -> StrResult<()> {
c if charsets::is_w3c_text_char(c) && c != '\r' => {
write!(w.buf, "&#x{:x};", c as u32).unwrap()
}
_ => bail!("the character `{}` cannot be encoded in HTML", c.repr()),
_ => return Err(unencodable(c)),
}
Ok(())
}
/// The error message for a character that cannot be encoded.
#[cold]
fn unencodable(c: char) -> EcoString {
eco_format!("the character `{}` cannot be encoded in HTML", c.repr())
}
/// Encode a laid out frame into the writer.
fn write_frame(w: &mut Writer, frame: &HtmlFrame) {
let svg = typst_svg::svg_html_frame(

View File

@ -1,7 +1,3 @@
--- html-non-char html ---
// Error: 1-9 the character `"\u{fdd0}"` cannot be encoded in HTML
\u{fdd0}
--- html-void-element-with-children html ---
// Error: 2-27 HTML void elements must not have children
#html.elem("img", [Hello])
@ -61,6 +57,22 @@
// Hint: 2-32 the sequence `</SCRiPT` appears in the raw text
#html.script("hello </SCRiPT ")
--- html-escapable-raw-text-contains-elem html ---
// Error: 16-34 HTML raw text element cannot have non-text children
#html.textarea(html.strong[Hello])
--- html-escapable-raw-text-contains-closing-tag html ---
// This is okay because we escape it.
#html.textarea("hello </textarea>")
--- html-non-char html ---
// Error: 1-9 the character `"\u{fdd0}"` cannot be encoded in HTML
\u{fdd0}
--- html-raw-text-non-char html ---
// Error: 24-32 the character `"\u{fdd0}"` cannot be encoded in HTML
#html.script[const x = \u{fdd0}]
--- html-escapable-raw-text-non-char html ---
// Error: 23-31 the character `"\u{fdd0}"` cannot be encoded in HTML
#html.textarea[Typing \u{fdd0}]