Properly handle raw text elements

This commit is contained in:
Laurenz 2025-06-23 15:54:52 +02:00
parent c2e2fd99f6
commit bf8ef2a4a5
5 changed files with 202 additions and 2 deletions

View File

@ -2,7 +2,9 @@ use std::fmt::Write;
use typst_library::diag::{bail, At, SourceResult, StrResult}; use typst_library::diag::{bail, At, SourceResult, StrResult};
use typst_library::foundations::Repr; use typst_library::foundations::Repr;
use typst_library::html::{charsets, tag, HtmlDocument, HtmlElement, HtmlNode, HtmlTag}; use typst_library::html::{
attr, charsets, tag, HtmlDocument, HtmlElement, HtmlNode, HtmlTag,
};
use typst_library::layout::Frame; use typst_library::layout::Frame;
use typst_syntax::Span; use typst_syntax::Span;
@ -95,7 +97,9 @@ fn write_element(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
return Ok(()); return Ok(());
} }
if !element.children.is_empty() { if tag::is_raw(element.tag) {
write_raw(w, element)?;
} else if !element.children.is_empty() {
write_children(w, element)?; write_children(w, element)?;
} }
@ -157,6 +161,108 @@ fn starts_with_newline(element: &HtmlElement) -> bool {
false false
} }
/// Encodes the contents of a raw text element.
fn write_raw(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
let text = collect_raw_text(element)?;
if let Some(closing) = find_closing_tag(&text, element.tag) {
bail!(
element.span,
"HTML raw text element cannot contain its own closing tag";
hint: "the sequence `{closing}` appears in the raw text",
)
}
let mode = if w.pretty { RawMode::of(element, &text) } else { RawMode::Keep };
match mode {
RawMode::Keep => {
w.buf.push_str(&text);
}
RawMode::Wrap => {
w.buf.push('\n');
w.buf.push_str(&text);
write_indent(w);
}
RawMode::Indent => {
w.level += 1;
for line in text.lines() {
write_indent(w);
w.buf.push_str(line);
}
w.level -= 1;
write_indent(w);
}
}
Ok(())
}
/// Collects the textual contents of a raw text element.
fn collect_raw_text(element: &HtmlElement) -> SourceResult<String> {
let mut output = String::new();
for c in &element.children {
match c {
HtmlNode::Tag(_) => continue,
HtmlNode::Text(text, _) => output.push_str(text),
HtmlNode::Element(_) | HtmlNode::Frame(_) => {
let span = match c {
HtmlNode::Element(child) => child.span,
_ => element.span,
};
bail!(span, "HTML raw text element cannot have non-text children")
}
};
}
Ok(output)
}
/// Finds a closing sequence for the given tag in the text, if it exists.
///
/// See HTML spec § 13.1.2.6.
fn find_closing_tag(text: &str, tag: HtmlTag) -> Option<&str> {
let s = tag.resolve();
let len = s.len();
text.match_indices("</").find_map(|(i, _)| {
let rest = &text[i + 2..];
let disallowed = rest.len() >= len
&& rest[..len].eq_ignore_ascii_case(&s)
&& rest[len..].starts_with(['\t', '\n', '\u{c}', '\r', ' ', '>', '/']);
disallowed.then(|| &text[i..i + 2 + len])
})
}
/// How to format the contents of a raw text element.
enum RawMode {
/// Just don't touch it.
Keep,
/// Newline after the opening and newline + indent before the closing tag.
Wrap,
/// Newlines after opening and before closing tag and each line indented.
Indent,
}
impl RawMode {
fn of(element: &HtmlElement, text: &str) -> Self {
match element.tag {
tag::script
if !element.attrs.0.iter().any(|(attr, value)| {
*attr == attr::r#type && value != "text/javascript"
}) =>
{
// Template literals can be multi-line, so indent may change
// the semantics of the JavaScript.
if text.contains('`') {
Self::Wrap
} else {
Self::Indent
}
}
tag::style => Self::Indent,
_ => Self::Keep,
}
}
}
/// Whether we are allowed to add an extra newline at the start and end of the /// Whether we are allowed to add an extra newline at the start and end of the
/// element's contents. /// element's contents.
/// ///

View File

@ -0,0 +1,8 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body><textarea>hello &lt;/textarea></textarea></body>
</html>

View File

@ -0,0 +1,21 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<script>
const x = 1
const y = 2
console.log(x < y, Math.max(1, 2))
</script>
<script>
console.log(`Hello
World`)
</script>
<script type="text/python">x = 1
y = 2
print(x < y, max(x, y))</script>
</body>
</html>

View File

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<style>
body {
text: red;
}
</style>
</body>
</html>

View File

@ -10,3 +10,54 @@
#html.pre("hello") #html.pre("hello")
#html.pre("\nhello") #html.pre("\nhello")
#html.pre("\n\nhello") #html.pre("\n\nhello")
--- html-script html ---
// This should be pretty and indented.
#html.script(
```js
const x = 1
const y = 2
console.log(x < y, Math.max(1, 2))
```.text,
)
// This should have extra newlines, but no indent because of the multiline
// string literal.
#html.script("console.log(`Hello\nWorld`)")
// This should be untouched.
#html.script(
type: "text/python",
```py
x = 1
y = 2
print(x < y, max(x, y))
```.text,
)
--- html-style html ---
// This should be pretty and indented.
#html.style(
```css
body {
text: red;
}
```.text,
)
--- html-raw-text-contains-elem html ---
// Error: 14-32 HTML raw text element cannot have non-text children
#html.script(html.strong[Hello])
--- html-raw-text-contains-frame html ---
// Error: 2-29 HTML raw text element cannot have non-text children
#html.script(html.frame[Ok])
--- html-raw-text-contains-closing-tag html ---
// Error: 2-32 HTML raw text element cannot contain its own closing tag
// Hint: 2-32 the sequence `</SCRiPT` appears in the raw text
#html.script("hello </SCRiPT ")
--- html-escapable-raw-text-contains-closing-tag html ---
// This is okay because we escape it.
#html.textarea("hello </textarea>")