Tweak HTML pretty printing (#5745)

This commit is contained in:
Laurenz 2025-01-24 12:15:09 +01:00 committed by GitHub
parent cd044825fc
commit 467968af07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 135 additions and 120 deletions

View File

@ -2,7 +2,7 @@ use std::fmt::Write;
use typst_library::diag::{bail, At, SourceResult, StrResult};
use typst_library::foundations::Repr;
use typst_library::html::{charsets, tag, HtmlDocument, HtmlElement, HtmlNode};
use typst_library::html::{charsets, tag, HtmlDocument, HtmlElement, HtmlNode, HtmlTag};
use typst_library::layout::Frame;
use typst_syntax::Span;
@ -20,10 +20,11 @@ pub fn html(document: &HtmlDocument) -> SourceResult<String> {
#[derive(Default)]
struct Writer {
/// The output buffer.
buf: String,
/// current indentation level
/// The current indentation level
level: usize,
/// pretty printing enabled?
/// Whether pretty printing is enabled.
pretty: bool,
}
@ -88,26 +89,32 @@ fn write_element(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
let pretty = w.pretty;
if !element.children.is_empty() {
w.pretty &= is_pretty(element);
let pretty_inside = allows_pretty_inside(element.tag)
&& element.children.iter().any(|node| match node {
HtmlNode::Element(child) => wants_pretty_around(child.tag),
_ => false,
});
w.pretty &= pretty_inside;
let mut indent = w.pretty;
w.level += 1;
for c in &element.children {
let pretty_child = match c {
let pretty_around = match c {
HtmlNode::Tag(_) => continue,
HtmlNode::Element(element) => is_pretty(element),
HtmlNode::Element(child) => w.pretty && wants_pretty_around(child.tag),
HtmlNode::Text(..) | HtmlNode::Frame(_) => false,
};
if core::mem::take(&mut indent) || pretty_child {
if core::mem::take(&mut indent) || pretty_around {
write_indent(w);
}
write_node(w, c)?;
indent = pretty_child;
indent = pretty_around;
}
w.level -= 1;
write_indent(w)
write_indent(w);
}
w.pretty = pretty;
@ -118,12 +125,27 @@ fn write_element(w: &mut Writer, element: &HtmlElement) -> SourceResult<()> {
Ok(())
}
/// Whether the element should be pretty-printed.
fn is_pretty(element: &HtmlElement) -> bool {
matches!(
element.tag,
tag::meta | tag::table | tag::thead | tag::tbody | tag::tfoot | tag::tr
) || tag::is_block_by_default(element.tag)
/// Whether we are allowed to add an extra newline at the start and end of the
/// element's contents.
///
/// Technically, users can change CSS `display` properties such that the
/// insertion of whitespace may actually impact the visual output. For example,
/// <https://www.w3.org/TR/css-text-3/#example-af2745cd> shows how adding CSS
/// rules to `<p>` can make it sensitive to whitespace. For this reason, we
/// should also respect the `style` tag in the future.
fn allows_pretty_inside(tag: HtmlTag) -> bool {
(tag::is_block_by_default(tag) && tag != tag::pre)
|| tag::is_tabular_by_default(tag)
|| tag == tag::li
}
/// Whether newlines should be added before and after the element if the parent
/// allows it.
///
/// In contrast to `allows_pretty_inside`, which is purely spec-driven, this is
/// more subjective and depends on preference.
fn wants_pretty_around(tag: HtmlTag) -> bool {
allows_pretty_inside(tag) || tag::is_metadata(tag) || tag == tag::pre
}
/// Escape a character.

View File

@ -475,17 +475,55 @@ pub mod tag {
wbr
}
/// Whether this is a void tag whose associated element may not have a
/// children.
pub fn is_void(tag: HtmlTag) -> bool {
matches!(
tag,
self::area
| self::base
| self::br
| self::col
| self::embed
| self::hr
| self::img
| self::input
| self::link
| self::meta
| self::param
| self::source
| self::track
| self::wbr
)
}
/// Whether this is a tag containing raw text.
pub fn is_raw(tag: HtmlTag) -> bool {
matches!(tag, self::script | self::style)
}
/// Whether this is a tag containing escapable raw text.
pub fn is_escapable_raw(tag: HtmlTag) -> bool {
matches!(tag, self::textarea | self::title)
}
/// Whether an element is considered metadata.
pub fn is_metadata(tag: HtmlTag) -> bool {
matches!(
tag,
self::base
| self::link
| self::meta
| self::noscript
| self::script
| self::style
| self::template
| self::title
)
}
/// Whether nodes with the tag have the CSS property `display: block` by
/// default.
///
/// If this is true, then pretty-printing can insert spaces around such
/// nodes and around the contents of such nodes.
///
/// However, when users change the properties of such tags via CSS, the
/// insertion of whitespace may actually impact the visual output; for
/// example, <https://www.w3.org/TR/css-text-3/#example-af2745cd> shows how
/// adding CSS rules to `<p>` can make it sensitive to whitespace. In such
/// cases, users should disable pretty-printing.
pub fn is_block_by_default(tag: HtmlTag) -> bool {
matches!(
tag,
@ -572,37 +610,23 @@ pub mod tag {
)
}
/// Whether this is a void tag whose associated element may not have a
/// children.
pub fn is_void(tag: HtmlTag) -> bool {
/// Whether nodes with the tag have the CSS property `display: table(-.*)?`
/// by default.
pub fn is_tabular_by_default(tag: HtmlTag) -> bool {
matches!(
tag,
self::area
| self::base
| self::br
self::table
| self::thead
| self::tbody
| self::tfoot
| self::tr
| self::th
| self::td
| self::caption
| self::col
| self::embed
| self::hr
| self::img
| self::input
| self::link
| self::meta
| self::param
| self::source
| self::track
| self::wbr
| self::colgroup
)
}
/// Whether this is a tag containing raw text.
pub fn is_raw(tag: HtmlTag) -> bool {
matches!(tag, self::script | self::style)
}
/// Whether this is a tag containing escapable raw text.
pub fn is_escapable_raw(tag: HtmlTag) -> bool {
matches!(tag, self::textarea | self::title)
}
}
/// Predefined constants for HTML attributes.

View File

@ -8,26 +8,36 @@
<table>
<thead>
<tr>
<th>The</th><th>first</th><th>and</th>
<th>The</th>
<th>first</th>
<th>and</th>
</tr>
<tr>
<th>the</th><th>second</th><th>row</th>
<th>the</th>
<th>second</th>
<th>row</th>
</tr>
</thead>
<tbody>
<tr>
<td>Foo</td><td rowspan="2">Baz</td><td>Bar</td>
<td>Foo</td>
<td rowspan="2">Baz</td>
<td>Bar</td>
</tr>
<tr>
<td>1</td><td>2</td>
<td>1</td>
<td>2</td>
</tr>
<tr>
<td colspan="2">3</td><td>4</td>
<td colspan="2">3</td>
<td>4</td>
</tr>
</tbody>
<tfoot>
<tr>
<td>The</td><td>last</td><td>row</td>
<td>The</td>
<td>last</td>
<td>row</td>
</tr>
</tfoot>
</table>

View File

@ -5,11 +5,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<p>
Paragraph
</p>
<div>
Div
</div>
<p>Paragraph</p>
<div>Div</div>
</body>
</html>

View File

@ -5,8 +5,6 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<p>
Text <span style="display: inline-block;">Span</span>.
</p>
<p>Text <span style="display: inline-block;">Span</span>.</p>
</body>
</html>

View File

@ -6,7 +6,8 @@
</head>
<body>
<ol start="3">
<li>Skipping</li><li>Ahead</li>
<li>Skipping</li>
<li>Ahead</li>
</ol>
</body>
</html>

View File

@ -5,26 +5,12 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<h2>
Level 1
</h2>
<h3>
Level 2
</h3>
<h4>
Level 3
</h4>
<h5>
Level 4
</h5>
<h6>
Level 5
</h6>
<div role="heading" aria-level="7">
Level 6
</div>
<div role="heading" aria-level="8">
Level 7
</div>
<h2>Level 1</h2>
<h3>Level 2</h3>
<h4>Level 3</h4>
<h5>Level 4</h5>
<h6>Level 5</h6>
<div role="heading" aria-level="7">Level 6</div>
<div role="heading" aria-level="8">Level 7</div>
</body>
</html>

View File

@ -5,17 +5,9 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<p>
<a href="https://example.com/">https://example.com/</a>
</p>
<p>
<a href="https://typst.org/">Some text text text</a>
</p>
<p>
This link appears <a href="https://google.com/">in the middle of</a> a paragraph.
</p>
<p>
Contact <a href="mailto:hi@typst.app">hi@typst.app</a> or call <a href="tel:123">123</a> for more information.
</p>
<p><a href="https://example.com/">https://example.com/</a></p>
<p><a href="https://typst.org/">Some text text text</a></p>
<p>This link appears <a href="https://google.com/">in the middle of</a> a paragraph.</p>
<p>Contact <a href="mailto:hi@typst.app">hi@typst.app</a> or call <a href="tel:123">123</a> for more information.</p>
</body>
</html>

View File

@ -5,11 +5,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<blockquote cite="https://typst.app/home">
Compose papers faster
</blockquote>
<p>
<a href="https://typst.app/home">typst.com</a>
</p>
<blockquote cite="https://typst.app/home"> Compose papers faster </blockquote>
<p><a href="https://typst.app/home">typst.com</a></p>
</body>
</html>

View File

@ -5,8 +5,6 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<p>
When you said that “he surely meant that she intended to say “I'm sorry””, I was quite confused.
</p>
<p>When you said that “he surely meant that she intended to say “I'm sorry””, I was quite confused.</p>
</body>
</html>

View File

@ -5,17 +5,9 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<blockquote>
… ἔοικα γοῦν τούτου γε σμικρῷ τινι αὐτῷ τούτῳ σοφώτερος εἶναι, ὅτι ἃ μὴ οἶδα οὐδὲ οἴομαι εἰδέναι.
</blockquote>
<p>
— Plato
</p>
<blockquote>
… I seem, then, in just this little thing to be wiser than this man at any rate, that what I do not know I do not think I know either.
</blockquote>
<p>
— from the Henry Cary literal translation of 1897
</p>
<blockquote> … ἔοικα γοῦν τούτου γε σμικρῷ τινι αὐτῷ τούτῳ σοφώτερος εἶναι, ὅτι ἃ μὴ οἶδα οὐδὲ οἴομαι εἰδέναι. </blockquote>
<p>— Plato</p>
<blockquote> … I seem, then, in just this little thing to be wiser than this man at any rate, that what I do not know I do not think I know either. </blockquote>
<p>— from the Henry Cary literal translation of 1897</p>
</body>
</html>