HTML DOM types

This commit is contained in:
Laurenz 2024-12-02 13:51:25 +01:00
parent 497446944c
commit 0ef97c104a
5 changed files with 645 additions and 1 deletions

View File

@ -646,7 +646,7 @@ impl Repr for str {
'\0' => r.push_str(r"\u{0}"),
'\'' => r.push('\''),
'"' => r.push_str(r#"\""#),
_ => c.escape_debug().for_each(|c| r.push(c)),
_ => r.extend(c.escape_debug()),
}
}
r.push('"');
@ -654,6 +654,12 @@ impl Repr for str {
}
}
impl Repr for char {
fn repr(&self) -> EcoString {
EcoString::from(*self).repr()
}
}
impl Add for Str {
type Output = Self;

View File

@ -0,0 +1,572 @@
use std::fmt::{self, Debug, Display, Formatter};
use ecow::{EcoString, EcoVec};
use typst_syntax::Span;
use typst_utils::{PicoStr, ResolvedPicoStr};
use crate::diag::{bail, HintedStrResult, StrResult};
use crate::foundations::{cast, Dict, Repr, Str};
use crate::introspection::{Introspector, Tag};
use crate::layout::Frame;
use crate::model::DocumentInfo;
/// An HTML document.
#[derive(Debug, Clone)]
pub struct HtmlDocument {
/// The document's root HTML element.
pub root: HtmlElement,
/// Details about the document.
pub info: DocumentInfo,
/// Provides the ability to execute queries on the document.
pub introspector: Introspector,
}
/// A child of an HTML element.
#[derive(Debug, Clone, Hash)]
pub enum HtmlNode {
/// An introspectable element that produced something within this node.
Tag(Tag),
/// Plain text.
Text(EcoString, Span),
/// Another element.
Element(HtmlElement),
/// A frame that will be displayed as an embedded SVG.
Frame(Frame),
}
impl HtmlNode {
/// Create a plain text node.
pub fn text(text: impl Into<EcoString>, span: Span) -> Self {
Self::Text(text.into(), span)
}
}
impl From<HtmlElement> for HtmlNode {
fn from(element: HtmlElement) -> Self {
Self::Element(element)
}
}
/// An HTML element.
#[derive(Debug, Clone, Hash)]
pub struct HtmlElement {
/// The HTML tag.
pub tag: HtmlTag,
/// The element's attributes.
pub attrs: HtmlAttrs,
/// The element's children.
pub children: Vec<HtmlNode>,
/// The span from which the element originated, if any.
pub span: Span,
}
impl HtmlElement {
/// Create a new, blank element without attributes or children.
pub fn new(tag: HtmlTag) -> Self {
Self {
tag,
attrs: HtmlAttrs::default(),
children: vec![],
span: Span::detached(),
}
}
/// Attach children to the element.
///
/// Note: This overwrites potential previous children.
pub fn with_children(mut self, children: Vec<HtmlNode>) -> Self {
self.children = children;
self
}
/// Add an atribute to the element.
pub fn with_attr(mut self, key: HtmlAttr, value: impl Into<EcoString>) -> Self {
self.attrs.push(key, value);
self
}
/// Attach a span to the element.
pub fn spanned(mut self, span: Span) -> Self {
self.span = span;
self
}
}
/// The tag of an HTML element.
#[derive(Copy, Clone, Eq, PartialEq, Hash)]
pub struct HtmlTag(PicoStr);
impl HtmlTag {
/// Intern an HTML tag string at runtime.
pub fn intern(string: &str) -> StrResult<Self> {
if string.is_empty() {
bail!("tag name must not be empty");
}
if let Some(c) = string.chars().find(|&c| !charsets::is_valid_in_tag_name(c)) {
bail!("the character {} is not valid in a tag name", c.repr());
}
Ok(Self(PicoStr::intern(string)))
}
/// Creates a compile-time constant `HtmlTag`.
///
/// Should only be used in const contexts because it can panic.
#[track_caller]
pub const fn constant(string: &'static str) -> Self {
if string.is_empty() {
panic!("tag name must not be empty");
}
let bytes = string.as_bytes();
let mut i = 0;
while i < bytes.len() {
if !bytes[i].is_ascii_alphanumeric() {
panic!("constant tag name must be ASCII alphanumeric");
}
i += 1;
}
Self(PicoStr::constant(string))
}
/// Resolves the tag to a string.
pub fn resolve(self) -> ResolvedPicoStr {
self.0.resolve()
}
/// Turns the tag into its inner interned string.
pub const fn into_inner(self) -> PicoStr {
self.0
}
}
impl Debug for HtmlTag {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
Display::fmt(self, f)
}
}
impl Display for HtmlTag {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "<{}>", self.resolve())
}
}
cast! {
HtmlTag,
self => self.0.resolve().as_str().into_value(),
v: Str => Self::intern(&v)?,
}
/// Attributes of an HTML element.
#[derive(Debug, Default, Clone, Eq, PartialEq, Hash)]
pub struct HtmlAttrs(pub EcoVec<(HtmlAttr, EcoString)>);
impl HtmlAttrs {
/// Add an attribute.
pub fn push(&mut self, attr: HtmlAttr, value: impl Into<EcoString>) {
self.0.push((attr, value.into()));
}
}
cast! {
HtmlAttrs,
self => self.0
.into_iter()
.map(|(key, value)| (key.resolve().as_str().into(), value.into_value()))
.collect::<Dict>()
.into_value(),
values: Dict => Self(values
.into_iter()
.map(|(k, v)| {
let attr = HtmlAttr::intern(&k)?;
let value = v.cast::<EcoString>()?;
Ok((attr, value))
})
.collect::<HintedStrResult<_>>()?),
}
/// An attribute of an HTML.
#[derive(Copy, Clone, Eq, PartialEq, Hash)]
pub struct HtmlAttr(PicoStr);
impl HtmlAttr {
/// Intern an HTML attribute string at runtime.
pub fn intern(string: &str) -> StrResult<Self> {
if string.is_empty() {
bail!("attribute name must not be empty");
}
if let Some(c) =
string.chars().find(|&c| !charsets::is_valid_in_attribute_name(c))
{
bail!("the character {} is not valid in an attribute name", c.repr());
}
Ok(Self(PicoStr::intern(string)))
}
/// Creates a compile-time constant `HtmlAttr`.
///
/// Should only be used in const contexts because it can panic.
#[track_caller]
pub const fn constant(string: &'static str) -> Self {
if string.is_empty() {
panic!("attribute name must not be empty");
}
let bytes = string.as_bytes();
let mut i = 0;
while i < bytes.len() {
if !bytes[i].is_ascii_alphanumeric() {
panic!("constant attribute name must be ASCII alphanumeric");
}
i += 1;
}
Self(PicoStr::constant(string))
}
/// Resolves the attribute to a string.
pub fn resolve(self) -> ResolvedPicoStr {
self.0.resolve()
}
/// Turns the attribute into its inner interned string.
pub const fn into_inner(self) -> PicoStr {
self.0
}
}
impl Debug for HtmlAttr {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
Display::fmt(self, f)
}
}
impl Display for HtmlAttr {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.resolve())
}
}
cast! {
HtmlAttr,
self => self.0.resolve().as_str().into_value(),
v: Str => Self::intern(&v)?,
}
/// Defines syntactical properties of HTML tags, attributes, and text.
pub mod charsets {
/// Check whether a character is in a tag name.
pub const fn is_valid_in_tag_name(c: char) -> bool {
c.is_ascii_alphanumeric()
}
/// Check whether a character is valid in an attribute name.
pub const fn is_valid_in_attribute_name(c: char) -> bool {
match c {
// These are forbidden.
'\0' | ' ' | '"' | '\'' | '>' | '/' | '=' => false,
c if is_whatwg_control_char(c) => false,
c if is_whatwg_non_char(c) => false,
// _Everything_ else is allowed, including U+2029 paragraph
// separator. Go wild.
_ => true,
}
}
/// Check whether a character can be an used in an attribute value without
/// escaping.
///
/// See <https://html.spec.whatwg.org/multipage/syntax.html#attributes-2>
pub const fn is_valid_in_attribute_value(c: char) -> bool {
match c {
// Ampersands are sometimes legal (i.e. when they are not _ambiguous
// ampersands_) but it is not worth the trouble to check for that.
'&' => false,
// Quotation marks are not allowed in double-quote-delimited attribute
// values.
'"' => false,
// All other text characters are allowed.
c => is_w3c_text_char(c),
}
}
/// Check whether a character can be an used in normal text without
/// escaping.
pub const fn is_valid_in_normal_element_text(c: char) -> bool {
match c {
// Ampersands are sometimes legal (i.e. when they are not _ambiguous
// ampersands_) but it is not worth the trouble to check for that.
'&' => false,
// Less-than signs are not allowed in text.
'<' => false,
// All other text characters are allowed.
c => is_w3c_text_char(c),
}
}
/// Check if something is valid text in HTML.
pub const fn is_w3c_text_char(c: char) -> bool {
match c {
// Non-characters are obviously not text characters.
c if is_whatwg_non_char(c) => false,
// Control characters are disallowed, except for whitespace.
c if is_whatwg_control_char(c) => c.is_ascii_whitespace(),
// Everything else is allowed.
_ => true,
}
}
const fn is_whatwg_non_char(c: char) -> bool {
match c {
'\u{fdd0}'..='\u{fdef}' => true,
// Non-characters matching xxFFFE or xxFFFF up to x10FFFF (inclusive).
c if c as u32 & 0xfffe == 0xfffe && c as u32 <= 0x10ffff => true,
_ => false,
}
}
const fn is_whatwg_control_char(c: char) -> bool {
match c {
// C0 control characters.
'\u{00}'..='\u{1f}' => true,
// Other control characters.
'\u{7f}'..='\u{9f}' => true,
_ => false,
}
}
}
/// Predefined constants for HTML tags.
pub mod tag {
use super::HtmlTag;
macro_rules! tags {
($($tag:ident)*) => {
$(#[allow(non_upper_case_globals)]
pub const $tag: HtmlTag = HtmlTag::constant(
stringify!($tag)
);)*
}
}
tags! {
a
abbr
address
area
article
aside
audio
b
base
bdi
bdo
blockquote
body
br
button
canvas
caption
cite
code
col
colgroup
data
datalist
dd
del
details
dfn
dialog
div
dl
dt
em
embed
fieldset
figcaption
figure
footer
form
h1
h2
h3
h4
h5
h6
head
header
hgroup
hr
html
i
iframe
img
input
ins
kbd
label
legend
li
link
main
map
mark
menu
meta
meter
nav
noscript
object
ol
optgroup
option
output
p
param
picture
pre
progress
q
rp
rt
ruby
s
samp
script
search
section
select
slot
small
source
span
strong
style
sub
summary
sup
table
tbody
td
template
textarea
tfoot
th
thead
time
title
tr
track
u
ul
var
video
wbr
}
/// Whether the element is inline-level as opposed to being block-level.
///
/// Not sure whether this distinction really makes sense. But we somehow
/// need to decide what to put into automatic paragraphs. A `<strong>`
/// should merged into a paragraph created by realization, but a `<div>`
/// shouldn't.
///
/// <https://www.w3.org/TR/html401/struct/global.html#block-inline>
/// <https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content>
/// <https://github.com/orgs/mdn/discussions/353>
pub fn is_inline(tag: HtmlTag) -> bool {
matches!(
tag,
self::abbr
| self::a
| self::bdi
| self::b
| self::br
| self::bdo
| self::code
| self::cite
| self::dfn
| self::data
| self::i
| self::em
| self::mark
| self::kbd
| self::rp
| self::q
| self::ruby
| self::rt
| self::samp
| self::s
| self::span
| self::small
| self::sub
| self::strong
| self::time
| self::sup
| self::var
| self::u
)
}
/// Whether this is a void tag whose associated element may not have a
/// children.
pub fn is_void(tag: HtmlTag) -> bool {
matches!(
tag,
self::area
| self::base
| self::br
| self::col
| self::embed
| self::hr
| self::img
| self::input
| self::link
| self::meta
| self::param
| self::source
| self::track
| self::wbr
)
}
/// Whether this is a tag containing raw text.
pub fn is_raw(tag: HtmlTag) -> bool {
matches!(tag, self::script | self::style)
}
/// Whether this is a tag containing escapable raw text.
pub fn is_escapable_raw(tag: HtmlTag) -> bool {
matches!(tag, self::textarea | self::title)
}
}
/// Predefined constants for HTML attributes.
///
/// Note: These are very incomplete.
pub mod attr {
use super::HtmlAttr;
macro_rules! attrs {
($($attr:ident)*) => {
$(#[allow(non_upper_case_globals)]
pub const $attr: HtmlAttr = HtmlAttr::constant(
stringify!($attr)
);)*
}
}
attrs! {
charset
content
href
name
value
}
}

View File

@ -0,0 +1,59 @@
//! HTML output.
mod dom;
pub use self::dom::*;
use ecow::EcoString;
use crate::foundations::{category, elem, Category, Content, Module, Scope};
/// HTML output.
#[category]
pub static HTML: Category;
/// Create a module with all HTML definitions.
pub fn module() -> Module {
let mut html = Scope::deduplicating();
html.category(HTML);
html.define_elem::<HtmlElem>();
html.define_elem::<FrameElem>();
Module::new("html", html)
}
/// A HTML element that can contain Typst content.
#[elem(name = "elem")]
pub struct HtmlElem {
/// The element's tag.
#[required]
pub tag: HtmlTag,
/// The element's attributes.
#[borrowed]
pub attrs: HtmlAttrs,
/// The contents of the HTML element.
#[positional]
#[borrowed]
pub body: Option<Content>,
}
impl HtmlElem {
/// Add an atribute to the element.
pub fn with_attr(mut self, attr: HtmlAttr, value: impl Into<EcoString>) -> Self {
self.attrs.get_or_insert_with(Default::default).push(attr, value);
self
}
}
/// An element that forces its contents to be laid out.
///
/// Integrates content that requires layout (e.g. a plot) into HTML output
/// by turning it into an inline SVG.
#[elem]
pub struct FrameElem {
/// The contents that shall be laid out.
#[positional]
#[required]
pub body: Content,
}

View File

@ -15,6 +15,7 @@ extern crate self as typst_library;
pub mod diag;
pub mod engine;
pub mod foundations;
pub mod html;
pub mod introspection;
pub mod layout;
pub mod loading;
@ -248,6 +249,10 @@ fn global(math: Module, inputs: Dict, features: &Features) -> Module {
self::introspection::define(&mut global);
self::loading::define(&mut global);
self::symbols::define(&mut global);
global.reset_category();
if features.is_enabled(Feature::Html) {
global.define_module(self::html::module());
}
prelude(&mut global);
Module::new("global", global)
}

View File

@ -216,6 +216,8 @@ mod exceptions {
pub const LIST: &[&str] = &[
"cjk-latin-spacing",
"discretionary-ligatures",
"h5",
"h6",
"historical-ligatures",
"number-clearance",
"number-margin",