From 8c861d2d274c04b99ef64c886bfc888e182b4c62 Mon Sep 17 00:00:00 2001 From: Tobias Schmitz Date: Thu, 22 May 2025 12:03:10 +0200 Subject: [PATCH] feat: [WIP] write tags skip-checks:true --- crates/typst-layout/src/pages/run.rs | 2 + crates/typst-pdf/src/convert.rs | 67 ++++++------ crates/typst-pdf/src/lib.rs | 1 + crates/typst-pdf/src/tags.rs | 149 +++++++++++++++++++++++++++ 4 files changed, 182 insertions(+), 37 deletions(-) create mode 100644 crates/typst-pdf/src/tags.rs diff --git a/crates/typst-layout/src/pages/run.rs b/crates/typst-layout/src/pages/run.rs index 6d2d29da5..e9e4e1105 100644 --- a/crates/typst-layout/src/pages/run.rs +++ b/crates/typst-layout/src/pages/run.rs @@ -185,6 +185,8 @@ fn layout_page_run_impl( )?; // Layouts a single marginal. + // TODO: add some sort of tag that indicates the marginals and use it to + // mark them as artifacts for PDF/UA. let mut layout_marginal = |content: &Option, area, align| { let Some(content) = content else { return Ok(None) }; let aligned = content.clone().styled(AlignElem::set_alignment(align)); diff --git a/crates/typst-pdf/src/convert.rs b/crates/typst-pdf/src/convert.rs index a925813c2..952767c6e 100644 --- a/crates/typst-pdf/src/convert.rs +++ b/crates/typst-pdf/src/convert.rs @@ -10,11 +10,11 @@ use krilla::error::KrillaError; use krilla::geom::PathBuilder; use krilla::page::{PageLabel, PageSettings}; use krilla::surface::Surface; -use krilla::tagging::{Node, SpanTag, Tag, TagGroup, TagTree}; +use krilla::tagging::{ArtifactType, ContentTag, Node}; use krilla::{Document, SerializeSettings}; use krilla_svg::render_svg_glyph; use typst_library::diag::{bail, error, SourceDiagnostic, SourceResult}; -use typst_library::foundations::{NativeElement, StyleChain}; +use typst_library::foundations::NativeElement; use typst_library::introspection::{self, Location}; use typst_library::layout::{ Abs, Frame, FrameItem, GroupItem, PagedDocument, Size, Transform, @@ -31,6 +31,7 @@ use crate::metadata::build_metadata; use crate::outline::build_outline; use crate::page::PageLabelExt; use crate::shape::handle_shape; +use crate::tags::{handle_close_tag, handle_open_tag, Tags}; use crate::text::handle_text; use crate::util::{convert_path, display_font, AbsExt, TransformExt}; use crate::PdfOptions; @@ -49,6 +50,8 @@ pub fn convert( xmp_metadata: true, cmyk_profile: None, configuration: config, + // TODO: Should we just set this to false? If set to `false` this will + // automatically be enabled if the `UA1` validator is used. enable_tagging: true, render_svg_glyph_fn: render_svg_glyph, }; @@ -70,12 +73,7 @@ pub fn convert( document.set_outline(build_outline(&gc)); document.set_metadata(build_metadata(&gc)); - - let mut tag_tree = TagTree::new(); - for tag in gc.tags.drain(..) { - tag_tree.push(tag); - } - document.set_tag_tree(tag_tree); + document.set_tag_tree(gc.tags.take_tree()); finish(document, gc, options.standards.config) } @@ -115,6 +113,19 @@ fn convert_pages(gc: &mut GlobalContext, document: &mut Document) -> SourceResul let mut surface = page.surface(); let mut fc = FrameContext::new(typst_page.frame.size()); + // Marked-content may not cross page boundaries: reopen tag + // that was closed at the end of the last page. + if let Some((_, _, nodes)) = gc.tags.stack.last_mut() { + let tag = if gc.tags.in_artifact { + ContentTag::Artifact(ArtifactType::Other) + } else { + ContentTag::Other + }; + // TODO: somehow avoid empty marked-content sequences + let id = surface.start_tagged(tag); + nodes.push(Node::Leaf(id)); + } + handle_frame( &mut fc, &typst_page.frame, @@ -123,6 +134,11 @@ fn convert_pages(gc: &mut GlobalContext, document: &mut Document) -> SourceResul gc, )?; + // Marked-content may not cross page boundaries: close open tag. + if !gc.tags.stack.is_empty() { + surface.end_tagged(); + } + surface.finish(); for annotation in fc.annotations { @@ -235,8 +251,8 @@ pub(crate) struct GlobalContext<'a> { /// The languages used throughout the document. pub(crate) languages: BTreeMap, pub(crate) page_index_converter: PageIndexConverter, - pub(crate) tag_stack: Vec, - pub(crate) tags: Vec, + /// Tagged PDF context. + pub(crate) tags: Tags, } impl<'a> GlobalContext<'a> { @@ -256,8 +272,8 @@ impl<'a> GlobalContext<'a> { image_spans: HashSet::new(), languages: BTreeMap::new(), page_index_converter, - tag_stack: Vec::new(), - tags: Vec::new(), + + tags: Tags::new(), } } } @@ -294,33 +310,10 @@ pub(crate) fn handle_frame( } FrameItem::Link(d, s) => handle_link(fc, gc, d, *s), FrameItem::Tag(introspection::Tag::Start(elem)) => { - let Some(heading) = elem.to_packed::() else { continue }; - let Some(loc) = heading.location() else { continue }; - - let level = heading.resolve_level(StyleChain::default()); - let name = heading.body.plain_text().to_string(); - let heading_id = surface - .start_tagged(krilla::tagging::ContentTag::Span(SpanTag::empty())); - let tag = match level.get() { - 1 => Tag::H1(Some(name)), - 2 => Tag::H2(Some(name)), - 3 => Tag::H3(Some(name)), - 4 => Tag::H4(Some(name)), - 5 => Tag::H5(Some(name)), - _ => Tag::H6(Some(name)), - }; - let mut tag_group = TagGroup::new(tag); - tag_group.push(Node::Leaf(heading_id)); - gc.tags.push(Node::Group(tag_group)); - - gc.tag_stack.push(loc); + handle_open_tag(gc, surface, elem) } FrameItem::Tag(introspection::Tag::End(loc, _)) => { - // FIXME: support or split up content tags that span multiple pages - if gc.tag_stack.last() == Some(loc) { - surface.end_tagged(); - gc.tag_stack.pop(); - } + handle_close_tag(gc, surface, loc); } } diff --git a/crates/typst-pdf/src/lib.rs b/crates/typst-pdf/src/lib.rs index 4e0b74308..9f3065a0c 100644 --- a/crates/typst-pdf/src/lib.rs +++ b/crates/typst-pdf/src/lib.rs @@ -9,6 +9,7 @@ mod outline; mod page; mod paint; mod shape; +mod tags; mod text; mod util; diff --git a/crates/typst-pdf/src/tags.rs b/crates/typst-pdf/src/tags.rs new file mode 100644 index 000000000..70792dfe8 --- /dev/null +++ b/crates/typst-pdf/src/tags.rs @@ -0,0 +1,149 @@ +use krilla::surface::Surface; +use krilla::tagging::{ContentTag, Node, Tag, TagGroup, TagTree}; +use typst_library::foundations::{Content, StyleChain}; +use typst_library::introspection::Location; +use typst_library::model::{HeadingElem, OutlineElem, OutlineEntry}; + +use crate::convert::GlobalContext; + +pub(crate) struct Tags { + /// The intermediary stack of nested tag groups. + pub(crate) stack: Vec<(Location, Tag, Vec)>, + pub(crate) in_artifact: bool, + + /// The output. + pub(crate) tree: TagTree, +} + +impl Tags { + pub(crate) fn new() -> Self { + Self { + stack: Vec::new(), + in_artifact: false, + tree: TagTree::new(), + } + } + + pub(crate) fn take_tree(&mut self) -> TagTree { + std::mem::take(&mut self.tree) + } + + pub(crate) fn context_supports(&self, tag: &Tag) -> bool { + let Some((_, parent, _)) = self.stack.last() else { return true }; + + use Tag::*; + + match parent { + Part => true, + Article => !matches!(tag, Article), + Section => true, + BlockQuote => todo!(), + Caption => todo!(), + TOC => matches!(tag, TOC | TOCI), + // TODO: NonStruct is allowed to but (currently?) not supported by krilla + TOCI => matches!(tag, TOC | Lbl | Reference | P), + Index => todo!(), + P => todo!(), + H1(_) => todo!(), + H2(_) => todo!(), + H3(_) => todo!(), + H4(_) => todo!(), + H5(_) => todo!(), + H6(_) => todo!(), + L(_list_numbering) => todo!(), + LI => todo!(), + Lbl => todo!(), + LBody => todo!(), + Table => todo!(), + TR => todo!(), + TH(_table_header_scope) => todo!(), + TD => todo!(), + THead => todo!(), + TBody => todo!(), + TFoot => todo!(), + InlineQuote => todo!(), + Note => todo!(), + Reference => todo!(), + BibEntry => todo!(), + Code => todo!(), + Link => todo!(), + Annot => todo!(), + Figure(_) => todo!(), + Formula(_) => todo!(), + Datetime => todo!(), + Terms => todo!(), + Title => todo!(), + } + } +} + +pub(crate) fn handle_open_tag( + gc: &mut GlobalContext, + surface: &mut Surface, + elem: &Content, +) { + if gc.tags.in_artifact { + return; + } + + let Some(loc) = elem.location() else { return }; + + let tag = if let Some(heading) = elem.to_packed::() { + let level = heading.resolve_level(StyleChain::default()); + let name = heading.body.plain_text().to_string(); + match level.get() { + 1 => Tag::H1(Some(name)), + 2 => Tag::H2(Some(name)), + 3 => Tag::H3(Some(name)), + 4 => Tag::H4(Some(name)), + 5 => Tag::H5(Some(name)), + // TODO: when targeting PDF 2.0 headings `> 6` are supported + _ => Tag::H6(Some(name)), + } + } else if let Some(_) = elem.to_packed::() { + Tag::TOC + } else if let Some(_outline_entry) = elem.to_packed::() { + Tag::TOCI + } else { + return; + }; + + if !gc.tags.context_supports(&tag) { + // TODO: error or warning? + } + + // close previous marked-content and open a nested tag. + if !gc.tags.stack.is_empty() { + surface.end_tagged(); + } + let content_id = surface.start_tagged(krilla::tagging::ContentTag::Other); + + gc.tags.stack.push((loc, tag, vec![Node::Leaf(content_id)])); +} + +pub(crate) fn handle_close_tag( + gc: &mut GlobalContext, + surface: &mut Surface, + loc: &Location, +) { + let Some((_, tag, nodes)) = gc.tags.stack.pop_if(|(l, ..)| l == loc) else { + return; + }; + // TODO: contstruct group directly from nodes + let mut tag_group = TagGroup::new(tag); + for node in nodes { + tag_group.push(node); + } + + surface.end_tagged(); + + if let Some((_, _, parent_nodes)) = gc.tags.stack.last_mut() { + parent_nodes.push(Node::Group(tag_group)); + + // TODO: somehow avoid empty marked-content sequences + let id = surface.start_tagged(ContentTag::Other); + parent_nodes.push(Node::Leaf(id)); + } else { + gc.tags.tree.push(Node::Group(tag_group)); + } +}