feat: [WIP] write tags

skip-checks:true
This commit is contained in:
Tobias Schmitz 2025-05-22 12:03:10 +02:00
parent 7ac8f48afa
commit 8c861d2d27
No known key found for this signature in database
4 changed files with 182 additions and 37 deletions

View File

@ -185,6 +185,8 @@ fn layout_page_run_impl(
)?;
// Layouts a single marginal.
// TODO: add some sort of tag that indicates the marginals and use it to
// mark them as artifacts for PDF/UA.
let mut layout_marginal = |content: &Option<Content>, area, align| {
let Some(content) = content else { return Ok(None) };
let aligned = content.clone().styled(AlignElem::set_alignment(align));

View File

@ -10,11 +10,11 @@ use krilla::error::KrillaError;
use krilla::geom::PathBuilder;
use krilla::page::{PageLabel, PageSettings};
use krilla::surface::Surface;
use krilla::tagging::{Node, SpanTag, Tag, TagGroup, TagTree};
use krilla::tagging::{ArtifactType, ContentTag, Node};
use krilla::{Document, SerializeSettings};
use krilla_svg::render_svg_glyph;
use typst_library::diag::{bail, error, SourceDiagnostic, SourceResult};
use typst_library::foundations::{NativeElement, StyleChain};
use typst_library::foundations::NativeElement;
use typst_library::introspection::{self, Location};
use typst_library::layout::{
Abs, Frame, FrameItem, GroupItem, PagedDocument, Size, Transform,
@ -31,6 +31,7 @@ use crate::metadata::build_metadata;
use crate::outline::build_outline;
use crate::page::PageLabelExt;
use crate::shape::handle_shape;
use crate::tags::{handle_close_tag, handle_open_tag, Tags};
use crate::text::handle_text;
use crate::util::{convert_path, display_font, AbsExt, TransformExt};
use crate::PdfOptions;
@ -49,6 +50,8 @@ pub fn convert(
xmp_metadata: true,
cmyk_profile: None,
configuration: config,
// TODO: Should we just set this to false? If set to `false` this will
// automatically be enabled if the `UA1` validator is used.
enable_tagging: true,
render_svg_glyph_fn: render_svg_glyph,
};
@ -70,12 +73,7 @@ pub fn convert(
document.set_outline(build_outline(&gc));
document.set_metadata(build_metadata(&gc));
let mut tag_tree = TagTree::new();
for tag in gc.tags.drain(..) {
tag_tree.push(tag);
}
document.set_tag_tree(tag_tree);
document.set_tag_tree(gc.tags.take_tree());
finish(document, gc, options.standards.config)
}
@ -115,6 +113,19 @@ fn convert_pages(gc: &mut GlobalContext, document: &mut Document) -> SourceResul
let mut surface = page.surface();
let mut fc = FrameContext::new(typst_page.frame.size());
// Marked-content may not cross page boundaries: reopen tag
// that was closed at the end of the last page.
if let Some((_, _, nodes)) = gc.tags.stack.last_mut() {
let tag = if gc.tags.in_artifact {
ContentTag::Artifact(ArtifactType::Other)
} else {
ContentTag::Other
};
// TODO: somehow avoid empty marked-content sequences
let id = surface.start_tagged(tag);
nodes.push(Node::Leaf(id));
}
handle_frame(
&mut fc,
&typst_page.frame,
@ -123,6 +134,11 @@ fn convert_pages(gc: &mut GlobalContext, document: &mut Document) -> SourceResul
gc,
)?;
// Marked-content may not cross page boundaries: close open tag.
if !gc.tags.stack.is_empty() {
surface.end_tagged();
}
surface.finish();
for annotation in fc.annotations {
@ -235,8 +251,8 @@ pub(crate) struct GlobalContext<'a> {
/// The languages used throughout the document.
pub(crate) languages: BTreeMap<Lang, usize>,
pub(crate) page_index_converter: PageIndexConverter,
pub(crate) tag_stack: Vec<Location>,
pub(crate) tags: Vec<Node>,
/// Tagged PDF context.
pub(crate) tags: Tags,
}
impl<'a> GlobalContext<'a> {
@ -256,8 +272,8 @@ impl<'a> GlobalContext<'a> {
image_spans: HashSet::new(),
languages: BTreeMap::new(),
page_index_converter,
tag_stack: Vec::new(),
tags: Vec::new(),
tags: Tags::new(),
}
}
}
@ -294,33 +310,10 @@ pub(crate) fn handle_frame(
}
FrameItem::Link(d, s) => handle_link(fc, gc, d, *s),
FrameItem::Tag(introspection::Tag::Start(elem)) => {
let Some(heading) = elem.to_packed::<HeadingElem>() else { continue };
let Some(loc) = heading.location() else { continue };
let level = heading.resolve_level(StyleChain::default());
let name = heading.body.plain_text().to_string();
let heading_id = surface
.start_tagged(krilla::tagging::ContentTag::Span(SpanTag::empty()));
let tag = match level.get() {
1 => Tag::H1(Some(name)),
2 => Tag::H2(Some(name)),
3 => Tag::H3(Some(name)),
4 => Tag::H4(Some(name)),
5 => Tag::H5(Some(name)),
_ => Tag::H6(Some(name)),
};
let mut tag_group = TagGroup::new(tag);
tag_group.push(Node::Leaf(heading_id));
gc.tags.push(Node::Group(tag_group));
gc.tag_stack.push(loc);
handle_open_tag(gc, surface, elem)
}
FrameItem::Tag(introspection::Tag::End(loc, _)) => {
// FIXME: support or split up content tags that span multiple pages
if gc.tag_stack.last() == Some(loc) {
surface.end_tagged();
gc.tag_stack.pop();
}
handle_close_tag(gc, surface, loc);
}
}

View File

@ -9,6 +9,7 @@ mod outline;
mod page;
mod paint;
mod shape;
mod tags;
mod text;
mod util;

View File

@ -0,0 +1,149 @@
use krilla::surface::Surface;
use krilla::tagging::{ContentTag, Node, Tag, TagGroup, TagTree};
use typst_library::foundations::{Content, StyleChain};
use typst_library::introspection::Location;
use typst_library::model::{HeadingElem, OutlineElem, OutlineEntry};
use crate::convert::GlobalContext;
pub(crate) struct Tags {
/// The intermediary stack of nested tag groups.
pub(crate) stack: Vec<(Location, Tag, Vec<Node>)>,
pub(crate) in_artifact: bool,
/// The output.
pub(crate) tree: TagTree,
}
impl Tags {
pub(crate) fn new() -> Self {
Self {
stack: Vec::new(),
in_artifact: false,
tree: TagTree::new(),
}
}
pub(crate) fn take_tree(&mut self) -> TagTree {
std::mem::take(&mut self.tree)
}
pub(crate) fn context_supports(&self, tag: &Tag) -> bool {
let Some((_, parent, _)) = self.stack.last() else { return true };
use Tag::*;
match parent {
Part => true,
Article => !matches!(tag, Article),
Section => true,
BlockQuote => todo!(),
Caption => todo!(),
TOC => matches!(tag, TOC | TOCI),
// TODO: NonStruct is allowed to but (currently?) not supported by krilla
TOCI => matches!(tag, TOC | Lbl | Reference | P),
Index => todo!(),
P => todo!(),
H1(_) => todo!(),
H2(_) => todo!(),
H3(_) => todo!(),
H4(_) => todo!(),
H5(_) => todo!(),
H6(_) => todo!(),
L(_list_numbering) => todo!(),
LI => todo!(),
Lbl => todo!(),
LBody => todo!(),
Table => todo!(),
TR => todo!(),
TH(_table_header_scope) => todo!(),
TD => todo!(),
THead => todo!(),
TBody => todo!(),
TFoot => todo!(),
InlineQuote => todo!(),
Note => todo!(),
Reference => todo!(),
BibEntry => todo!(),
Code => todo!(),
Link => todo!(),
Annot => todo!(),
Figure(_) => todo!(),
Formula(_) => todo!(),
Datetime => todo!(),
Terms => todo!(),
Title => todo!(),
}
}
}
pub(crate) fn handle_open_tag(
gc: &mut GlobalContext,
surface: &mut Surface,
elem: &Content,
) {
if gc.tags.in_artifact {
return;
}
let Some(loc) = elem.location() else { return };
let tag = if let Some(heading) = elem.to_packed::<HeadingElem>() {
let level = heading.resolve_level(StyleChain::default());
let name = heading.body.plain_text().to_string();
match level.get() {
1 => Tag::H1(Some(name)),
2 => Tag::H2(Some(name)),
3 => Tag::H3(Some(name)),
4 => Tag::H4(Some(name)),
5 => Tag::H5(Some(name)),
// TODO: when targeting PDF 2.0 headings `> 6` are supported
_ => Tag::H6(Some(name)),
}
} else if let Some(_) = elem.to_packed::<OutlineElem>() {
Tag::TOC
} else if let Some(_outline_entry) = elem.to_packed::<OutlineEntry>() {
Tag::TOCI
} else {
return;
};
if !gc.tags.context_supports(&tag) {
// TODO: error or warning?
}
// close previous marked-content and open a nested tag.
if !gc.tags.stack.is_empty() {
surface.end_tagged();
}
let content_id = surface.start_tagged(krilla::tagging::ContentTag::Other);
gc.tags.stack.push((loc, tag, vec![Node::Leaf(content_id)]));
}
pub(crate) fn handle_close_tag(
gc: &mut GlobalContext,
surface: &mut Surface,
loc: &Location,
) {
let Some((_, tag, nodes)) = gc.tags.stack.pop_if(|(l, ..)| l == loc) else {
return;
};
// TODO: contstruct group directly from nodes
let mut tag_group = TagGroup::new(tag);
for node in nodes {
tag_group.push(node);
}
surface.end_tagged();
if let Some((_, _, parent_nodes)) = gc.tags.stack.last_mut() {
parent_nodes.push(Node::Group(tag_group));
// TODO: somehow avoid empty marked-content sequences
let id = surface.start_tagged(ContentTag::Other);
parent_nodes.push(Node::Leaf(id));
} else {
gc.tags.tree.push(Node::Group(tag_group));
}
}