Embed files associated with the document as a whole (#5221)

Co-authored-by: Laurenz <laurmaedje@gmail.com>
This commit is contained in:
Niklas Eicker 2025-01-08 10:38:34 +01:00 committed by GitHub
parent 265df6c29f
commit 0a374d2380
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 411 additions and 34 deletions

View File

@ -473,6 +473,9 @@ pub enum PdfStandard {
/// PDF/A-2b.
#[value(name = "a-2b")]
A_2b,
/// PDF/A-3b.
#[value(name = "a-3b")]
A_3b,
}
display_possible_values!(PdfStandard);

View File

@ -136,6 +136,7 @@ impl CompileConfig {
.map(|standard| match standard {
PdfStandard::V_1_7 => typst_pdf::PdfStandard::V_1_7,
PdfStandard::A_2b => typst_pdf::PdfStandard::A_2b,
PdfStandard::A_3b => typst_pdf::PdfStandard::A_3b,
})
.collect::<Vec<_>>();
PdfStandards::new(&list)?

View File

@ -21,6 +21,7 @@ pub mod layout;
pub mod loading;
pub mod math;
pub mod model;
pub mod pdf;
pub mod routines;
pub mod symbols;
pub mod text;
@ -249,6 +250,7 @@ fn global(math: Module, inputs: Dict, features: &Features) -> Module {
self::introspection::define(&mut global);
self::loading::define(&mut global);
self::symbols::define(&mut global);
self::pdf::define(&mut global);
global.reset_category();
if features.is_enabled(Feature::Html) {
global.define_module(self::html::module());

View File

@ -0,0 +1,131 @@
use ecow::EcoString;
use typst_syntax::{Span, Spanned};
use crate::diag::{At, SourceResult, StrResult};
use crate::engine::Engine;
use crate::foundations::{
elem, func, scope, Cast, Content, NativeElement, Packed, Show, StyleChain,
};
use crate::introspection::Locatable;
use crate::loading::Readable;
use crate::World;
/// A file that will be embedded into the output PDF.
///
/// This can be used to distribute additional files that are related to the PDF
/// within it. PDF readers will display the files in a file listing.
///
/// Some international standards use this mechanism to embed machine-readable
/// data (e.g., ZUGFeRD/Factur-X for invoices) that mirrors the visual content
/// of the PDF.
///
/// # Example
/// ```typ
/// #pdf.embed(
/// "experiment.csv",
/// relationship: "supplement",
/// mime-type: "text/csv",
/// description: "Raw Oxygen readings from the Arctic experiment",
/// )
/// ```
///
/// # Notes
/// - This element is ignored if exporting to a format other than PDF.
/// - File embeddings are not currently supported for PDF/A-2, even if the
/// embedded file conforms to PDF/A-1 or PDF/A-2.
#[elem(scope, Show, Locatable)]
pub struct EmbedElem {
/// Path to a file to be embedded.
///
/// For more details, see the [Paths section]($syntax/#paths).
#[required]
#[parse(
let Spanned { v: path, span } =
args.expect::<Spanned<EcoString>>("path to the file to be embedded")?;
let id = span.resolve_path(&path).at(span)?;
let data = engine.world.file(id).at(span)?;
path
)]
#[borrowed]
pub path: EcoString,
/// The resolved project-relative path.
#[internal]
#[required]
#[parse(id.vpath().as_rootless_path().to_string_lossy().replace("\\", "/").into())]
pub resolved_path: EcoString,
/// The raw file data.
#[internal]
#[required]
#[parse(Readable::Bytes(data))]
pub data: Readable,
/// The relationship of the embedded file to the document.
///
/// Ignored if export doesn't target PDF/A-3.
pub relationship: Option<EmbeddedFileRelationship>,
/// The MIME type of the embedded file.
#[borrowed]
pub mime_type: Option<EcoString>,
/// A description for the embedded file.
#[borrowed]
pub description: Option<EcoString>,
}
#[scope]
impl EmbedElem {
/// Decode a file embedding from bytes or a string.
#[func(title = "Embed Data")]
fn decode(
/// The call span of this function.
span: Span,
/// The path that will be written into the PDF. Typst will not read from
/// this path since the data is provided in the following argument.
path: EcoString,
/// The data to embed as a file.
data: Readable,
/// The relationship of the embedded file to the document.
#[named]
relationship: Option<Option<EmbeddedFileRelationship>>,
/// The MIME type of the embedded file.
#[named]
mime_type: Option<Option<EcoString>>,
/// A description for the embedded file.
#[named]
description: Option<Option<EcoString>>,
) -> StrResult<Content> {
let mut elem = EmbedElem::new(path.clone(), path, data);
if let Some(description) = description {
elem.push_description(description);
}
if let Some(mime_type) = mime_type {
elem.push_mime_type(mime_type);
}
if let Some(relationship) = relationship {
elem.push_relationship(relationship);
}
Ok(elem.pack().spanned(span))
}
}
impl Show for Packed<EmbedElem> {
fn show(&self, _: &mut Engine, _: StyleChain) -> SourceResult<Content> {
Ok(Content::empty())
}
}
/// The relationship of an embedded file with the document.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Cast)]
pub enum EmbeddedFileRelationship {
/// The PDF document was created from the source file.
Source,
/// The file was used to derive a visual presentation in the PDF.
Data,
/// An alternative representation of the document.
Alternative,
/// Additional resources for the document.
Supplement,
}

View File

@ -0,0 +1,24 @@
//! PDF-specific functionality.
mod embed;
pub use self::embed::*;
use crate::foundations::{category, Category, Module, Scope};
/// PDF-specific functionality.
#[category]
pub static PDF: Category;
/// Hook up the `pdf` module.
pub(super) fn define(global: &mut Scope) {
global.category(PDF);
global.define_module(module());
}
/// Hook up all `pdf` definitions.
pub fn module() -> Module {
let mut scope = Scope::deduplicating();
scope.define_elem::<EmbedElem>();
Module::new("pdf", scope)
}

View File

@ -12,7 +12,7 @@ use typst_syntax::Span;
use xmp_writer::{DateTime, LangId, RenditionClass, XmpWriter};
use crate::page::PdfPageLabel;
use crate::{hash_base64, outline, TextStrExt, Timezone, WithEverything};
use crate::{hash_base64, outline, TextStrExt, Timestamp, Timezone, WithEverything};
/// Write the document catalog.
pub fn write_catalog(
@ -86,23 +86,10 @@ pub fn write_catalog(
info.keywords(TextStr::trimmed(&joined));
xmp.pdf_keywords(&joined);
}
// (1) If the `document.date` is set to specific `datetime` or `none`, use it.
// (2) If the `document.date` is set to `auto` or not set, try to use the
// date from the options.
// (3) Otherwise, we don't write date metadata.
let (date, tz) = match (ctx.document.info.date, ctx.options.timestamp) {
(Smart::Custom(date), _) => (date, None),
(Smart::Auto, Some(timestamp)) => {
(Some(timestamp.datetime), Some(timestamp.timezone))
}
_ => (None, None),
};
if let Some(date) = date {
if let Some(pdf_date) = pdf_date(date, tz) {
info.creation_date(pdf_date);
info.modified_date(pdf_date);
}
let (date, tz) = document_date(ctx.document.info.date, ctx.options.timestamp);
if let Some(pdf_date) = date.and_then(|date| pdf_date(date, tz)) {
info.creation_date(pdf_date);
info.modified_date(pdf_date);
}
info.finish();
@ -154,7 +141,7 @@ pub fn write_catalog(
}
// Assert dominance.
if ctx.options.standards.pdfa {
if let Some((part, conformance)) = ctx.options.standards.pdfa_part {
let mut extension_schemas = xmp.extension_schemas();
extension_schemas
.xmp_media_management()
@ -162,8 +149,8 @@ pub fn write_catalog(
.describe_instance_id();
extension_schemas.pdf().properties().describe_all();
extension_schemas.finish();
xmp.pdfa_part(2);
xmp.pdfa_conformance("B");
xmp.pdfa_part(part);
xmp.pdfa_conformance(conformance);
}
let xmp_buf = xmp.finish(None);
@ -182,13 +169,35 @@ pub fn write_catalog(
catalog.viewer_preferences().direction(dir);
catalog.metadata(meta_ref);
// Write the named destination tree if there are any entries.
if !ctx.references.named_destinations.dests.is_empty() {
let has_dests = !ctx.references.named_destinations.dests.is_empty();
let has_embeddings = !ctx.references.embedded_files.is_empty();
// Write the `/Names` dictionary.
if has_dests || has_embeddings {
// Write the named destination tree if there are any entries.
let mut name_dict = catalog.names();
let mut dests_name_tree = name_dict.destinations();
let mut names = dests_name_tree.names();
for &(name, dest_ref, ..) in &ctx.references.named_destinations.dests {
names.insert(Str(name.resolve().as_bytes()), dest_ref);
if has_dests {
let mut dests_name_tree = name_dict.destinations();
let mut names = dests_name_tree.names();
for &(name, dest_ref, ..) in &ctx.references.named_destinations.dests {
names.insert(Str(name.resolve().as_bytes()), dest_ref);
}
}
if has_embeddings {
let mut embedded_files = name_dict.embedded_files();
let mut names = embedded_files.names();
for (name, file_ref) in &ctx.references.embedded_files {
names.insert(Str(name.as_bytes()), *file_ref);
}
}
}
if has_embeddings && ctx.options.standards.pdfa {
// PDF 2.0, but ISO 19005-3 (PDF/A-3) Annex E allows it for PDF/A-3.
let mut associated_files = catalog.insert(Name(b"AF")).array().typed();
for (_, file_ref) in ctx.references.embedded_files {
associated_files.item(file_ref).finish();
}
}
@ -289,8 +298,27 @@ pub(crate) fn write_page_labels(
result
}
/// Resolve the document date.
///
/// (1) If the `document.date` is set to specific `datetime` or `none`, use it.
/// (2) If the `document.date` is set to `auto` or not set, try to use the
/// date from the options.
/// (3) Otherwise, we don't write date metadata.
pub fn document_date(
document_date: Smart<Option<Datetime>>,
timestamp: Option<Timestamp>,
) -> (Option<Datetime>, Option<Timezone>) {
match (document_date, timestamp) {
(Smart::Custom(date), _) => (date, None),
(Smart::Auto, Some(timestamp)) => {
(Some(timestamp.datetime), Some(timestamp.timezone))
}
_ => (None, None),
}
}
/// Converts a datetime to a pdf-writer date.
fn pdf_date(datetime: Datetime, tz: Option<Timezone>) -> Option<pdf_writer::Date> {
pub fn pdf_date(datetime: Datetime, tz: Option<Timezone>) -> Option<pdf_writer::Date> {
let year = datetime.year().filter(|&y| y >= 0)? as u16;
let mut pdf_date = pdf_writer::Date::new(year);

View File

@ -0,0 +1,122 @@
use std::collections::BTreeMap;
use ecow::EcoString;
use pdf_writer::types::AssociationKind;
use pdf_writer::{Filter, Finish, Name, Ref, Str, TextStr};
use typst_library::diag::{bail, SourceResult};
use typst_library::foundations::{NativeElement, Packed, StyleChain};
use typst_library::pdf::{EmbedElem, EmbeddedFileRelationship};
use crate::catalog::{document_date, pdf_date};
use crate::{deflate, NameExt, PdfChunk, StrExt, WithGlobalRefs};
/// Query for all [`EmbedElem`] and write them and their file specifications.
///
/// This returns a map of embedding names and references so that we can later
/// add them to the catalog's `/Names` dictionary.
pub fn write_embedded_files(
ctx: &WithGlobalRefs,
) -> SourceResult<(PdfChunk, BTreeMap<EcoString, Ref>)> {
let mut chunk = PdfChunk::new();
let mut embedded_files = BTreeMap::default();
let elements = ctx.document.introspector.query(&EmbedElem::elem().select());
for elem in &elements {
if !ctx.options.standards.embedded_files {
// PDF/A-2 requires embedded files to be PDF/A-1 or PDF/A-2,
// which we don't currently check.
bail!(
elem.span(),
"file embeddings are not currently supported for PDF/A-2";
hint: "PDF/A-3 supports arbitrary embedded files"
);
}
let embed = elem.to_packed::<EmbedElem>().unwrap();
if embed.resolved_path.len() > Str::PDFA_LIMIT {
bail!(embed.span(), "embedded file path is too long");
}
let id = embed_file(ctx, &mut chunk, embed)?;
if embedded_files.insert(embed.resolved_path.clone(), id).is_some() {
bail!(
elem.span(),
"duplicate embedded file for path `{}`", embed.resolved_path;
hint: "embedded file paths must be unique",
);
}
}
Ok((chunk, embedded_files))
}
/// Write the embedded file stream and its file specification.
fn embed_file(
ctx: &WithGlobalRefs,
chunk: &mut PdfChunk,
embed: &Packed<EmbedElem>,
) -> SourceResult<Ref> {
let embedded_file_stream_ref = chunk.alloc.bump();
let file_spec_dict_ref = chunk.alloc.bump();
let data = embed.data().as_slice();
let compressed = deflate(data);
let mut embedded_file = chunk.embedded_file(embedded_file_stream_ref, &compressed);
embedded_file.filter(Filter::FlateDecode);
if let Some(mime_type) = embed.mime_type(StyleChain::default()) {
if mime_type.len() > Name::PDFA_LIMIT {
bail!(embed.span(), "embedded file MIME type is too long");
}
embedded_file.subtype(Name(mime_type.as_bytes()));
} else if ctx.options.standards.pdfa {
bail!(embed.span(), "embedded files must have a MIME type in PDF/A-3");
}
let mut params = embedded_file.params();
params.size(data.len() as i32);
let (date, tz) = document_date(ctx.document.info.date, ctx.options.timestamp);
if let Some(pdf_date) = date.and_then(|date| pdf_date(date, tz)) {
params.modification_date(pdf_date);
} else if ctx.options.standards.pdfa {
bail!(
embed.span(),
"the document must have a date when embedding files in PDF/A-3";
hint: "`set document(date: none)` must not be used in this case"
);
}
params.finish();
embedded_file.finish();
let mut file_spec = chunk.file_spec(file_spec_dict_ref);
file_spec.path(Str(embed.resolved_path.as_bytes()));
file_spec.unic_file(TextStr(&embed.resolved_path));
file_spec
.insert(Name(b"EF"))
.dict()
.pair(Name(b"F"), embedded_file_stream_ref)
.pair(Name(b"UF"), embedded_file_stream_ref);
if ctx.options.standards.pdfa {
// PDF 2.0, but ISO 19005-3 (PDF/A-3) Annex E allows it for PDF/A-3.
file_spec.association_kind(match embed.relationship(StyleChain::default()) {
Some(EmbeddedFileRelationship::Source) => AssociationKind::Source,
Some(EmbeddedFileRelationship::Data) => AssociationKind::Data,
Some(EmbeddedFileRelationship::Alternative) => AssociationKind::Alternative,
Some(EmbeddedFileRelationship::Supplement) => AssociationKind::Supplement,
None => AssociationKind::Unspecified,
});
}
if let Some(description) = embed.description(StyleChain::default()) {
if description.len() > Str::PDFA_LIMIT {
bail!(embed.span(), "embedded file description is too long");
}
file_spec.description(TextStr(description));
}
Ok(file_spec_dict_ref)
}

View File

@ -4,6 +4,7 @@ mod catalog;
mod color;
mod color_font;
mod content;
mod embed;
mod extg;
mod font;
mod gradient;
@ -14,12 +15,13 @@ mod page;
mod resources;
mod tiling;
use std::collections::HashMap;
use std::collections::{BTreeMap, HashMap};
use std::fmt::{self, Debug, Formatter};
use std::hash::Hash;
use std::ops::{Deref, DerefMut};
use base64::Engine;
use ecow::EcoString;
use pdf_writer::{Chunk, Name, Pdf, Ref, Str, TextStr};
use serde::{Deserialize, Serialize};
use typst_library::diag::{bail, SourceResult, StrResult};
@ -33,6 +35,7 @@ use typst_utils::Deferred;
use crate::catalog::write_catalog;
use crate::color::{alloc_color_functions_refs, ColorFunctionRefs};
use crate::color_font::{write_color_fonts, ColorFontSlice};
use crate::embed::write_embedded_files;
use crate::extg::{write_graphic_states, ExtGState};
use crate::font::write_fonts;
use crate::gradient::{write_gradients, PdfGradient};
@ -67,6 +70,7 @@ pub fn pdf(document: &PagedDocument, options: &PdfOptions) -> SourceResult<Vec<u
gradients: builder.run(write_gradients)?,
tilings: builder.run(write_tilings)?,
ext_gs: builder.run(write_graphic_states)?,
embedded_files: builder.run(write_embedded_files)?,
})
})?
.phase(|builder| builder.run(write_page_tree))?
@ -147,16 +151,34 @@ pub enum Timezone {
/// Encapsulates a list of compatible PDF standards.
#[derive(Clone)]
pub struct PdfStandards {
/// For now, we simplify to just PDF/A, since we only support PDF/A-2b. But
/// it can be more fine-grained in the future.
/// For now, we simplify to just PDF/A. But it can be more fine-grained in
/// the future.
pub(crate) pdfa: bool,
/// Whether the standard allows for embedding any kind of file into the PDF.
/// We disallow this for PDF/A-2, since it only allows embedding
/// PDF/A-1 and PDF/A-2 documents.
pub(crate) embedded_files: bool,
/// Part of the PDF/A standard.
pub(crate) pdfa_part: Option<(i32, &'static str)>,
}
impl PdfStandards {
/// Validates a list of PDF standards for compatibility and returns their
/// encapsulated representation.
pub fn new(list: &[PdfStandard]) -> StrResult<Self> {
Ok(Self { pdfa: list.contains(&PdfStandard::A_2b) })
let a2b = list.contains(&PdfStandard::A_2b);
let a3b = list.contains(&PdfStandard::A_3b);
if a2b && a3b {
bail!("PDF cannot conform to A-2B and A-3B at the same time")
}
let pdfa = a2b || a3b;
Ok(Self {
pdfa,
embedded_files: !a2b,
pdfa_part: pdfa.then_some((if a2b { 2 } else { 3 }, "B")),
})
}
}
@ -166,10 +188,9 @@ impl Debug for PdfStandards {
}
}
#[allow(clippy::derivable_impls)]
impl Default for PdfStandards {
fn default() -> Self {
Self { pdfa: false }
Self { pdfa: false, embedded_files: true, pdfa_part: None }
}
}
@ -186,6 +207,9 @@ pub enum PdfStandard {
/// PDF/A-2b.
#[serde(rename = "a-2b")]
A_2b,
/// PDF/A-3b.
#[serde(rename = "a-3b")]
A_3b,
}
/// A struct to build a PDF following a fixed succession of phases.
@ -316,6 +340,8 @@ struct References {
tilings: HashMap<PdfTiling, Ref>,
/// The IDs of written external graphics states.
ext_gs: HashMap<ExtGState, Ref>,
/// The names and references for embedded files.
embedded_files: BTreeMap<EcoString, Ref>,
}
/// At this point, the references have been assigned to all resources. The page
@ -481,6 +507,14 @@ impl<T: Eq + Hash, R: Renumber> Renumber for HashMap<T, R> {
}
}
impl<T: Ord, R: Renumber> Renumber for BTreeMap<T, R> {
fn renumber(&mut self, offset: i32) {
for v in self.values_mut() {
v.renumber(offset);
}
}
}
impl<R: Renumber> Renumber for Option<R> {
fn renumber(&mut self, offset: i32) {
if let Some(r) = self {

View File

@ -25,6 +25,7 @@ use typst::layout::{Abs, Margin, PageElem, PagedDocument, LAYOUT};
use typst::loading::DATA_LOADING;
use typst::math::MATH;
use typst::model::MODEL;
use typst::pdf::PDF;
use typst::symbols::SYMBOLS;
use typst::text::{Font, FontBook, TEXT};
use typst::utils::LazyHash;
@ -163,6 +164,7 @@ fn reference_pages(resolver: &dyn Resolver) -> PageModel {
category_page(resolver, VISUALIZE),
category_page(resolver, INTROSPECTION),
category_page(resolver, DATA_LOADING),
category_page(resolver, PDF),
];
page
}

30
tests/suite/pdf/embed.typ Normal file
View File

@ -0,0 +1,30 @@
// Test file embeddings. The tests here so far are unsatisfactory because we
// have no PDF testing infrastructure. That should be improved in the future.
--- pdf-embed ---
#pdf.embed("/assets/text/hello.txt")
#pdf.embed(
"/assets/data/details.toml",
relationship: "supplement",
mime-type: "application/toml",
description: "Information about a secret project",
)
--- pdf-embed-invalid-relationship ---
#pdf.embed(
"/assets/text/hello.txt",
// Error: 17-23 expected "source", "data", "alternative", "supplement", or none
relationship: "test",
mime-type: "text/plain",
description: "A test file",
)
--- pdf-embed-decode ---
#pdf.embed.decode("hello.txt", read("/assets/text/hello.txt"))
#pdf.embed.decode(
"a_file_name.txt",
read("/assets/text/hello.txt"),
relationship: "supplement",
mime-type: "text/plain",
description: "A description",
)