Improve subsetting (#4373)

Co-authored-by: Laurenz <laurmaedje@gmail.com>
This commit is contained in:
Laurenz Stampfl 2024-06-16 11:38:33 +02:00 committed by GitHub
parent 34550220ae
commit feedfe80cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 79 additions and 68 deletions

8
Cargo.lock generated
View File

@ -2255,15 +2255,13 @@ dependencies = [
[[package]] [[package]]
name = "subsetter" name = "subsetter"
version = "0.1.1" version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/typst/subsetter?rev=4e0058b#4e0058b4b9a0948a5f79894111948d95e59ba350"
checksum = "09eab8a83bff89ba2200bd4c59be45c7c787f988431b936099a5a266c957f2f9"
[[package]] [[package]]
name = "svg2pdf" name = "svg2pdf"
version = "0.11.0" version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/typst/svg2pdf?rev=39f8ad3#39f8ad3b35e14cfcabf3d5d916899f7ac78790f7"
checksum = "e31565956eb1dc398c0d9776ee1d1bac4e34759af63dcbe0520df32313a5b53b"
dependencies = [ dependencies = [
"fontdb", "fontdb",
"image 0.25.1", "image 0.25.1",

View File

@ -98,8 +98,8 @@ shell-escape = "0.1.5"
siphasher = "1" siphasher = "1"
smallvec = { version = "1.11.1", features = ["union", "const_generics", "const_new"] } smallvec = { version = "1.11.1", features = ["union", "const_generics", "const_new"] }
stacker = "0.1.15" stacker = "0.1.15"
subsetter = "0.1.1" subsetter = { git = "https://github.com/typst/subsetter", rev = "4e0058b" }
svg2pdf = "0.11.0" svg2pdf = { git = "https://github.com/typst/svg2pdf", rev = "39f8ad3" }
syn = { version = "2", features = ["full", "extra-traits"] } syn = { version = "2", features = ["full", "extra-traits"] }
syntect = { version = "5", default-features = false, features = ["parsing", "regex-fancy", "plist-load", "yaml-load"] } syntect = { version = "5", default-features = false, features = ["parsing", "regex-fancy", "plist-load", "yaml-load"] }
tar = "0.4" tar = "0.4"

View File

@ -115,7 +115,7 @@ pub fn write_color_fonts(
pdf_font.finish(); pdf_font.finish();
// Encode a CMAP to make it possible to search or copy glyphs. // Encode a CMAP to make it possible to search or copy glyphs.
let glyph_set = resources.glyph_sets.get(&font_slice.font).unwrap(); let glyph_set = resources.color_glyph_sets.get(&font_slice.font).unwrap();
let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO); let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
for (index, glyph) in subset.iter().enumerate() { for (index, glyph) in subset.iter().enumerate() {
let Some(text) = glyph_set.get(&glyph.gid) else { let Some(text) = glyph_set.get(&glyph.gid) else {

View File

@ -476,6 +476,12 @@ fn write_normal_text(ctx: &mut Builder, pos: Point, text: TextItemView) {
let mut adjustment = Em::zero(); let mut adjustment = Em::zero();
let mut encoded = vec![]; let mut encoded = vec![];
let glyph_remapper = ctx
.resources
.glyph_remappers
.entry(text.item.font.clone())
.or_default();
// Write the glyphs with kerning adjustments. // Write the glyphs with kerning adjustments.
for glyph in text.glyphs() { for glyph in text.glyphs() {
adjustment += glyph.x_offset; adjustment += glyph.x_offset;
@ -490,7 +496,26 @@ fn write_normal_text(ctx: &mut Builder, pos: Point, text: TextItemView) {
adjustment = Em::zero(); adjustment = Em::zero();
} }
let cid = crate::font::glyph_cid(&text.item.font, glyph.id); // In PDF, we use CIDs to index the glyphs in a font, not GIDs. What a
// CID actually refers to depends on the type of font we are embedding:
//
// - For TrueType fonts, the CIDs are defined by an external mapping.
// - For SID-keyed CFF fonts, the CID is the same as the GID in the font.
// - For CID-keyed CFF fonts, the CID refers to the CID in the font.
//
// (See in the PDF-spec for more details on this.)
//
// However, in our case:
// - We use the identity-mapping for TrueType fonts.
// - SID-keyed fonts will get converted into CID-keyed fonts by the
// subsetter.
// - CID-keyed fonts will be rewritten in a way so that the mapping
// between CID and GID is always the identity mapping, regardless of
// the mapping before.
//
// Because of this, we can always use the remapped GID as the CID,
// regardless of which type of font we are actually embedding.
let cid = glyph_remapper.remap(glyph.id);
encoded.push((cid >> 8) as u8); encoded.push((cid >> 8) as u8);
encoded.push((cid & 0xff) as u8); encoded.push((cid & 0xff) as u8);
@ -523,7 +548,11 @@ fn write_color_glyphs(ctx: &mut Builder, pos: Point, text: TextItemView) {
// displays regular glyphs and not color glyphs. // displays regular glyphs and not color glyphs.
ctx.state.font = None; ctx.state.font = None;
let glyph_set = ctx.resources.glyph_sets.entry(text.item.font.clone()).or_default(); let glyph_set = ctx
.resources
.color_glyph_sets
.entry(text.item.font.clone())
.or_default();
for glyph in text.glyphs() { for glyph in text.glyphs() {
// Retrieve the Type3 font reference and the glyph index in the font. // Retrieve the Type3 font reference and the glyph index in the font.

View File

@ -8,6 +8,7 @@ use pdf_writer::{
writers::FontDescriptor, writers::FontDescriptor,
Chunk, Filter, Finish, Name, Rect, Ref, Str, Chunk, Filter, Finish, Name, Rect, Ref, Str,
}; };
use subsetter::GlyphRemapper;
use ttf_parser::{name_id, GlyphId, Tag}; use ttf_parser::{name_id, GlyphId, Tag};
use typst::text::Font; use typst::text::Font;
use typst::utils::SliceExt; use typst::utils::SliceExt;
@ -43,6 +44,7 @@ pub fn write_fonts(context: &WithGlobalRefs) -> (PdfChunk, HashMap<Font, Ref>) {
out.insert(font.clone(), type0_ref); out.insert(font.clone(), type0_ref);
let glyph_set = resources.glyph_sets.get(font).unwrap(); let glyph_set = resources.glyph_sets.get(font).unwrap();
let glyph_remapper = resources.glyph_remappers.get(font).unwrap();
let ttf = font.ttf(); let ttf = font.ttf();
// Do we have a TrueType or CFF font? // Do we have a TrueType or CFF font?
@ -87,16 +89,15 @@ pub fn write_fonts(context: &WithGlobalRefs) -> (PdfChunk, HashMap<Font, Ref>) {
} }
// Extract the widths of all glyphs. // Extract the widths of all glyphs.
let mut widths = vec![]; // `remapped_gids` returns an iterator over the old GIDs in their new sorted
for gid in std::iter::once(0).chain(glyph_set.keys().copied()) { // order, so we can append the widths as is.
let width = ttf.glyph_hor_advance(GlyphId(gid)).unwrap_or(0); let widths = glyph_remapper
let units = font.to_em(width).to_font_units(); .remapped_gids()
let cid = glyph_cid(font, gid); .map(|gid| {
if usize::from(cid) >= widths.len() { let width = ttf.glyph_hor_advance(GlyphId(gid)).unwrap_or(0);
widths.resize(usize::from(cid) + 1, 0.0); font.to_em(width).to_font_units()
widths[usize::from(cid)] = units; })
} .collect::<Vec<_>>();
}
// Write all non-zero glyph widths. // Write all non-zero glyph widths.
let mut first = 0; let mut first = 0;
@ -115,19 +116,15 @@ pub fn write_fonts(context: &WithGlobalRefs) -> (PdfChunk, HashMap<Font, Ref>) {
// Write the /ToUnicode character map, which maps glyph ids back to // Write the /ToUnicode character map, which maps glyph ids back to
// unicode codepoints to enable copying out of the PDF. // unicode codepoints to enable copying out of the PDF.
let cmap = create_cmap(font, glyph_set); let cmap = create_cmap(glyph_set, glyph_remapper);
chunk.cmap(cmap_ref, &cmap.finish()); chunk.cmap(cmap_ref, &cmap.finish());
// Subset and write the font's bytes. let subset = subset_font(font, glyph_remapper);
let glyphs: Vec<_> = glyph_set.keys().copied().collect(); let mut stream = chunk.stream(data_ref, &subset);
let data = subset_font(font, &glyphs);
let mut stream = chunk.stream(data_ref, &data);
stream.filter(Filter::FlateDecode); stream.filter(Filter::FlateDecode);
if is_cff { if is_cff {
stream.pair(Name(b"Subtype"), Name(b"CIDFontType0C")); stream.pair(Name(b"Subtype"), Name(b"CIDFontType0C"));
} }
stream.finish(); stream.finish();
let mut font_descriptor = let mut font_descriptor =
@ -194,15 +191,18 @@ pub fn write_font_descriptor<'a>(
/// Subset a font to the given glyphs. /// Subset a font to the given glyphs.
/// ///
/// - For a font with TrueType outlines, this returns the whole OpenType font. /// - For a font with TrueType outlines, this produces the whole OpenType font.
/// - For a font with CFF outlines, this returns just the CFF font program. /// - For a font with CFF outlines, this produces just the CFF font program.
///
/// In both cases, this returns the already compressed data.
#[comemo::memoize] #[comemo::memoize]
#[typst_macros::time(name = "subset font")] #[typst_macros::time(name = "subset font")]
fn subset_font(font: &Font, glyphs: &[u16]) -> Arc<Vec<u8>> { fn subset_font(font: &Font, glyph_remapper: &GlyphRemapper) -> Arc<Vec<u8>> {
let data = font.data(); let data = font.data();
let profile = subsetter::Profile::pdf(glyphs); // TODO: Fail export instead of unwrapping once export diagnoistics exist.
let subsetted = subsetter::subset(data, font.index(), profile); let subsetted = subsetter::subset(data, font.index(), glyph_remapper).unwrap();
let mut data = subsetted.as_deref().unwrap_or(data);
let mut data = subsetted.as_ref();
// Extract the standalone CFF font program if applicable. // Extract the standalone CFF font program if applicable.
let raw = ttf_parser::RawFace::parse(data, 0).unwrap(); let raw = ttf_parser::RawFace::parse(data, 0).unwrap();
@ -259,46 +259,19 @@ pub fn improve_glyph_sets(glyph_sets: &mut HashMap<Font, BTreeMap<u16, EcoString
} }
/// Create a /ToUnicode CMap. /// Create a /ToUnicode CMap.
fn create_cmap(font: &Font, glyph_set: &BTreeMap<u16, EcoString>) -> UnicodeCmap { fn create_cmap(
glyph_set: &BTreeMap<u16, EcoString>,
glyph_remapper: &GlyphRemapper,
) -> UnicodeCmap {
// Produce a reverse mapping from glyphs' CIDs to unicode strings. // Produce a reverse mapping from glyphs' CIDs to unicode strings.
let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO); let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
for (&g, text) in glyph_set.iter() { for (&g, text) in glyph_set.iter() {
// See commend in `write_normal_text` for why we can choose the CID this way.
let cid = glyph_remapper.get(g).unwrap();
if !text.is_empty() { if !text.is_empty() {
cmap.pair_with_multiple(glyph_cid(font, g), text.chars()); cmap.pair_with_multiple(cid, text.chars());
} }
} }
cmap cmap
} }
/// Get the CID for a glyph id.
///
/// When writing text into a PDF, we have to specify CIDs (character ids) not
/// GIDs (glyph IDs).
///
/// Most of the time, the mapping between these two is an identity mapping. In
/// particular, for TrueType fonts, the mapping is an identity mapping because
/// of this line above:
/// ```ignore
/// cid.cid_to_gid_map_predefined(Name(b"Identity"));
/// ```
///
/// However, CID-keyed CFF fonts may have a non-identity mapping defined in
/// their charset. For those, we must map the glyph IDs in a `TextItem` to CIDs.
/// The font defines the map through its charset. The charset usually maps
/// glyphs to SIDs (string ids) specifying the glyph's name. Not for CID-keyed
/// fonts though! For these, the SIDs are CIDs in disguise. Relevant quote from
/// the CFF spec:
///
/// > The charset data, although in the same format as non-CIDFonts, will
/// > represent CIDs rather than SIDs, [...]
///
/// This function performs the mapping from glyph ID to CID. It also works for
/// non CID-keyed fonts. Then, it will simply return the glyph ID.
pub(super) fn glyph_cid(font: &Font, glyph_id: u16) -> u16 {
font.ttf()
.tables()
.cff
.and_then(|cff| cff.glyph_cid(ttf_parser::GlyphId(glyph_id)))
.unwrap_or(glyph_id)
}

View File

@ -183,7 +183,8 @@ fn encode_alpha(raster: &RasterImage) -> (Vec<u8>, Filter) {
/// Encode an SVG into a chunk of PDF objects. /// Encode an SVG into a chunk of PDF objects.
#[typst_macros::time(name = "encode svg")] #[typst_macros::time(name = "encode svg")]
fn encode_svg(svg: &SvgImage) -> (Chunk, Ref) { fn encode_svg(svg: &SvgImage) -> (Chunk, Ref) {
svg2pdf::to_chunk(svg.tree(), svg2pdf::ConversionOptions::default()) // TODO: Don't unwrap once we have export diagostics.
svg2pdf::to_chunk(svg.tree(), svg2pdf::ConversionOptions::default()).unwrap()
} }
/// A pre-encoded image. /// A pre-encoded image.

View File

@ -53,6 +53,7 @@ pub fn traverse_pages(
} }
improve_glyph_sets(&mut resources.glyph_sets); improve_glyph_sets(&mut resources.glyph_sets);
improve_glyph_sets(&mut resources.color_glyph_sets);
(PdfChunk::new(), (pages, resources)) (PdfChunk::new(), (pages, resources))
} }

View File

@ -11,6 +11,7 @@ use std::hash::Hash;
use ecow::{eco_format, EcoString}; use ecow::{eco_format, EcoString};
use pdf_writer::{Dict, Finish, Name, Ref}; use pdf_writer::{Dict, Finish, Name, Ref};
use subsetter::GlyphRemapper;
use typst::text::Lang; use typst::text::Lang;
use typst::{text::Font, utils::Deferred, visualize::Image}; use typst::{text::Font, utils::Deferred, visualize::Image};
@ -82,6 +83,10 @@ pub struct Resources<R = Ref> {
/// PDF's /ToUnicode map for glyphs that don't have an entry in the font's /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
/// cmap. This is important for copy-paste and searching. /// cmap. This is important for copy-paste and searching.
pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>, pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
/// Same as `glyph_sets`, but for color fonts.
pub color_glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
/// Stores the glyph remapper for each font for the subsetter.
pub glyph_remappers: HashMap<Font, GlyphRemapper>,
} }
impl<R: Renumber> Renumber for Resources<R> { impl<R: Renumber> Renumber for Resources<R> {
@ -112,6 +117,8 @@ impl Default for Resources<()> {
color_fonts: None, color_fonts: None,
languages: BTreeMap::new(), languages: BTreeMap::new(),
glyph_sets: HashMap::new(), glyph_sets: HashMap::new(),
color_glyph_sets: HashMap::new(),
glyph_remappers: HashMap::new(),
} }
} }
} }
@ -138,6 +145,8 @@ impl Resources<()> {
.map(|(c, r)| Box::new(c.with_refs(r))), .map(|(c, r)| Box::new(c.with_refs(r))),
languages: self.languages, languages: self.languages,
glyph_sets: self.glyph_sets, glyph_sets: self.glyph_sets,
color_glyph_sets: self.color_glyph_sets,
glyph_remappers: self.glyph_remappers,
} }
} }
} }