From 9b001e21121ab7b5645aa36f0cdcb6ac57e03a2b Mon Sep 17 00:00:00 2001 From: "Y.D.X." <73375426+YDX-2147483647@users.noreply.github.com> Date: Sat, 20 Jul 2024 22:13:06 +0800 Subject: [PATCH] Use texts of the first occurrences for `/ToUnicode` CMap (#4585) --- Cargo.lock | 1 - Cargo.toml | 1 - crates/typst-pdf/Cargo.toml | 1 - crates/typst-pdf/src/font.rs | 33 ------------------------------- crates/typst-pdf/src/page.rs | 5 +---- crates/typst-pdf/src/resources.rs | 15 +++++++++----- 6 files changed, 11 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a92c0d228..f238f4f56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2790,7 +2790,6 @@ dependencies = [ "typst-assets", "typst-macros", "typst-timing", - "unicode-properties", "unscanny", "xmp-writer", ] diff --git a/Cargo.toml b/Cargo.toml index e26f058ea..098900620 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,7 +114,6 @@ typed-arena = "2" unicode-bidi = "0.3.13" unicode-ident = "1.0" unicode-math-class = "0.1" -unicode-properties = "0.1" unicode-script = "0.5" unicode-segmentation = "1" unscanny = "0.1" diff --git a/crates/typst-pdf/Cargo.toml b/crates/typst-pdf/Cargo.toml index d2dcd5f5c..a3a693f38 100644 --- a/crates/typst-pdf/Cargo.toml +++ b/crates/typst-pdf/Cargo.toml @@ -29,7 +29,6 @@ pdf-writer = { workspace = true } subsetter = { workspace = true } svg2pdf = { workspace = true } ttf-parser = { workspace = true } -unicode-properties = { workspace = true } unscanny = { workspace = true } xmp-writer = { workspace = true } diff --git a/crates/typst-pdf/src/font.rs b/crates/typst-pdf/src/font.rs index fd719799d..c88c2bfde 100644 --- a/crates/typst-pdf/src/font.rs +++ b/crates/typst-pdf/src/font.rs @@ -12,7 +12,6 @@ use subsetter::GlyphRemapper; use ttf_parser::{name_id, GlyphId, Tag}; use typst::text::Font; use typst::utils::SliceExt; -use unicode_properties::{GeneralCategory, UnicodeGeneralCategory}; use crate::{deflate, EmExt, PdfChunk, WithGlobalRefs}; @@ -226,38 +225,6 @@ pub(crate) fn subset_tag(glyphs: &T) -> EcoString { std::str::from_utf8(&letter).unwrap().into() } -/// For glyphs that have codepoints mapping to them in the font's cmap table, we -/// prefer them over pre-existing text mappings from the document. Only things -/// that don't have a corresponding codepoint (or only a private-use one) like -/// the "Th" in Linux Libertine get the text of their first occurrences in the -/// document instead. -/// -/// This function replaces as much copepoints from the document with ones from -/// the cmap table as possible. -pub fn improve_glyph_sets(glyph_sets: &mut HashMap>) { - for (font, glyph_set) in glyph_sets { - let ttf = font.ttf(); - - for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) { - if !subtable.is_unicode() { - continue; - } - - subtable.codepoints(|n| { - let Some(c) = std::char::from_u32(n) else { return }; - if c.general_category() == GeneralCategory::PrivateUse { - return; - } - - let Some(GlyphId(g)) = ttf.glyph_index(c) else { return }; - if glyph_set.contains_key(&g) { - glyph_set.insert(g, c.into()); - } - }); - } - } -} - /// Create a compressed `/ToUnicode` CMap. #[comemo::memoize] #[typst_macros::time(name = "create cmap")] diff --git a/crates/typst-pdf/src/page.rs b/crates/typst-pdf/src/page.rs index b07490cc0..1001d8992 100644 --- a/crates/typst-pdf/src/page.rs +++ b/crates/typst-pdf/src/page.rs @@ -12,8 +12,8 @@ use typst::layout::{Abs, Page}; use typst::model::{Destination, Numbering}; use typst::text::Case; +use crate::Resources; use crate::{content, AbsExt, PdfChunk, WithDocument, WithRefs, WithResources}; -use crate::{font::improve_glyph_sets, Resources}; /// Construct page objects. #[typst_macros::time(name = "construct pages")] @@ -52,9 +52,6 @@ pub fn traverse_pages( } } - improve_glyph_sets(&mut resources.glyph_sets); - improve_glyph_sets(&mut resources.color_glyph_sets); - (PdfChunk::new(), (pages, resources)) } diff --git a/crates/typst-pdf/src/resources.rs b/crates/typst-pdf/src/resources.rs index a2cf56878..32b6612ff 100644 --- a/crates/typst-pdf/src/resources.rs +++ b/crates/typst-pdf/src/resources.rs @@ -77,11 +77,16 @@ pub struct Resources { pub languages: BTreeMap, /// For each font a mapping from used glyphs to their text representation. - /// May contain multiple chars in case of ligatures or similar things. The - /// same glyph can have a different text representation within one document, - /// then we just save the first one. The resulting strings are used for the - /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's - /// cmap. This is important for copy-paste and searching. + /// This is used for the PDF's /ToUnicode map, and important for copy-paste + /// and searching. + /// + /// Note that the text representation may contain multiple chars in case of + /// ligatures or similar things, and it may have no entry in the font's cmap + /// (or only a private-use codepoint), like the “Th” in Linux Libertine. + /// + /// A glyph may have multiple entries in the font's cmap, and even the same + /// glyph can have a different text representation within one document. + /// But /ToUnicode does not support that, so we just save the first occurrence. pub glyph_sets: HashMap>, /// Same as `glyph_sets`, but for color fonts. pub color_glyph_sets: HashMap>,