Use texts of the first occurrences for /ToUnicode CMap (#4585)

2025-07-16 00:52:54 +08:00 · 2024-07-20 22:13:06 +08:00 · 2024-07-20 22:13:06 +08:00 · 9b001e2112
commit 9b001e2112
parent 46ef8e1dfa
6 changed files with 11 additions and 45 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2790,7 +2790,6 @@ dependencies = [
 "typst-assets",
 "typst-macros",
 "typst-timing",
- "unicode-properties",
 "unscanny",
 "xmp-writer",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -114,7 +114,6 @@ typed-arena = "2"
 unicode-bidi = "0.3.13"
 unicode-ident = "1.0"
 unicode-math-class = "0.1"
-unicode-properties = "0.1"
 unicode-script = "0.5"
 unicode-segmentation = "1"
 unscanny = "0.1"
--- a/crates/typst-pdf/Cargo.toml
+++ b/crates/typst-pdf/Cargo.toml
@ -29,7 +29,6 @@ pdf-writer = { workspace = true }
 subsetter = { workspace = true }
 svg2pdf = { workspace = true }
 ttf-parser = { workspace = true }
-unicode-properties = { workspace = true }
 unscanny = { workspace = true }
 xmp-writer = { workspace = true }

--- a/crates/typst-pdf/src/font.rs
+++ b/crates/typst-pdf/src/font.rs
@ -12,7 +12,6 @@ use subsetter::GlyphRemapper;
 use ttf_parser::{name_id, GlyphId, Tag};
 use typst::text::Font;
 use typst::utils::SliceExt;
-use unicode_properties::{GeneralCategory, UnicodeGeneralCategory};

 use crate::{deflate, EmExt, PdfChunk, WithGlobalRefs};

@ -226,38 +225,6 @@ pub(crate) fn subset_tag<T: Hash>(glyphs: &T) -> EcoString {
    std::str::from_utf8(&letter).unwrap().into()
 }

-/// For glyphs that have codepoints mapping to them in the font's cmap table, we
-/// prefer them over pre-existing text mappings from the document. Only things
-/// that don't have a corresponding codepoint (or only a private-use one) like
-/// the "Th" in Linux Libertine get the text of their first occurrences in the
-/// document instead.
-///
-/// This function replaces as much copepoints from the document with ones from
-/// the cmap table as possible.
-pub fn improve_glyph_sets(glyph_sets: &mut HashMap<Font, BTreeMap<u16, EcoString>>) {
-    for (font, glyph_set) in glyph_sets {
-        let ttf = font.ttf();
-
-        for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
-            if !subtable.is_unicode() {
-                continue;
-            }
-
-            subtable.codepoints(|n| {
-                let Some(c) = std::char::from_u32(n) else { return };
-                if c.general_category() == GeneralCategory::PrivateUse {
-                    return;
-                }
-
-                let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
-                if glyph_set.contains_key(&g) {
-                    glyph_set.insert(g, c.into());
-                }
-            });
-        }
-    }
-}
-
 /// Create a compressed `/ToUnicode` CMap.
 #[comemo::memoize]
 #[typst_macros::time(name = "create cmap")]
--- a/crates/typst-pdf/src/page.rs
+++ b/crates/typst-pdf/src/page.rs
@ -12,8 +12,8 @@ use typst::layout::{Abs, Page};
 use typst::model::{Destination, Numbering};
 use typst::text::Case;

+use crate::Resources;
 use crate::{content, AbsExt, PdfChunk, WithDocument, WithRefs, WithResources};
-use crate::{font::improve_glyph_sets, Resources};

 /// Construct page objects.
 #[typst_macros::time(name = "construct pages")]
@ -52,9 +52,6 @@ pub fn traverse_pages(
        }
    }

-    improve_glyph_sets(&mut resources.glyph_sets);
-    improve_glyph_sets(&mut resources.color_glyph_sets);
-
    (PdfChunk::new(), (pages, resources))
 }

--- a/crates/typst-pdf/src/resources.rs
+++ b/crates/typst-pdf/src/resources.rs
@ -77,11 +77,16 @@ pub struct Resources<R = Ref> {
    pub languages: BTreeMap<Lang, usize>,

    /// For each font a mapping from used glyphs to their text representation.
-    /// May contain multiple chars in case of ligatures or similar things. The
-    /// same glyph can have a different text representation within one document,
-    /// then we just save the first one. The resulting strings are used for the
-    /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
-    /// cmap. This is important for copy-paste and searching.
+    /// This is used for the PDF's /ToUnicode map, and important for copy-paste
+    /// and searching.
+    ///
+    /// Note that the text representation may contain multiple chars in case of
+    /// ligatures or similar things, and it may have no entry in the font's cmap
+    /// (or only a private-use codepoint), like the “Th” in Linux Libertine.
+    ///
+    /// A glyph may have multiple entries in the font's cmap, and even the same
+    /// glyph can have a different text representation within one document.
+    /// But /ToUnicode does not support that, so we just save the first occurrence.
    pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
    /// Same as `glyph_sets`, but for color fonts.
    pub color_glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,