Use texts of the first occurrences for /ToUnicode CMap (#4585)

This commit is contained in:
Y.D.X. 2024-07-20 22:13:06 +08:00 committed by GitHub
parent 46ef8e1dfa
commit 9b001e2112
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 11 additions and 45 deletions

1
Cargo.lock generated
View File

@ -2790,7 +2790,6 @@ dependencies = [
"typst-assets",
"typst-macros",
"typst-timing",
"unicode-properties",
"unscanny",
"xmp-writer",
]

View File

@ -114,7 +114,6 @@ typed-arena = "2"
unicode-bidi = "0.3.13"
unicode-ident = "1.0"
unicode-math-class = "0.1"
unicode-properties = "0.1"
unicode-script = "0.5"
unicode-segmentation = "1"
unscanny = "0.1"

View File

@ -29,7 +29,6 @@ pdf-writer = { workspace = true }
subsetter = { workspace = true }
svg2pdf = { workspace = true }
ttf-parser = { workspace = true }
unicode-properties = { workspace = true }
unscanny = { workspace = true }
xmp-writer = { workspace = true }

View File

@ -12,7 +12,6 @@ use subsetter::GlyphRemapper;
use ttf_parser::{name_id, GlyphId, Tag};
use typst::text::Font;
use typst::utils::SliceExt;
use unicode_properties::{GeneralCategory, UnicodeGeneralCategory};
use crate::{deflate, EmExt, PdfChunk, WithGlobalRefs};
@ -226,38 +225,6 @@ pub(crate) fn subset_tag<T: Hash>(glyphs: &T) -> EcoString {
std::str::from_utf8(&letter).unwrap().into()
}
/// For glyphs that have codepoints mapping to them in the font's cmap table, we
/// prefer them over pre-existing text mappings from the document. Only things
/// that don't have a corresponding codepoint (or only a private-use one) like
/// the "Th" in Linux Libertine get the text of their first occurrences in the
/// document instead.
///
/// This function replaces as much copepoints from the document with ones from
/// the cmap table as possible.
pub fn improve_glyph_sets(glyph_sets: &mut HashMap<Font, BTreeMap<u16, EcoString>>) {
for (font, glyph_set) in glyph_sets {
let ttf = font.ttf();
for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
if !subtable.is_unicode() {
continue;
}
subtable.codepoints(|n| {
let Some(c) = std::char::from_u32(n) else { return };
if c.general_category() == GeneralCategory::PrivateUse {
return;
}
let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
if glyph_set.contains_key(&g) {
glyph_set.insert(g, c.into());
}
});
}
}
}
/// Create a compressed `/ToUnicode` CMap.
#[comemo::memoize]
#[typst_macros::time(name = "create cmap")]

View File

@ -12,8 +12,8 @@ use typst::layout::{Abs, Page};
use typst::model::{Destination, Numbering};
use typst::text::Case;
use crate::Resources;
use crate::{content, AbsExt, PdfChunk, WithDocument, WithRefs, WithResources};
use crate::{font::improve_glyph_sets, Resources};
/// Construct page objects.
#[typst_macros::time(name = "construct pages")]
@ -52,9 +52,6 @@ pub fn traverse_pages(
}
}
improve_glyph_sets(&mut resources.glyph_sets);
improve_glyph_sets(&mut resources.color_glyph_sets);
(PdfChunk::new(), (pages, resources))
}

View File

@ -77,11 +77,16 @@ pub struct Resources<R = Ref> {
pub languages: BTreeMap<Lang, usize>,
/// For each font a mapping from used glyphs to their text representation.
/// May contain multiple chars in case of ligatures or similar things. The
/// same glyph can have a different text representation within one document,
/// then we just save the first one. The resulting strings are used for the
/// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
/// cmap. This is important for copy-paste and searching.
/// This is used for the PDF's /ToUnicode map, and important for copy-paste
/// and searching.
///
/// Note that the text representation may contain multiple chars in case of
/// ligatures or similar things, and it may have no entry in the font's cmap
/// (or only a private-use codepoint), like the “Th” in Linux Libertine.
///
/// A glyph may have multiple entries in the font's cmap, and even the same
/// glyph can have a different text representation within one document.
/// But /ToUnicode does not support that, so we just save the first occurrence.
pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
/// Same as `glyph_sets`, but for color fonts.
pub color_glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,