mirror of
https://github.com/typst/typst
synced 2025-05-14 04:56:26 +08:00
Use texts of the first occurrences for /ToUnicode
CMap (#4585)
This commit is contained in:
parent
46ef8e1dfa
commit
9b001e2112
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -2790,7 +2790,6 @@ dependencies = [
|
||||
"typst-assets",
|
||||
"typst-macros",
|
||||
"typst-timing",
|
||||
"unicode-properties",
|
||||
"unscanny",
|
||||
"xmp-writer",
|
||||
]
|
||||
|
@ -114,7 +114,6 @@ typed-arena = "2"
|
||||
unicode-bidi = "0.3.13"
|
||||
unicode-ident = "1.0"
|
||||
unicode-math-class = "0.1"
|
||||
unicode-properties = "0.1"
|
||||
unicode-script = "0.5"
|
||||
unicode-segmentation = "1"
|
||||
unscanny = "0.1"
|
||||
|
@ -29,7 +29,6 @@ pdf-writer = { workspace = true }
|
||||
subsetter = { workspace = true }
|
||||
svg2pdf = { workspace = true }
|
||||
ttf-parser = { workspace = true }
|
||||
unicode-properties = { workspace = true }
|
||||
unscanny = { workspace = true }
|
||||
xmp-writer = { workspace = true }
|
||||
|
||||
|
@ -12,7 +12,6 @@ use subsetter::GlyphRemapper;
|
||||
use ttf_parser::{name_id, GlyphId, Tag};
|
||||
use typst::text::Font;
|
||||
use typst::utils::SliceExt;
|
||||
use unicode_properties::{GeneralCategory, UnicodeGeneralCategory};
|
||||
|
||||
use crate::{deflate, EmExt, PdfChunk, WithGlobalRefs};
|
||||
|
||||
@ -226,38 +225,6 @@ pub(crate) fn subset_tag<T: Hash>(glyphs: &T) -> EcoString {
|
||||
std::str::from_utf8(&letter).unwrap().into()
|
||||
}
|
||||
|
||||
/// For glyphs that have codepoints mapping to them in the font's cmap table, we
|
||||
/// prefer them over pre-existing text mappings from the document. Only things
|
||||
/// that don't have a corresponding codepoint (or only a private-use one) like
|
||||
/// the "Th" in Linux Libertine get the text of their first occurrences in the
|
||||
/// document instead.
|
||||
///
|
||||
/// This function replaces as much copepoints from the document with ones from
|
||||
/// the cmap table as possible.
|
||||
pub fn improve_glyph_sets(glyph_sets: &mut HashMap<Font, BTreeMap<u16, EcoString>>) {
|
||||
for (font, glyph_set) in glyph_sets {
|
||||
let ttf = font.ttf();
|
||||
|
||||
for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
|
||||
if !subtable.is_unicode() {
|
||||
continue;
|
||||
}
|
||||
|
||||
subtable.codepoints(|n| {
|
||||
let Some(c) = std::char::from_u32(n) else { return };
|
||||
if c.general_category() == GeneralCategory::PrivateUse {
|
||||
return;
|
||||
}
|
||||
|
||||
let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
|
||||
if glyph_set.contains_key(&g) {
|
||||
glyph_set.insert(g, c.into());
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a compressed `/ToUnicode` CMap.
|
||||
#[comemo::memoize]
|
||||
#[typst_macros::time(name = "create cmap")]
|
||||
|
@ -12,8 +12,8 @@ use typst::layout::{Abs, Page};
|
||||
use typst::model::{Destination, Numbering};
|
||||
use typst::text::Case;
|
||||
|
||||
use crate::Resources;
|
||||
use crate::{content, AbsExt, PdfChunk, WithDocument, WithRefs, WithResources};
|
||||
use crate::{font::improve_glyph_sets, Resources};
|
||||
|
||||
/// Construct page objects.
|
||||
#[typst_macros::time(name = "construct pages")]
|
||||
@ -52,9 +52,6 @@ pub fn traverse_pages(
|
||||
}
|
||||
}
|
||||
|
||||
improve_glyph_sets(&mut resources.glyph_sets);
|
||||
improve_glyph_sets(&mut resources.color_glyph_sets);
|
||||
|
||||
(PdfChunk::new(), (pages, resources))
|
||||
}
|
||||
|
||||
|
@ -77,11 +77,16 @@ pub struct Resources<R = Ref> {
|
||||
pub languages: BTreeMap<Lang, usize>,
|
||||
|
||||
/// For each font a mapping from used glyphs to their text representation.
|
||||
/// May contain multiple chars in case of ligatures or similar things. The
|
||||
/// same glyph can have a different text representation within one document,
|
||||
/// then we just save the first one. The resulting strings are used for the
|
||||
/// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
|
||||
/// cmap. This is important for copy-paste and searching.
|
||||
/// This is used for the PDF's /ToUnicode map, and important for copy-paste
|
||||
/// and searching.
|
||||
///
|
||||
/// Note that the text representation may contain multiple chars in case of
|
||||
/// ligatures or similar things, and it may have no entry in the font's cmap
|
||||
/// (or only a private-use codepoint), like the “Th” in Linux Libertine.
|
||||
///
|
||||
/// A glyph may have multiple entries in the font's cmap, and even the same
|
||||
/// glyph can have a different text representation within one document.
|
||||
/// But /ToUnicode does not support that, so we just save the first occurrence.
|
||||
pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
|
||||
/// Same as `glyph_sets`, but for color fonts.
|
||||
pub color_glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
|
||||
|
Loading…
x
Reference in New Issue
Block a user