mirror of
https://github.com/typst/typst
synced 2025-05-14 04:56:26 +08:00
Use texts of the first occurrences for /ToUnicode
CMap (#4585)
This commit is contained in:
parent
46ef8e1dfa
commit
9b001e2112
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -2790,7 +2790,6 @@ dependencies = [
|
|||||||
"typst-assets",
|
"typst-assets",
|
||||||
"typst-macros",
|
"typst-macros",
|
||||||
"typst-timing",
|
"typst-timing",
|
||||||
"unicode-properties",
|
|
||||||
"unscanny",
|
"unscanny",
|
||||||
"xmp-writer",
|
"xmp-writer",
|
||||||
]
|
]
|
||||||
|
@ -114,7 +114,6 @@ typed-arena = "2"
|
|||||||
unicode-bidi = "0.3.13"
|
unicode-bidi = "0.3.13"
|
||||||
unicode-ident = "1.0"
|
unicode-ident = "1.0"
|
||||||
unicode-math-class = "0.1"
|
unicode-math-class = "0.1"
|
||||||
unicode-properties = "0.1"
|
|
||||||
unicode-script = "0.5"
|
unicode-script = "0.5"
|
||||||
unicode-segmentation = "1"
|
unicode-segmentation = "1"
|
||||||
unscanny = "0.1"
|
unscanny = "0.1"
|
||||||
|
@ -29,7 +29,6 @@ pdf-writer = { workspace = true }
|
|||||||
subsetter = { workspace = true }
|
subsetter = { workspace = true }
|
||||||
svg2pdf = { workspace = true }
|
svg2pdf = { workspace = true }
|
||||||
ttf-parser = { workspace = true }
|
ttf-parser = { workspace = true }
|
||||||
unicode-properties = { workspace = true }
|
|
||||||
unscanny = { workspace = true }
|
unscanny = { workspace = true }
|
||||||
xmp-writer = { workspace = true }
|
xmp-writer = { workspace = true }
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ use subsetter::GlyphRemapper;
|
|||||||
use ttf_parser::{name_id, GlyphId, Tag};
|
use ttf_parser::{name_id, GlyphId, Tag};
|
||||||
use typst::text::Font;
|
use typst::text::Font;
|
||||||
use typst::utils::SliceExt;
|
use typst::utils::SliceExt;
|
||||||
use unicode_properties::{GeneralCategory, UnicodeGeneralCategory};
|
|
||||||
|
|
||||||
use crate::{deflate, EmExt, PdfChunk, WithGlobalRefs};
|
use crate::{deflate, EmExt, PdfChunk, WithGlobalRefs};
|
||||||
|
|
||||||
@ -226,38 +225,6 @@ pub(crate) fn subset_tag<T: Hash>(glyphs: &T) -> EcoString {
|
|||||||
std::str::from_utf8(&letter).unwrap().into()
|
std::str::from_utf8(&letter).unwrap().into()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For glyphs that have codepoints mapping to them in the font's cmap table, we
|
|
||||||
/// prefer them over pre-existing text mappings from the document. Only things
|
|
||||||
/// that don't have a corresponding codepoint (or only a private-use one) like
|
|
||||||
/// the "Th" in Linux Libertine get the text of their first occurrences in the
|
|
||||||
/// document instead.
|
|
||||||
///
|
|
||||||
/// This function replaces as much copepoints from the document with ones from
|
|
||||||
/// the cmap table as possible.
|
|
||||||
pub fn improve_glyph_sets(glyph_sets: &mut HashMap<Font, BTreeMap<u16, EcoString>>) {
|
|
||||||
for (font, glyph_set) in glyph_sets {
|
|
||||||
let ttf = font.ttf();
|
|
||||||
|
|
||||||
for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
|
|
||||||
if !subtable.is_unicode() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
subtable.codepoints(|n| {
|
|
||||||
let Some(c) = std::char::from_u32(n) else { return };
|
|
||||||
if c.general_category() == GeneralCategory::PrivateUse {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
|
|
||||||
if glyph_set.contains_key(&g) {
|
|
||||||
glyph_set.insert(g, c.into());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a compressed `/ToUnicode` CMap.
|
/// Create a compressed `/ToUnicode` CMap.
|
||||||
#[comemo::memoize]
|
#[comemo::memoize]
|
||||||
#[typst_macros::time(name = "create cmap")]
|
#[typst_macros::time(name = "create cmap")]
|
||||||
|
@ -12,8 +12,8 @@ use typst::layout::{Abs, Page};
|
|||||||
use typst::model::{Destination, Numbering};
|
use typst::model::{Destination, Numbering};
|
||||||
use typst::text::Case;
|
use typst::text::Case;
|
||||||
|
|
||||||
|
use crate::Resources;
|
||||||
use crate::{content, AbsExt, PdfChunk, WithDocument, WithRefs, WithResources};
|
use crate::{content, AbsExt, PdfChunk, WithDocument, WithRefs, WithResources};
|
||||||
use crate::{font::improve_glyph_sets, Resources};
|
|
||||||
|
|
||||||
/// Construct page objects.
|
/// Construct page objects.
|
||||||
#[typst_macros::time(name = "construct pages")]
|
#[typst_macros::time(name = "construct pages")]
|
||||||
@ -52,9 +52,6 @@ pub fn traverse_pages(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
improve_glyph_sets(&mut resources.glyph_sets);
|
|
||||||
improve_glyph_sets(&mut resources.color_glyph_sets);
|
|
||||||
|
|
||||||
(PdfChunk::new(), (pages, resources))
|
(PdfChunk::new(), (pages, resources))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,11 +77,16 @@ pub struct Resources<R = Ref> {
|
|||||||
pub languages: BTreeMap<Lang, usize>,
|
pub languages: BTreeMap<Lang, usize>,
|
||||||
|
|
||||||
/// For each font a mapping from used glyphs to their text representation.
|
/// For each font a mapping from used glyphs to their text representation.
|
||||||
/// May contain multiple chars in case of ligatures or similar things. The
|
/// This is used for the PDF's /ToUnicode map, and important for copy-paste
|
||||||
/// same glyph can have a different text representation within one document,
|
/// and searching.
|
||||||
/// then we just save the first one. The resulting strings are used for the
|
///
|
||||||
/// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
|
/// Note that the text representation may contain multiple chars in case of
|
||||||
/// cmap. This is important for copy-paste and searching.
|
/// ligatures or similar things, and it may have no entry in the font's cmap
|
||||||
|
/// (or only a private-use codepoint), like the “Th” in Linux Libertine.
|
||||||
|
///
|
||||||
|
/// A glyph may have multiple entries in the font's cmap, and even the same
|
||||||
|
/// glyph can have a different text representation within one document.
|
||||||
|
/// But /ToUnicode does not support that, so we just save the first occurrence.
|
||||||
pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
|
pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
|
||||||
/// Same as `glyph_sets`, but for color fonts.
|
/// Same as `glyph_sets`, but for color fonts.
|
||||||
pub color_glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
|
pub color_glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user