Make ligatures copyable and searchable

Fixes #479
Fixes #1040
This commit is contained in:
Laurenz 2023-05-03 10:33:18 +02:00
parent bcc014c4e1
commit ad347632ab
17 changed files with 229 additions and 187 deletions

12
Cargo.lock generated
View File

@ -116,6 +116,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "az"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973"
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.13.1" version = "0.13.1"
@ -1385,9 +1391,9 @@ checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
[[package]] [[package]]
name = "pdf-writer" name = "pdf-writer"
version = "0.7.0" version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63f45f7c7538e67c58cb4977e4f97bbd75fbd3990d827d28d597ec746291f644" checksum = "30900f178ea696fc5d9637171f98aaa93d5aae54f0726726df68fc3e32810db6"
dependencies = [ dependencies = [
"bitflags 1.3.2", "bitflags 1.3.2",
"itoa", "itoa",
@ -2306,6 +2312,7 @@ dependencies = [
"tracing", "tracing",
"ttf-parser", "ttf-parser",
"typst-macros", "typst-macros",
"unicode-general-category",
"unicode-math-class", "unicode-math-class",
"unicode-segmentation", "unicode-segmentation",
"unicode-xid", "unicode-xid",
@ -2366,6 +2373,7 @@ dependencies = [
name = "typst-library" name = "typst-library"
version = "0.3.0" version = "0.3.0"
dependencies = [ dependencies = [
"az",
"chinese-number", "chinese-number",
"comemo", "comemo",
"csv", "csv",

View File

@ -33,7 +33,7 @@ indexmap = "1.9.3"
log = "0.4" log = "0.4"
miniz_oxide = "0.7" miniz_oxide = "0.7"
once_cell = "1" once_cell = "1"
pdf-writer = "0.7" pdf-writer = "0.7.1"
pixglyph = "0.1" pixglyph = "0.1"
regex = "1" regex = "1"
resvg = { version = "0.32", default-features = false } resvg = { version = "0.32", default-features = false }
@ -46,6 +46,7 @@ svg2pdf = { git = "https://github.com/typst/svg2pdf" }
tiny-skia = "0.9.0" tiny-skia = "0.9.0"
tracing = "0.1.37" tracing = "0.1.37"
ttf-parser = "0.18.1" ttf-parser = "0.18.1"
unicode-general-category = "0.6"
unicode-math-class = "0.1" unicode-math-class = "0.1"
unicode-segmentation = "1" unicode-segmentation = "1"
unicode-xid = "0.2" unicode-xid = "0.2"

Binary file not shown.

View File

@ -159,7 +159,7 @@ construct: |
data-loading: | data-loading: |
Data loading from external files. Data loading from external files.
These functions help you with embedding data from experiments and APIs in your These functions help you with embedding data from experiments in your
documents. documents.
utility: | utility: |

View File

@ -16,6 +16,7 @@ bench = false
[dependencies] [dependencies]
typst = { path = ".." } typst = { path = ".." }
az = "1.2"
chinese-number = { version = "0.7.2", default-features = false, features = ["number-to-chinese"] } chinese-number = { version = "0.7.2", default-features = false, features = ["number-to-chinese"] }
comemo = "0.2.2" comemo = "0.2.2"
csv = "1" csv = "1"

View File

@ -1139,8 +1139,7 @@ fn line<'a>(
// are no other items in the line. // are no other items in the line.
if hyphen || start + shaped.text.len() > range.end { if hyphen || start + shaped.text.len() > range.end {
if hyphen || start < range.end || before.is_empty() { if hyphen || start < range.end || before.is_empty() {
let shifted = start - base..range.end - base; let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
let mut reshaped = shaped.reshape(vt, &p.spans, shifted);
if hyphen || shy { if hyphen || shy {
reshaped.push_hyphen(vt); reshaped.push_hyphen(vt);
} }
@ -1162,8 +1161,7 @@ fn line<'a>(
// Reshape if necessary. // Reshape if necessary.
if range.start + shaped.text.len() > end { if range.start + shaped.text.len() > end {
if range.start < end { if range.start < end {
let shifted = range.start - base..end - base; let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
let reshaped = shaped.reshape(vt, &p.spans, shifted);
width += reshaped.width; width += reshaped.width;
first = Some(Item::Text(reshaped)); first = Some(Item::Text(reshaped));
} }

View File

@ -222,13 +222,13 @@ impl GlyphFragment {
size: self.font_size, size: self.font_size,
fill: self.fill, fill: self.fill,
lang: self.lang, lang: self.lang,
text: self.c.into(),
glyphs: vec![Glyph { glyphs: vec![Glyph {
id: self.id.0, id: self.id.0,
c: self.c,
x_advance: Em::from_length(self.width, self.font_size), x_advance: Em::from_length(self.width, self.font_size),
x_offset: Em::zero(), x_offset: Em::zero(),
span: self.span, range: 0..self.c.len_utf8() as u16,
offset: 0, span: (self.span, 0),
}], }],
}; };
let size = Size::new(self.width, self.ascent + self.descent); let size = Size::new(self.width, self.ascent + self.descent);

View File

@ -1,6 +1,7 @@
use std::ops::Range; use std::ops::Range;
use std::str::FromStr; use std::str::FromStr;
use az::SaturatingAs;
use rustybuzz::{Feature, Tag, UnicodeBuffer}; use rustybuzz::{Feature, Tag, UnicodeBuffer};
use typst::font::{Font, FontVariant}; use typst::font::{Font, FontVariant};
use typst::util::SliceExt; use typst::util::SliceExt;
@ -47,20 +48,18 @@ pub struct ShapedGlyph {
pub x_offset: Em, pub x_offset: Em,
/// The vertical offset of the glyph. /// The vertical offset of the glyph.
pub y_offset: Em, pub y_offset: Em,
/// The byte index in the source text where this glyph's cluster starts. A /// The byte range of this glyph's cluster in the full paragraph. A cluster
/// cluster is a sequence of one or multiple glyphs that cannot be /// is a sequence of one or multiple glyphs that cannot be separated and
/// separated and must always be treated as a union. /// must always be treated as a union.
pub cluster: usize, pub range: Range<usize>,
/// Whether splitting the shaping result before this glyph would yield the /// Whether splitting the shaping result before this glyph would yield the
/// same results as shaping the parts to both sides of `text_index` /// same results as shaping the parts to both sides of `text_index`
/// separately. /// separately.
pub safe_to_break: bool, pub safe_to_break: bool,
/// The first char in this glyph's cluster. /// The first char in this glyph's cluster.
pub c: char, pub c: char,
/// The source code location of the text. /// The source code location of the glyph and its byte offset within it.
pub span: Span, pub span: (Span, u16),
/// The offset within the spanned text.
pub offset: u16,
} }
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
@ -181,6 +180,12 @@ impl<'a> ShapedText<'a> {
for ((font, y_offset), group) in for ((font, y_offset), group) in
self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset)) self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset))
{ {
let mut range = group[0].range.clone();
for glyph in group {
range.start = range.start.min(glyph.range.start);
range.end = range.end.max(glyph.range.end);
}
let pos = Point::new(offset, top + shift - y_offset.at(self.size)); let pos = Point::new(offset, top + shift - y_offset.at(self.size));
let glyphs = group let glyphs = group
.iter() .iter()
@ -195,8 +200,8 @@ impl<'a> ShapedText<'a> {
} else { } else {
glyph.stretchability().1 glyph.stretchability().1
}; };
let justification_left = adjustability_left * justification_ratio;
let justification_left = adjustability_left * justification_ratio;
let mut justification_right = let mut justification_right =
adjustability_right * justification_ratio; adjustability_right * justification_ratio;
if glyph.is_justifiable() { if glyph.is_justifiable() {
@ -206,15 +211,16 @@ impl<'a> ShapedText<'a> {
frame.size_mut().x += justification_left.at(self.size) frame.size_mut().x += justification_left.at(self.size)
+ justification_right.at(self.size); + justification_right.at(self.size);
Glyph { Glyph {
id: glyph.glyph_id, id: glyph.glyph_id,
x_advance: glyph.x_advance x_advance: glyph.x_advance
+ justification_left + justification_left
+ justification_right, + justification_right,
x_offset: glyph.x_offset + justification_left, x_offset: glyph.x_offset + justification_left,
c: glyph.c, range: (glyph.range.start - range.start).saturating_as()
..(glyph.range.end - range.start).saturating_as(),
span: glyph.span, span: glyph.span,
offset: glyph.offset,
} }
}) })
.collect(); .collect();
@ -224,6 +230,7 @@ impl<'a> ShapedText<'a> {
size: self.size, size: self.size,
lang, lang,
fill: fill.clone(), fill: fill.clone(),
text: self.text[range.start - self.base..range.end - self.base].into(),
glyphs, glyphs,
}; };
@ -318,16 +325,19 @@ impl<'a> ShapedText<'a> {
/// Reshape a range of the shaped text, reusing information from this /// Reshape a range of the shaped text, reusing information from this
/// shaping process if possible. /// shaping process if possible.
///
/// The text `range` is relative to the whole paragraph.
pub fn reshape( pub fn reshape(
&'a self, &'a self,
vt: &Vt, vt: &Vt,
spans: &SpanMapper, spans: &SpanMapper,
text_range: Range<usize>, text_range: Range<usize>,
) -> ShapedText<'a> { ) -> ShapedText<'a> {
let text = &self.text[text_range.start - self.base..text_range.end - self.base];
if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) { if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) {
Self { Self {
base: self.base + text_range.start, base: text_range.start,
text: &self.text[text_range], text,
dir: self.dir, dir: self.dir,
styles: self.styles, styles: self.styles,
size: self.size, size: self.size,
@ -336,14 +346,7 @@ impl<'a> ShapedText<'a> {
glyphs: Cow::Borrowed(glyphs), glyphs: Cow::Borrowed(glyphs),
} }
} else { } else {
shape( shape(vt, text_range.start, text, spans, self.styles, self.dir)
vt,
self.base + text_range.start,
&self.text[text_range],
spans,
self.styles,
self.dir,
)
} }
} }
@ -358,7 +361,11 @@ impl<'a> ShapedText<'a> {
let ttf = font.ttf(); let ttf = font.ttf();
let glyph_id = ttf.glyph_index('-')?; let glyph_id = ttf.glyph_index('-')?;
let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?); let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
let cluster = self.glyphs.last().map(|g| g.cluster).unwrap_or_default(); let range = self
.glyphs
.last()
.map(|g| g.range.end..g.range.end)
.unwrap_or_default();
self.width += x_advance.at(self.size); self.width += x_advance.at(self.size);
self.glyphs.to_mut().push(ShapedGlyph { self.glyphs.to_mut().push(ShapedGlyph {
font, font,
@ -366,11 +373,10 @@ impl<'a> ShapedText<'a> {
x_advance, x_advance,
x_offset: Em::zero(), x_offset: Em::zero(),
y_offset: Em::zero(), y_offset: Em::zero(),
cluster, range,
safe_to_break: true, safe_to_break: true,
c: '-', c: '-',
span: Span::detached(), span: (Span::detached(), 0),
offset: 0,
}); });
Some(()) Some(())
}); });
@ -396,9 +402,9 @@ impl<'a> ShapedText<'a> {
// Handle edge cases. // Handle edge cases.
let len = self.glyphs.len(); let len = self.glyphs.len();
if text_index == 0 { if text_index == self.base {
return Some(if ltr { 0 } else { len }); return Some(if ltr { 0 } else { len });
} else if text_index == self.text.len() { } else if text_index == self.base + self.text.len() {
return Some(if ltr { len } else { 0 }); return Some(if ltr { len } else { 0 });
} }
@ -406,7 +412,7 @@ impl<'a> ShapedText<'a> {
let mut idx = self let mut idx = self
.glyphs .glyphs
.binary_search_by(|g| { .binary_search_by(|g| {
let ordering = g.cluster.cmp(&text_index); let ordering = g.range.start.cmp(&text_index);
if ltr { if ltr {
ordering ordering
} else { } else {
@ -422,7 +428,7 @@ impl<'a> ShapedText<'a> {
// Search for the outermost glyph with the text index. // Search for the outermost glyph with the text index.
while let Some(next) = next(idx, 1) { while let Some(next) = next(idx, 1) {
if self.glyphs.get(next).map_or(true, |g| g.cluster != text_index) { if self.glyphs.get(next).map_or(true, |g| g.range.start != text_index) {
break; break;
} }
idx = next; idx = next;
@ -444,7 +450,6 @@ impl Debug for ShapedText<'_> {
/// Holds shaping results and metadata common to all shaped segments. /// Holds shaping results and metadata common to all shaped segments.
struct ShapingContext<'a> { struct ShapingContext<'a> {
vt: &'a Vt<'a>, vt: &'a Vt<'a>,
base: usize,
spans: &'a SpanMapper, spans: &'a SpanMapper,
glyphs: Vec<ShapedGlyph>, glyphs: Vec<ShapedGlyph>,
used: Vec<Font>, used: Vec<Font>,
@ -468,7 +473,6 @@ pub fn shape<'a>(
let size = TextElem::size_in(styles); let size = TextElem::size_in(styles);
let mut ctx = ShapingContext { let mut ctx = ShapingContext {
vt, vt,
base,
spans, spans,
size, size,
glyphs: vec![], glyphs: vec![],
@ -481,7 +485,7 @@ pub fn shape<'a>(
}; };
if !text.is_empty() { if !text.is_empty() {
shape_segment(&mut ctx, 0, text, families(styles)); shape_segment(&mut ctx, base, text, families(styles));
} }
track_and_space(&mut ctx); track_and_space(&mut ctx);
@ -552,6 +556,7 @@ fn shape_segment(
let buffer = rustybuzz::shape(font.rusty(), &ctx.tags, buffer); let buffer = rustybuzz::shape(font.rusty(), &ctx.tags, buffer);
let infos = buffer.glyph_infos(); let infos = buffer.glyph_infos();
let pos = buffer.glyph_positions(); let pos = buffer.glyph_positions();
let ltr = ctx.dir.is_positive();
// Collect the shaped glyphs, doing fallback and shaping parts again with // Collect the shaped glyphs, doing fallback and shaping parts again with
// the next font if necessary. // the next font if necessary.
@ -560,32 +565,36 @@ fn shape_segment(
let info = &infos[i]; let info = &infos[i];
let cluster = info.cluster as usize; let cluster = info.cluster as usize;
if info.glyph_id != 0 {
// Add the glyph to the shaped output. // Add the glyph to the shaped output.
// TODO: Don't ignore y_advance. if info.glyph_id != 0 {
let (span, offset) = ctx.spans.span_at(ctx.base + cluster); // Determine the text range of the glyph.
let start = base + cluster;
let end = base
+ if ltr { i.checked_add(1) } else { i.checked_sub(1) }
.and_then(|last| infos.get(last))
.map_or(text.len(), |info| info.cluster as usize);
ctx.glyphs.push(ShapedGlyph { ctx.glyphs.push(ShapedGlyph {
font: font.clone(), font: font.clone(),
glyph_id: info.glyph_id as u16, glyph_id: info.glyph_id as u16,
// TODO: Don't ignore y_advance.
x_advance: font.to_em(pos[i].x_advance), x_advance: font.to_em(pos[i].x_advance),
x_offset: font.to_em(pos[i].x_offset), x_offset: font.to_em(pos[i].x_offset),
y_offset: font.to_em(pos[i].y_offset), y_offset: font.to_em(pos[i].y_offset),
cluster: base + cluster, range: start..end,
safe_to_break: !info.unsafe_to_break(), safe_to_break: !info.unsafe_to_break(),
c: text[cluster..].chars().next().unwrap(), c: text[cluster..].chars().next().unwrap(),
span, span: ctx.spans.span_at(start),
offset,
}); });
} else { } else {
// Determine the source text range for the tofu sequence.
let range = {
// First, search for the end of the tofu sequence. // First, search for the end of the tofu sequence.
let k = i; let k = i;
while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) { while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
i += 1; i += 1;
} }
// Then, determine the start and end text index. // Then, determine the start and end text index for the tofu
// sequence.
// //
// Examples: // Examples:
// Everything is shown in visual order. Tofus are written as "_". // Everything is shown in visual order. Tofus are written as "_".
@ -603,25 +612,19 @@ fn shape_segment(
// Glyphs: E C _ _ A // Glyphs: E C _ _ A
// Clusters: 8 6 4 2 0 // Clusters: 8 6 4 2 0
// k=2 i=3 // k=2 i=3
let ltr = ctx.dir.is_positive(); let start = infos[if ltr { k } else { i }].cluster as usize;
let first = if ltr { k } else { i }; let end = if ltr { i.checked_add(1) } else { k.checked_sub(1) }
let start = infos[first].cluster as usize;
let last = if ltr { i.checked_add(1) } else { k.checked_sub(1) };
let end = last
.and_then(|last| infos.get(last)) .and_then(|last| infos.get(last))
.map_or(text.len(), |info| info.cluster as usize); .map_or(text.len(), |info| info.cluster as usize);
start..end
};
// Trim half-baked cluster. // Trim half-baked cluster.
let remove = base + range.start..base + range.end; let remove = base + start..base + end;
while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.cluster)) { while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.range.start)) {
ctx.glyphs.pop(); ctx.glyphs.pop();
} }
// Recursively shape the tofu sequence with the next family. // Recursively shape the tofu sequence with the next family.
shape_segment(ctx, base + range.start, &text[range], families.clone()); shape_segment(ctx, base + start, &text[start..end], families.clone());
} }
i += 1; i += 1;
@ -634,19 +637,18 @@ fn shape_segment(
fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) { fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
let x_advance = font.advance(0).unwrap_or_default(); let x_advance = font.advance(0).unwrap_or_default();
for (cluster, c) in text.char_indices() { for (cluster, c) in text.char_indices() {
let cluster = base + cluster; let start = base + cluster;
let (span, offset) = ctx.spans.span_at(ctx.base + cluster); let end = start + c.len_utf8();
ctx.glyphs.push(ShapedGlyph { ctx.glyphs.push(ShapedGlyph {
font: font.clone(), font: font.clone(),
glyph_id: 0, glyph_id: 0,
x_advance, x_advance,
x_offset: Em::zero(), x_offset: Em::zero(),
y_offset: Em::zero(), y_offset: Em::zero(),
cluster, range: start..end,
safe_to_break: true, safe_to_break: true,
c, c,
span, span: ctx.spans.span_at(start),
offset,
}); });
} }
} }
@ -668,7 +670,10 @@ fn track_and_space(ctx: &mut ShapingContext) {
glyph.x_advance = spacing.relative_to(glyph.x_advance); glyph.x_advance = spacing.relative_to(glyph.x_advance);
} }
if glyphs.peek().map_or(false, |next| glyph.cluster != next.cluster) { if glyphs
.peek()
.map_or(false, |next| glyph.range.start != next.range.start)
{
glyph.x_advance += tracking; glyph.x_advance += tracking;
} }
} }

View File

@ -1,7 +1,8 @@
//! Finished documents. //! Finished documents.
use std::fmt::{self, Debug, Formatter, Write}; use std::fmt::{self, Debug, Formatter};
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
use std::ops::Range;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
@ -114,23 +115,6 @@ impl Frame {
pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> { pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> {
self.items.iter() self.items.iter()
} }
/// Approximately recover the text inside of the frame and its children.
pub fn text(&self) -> EcoString {
let mut text = EcoString::new();
for (_, item) in self.items() {
match item {
FrameItem::Text(item) => {
for glyph in &item.glyphs {
text.push(glyph.c);
}
}
FrameItem::Group(group) => text.push_str(&group.frame.text()),
_ => {}
}
}
text
}
} }
/// Insert items and subframes. /// Insert items and subframes.
@ -476,6 +460,8 @@ pub struct TextItem {
pub fill: Paint, pub fill: Paint,
/// The natural language of the text. /// The natural language of the text.
pub lang: Lang, pub lang: Lang,
/// The item's plain text.
pub text: EcoString,
/// The glyphs. /// The glyphs.
pub glyphs: Vec<Glyph>, pub glyphs: Vec<Glyph>,
} }
@ -489,19 +475,14 @@ impl TextItem {
impl Debug for TextItem { impl Debug for TextItem {
fn fmt(&self, f: &mut Formatter) -> fmt::Result { fn fmt(&self, f: &mut Formatter) -> fmt::Result {
// This is only a rough approximation of the source text. f.write_str("Text(")?;
f.write_str("Text(\"")?; self.text.fmt(f)?;
for glyph in &self.glyphs { f.write_str(")")
for c in glyph.c.escape_debug() {
f.write_char(c)?;
}
}
f.write_str("\")")
} }
} }
/// A glyph in a run of shaped text. /// A glyph in a run of shaped text.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] #[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct Glyph { pub struct Glyph {
/// The glyph's index in the font. /// The glyph's index in the font.
pub id: u16, pub id: u16,
@ -509,12 +490,17 @@ pub struct Glyph {
pub x_advance: Em, pub x_advance: Em,
/// The horizontal offset of the glyph. /// The horizontal offset of the glyph.
pub x_offset: Em, pub x_offset: Em,
/// The first character of the glyph's cluster. /// The range of the glyph in its item's text.
pub c: char, pub range: Range<u16>,
/// The source code location of the text. /// The source code location of the text.
pub span: Span, pub span: (Span, u16),
/// The offset within the spanned text. }
pub offset: u16,
impl Glyph {
/// The range of the glyph in its item's text.
pub fn range(&self) -> Range<usize> {
usize::from(self.range.start)..usize::from(self.range.end)
}
} }
/// An identifier for a natural language. /// An identifier for a natural language.

View File

@ -1,13 +1,21 @@
use std::collections::BTreeMap; use std::collections::BTreeMap;
use ecow::eco_format; use ecow::{eco_format, EcoString};
use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap}; use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
use pdf_writer::{Filter, Finish, Name, Rect, Str}; use pdf_writer::{Filter, Finish, Name, Rect, Str};
use ttf_parser::{name_id, GlyphId, Tag}; use ttf_parser::{name_id, GlyphId, Tag};
use unicode_general_category::GeneralCategory;
use super::{deflate, EmExt, PdfContext, RefExt}; use super::{deflate, EmExt, PdfContext, RefExt};
use crate::util::SliceExt; use crate::util::SliceExt;
const CMAP_NAME: Name = Name(b"Custom");
const SYSTEM_INFO: SystemInfo = SystemInfo {
registry: Str(b"Adobe"),
ordering: Str(b"Identity"),
supplement: 0,
};
/// Embed all used fonts into the PDF. /// Embed all used fonts into the PDF.
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
pub fn write_fonts(ctx: &mut PdfContext) { pub fn write_fonts(ctx: &mut PdfContext) {
@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let data_ref = ctx.alloc.bump(); let data_ref = ctx.alloc.bump();
ctx.font_refs.push(type0_ref); ctx.font_refs.push(type0_ref);
let glyphs = &ctx.glyph_sets[font]; let glyph_set = ctx.glyph_sets.get_mut(font).unwrap();
let metrics = font.metrics(); let metrics = font.metrics();
let ttf = font.ttf(); let ttf = font.ttf();
@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let base_font = eco_format!("ABCDEF+{}", postscript_name); let base_font = eco_format!("ABCDEF+{}", postscript_name);
let base_font = Name(base_font.as_bytes()); let base_font = Name(base_font.as_bytes());
let cmap_name = Name(b"Custom");
let system_info = SystemInfo {
registry: Str(b"Adobe"),
ordering: Str(b"Identity"),
supplement: 0,
};
// Write the base font object referencing the CID font. // Write the base font object referencing the CID font.
ctx.writer ctx.writer
@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let mut cid = ctx.writer.cid_font(cid_ref); let mut cid = ctx.writer.cid_font(cid_ref);
cid.subtype(subtype); cid.subtype(subtype);
cid.base_font(base_font); cid.base_font(base_font);
cid.system_info(system_info); cid.system_info(SYSTEM_INFO);
cid.font_descriptor(descriptor_ref); cid.font_descriptor(descriptor_ref);
cid.default_width(0.0); cid.default_width(0.0);
@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
// Extract the widths of all glyphs. // Extract the widths of all glyphs.
let num_glyphs = ttf.number_of_glyphs(); let num_glyphs = ttf.number_of_glyphs();
let mut widths = vec![0.0; num_glyphs as usize]; let mut widths = vec![0.0; num_glyphs as usize];
for &g in glyphs { for &g in glyph_set.keys() {
let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0); let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0);
widths[g as usize] = font.to_em(x).to_font_units(); widths[g as usize] = font.to_em(x).to_font_units();
} }
@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) {
font_descriptor.finish(); font_descriptor.finish();
// Compute a reverse mapping from glyphs to unicode.
let cmap = {
let mut mapping = BTreeMap::new();
for subtable in
ttf.tables().cmap.into_iter().flat_map(|table| table.subtables)
{
if subtable.is_unicode() {
subtable.codepoints(|n| {
if let Some(c) = std::char::from_u32(n) {
if let Some(GlyphId(g)) = ttf.glyph_index(c) {
if glyphs.contains(&g) {
mapping.insert(g, c);
}
}
}
});
}
}
let mut cmap = UnicodeCmap::new(cmap_name, system_info);
for (g, c) in mapping {
cmap.pair(g, c);
}
cmap
};
// Write the /ToUnicode character map, which maps glyph ids back to // Write the /ToUnicode character map, which maps glyph ids back to
// unicode codepoints to enable copying out of the PDF. // unicode codepoints to enable copying out of the PDF.
ctx.writer let cmap = create_cmap(ttf, glyph_set);
.cmap(cmap_ref, &deflate(&cmap.finish())) ctx.writer.cmap(cmap_ref, &cmap.finish());
.filter(Filter::FlateDecode);
// Subset and write the font's bytes. // Subset and write the font's bytes.
let data = font.data(); let data = font.data();
let subsetted = { let subsetted = {
let glyphs: Vec<_> = glyphs.iter().copied().collect(); let glyphs: Vec<_> = glyph_set.keys().copied().collect();
let profile = subsetter::Profile::pdf(&glyphs); let profile = subsetter::Profile::pdf(&glyphs);
subsetter::subset(data, font.index(), profile) subsetter::subset(data, font.index(), profile)
}; };
@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) {
stream.finish(); stream.finish();
} }
} }
/// Create a /ToUnicode CMap.
fn create_cmap(
ttf: &ttf_parser::Face,
glyph_set: &mut BTreeMap<u16, EcoString>,
) -> UnicodeCmap {
// For glyphs that have codepoints mapping to in the font's cmap table, we
// prefer them over pre-existing text mappings from the document. Only
// things that don't have a corresponding codepoint (or only a private-use
// one) like the "Th" in Linux Libertine get the text of their first
// occurances in the document instead.
for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
if !subtable.is_unicode() {
continue;
}
subtable.codepoints(|n| {
let Some(c) = std::char::from_u32(n) else { return };
if unicode_general_category::get_general_category(c)
== GeneralCategory::PrivateUse
{
return;
}
let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
if glyph_set.contains_key(&g) {
glyph_set.insert(g, c.into());
}
});
}
// Produce a reverse mapping from glyphs to unicode strings.
let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
for (&g, text) in glyph_set.iter() {
if !text.is_empty() {
cmap.pair_with_multiple(g, text.chars());
}
}
cmap
}

View File

@ -6,9 +6,10 @@ mod outline;
mod page; mod page;
use std::cmp::Eq; use std::cmp::Eq;
use std::collections::{HashMap, HashSet}; use std::collections::{BTreeMap, HashMap};
use std::hash::Hash; use std::hash::Hash;
use ecow::EcoString;
use pdf_writer::types::Direction; use pdf_writer::types::Direction;
use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr}; use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr};
use xmp_writer::{LangId, RenditionClass, XmpWriter}; use xmp_writer::{LangId, RenditionClass, XmpWriter};
@ -52,7 +53,13 @@ pub struct PdfContext<'a> {
page_refs: Vec<Ref>, page_refs: Vec<Ref>,
font_map: Remapper<Font>, font_map: Remapper<Font>,
image_map: Remapper<Image>, image_map: Remapper<Image>,
glyph_sets: HashMap<Font, HashSet<u16>>, /// For each font a mapping from used glyphs to their text representation.
/// May contain multiple chars in case of ligatures or similar things. The
/// same glyph can have a different text representation within one document,
/// then we just save the first one. The resulting strings are used for the
/// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
/// cmap. This is important for copy-paste and searching.
glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
languages: HashMap<Lang, usize>, languages: HashMap<Lang, usize>,
} }

View File

@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) {
/// Encode a text run into the content stream. /// Encode a text run into the content stream.
fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) { fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) {
*ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len(); *ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len();
ctx.parent
.glyph_sets let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default();
.entry(text.font.clone()) for g in &text.glyphs {
.or_default() let segment = &text.text[g.range()];
.extend(text.glyphs.iter().map(|g| g.id)); glyph_set.entry(g.id).or_insert_with(|| segment.into());
}
ctx.set_fill(&text.fill); ctx.set_fill(&text.fill);
ctx.set_font(&text.font, text.size); ctx.set_font(&text.font, text.size);

View File

@ -67,7 +67,8 @@ pub fn jump_from_click(
FrameItem::Text(text) => { FrameItem::Text(text) => {
for glyph in &text.glyphs { for glyph in &text.glyphs {
if glyph.span.is_detached() { let (span, span_offset) = glyph.span;
if span.is_detached() {
continue; continue;
} }
@ -77,13 +78,13 @@ pub fn jump_from_click(
Size::new(width, text.size), Size::new(width, text.size),
click, click,
) { ) {
let source = world.source(glyph.span.source()); let source = world.source(span.source());
let node = source.find(glyph.span)?; let node = source.find(span)?;
let pos = if node.kind() == SyntaxKind::Text { let pos = if node.kind() == SyntaxKind::Text {
let range = node.range(); let range = node.range();
let mut offset = range.start + usize::from(glyph.offset); let mut offset = range.start + usize::from(span_offset);
if (click.x - pos.x) > width / 2.0 { if (click.x - pos.x) > width / 2.0 {
offset += glyph.c.len_utf8(); offset += glyph.range().len();
} }
offset.min(range.end) offset.min(range.end)
} else { } else {
@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option<Point> {
if let FrameItem::Text(text) = item { if let FrameItem::Text(text) = item {
for glyph in &text.glyphs { for glyph in &text.glyphs {
if glyph.span == span { if glyph.span.0 == span {
return Some(pos); return Some(pos);
} }
pos.x += glyph.x_advance.at(text.size); pos.x += glyph.x_advance.at(text.size);

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

After

Width:  |  Height:  |  Size: 2.5 KiB

View File

@ -353,9 +353,18 @@ fn test(
pdf_path: Option<&Path>, pdf_path: Option<&Path>,
args: &Args, args: &Args,
) -> bool { ) -> bool {
let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path); struct PanicGuard<'a>(&'a Path);
impl Drop for PanicGuard<'_> {
fn drop(&mut self) {
if std::thread::panicking() {
println!("Panicked in {}", self.0.display());
}
}
}
let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
let text = fs::read_to_string(src_path).unwrap(); let text = fs::read_to_string(src_path).unwrap();
let _guard = PanicGuard(name);
let mut output = String::new(); let mut output = String::new();
let mut ok = true; let mut ok = true;
@ -401,6 +410,7 @@ fn test(
line, line,
&mut rng, &mut rng,
); );
ok &= part_ok; ok &= part_ok;
compare_ever |= compare_here; compare_ever |= compare_here;
frames.extend(part_frames); frames.extend(part_frames);

View File

@ -0,0 +1,8 @@
// Test copy-paste and search in PDF with ligatures
// and Arabic test. Must be tested manually!
---
The after fira 🏳️‍🌈!
#set text(lang: "ar", font: "Noto Sans Arabic")
مرحبًا