Make ligatures copyable and searchable

Fixes #479 Fixes #1040
2025-07-13 07:32:52 +08:00 · 2023-05-03 10:33:18 +02:00 · 2023-05-03 10:33:18 +02:00 · ad347632ab
commit ad347632ab
parent bcc014c4e1
17 changed files with 229 additions and 187 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -116,6 +116,12 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 [[package]]
 name = "az"
 version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973"
 [[package]]
 name = "base64"
 version = "0.13.1"
@ -1385,9 +1391,9 @@ checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
 [[package]]
 name = "pdf-writer"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63f45f7c7538e67c58cb4977e4f97bbd75fbd3990d827d28d597ec746291f644"
+checksum = "30900f178ea696fc5d9637171f98aaa93d5aae54f0726726df68fc3e32810db6"
 dependencies = [
 "bitflags 1.3.2",
 "itoa",
@ -2306,6 +2312,7 @@ dependencies = [
 "tracing",
 "ttf-parser",
 "typst-macros",
 "unicode-general-category",
 "unicode-math-class",
 "unicode-segmentation",
 "unicode-xid",
@ -2366,6 +2373,7 @@ dependencies = [
 name = "typst-library"
 version = "0.3.0"
 dependencies = [
 "az",
 "chinese-number",
 "comemo",
 "csv",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -33,7 +33,7 @@ indexmap = "1.9.3"
 log = "0.4"
 miniz_oxide = "0.7"
 once_cell = "1"
-pdf-writer = "0.7"
+pdf-writer = "0.7.1"
 pixglyph = "0.1"
 regex = "1"
 resvg = { version = "0.32", default-features = false }
@ -46,6 +46,7 @@ svg2pdf = { git = "https://github.com/typst/svg2pdf" }
 tiny-skia = "0.9.0"
 tracing = "0.1.37"
 ttf-parser = "0.18.1"
 unicode-general-category = "0.6"
 unicode-math-class = "0.1"
 unicode-segmentation = "1"
 unicode-xid = "0.2"
--- a/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
+++ b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
--- a/docs/src/reference/details.yml
+++ b/docs/src/reference/details.yml
@ -159,7 +159,7 @@ construct: |
 data-loading: |
  Data loading from external files.
-  These functions help you with embedding data from experiments and APIs in your
+  These functions help you with embedding data from experiments in your
  documents.
 utility: |
--- a/library/Cargo.toml
+++ b/library/Cargo.toml
@ -16,6 +16,7 @@ bench = false
 [dependencies]
 typst = { path = ".." }
 az = "1.2"
 chinese-number = { version = "0.7.2", default-features = false, features = ["number-to-chinese"] }
 comemo = "0.2.2"
 csv = "1"
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@ -1139,8 +1139,7 @@ fn line<'a>(
        // are no other items in the line.
        if hyphen || start + shaped.text.len() > range.end {
            if hyphen || start < range.end || before.is_empty() {
-                let shifted = start - base..range.end - base;
+                let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
                let mut reshaped = shaped.reshape(vt, &p.spans, shifted);
                if hyphen || shy {
                    reshaped.push_hyphen(vt);
                }
@ -1162,8 +1161,7 @@ fn line<'a>(
        // Reshape if necessary.
        if range.start + shaped.text.len() > end {
            if range.start < end {
-                let shifted = range.start - base..end - base;
+                let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
                let reshaped = shaped.reshape(vt, &p.spans, shifted);
                width += reshaped.width;
                first = Some(Item::Text(reshaped));
            }
--- a/library/src/math/fragment.rs
+++ b/library/src/math/fragment.rs
@ -222,13 +222,13 @@ impl GlyphFragment {
            size: self.font_size,
            fill: self.fill,
            lang: self.lang,
            text: self.c.into(),
            glyphs: vec![Glyph {
                id: self.id.0,
                c: self.c,
                x_advance: Em::from_length(self.width, self.font_size),
                x_offset: Em::zero(),
-                span: self.span,
+                range: 0..self.c.len_utf8() as u16,
-                offset: 0,
+                span: (self.span, 0),
            }],
        };
        let size = Size::new(self.width, self.ascent + self.descent);
--- a/library/src/text/shaping.rs
+++ b/library/src/text/shaping.rs
@ -1,6 +1,7 @@
 use std::ops::Range;
 use std::str::FromStr;
 use az::SaturatingAs;
 use rustybuzz::{Feature, Tag, UnicodeBuffer};
 use typst::font::{Font, FontVariant};
 use typst::util::SliceExt;
@ -47,20 +48,18 @@ pub struct ShapedGlyph {
    pub x_offset: Em,
    /// The vertical offset of the glyph.
    pub y_offset: Em,
-    /// The byte index in the source text where this glyph's cluster starts. A
+    /// The byte range of this glyph's cluster in the full paragraph. A cluster
-    /// cluster is a sequence of one or multiple glyphs that cannot be
+    /// is a sequence of one or multiple glyphs that cannot be separated and
-    /// separated and must always be treated as a union.
+    /// must always be treated as a union.
-    pub cluster: usize,
+    pub range: Range<usize>,
    /// Whether splitting the shaping result before this glyph would yield the
    /// same results as shaping the parts to both sides of `text_index`
    /// separately.
    pub safe_to_break: bool,
    /// The first char in this glyph's cluster.
    pub c: char,
-    /// The source code location of the text.
+    /// The source code location of the glyph and its byte offset within it.
-    pub span: Span,
+    pub span: (Span, u16),
    /// The offset within the spanned text.
    pub offset: u16,
 }
 #[derive(Debug, Clone, Default)]
@ -181,6 +180,12 @@ impl<'a> ShapedText<'a> {
        for ((font, y_offset), group) in
            self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset))
        {
            let mut range = group[0].range.clone();
            for glyph in group {
                range.start = range.start.min(glyph.range.start);
                range.end = range.end.max(glyph.range.end);
            }
            let pos = Point::new(offset, top + shift - y_offset.at(self.size));
            let glyphs = group
                .iter()
@ -195,8 +200,8 @@ impl<'a> ShapedText<'a> {
                    } else {
                        glyph.stretchability().1
                    };
                    let justification_left = adjustability_left * justification_ratio;
                    let justification_left = adjustability_left * justification_ratio;
                    let mut justification_right =
                        adjustability_right * justification_ratio;
                    if glyph.is_justifiable() {
@ -206,15 +211,16 @@ impl<'a> ShapedText<'a> {
                    frame.size_mut().x += justification_left.at(self.size)
                        + justification_right.at(self.size);
                    Glyph {
                        id: glyph.glyph_id,
                        x_advance: glyph.x_advance
                            + justification_left
                            + justification_right,
                        x_offset: glyph.x_offset + justification_left,
-                        c: glyph.c,
+                        range: (glyph.range.start - range.start).saturating_as()
                            ..(glyph.range.end - range.start).saturating_as(),
                        span: glyph.span,
                        offset: glyph.offset,
                    }
                })
                .collect();
@ -224,6 +230,7 @@ impl<'a> ShapedText<'a> {
                size: self.size,
                lang,
                fill: fill.clone(),
                text: self.text[range.start - self.base..range.end - self.base].into(),
                glyphs,
            };
@ -318,16 +325,19 @@ impl<'a> ShapedText<'a> {
    /// Reshape a range of the shaped text, reusing information from this
    /// shaping process if possible.
    ///
    /// The text `range` is relative to the whole paragraph.
    pub fn reshape(
        &'a self,
        vt: &Vt,
        spans: &SpanMapper,
        text_range: Range<usize>,
    ) -> ShapedText<'a> {
        let text = &self.text[text_range.start - self.base..text_range.end - self.base];
        if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) {
            Self {
-                base: self.base + text_range.start,
+                base: text_range.start,
-                text: &self.text[text_range],
+                text,
                dir: self.dir,
                styles: self.styles,
                size: self.size,
@ -336,14 +346,7 @@ impl<'a> ShapedText<'a> {
                glyphs: Cow::Borrowed(glyphs),
            }
        } else {
-            shape(
+            shape(vt, text_range.start, text, spans, self.styles, self.dir)
                vt,
                self.base + text_range.start,
                &self.text[text_range],
                spans,
                self.styles,
                self.dir,
            )
        }
    }
@ -358,7 +361,11 @@ impl<'a> ShapedText<'a> {
            let ttf = font.ttf();
            let glyph_id = ttf.glyph_index('-')?;
            let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
-            let cluster = self.glyphs.last().map(|g| g.cluster).unwrap_or_default();
+            let range = self
                .glyphs
                .last()
                .map(|g| g.range.end..g.range.end)
                .unwrap_or_default();
            self.width += x_advance.at(self.size);
            self.glyphs.to_mut().push(ShapedGlyph {
                font,
@ -366,11 +373,10 @@ impl<'a> ShapedText<'a> {
                x_advance,
                x_offset: Em::zero(),
                y_offset: Em::zero(),
-                cluster,
+                range,
                safe_to_break: true,
                c: '-',
-                span: Span::detached(),
+                span: (Span::detached(), 0),
                offset: 0,
            });
            Some(())
        });
@ -396,9 +402,9 @@ impl<'a> ShapedText<'a> {
        // Handle edge cases.
        let len = self.glyphs.len();
-        if text_index == 0 {
+        if text_index == self.base {
            return Some(if ltr { 0 } else { len });
-        } else if text_index == self.text.len() {
+        } else if text_index == self.base + self.text.len() {
            return Some(if ltr { len } else { 0 });
        }
@ -406,7 +412,7 @@ impl<'a> ShapedText<'a> {
        let mut idx = self
            .glyphs
            .binary_search_by(|g| {
-                let ordering = g.cluster.cmp(&text_index);
+                let ordering = g.range.start.cmp(&text_index);
                if ltr {
                    ordering
                } else {
@ -422,7 +428,7 @@ impl<'a> ShapedText<'a> {
        // Search for the outermost glyph with the text index.
        while let Some(next) = next(idx, 1) {
-            if self.glyphs.get(next).map_or(true, |g| g.cluster != text_index) {
+            if self.glyphs.get(next).map_or(true, |g| g.range.start != text_index) {
                break;
            }
            idx = next;
@ -444,7 +450,6 @@ impl Debug for ShapedText<'_> {
 /// Holds shaping results and metadata common to all shaped segments.
 struct ShapingContext<'a> {
    vt: &'a Vt<'a>,
    base: usize,
    spans: &'a SpanMapper,
    glyphs: Vec<ShapedGlyph>,
    used: Vec<Font>,
@ -468,7 +473,6 @@ pub fn shape<'a>(
    let size = TextElem::size_in(styles);
    let mut ctx = ShapingContext {
        vt,
        base,
        spans,
        size,
        glyphs: vec![],
@ -481,7 +485,7 @@ pub fn shape<'a>(
    };
    if !text.is_empty() {
-        shape_segment(&mut ctx, 0, text, families(styles));
+        shape_segment(&mut ctx, base, text, families(styles));
    }
    track_and_space(&mut ctx);
@ -552,6 +556,7 @@ fn shape_segment(
    let buffer = rustybuzz::shape(font.rusty(), &ctx.tags, buffer);
    let infos = buffer.glyph_infos();
    let pos = buffer.glyph_positions();
    let ltr = ctx.dir.is_positive();
    // Collect the shaped glyphs, doing fallback and shaping parts again with
    // the next font if necessary.
@ -560,32 +565,36 @@ fn shape_segment(
        let info = &infos[i];
        let cluster = info.cluster as usize;
        if info.glyph_id != 0 {
        // Add the glyph to the shaped output.
-            // TODO: Don't ignore y_advance.
+        if info.glyph_id != 0 {
-            let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
+            // Determine the text range of the glyph.
            let start = base + cluster;
            let end = base
                + if ltr { i.checked_add(1) } else { i.checked_sub(1) }
                    .and_then(|last| infos.get(last))
                    .map_or(text.len(), |info| info.cluster as usize);
            ctx.glyphs.push(ShapedGlyph {
                font: font.clone(),
                glyph_id: info.glyph_id as u16,
                // TODO: Don't ignore y_advance.
                x_advance: font.to_em(pos[i].x_advance),
                x_offset: font.to_em(pos[i].x_offset),
                y_offset: font.to_em(pos[i].y_offset),
-                cluster: base + cluster,
+                range: start..end,
                safe_to_break: !info.unsafe_to_break(),
                c: text[cluster..].chars().next().unwrap(),
-                span,
+                span: ctx.spans.span_at(start),
                offset,
            });
        } else {
            // Determine the source text range for the tofu sequence.
            let range = {
            // First, search for the end of the tofu sequence.
            let k = i;
            while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
                i += 1;
            }
-                // Then, determine the start and end text index.
+            // Then, determine the start and end text index for the tofu
            // sequence.
            //
            // Examples:
            // Everything is shown in visual order. Tofus are written as "_".
@ -603,25 +612,19 @@ fn shape_segment(
            // Glyphs:   E   C   _   _   A
            // Clusters: 8   6   4   2   0
            //                  k=2 i=3
-                let ltr = ctx.dir.is_positive();
+            let start = infos[if ltr { k } else { i }].cluster as usize;
-                let first = if ltr { k } else { i };
+            let end = if ltr { i.checked_add(1) } else { k.checked_sub(1) }
                let start = infos[first].cluster as usize;
                let last = if ltr { i.checked_add(1) } else { k.checked_sub(1) };
                let end = last
                .and_then(|last| infos.get(last))
                .map_or(text.len(), |info| info.cluster as usize);
                start..end
            };
            // Trim half-baked cluster.
-            let remove = base + range.start..base + range.end;
+            let remove = base + start..base + end;
-            while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.cluster)) {
+            while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.range.start)) {
                ctx.glyphs.pop();
            }
            // Recursively shape the tofu sequence with the next family.
-            shape_segment(ctx, base + range.start, &text[range], families.clone());
+            shape_segment(ctx, base + start, &text[start..end], families.clone());
        }
        i += 1;
@ -634,19 +637,18 @@ fn shape_segment(
 fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
    let x_advance = font.advance(0).unwrap_or_default();
    for (cluster, c) in text.char_indices() {
-        let cluster = base + cluster;
+        let start = base + cluster;
-        let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
+        let end = start + c.len_utf8();
        ctx.glyphs.push(ShapedGlyph {
            font: font.clone(),
            glyph_id: 0,
            x_advance,
            x_offset: Em::zero(),
            y_offset: Em::zero(),
-            cluster,
+            range: start..end,
            safe_to_break: true,
            c,
-            span,
+            span: ctx.spans.span_at(start),
            offset,
        });
    }
 }
@ -668,7 +670,10 @@ fn track_and_space(ctx: &mut ShapingContext) {
            glyph.x_advance = spacing.relative_to(glyph.x_advance);
        }
-        if glyphs.peek().map_or(false, |next| glyph.cluster != next.cluster) {
+        if glyphs
            .peek()
            .map_or(false, |next| glyph.range.start != next.range.start)
        {
            glyph.x_advance += tracking;
        }
    }
--- a/src/doc.rs
+++ b/src/doc.rs
@ -1,7 +1,8 @@
 //! Finished documents.
-use std::fmt::{self, Debug, Formatter, Write};
+use std::fmt::{self, Debug, Formatter};
 use std::num::NonZeroUsize;
 use std::ops::Range;
 use std::str::FromStr;
 use std::sync::Arc;
@ -114,23 +115,6 @@ impl Frame {
    pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> {
        self.items.iter()
    }
    /// Approximately recover the text inside of the frame and its children.
    pub fn text(&self) -> EcoString {
        let mut text = EcoString::new();
        for (_, item) in self.items() {
            match item {
                FrameItem::Text(item) => {
                    for glyph in &item.glyphs {
                        text.push(glyph.c);
                    }
                }
                FrameItem::Group(group) => text.push_str(&group.frame.text()),
                _ => {}
            }
        }
        text
    }
 }
 /// Insert items and subframes.
@ -476,6 +460,8 @@ pub struct TextItem {
    pub fill: Paint,
    /// The natural language of the text.
    pub lang: Lang,
    /// The item's plain text.
    pub text: EcoString,
    /// The glyphs.
    pub glyphs: Vec<Glyph>,
 }
@ -489,19 +475,14 @@ impl TextItem {
 impl Debug for TextItem {
    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-        // This is only a rough approximation of the source text.
+        f.write_str("Text(")?;
-        f.write_str("Text(\"")?;
+        self.text.fmt(f)?;
-        for glyph in &self.glyphs {
+        f.write_str(")")
            for c in glyph.c.escape_debug() {
                f.write_char(c)?;
            }
        }
        f.write_str("\")")
    }
 }
 /// A glyph in a run of shaped text.
-#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
 pub struct Glyph {
    /// The glyph's index in the font.
    pub id: u16,
@ -509,12 +490,17 @@ pub struct Glyph {
    pub x_advance: Em,
    /// The horizontal offset of the glyph.
    pub x_offset: Em,
-    /// The first character of the glyph's cluster.
+    /// The range of the glyph in its item's text.
-    pub c: char,
+    pub range: Range<u16>,
    /// The source code location of the text.
-    pub span: Span,
+    pub span: (Span, u16),
-    /// The offset within the spanned text.
+}
-    pub offset: u16,
+
 impl Glyph {
    /// The range of the glyph in its item's text.
    pub fn range(&self) -> Range<usize> {
        usize::from(self.range.start)..usize::from(self.range.end)
    }
 }
 /// An identifier for a natural language.
--- a/src/export/pdf/font.rs
+++ b/src/export/pdf/font.rs
@ -1,13 +1,21 @@
 use std::collections::BTreeMap;
-use ecow::eco_format;
+use ecow::{eco_format, EcoString};
 use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
 use pdf_writer::{Filter, Finish, Name, Rect, Str};
 use ttf_parser::{name_id, GlyphId, Tag};
 use unicode_general_category::GeneralCategory;
 use super::{deflate, EmExt, PdfContext, RefExt};
 use crate::util::SliceExt;
 const CMAP_NAME: Name = Name(b"Custom");
 const SYSTEM_INFO: SystemInfo = SystemInfo {
    registry: Str(b"Adobe"),
    ordering: Str(b"Identity"),
    supplement: 0,
 };
 /// Embed all used fonts into the PDF.
 #[tracing::instrument(skip_all)]
 pub fn write_fonts(ctx: &mut PdfContext) {
@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        let data_ref = ctx.alloc.bump();
        ctx.font_refs.push(type0_ref);
-        let glyphs = &ctx.glyph_sets[font];
+        let glyph_set = ctx.glyph_sets.get_mut(font).unwrap();
        let metrics = font.metrics();
        let ttf = font.ttf();
@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        let base_font = eco_format!("ABCDEF+{}", postscript_name);
        let base_font = Name(base_font.as_bytes());
        let cmap_name = Name(b"Custom");
        let system_info = SystemInfo {
            registry: Str(b"Adobe"),
            ordering: Str(b"Identity"),
            supplement: 0,
        };
        // Write the base font object referencing the CID font.
        ctx.writer
@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        let mut cid = ctx.writer.cid_font(cid_ref);
        cid.subtype(subtype);
        cid.base_font(base_font);
-        cid.system_info(system_info);
+        cid.system_info(SYSTEM_INFO);
        cid.font_descriptor(descriptor_ref);
        cid.default_width(0.0);
@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        // Extract the widths of all glyphs.
        let num_glyphs = ttf.number_of_glyphs();
        let mut widths = vec![0.0; num_glyphs as usize];
-        for &g in glyphs {
+        for &g in glyph_set.keys() {
            let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0);
            widths[g as usize] = font.to_em(x).to_font_units();
        }
@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        font_descriptor.finish();
        // Compute a reverse mapping from glyphs to unicode.
        let cmap = {
            let mut mapping = BTreeMap::new();
            for subtable in
                ttf.tables().cmap.into_iter().flat_map(|table| table.subtables)
            {
                if subtable.is_unicode() {
                    subtable.codepoints(|n| {
                        if let Some(c) = std::char::from_u32(n) {
                            if let Some(GlyphId(g)) = ttf.glyph_index(c) {
                                if glyphs.contains(&g) {
                                    mapping.insert(g, c);
                                }
                            }
                        }
                    });
                }
            }
            let mut cmap = UnicodeCmap::new(cmap_name, system_info);
            for (g, c) in mapping {
                cmap.pair(g, c);
            }
            cmap
        };
        // Write the /ToUnicode character map, which maps glyph ids back to
        // unicode codepoints to enable copying out of the PDF.
-        ctx.writer
+        let cmap = create_cmap(ttf, glyph_set);
-            .cmap(cmap_ref, &deflate(&cmap.finish()))
+        ctx.writer.cmap(cmap_ref, &cmap.finish());
            .filter(Filter::FlateDecode);
        // Subset and write the font's bytes.
        let data = font.data();
        let subsetted = {
-            let glyphs: Vec<_> = glyphs.iter().copied().collect();
+            let glyphs: Vec<_> = glyph_set.keys().copied().collect();
            let profile = subsetter::Profile::pdf(&glyphs);
            subsetter::subset(data, font.index(), profile)
        };
@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        stream.finish();
    }
 }
 /// Create a /ToUnicode CMap.
 fn create_cmap(
    ttf: &ttf_parser::Face,
    glyph_set: &mut BTreeMap<u16, EcoString>,
 ) -> UnicodeCmap {
    // For glyphs that have codepoints mapping to in the font's cmap table, we
    // prefer them over pre-existing text mappings from the document. Only
    // things that don't have a corresponding codepoint (or only a private-use
    // one) like the "Th" in Linux Libertine get the text of their first
    // occurances in the document instead.
    for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
        if !subtable.is_unicode() {
            continue;
        }
        subtable.codepoints(|n| {
            let Some(c) = std::char::from_u32(n) else { return };
            if unicode_general_category::get_general_category(c)
                == GeneralCategory::PrivateUse
            {
                return;
            }
            let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
            if glyph_set.contains_key(&g) {
                glyph_set.insert(g, c.into());
            }
        });
    }
    // Produce a reverse mapping from glyphs to unicode strings.
    let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
    for (&g, text) in glyph_set.iter() {
        if !text.is_empty() {
            cmap.pair_with_multiple(g, text.chars());
        }
    }
    cmap
 }
--- a/src/export/pdf/mod.rs
+++ b/src/export/pdf/mod.rs
@ -6,9 +6,10 @@ mod outline;
 mod page;
 use std::cmp::Eq;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
 use std::hash::Hash;
 use ecow::EcoString;
 use pdf_writer::types::Direction;
 use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr};
 use xmp_writer::{LangId, RenditionClass, XmpWriter};
@ -52,7 +53,13 @@ pub struct PdfContext<'a> {
    page_refs: Vec<Ref>,
    font_map: Remapper<Font>,
    image_map: Remapper<Image>,
-    glyph_sets: HashMap<Font, HashSet<u16>>,
+    /// For each font a mapping from used glyphs to their text representation.
    /// May contain multiple chars in case of ligatures or similar things. The
    /// same glyph can have a different text representation within one document,
    /// then we just save the first one. The resulting strings are used for the
    /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
    /// cmap. This is important for copy-paste and searching.
    glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
    languages: HashMap<Lang, usize>,
 }
--- a/src/export/pdf/page.rs
+++ b/src/export/pdf/page.rs
@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) {
 /// Encode a text run into the content stream.
 fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) {
    *ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len();
-    ctx.parent
+
-        .glyph_sets
+    let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default();
-        .entry(text.font.clone())
+    for g in &text.glyphs {
-        .or_default()
+        let segment = &text.text[g.range()];
-        .extend(text.glyphs.iter().map(|g| g.id));
+        glyph_set.entry(g.id).or_insert_with(|| segment.into());
    }
    ctx.set_fill(&text.fill);
    ctx.set_font(&text.font, text.size);
--- a/src/ide/jump.rs
+++ b/src/ide/jump.rs
@ -67,7 +67,8 @@ pub fn jump_from_click(
            FrameItem::Text(text) => {
                for glyph in &text.glyphs {
-                    if glyph.span.is_detached() {
+                    let (span, span_offset) = glyph.span;
                    if span.is_detached() {
                        continue;
                    }
@ -77,13 +78,13 @@ pub fn jump_from_click(
                        Size::new(width, text.size),
                        click,
                    ) {
-                        let source = world.source(glyph.span.source());
+                        let source = world.source(span.source());
-                        let node = source.find(glyph.span)?;
+                        let node = source.find(span)?;
                        let pos = if node.kind() == SyntaxKind::Text {
                            let range = node.range();
-                            let mut offset = range.start + usize::from(glyph.offset);
+                            let mut offset = range.start + usize::from(span_offset);
                            if (click.x - pos.x) > width / 2.0 {
-                                offset += glyph.c.len_utf8();
+                                offset += glyph.range().len();
                            }
                            offset.min(range.end)
                        } else {
@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option<Point> {
        if let FrameItem::Text(text) = item {
            for glyph in &text.glyphs {
-                if glyph.span == span {
+                if glyph.span.0 == span {
                    return Some(pos);
                }
                pos.x += glyph.x_advance.at(text.size);
--- a/tests/ref/text/copy-paste.png
+++ b/tests/ref/text/copy-paste.png
--- a/tests/ref/text/shaping.png
+++ b/tests/ref/text/shaping.png
--- a/tests/src/tests.rs
+++ b/tests/src/tests.rs
@ -353,9 +353,18 @@ fn test(
    pdf_path: Option<&Path>,
    args: &Args,
 ) -> bool {
-    let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
+    struct PanicGuard<'a>(&'a Path);
    impl Drop for PanicGuard<'_> {
        fn drop(&mut self) {
            if std::thread::panicking() {
                println!("Panicked in {}", self.0.display());
            }
        }
    }
    let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
    let text = fs::read_to_string(src_path).unwrap();
    let _guard = PanicGuard(name);
    let mut output = String::new();
    let mut ok = true;
@ -401,6 +410,7 @@ fn test(
                line,
                &mut rng,
            );
            ok &= part_ok;
            compare_ever |= compare_here;
            frames.extend(part_frames);
--- a/tests/typ/text/copy-paste.typ
+++ b/tests/typ/text/copy-paste.typ
@ -0,0 +1,8 @@
 // Test copy-paste and search in PDF with ligatures
 // and Arabic test. Must be tested manually!
 ---
 The after fira 🏳️‍🌈!
 #set text(lang: "ar", font: "Noto Sans Arabic")
 مرحبًا