Simplify linebreaking

Switches from an iterator to callback style, which significantly increases the clarity of the whole thing.
2025-07-12 07:02:53 +08:00 · 2023-10-28 22:02:22 +02:00 · 2023-10-28 22:02:22 +02:00 · 4c75adbb04
commit 4c75adbb04
parent 0f4f37cc09
3 changed files with 232 additions and 239 deletions
--- a/crates/typst-library/src/layout/par.rs
+++ b/crates/typst-library/src/layout/par.rs
@ -1,11 +1,3 @@
 use std::iter::Peekable;
 use icu_properties::{maps::CodePointMapData, LineBreak};
 use icu_provider::AsDeserializingBufferProvider;
 use icu_provider_adapters::fork::ForkByKeyProvider;
 use icu_provider_blob::BlobDataProvider;
 use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter};
 use once_cell::sync::Lazy;
 use typst::eval::Tracer;
 use typst::model::DelayedErrors;
 use unicode_bidi::{BidiInfo, Level as BidiLevel};
@ -16,8 +8,9 @@ use crate::layout::AlignElem;
 use crate::math::EquationElem;
 use crate::prelude::*;
 use crate::text::{
-    char_is_cjk_script, is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedGlyph,
+    breakpoints, char_is_cjk_script, is_gb_style, shape, Breakpoint, LinebreakElem,
-    ShapedText, SmartquoteElem, SpaceElem, TextElem, BEGIN_PUNCT_PAT, END_PUNCT_PAT,
+    Quoter, Quotes, ShapedGlyph, ShapedText, SmartquoteElem, SpaceElem, TextElem,
    BEGIN_PUNCT_PAT, END_PUNCT_PAT,
 };
 /// Arranges text, spacing and inline-level elements into a paragraph.
@ -246,30 +239,32 @@ const OBJ_REPLACE: char = '\u{FFFC}'; // Object Replacement Character
 /// In many cases, we can directly reuse these results when constructing a line.
 /// Only when a line break falls onto a text index that is not safe-to-break per
 /// rustybuzz, we have to reshape that portion.
-struct Preparation<'a> {
+pub(crate) struct Preparation<'a> {
    /// Bidirectional text embedding levels for the paragraph.
-    bidi: BidiInfo<'a>,
+    pub bidi: BidiInfo<'a>,
    /// Text runs, spacing and layouted elements.
-    items: Vec<Item<'a>>,
+    pub items: Vec<Item<'a>>,
    /// The span mapper.
-    spans: SpanMapper,
+    pub spans: SpanMapper,
    /// The styles shared by all children.
-    styles: StyleChain<'a>,
+    pub styles: StyleChain<'a>,
    /// Whether to hyphenate if it's the same for all children.
-    hyphenate: Option<bool>,
+    pub hyphenate: Option<bool>,
    /// The text language if it's the same for all children.
-    lang: Option<Lang>,
+    pub lang: Option<Lang>,
    /// The paragraph's resolved horizontal alignment.
-    align: FixedAlign,
+    pub align: FixedAlign,
    /// Whether to justify the paragraph.
-    justify: bool,
+    pub justify: bool,
    /// The paragraph's hanging indent.
-    hang: Abs,
+    pub hang: Abs,
    /// The CJK-latin spacing.
    pub cjk_latin_spacing: bool,
 }
 impl<'a> Preparation<'a> {
    /// Find the item that contains the given `text_offset`.
-    fn find(&self, text_offset: usize) -> Option<&Item<'a>> {
+    pub fn find(&self, text_offset: usize) -> Option<&Item<'a>> {
        let mut cursor = 0;
        for item in &self.items {
            let end = cursor + item.len();
@ -284,7 +279,7 @@ impl<'a> Preparation<'a> {
    /// Return the items that intersect the given `text_range`.
    ///
    /// Returns the expanded range around the items and the items.
-    fn slice(&self, text_range: Range) -> (Range, &[Item<'a>]) {
+    pub fn slice(&self, text_range: Range) -> (Range, &[Item<'a>]) {
        let mut cursor = 0;
        let mut start = 0;
        let mut end = 0;
@ -342,7 +337,7 @@ impl Segment<'_> {
 /// A prepared item in a paragraph layout.
 #[derive(Debug)]
-enum Item<'a> {
+pub(crate) enum Item<'a> {
    /// A shaped text run with consistent style and direction.
    Text(ShapedText<'a>),
    /// Absolute spacing between other items.
@ -357,14 +352,14 @@ enum Item<'a> {
 impl<'a> Item<'a> {
    /// If this a text item, return it.
-    fn text(&self) -> Option<&ShapedText<'a>> {
+    pub fn text(&self) -> Option<&ShapedText<'a>> {
        match self {
            Self::Text(shaped) => Some(shaped),
            _ => None,
        }
    }
-    fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> {
+    pub fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> {
        match self {
            Self::Text(shaped) => Some(shaped),
            _ => None,
@ -372,7 +367,7 @@ impl<'a> Item<'a> {
    }
    /// The text length of the item.
-    fn len(&self) -> usize {
+    pub fn len(&self) -> usize {
        match self {
            Self::Text(shaped) => shaped.text.len(),
            Self::Absolute(_) | Self::Fractional(_, _) => SPACING_REPLACE.len_utf8(),
@ -382,7 +377,7 @@ impl<'a> Item<'a> {
    }
    /// The natural layouted width of the item.
-    fn width(&self) -> Abs {
+    pub fn width(&self) -> Abs {
        match self {
            Self::Text(shaped) => shaped.width,
            Self::Absolute(v) => *v,
@ -737,6 +732,7 @@ fn prepare<'a>(
        align: AlignElem::alignment_in(styles).resolve(styles).x,
        justify: ParElem::justify_in(styles),
        hang: ParElem::hanging_indent_in(styles),
        cjk_latin_spacing: TextElem::cjk_latin_spacing_in(styles).is_auto(),
    })
 }
@ -889,11 +885,10 @@ fn linebreak_simple<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<Line
    let mut lines = vec![];
    let mut start = 0;
    let mut last = None;
    let cjk_latin_spacing = TextElem::cjk_latin_spacing_in(p.styles).is_auto();
-    for (end, mandatory, hyphen) in breakpoints(p) {
+    breakpoints(p, |end, breakpoint| {
        // Compute the line and its size.
-        let mut attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
+        let mut attempt = line(vt, p, start..end, breakpoint);
        // If the line doesn't fit anymore, we push the last fitting attempt
        // into the stack and rebuild the line from the attempt's end. The
@ -902,21 +897,21 @@ fn linebreak_simple<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<Line
            if let Some((last_attempt, last_end)) = last.take() {
                lines.push(last_attempt);
                start = last_end;
-                attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
+                attempt = line(vt, p, start..end, breakpoint);
            }
        }
        // Finish the current line if there is a mandatory line break (i.e.
        // due to "\n") or if the line doesn't fit horizontally already
        // since then no shorter line will be possible.
-        if mandatory || !width.fits(attempt.width) {
+        if breakpoint == Breakpoint::Mandatory || !width.fits(attempt.width) {
            lines.push(attempt);
            start = end;
            last = None;
        } else {
            last = Some((attempt, end));
        }
-    }
+    });
    if let Some((line, _)) = last {
        lines.push(line);
@ -965,13 +960,12 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
    let mut table = vec![Entry {
        pred: 0,
        total: 0.0,
-        line: line(vt, p, 0..0, false, false, false),
+        line: line(vt, p, 0..0, Breakpoint::Mandatory),
    }];
    let em = TextElem::size_in(p.styles);
    let cjk_latin_spacing = TextElem::cjk_latin_spacing_in(p.styles).is_auto();
-    for (end, mandatory, hyphen) in breakpoints(p) {
+    breakpoints(p, |end, breakpoint| {
        let k = table.len();
        let eof = end == p.bidi.text.len();
        let mut best: Option<Entry> = None;
@ -981,7 +975,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
            // Layout the line.
            let start = pred.line.end;
-            let attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
+            let attempt = line(vt, p, start..end, breakpoint);
            // Determine how much the line's spaces would need to be stretched
            // to make it the desired width.
@ -1025,7 +1019,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
                    active += 1;
                }
                MAX_COST
-            } else if mandatory || eof {
+            } else if breakpoint == Breakpoint::Mandatory || eof {
                // This is a mandatory break and the line is not overfull, so
                // all breakpoints before this one become inactive since no line
                // can span above the mandatory break.
@ -1048,7 +1042,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
            }
            // Penalize hyphens.
-            if hyphen {
+            if breakpoint == Breakpoint::Hyphen {
                cost += HYPH_COST;
            }
@ -1073,7 +1067,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
        }
        table.push(best.unwrap());
-    }
+    });
    // Retrace the best path.
    let mut lines = vec![];
@ -1089,208 +1083,16 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
    lines
 }
 /// Generated by the following command:
 ///
 /// ```sh
 /// icu4x-datagen --locales full \
 ///               --format blob \
 ///               --keys-for-bin target/debug/typst \
 ///               --out crates/typst-library/assets/icudata.postcard  \
 ///               --overwrite
 /// ```
 ///
 /// Install icu_datagen with `cargo install icu_datagen`.
 static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard");
 /// Generated by the following command:
 ///
 /// ```sh
 /// icu4x-datagen --locales zh ja \
 ///               --format blob \
 ///               --keys segmenter/line@1 \
 ///               --out crates/typst-library/assets/cj_linebreak_data.postcard \
 ///               --overwrite
 /// ```
 ///
 /// The used icu_datagen should be patched by
 /// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
 static CJ_LINEBREAK_DATA: &[u8] =
    include_bytes!("../../assets/cj_linebreak_data.postcard");
 /// The general line break segmenter.
 static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
    LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
 });
 /// The line break segmenter for Chinese/Japanese text.
 static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
    let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
    let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
    LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
 });
 /// The Unicode line break properties for each code point.
 static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
    let deser_provider = provider.as_deserializing();
    icu_properties::maps::load_line_break(&deser_provider).unwrap()
 });
 /// Determine all possible points in the text where lines can broken.
 ///
 /// Returns for each breakpoint the text index, whether the break is mandatory
 /// (after `\n`) and whether a hyphen is required (when breaking inside of a
 /// word).
 fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> {
    let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) {
        CJ_SEGMENTER.segment_str(p.bidi.text)
    } else {
        SEGMENTER.segment_str(p.bidi.text)
    };
    // The iterator always yields a breakpoint at index 0, we want to ignore it
    linebreaks.next();
    Breakpoints {
        p,
        linebreaks: linebreaks.peekable(),
        syllables: None,
        offset: 0,
        suffix: 0,
        end: 0,
        mandatory: false,
    }
 }
 /// An iterator over the line break opportunities in a text.
 struct Breakpoints<'a> {
    /// The paragraph's items.
    p: &'a Preparation<'a>,
    /// The inner iterator over the unicode line break opportunities.
    linebreaks: Peekable<LineBreakIteratorUtf8<'a, 'a>>,
    /// Iterator over syllables of the current word.
    syllables: Option<hypher::Syllables<'a>>,
    /// The current text offset.
    offset: usize,
    /// The trimmed end of the current word.
    suffix: usize,
    /// The untrimmed end of the current word.
    end: usize,
    /// Whether the break after the current word is mandatory.
    mandatory: bool,
 }
 impl Iterator for Breakpoints<'_> {
    type Item = (usize, bool, bool);
    fn next(&mut self) -> Option<Self::Item> {
        let lb = LINEBREAK_DATA.as_borrowed();
        // If we're currently in a hyphenated "word", process the next syllable.
        if let Some(syllable) = self.syllables.as_mut().and_then(Iterator::next) {
            self.offset += syllable.len();
            if self.offset == self.suffix {
                self.offset = self.end;
            }
            let hyphen = self.offset < self.end;
            if hyphen {
                // Filter out hyphenation opportunities where hyphenation was
                // actually disabled.
                if !self.hyphenate(self.offset) {
                    return self.next();
                }
                // Filter out forbidden hyphenation opportunities.
                if matches!(
                    syllable.chars().last().map(|c| lb.get(c)),
                    Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
                ) {
                    return self.next();
                }
            }
            return Some((self.offset, self.mandatory && !hyphen, hyphen));
        }
        loop {
            // Get the next "word".
            self.end = self.linebreaks.next()?;
            self.mandatory = false;
            // Fix for: https://github.com/unicode-org/icu4x/issues/4146
            if let Some(c) = self.p.bidi.text[..self.end].chars().next_back() {
                if self.end == self.p.bidi.text.len() {
                    self.mandatory = true;
                    break;
                }
                self.mandatory = match lb.get(c) {
                    LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
                    LineBreak::MandatoryBreak
                    | LineBreak::CarriageReturn
                    | LineBreak::LineFeed
                    | LineBreak::NextLine => true,
                    _ => false,
                };
            }
            break;
        }
        // Hyphenate the next word.
        if self.p.hyphenate != Some(false) {
            if let Some(lang) = self.lang(self.offset) {
                let word = &self.p.bidi.text[self.offset..self.end];
                let trimmed = word.trim_end_matches(|c: char| !c.is_alphabetic());
                if !trimmed.is_empty() {
                    self.suffix = self.offset + trimmed.len();
                    self.syllables = Some(hypher::hyphenate(trimmed, lang));
                    return self.next();
                }
            }
        }
        self.offset = self.end;
        Some((self.end, self.mandatory, false))
    }
 }
 impl Breakpoints<'_> {
    /// Whether hyphenation is enabled at the given offset.
    fn hyphenate(&self, offset: usize) -> bool {
        self.p
            .hyphenate
            .or_else(|| {
                let shaped = self.p.find(offset)?.text()?;
                Some(TextElem::hyphenate_in(shaped.styles))
            })
            .unwrap_or(false)
    }
    /// The text language at the given offset.
    fn lang(&self, offset: usize) -> Option<hypher::Lang> {
        let lang = self.p.lang.or_else(|| {
            let shaped = self.p.find(offset)?.text()?;
            Some(TextElem::lang_in(shaped.styles))
        })?;
        let bytes = lang.as_str().as_bytes().try_into().ok()?;
        hypher::Lang::from_iso(bytes)
    }
 }
 /// Create a line which spans the given range.
 fn line<'a>(
    vt: &Vt,
    p: &'a Preparation,
    mut range: Range,
-    mandatory: bool,
+    breakpoint: Breakpoint,
    hyphen: bool,
    cjk_latin_spacing: bool,
 ) -> Line<'a> {
    let end = range.end;
-    let mut justify = p.justify && end < p.bidi.text.len() && !mandatory;
+    let mut justify =
        p.justify && end < p.bidi.text.len() && breakpoint != Breakpoint::Mandatory;
    if range.is_empty() {
        return Line {
@ -1326,13 +1128,14 @@ fn line<'a>(
        // Deal with hyphens, dashes and justification.
        let shy = trimmed.ends_with('\u{ad}');
        let hyphen = breakpoint == Breakpoint::Hyphen;
        dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']);
        justify |= text.ends_with('\u{2028}');
        // Deal with CJK punctuation at line ends.
        let gb_style = is_gb_style(shaped.lang, shaped.region);
        let maybe_adjust_last_glyph = trimmed.ends_with(END_PUNCT_PAT)
-            || (cjk_latin_spacing && trimmed.ends_with(char_is_cjk_script));
+            || (p.cjk_latin_spacing && trimmed.ends_with(char_is_cjk_script));
        // Usually, we don't want to shape an empty string because:
        // - We don't want the height of trimmed whitespace in a different
@ -1359,7 +1162,7 @@ fn line<'a>(
                        let punct = reshaped.glyphs.to_mut().last_mut().unwrap();
                        punct.shrink_right(shrink_amount);
                        reshaped.width -= shrink_amount.at(reshaped.size);
-                    } else if cjk_latin_spacing
+                    } else if p.cjk_latin_spacing
                        && last_glyph.is_cjk_script()
                        && (last_glyph.x_advance - last_glyph.x_offset) > Em::one()
                    {
@ -1385,7 +1188,7 @@ fn line<'a>(
    // Deal with CJK characters at line starts.
    let text = &p.bidi.text[range.start..end];
    let maybe_adjust_first_glyph = text.starts_with(BEGIN_PUNCT_PAT)
-        || (cjk_latin_spacing && text.starts_with(char_is_cjk_script));
+        || (p.cjk_latin_spacing && text.starts_with(char_is_cjk_script));
    // Reshape the start item if it's split in half.
    let mut first = None;
@ -1419,7 +1222,7 @@ fn line<'a>(
                    let amount_abs = shrink_amount.at(reshaped.size);
                    reshaped.width -= amount_abs;
                    width -= amount_abs;
-                } else if cjk_latin_spacing
+                } else if p.cjk_latin_spacing
                    && first_glyph.is_cjk_script()
                    && first_glyph.x_offset > Em::zero()
                {
--- a/crates/typst-library/src/text/linebreak.rs
+++ b/crates/typst-library/src/text/linebreak.rs
@ -0,0 +1,188 @@
 use icu_properties::{maps::CodePointMapData, LineBreak};
 use icu_provider::AsDeserializingBufferProvider;
 use icu_provider_adapters::fork::ForkByKeyProvider;
 use icu_provider_blob::BlobDataProvider;
 use icu_segmenter::LineSegmenter;
 use once_cell::sync::Lazy;
 use typst::doc::Lang;
 use super::TextElem;
 use crate::layout::Preparation;
 /// Generated by the following command:
 ///
 /// ```sh
 /// icu4x-datagen --locales full \
 ///               --format blob \
 ///               --keys-for-bin target/debug/typst \
 ///               --out crates/typst-library/assets/icudata.postcard  \
 ///               --overwrite
 /// ```
 ///
 /// Install icu_datagen with `cargo install icu_datagen`.
 static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard");
 /// Generated by the following command:
 ///
 /// ```sh
 /// icu4x-datagen --locales zh ja \
 ///               --format blob \
 ///               --keys segmenter/line@1 \
 ///               --out crates/typst-library/assets/cj_linebreak_data.postcard \
 ///               --overwrite
 /// ```
 ///
 /// The used icu_datagen should be patched by
 /// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
 static CJ_LINEBREAK_DATA: &[u8] =
    include_bytes!("../../assets/cj_linebreak_data.postcard");
 /// The general line break segmenter.
 static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
    LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
 });
 /// The line break segmenter for Chinese/Japanese text.
 static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
    let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
    let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
    LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
 });
 /// The Unicode line break properties for each code point.
 static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
    let deser_provider = provider.as_deserializing();
    icu_properties::maps::load_line_break(&deser_provider).unwrap()
 });
 /// A line break opportunity.
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 pub(crate) enum Breakpoint {
    /// Just a normal opportunity (e.g. after a space).
    Normal,
    /// A mandatory breakpoint (after '\n' or at the end of the text).
    Mandatory,
    /// An opportunity for hyphenating.
    Hyphen,
 }
 /// Calls `f` for all possible points in the text where lines can broken.
 ///
 /// Yields for each breakpoint the text index, whether the break is mandatory
 /// (after `\n`) and whether a hyphen is required (when breaking inside of a
 /// word).
 ///
 /// This is an internal instead of an external iterator because it makes the
 /// code much simpler and the consumers of this function don't need the
 /// composability and flexibility of external iteration anyway.
 pub(crate) fn breakpoints<'a>(
    p: &'a Preparation<'a>,
    mut f: impl FnMut(usize, Breakpoint),
 ) {
    let lb = LINEBREAK_DATA.as_borrowed();
    let segmenter = match p.lang {
        Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
        _ => &SEGMENTER,
    };
    let hyphenate = p.hyphenate != Some(false);
    let mut last = 0;
    // Walk over all UAX #14 linebreak opportunities.
    for point in segmenter.segment_str(p.bidi.text) {
        // Skip breakpoint if there is no char before it. icu4x generates one
        // at offset 0, but we don't want it.
        let Some(c) = p.bidi.text[..point].chars().next_back() else { continue };
        // Find out whether the last break was mandatory by checking against
        // rules LB4 and LB5, special-casing the end of text according to LB3.
        // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
        let breakpoint = if point == p.bidi.text.len() {
            Breakpoint::Mandatory
        } else {
            match lb.get(c) {
                // Fix for: https://github.com/unicode-org/icu4x/issues/4146
                LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
                LineBreak::MandatoryBreak
                | LineBreak::CarriageReturn
                | LineBreak::LineFeed
                | LineBreak::NextLine => Breakpoint::Mandatory,
                _ => Breakpoint::Normal,
            }
        };
        // Hyphenate between the last and current breakpoint.
        'hyphenate: {
            if !hyphenate {
                break 'hyphenate;
            }
            // Extract a hyphenatable "word".
            let word =
                &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
            if word.is_empty() {
                break 'hyphenate;
            }
            let end = last + word.len();
            let mut offset = last;
            // Determine the language to hyphenate this word in.
            let Some(lang) = lang_at(p, last) else { break 'hyphenate };
            for syllable in hypher::hyphenate(word, lang) {
                // Don't hyphenate after the final syllable.
                offset += syllable.len();
                if offset == end {
                    continue;
                }
                // Filter out hyphenation opportunities where hyphenation was
                // actually disabled.
                if !hyphenate_at(p, offset) {
                    continue;
                }
                // Filter out forbidden hyphenation opportunities.
                if matches!(
                    syllable.chars().next_back().map(|c| lb.get(c)),
                    Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
                ) {
                    continue;
                }
                // Call `f` for the word-internal hyphenation opportunity.
                f(offset, Breakpoint::Hyphen);
            }
        }
        // Call `f` for the UAX #14 break opportunity.
        f(point, breakpoint);
        last = point;
    }
 }
 /// Whether hyphenation is enabled at the given offset.
 fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
    p.hyphenate
        .or_else(|| {
            let shaped = p.find(offset)?.text()?;
            Some(TextElem::hyphenate_in(shaped.styles))
        })
        .unwrap_or(false)
 }
 /// The text language at the given offset.
 fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
    let lang = p.lang.or_else(|| {
        let shaped = p.find(offset)?.text()?;
        Some(TextElem::lang_in(shaped.styles))
    })?;
    let bytes = lang.as_str().as_bytes().try_into().ok()?;
    hypher::Lang::from_iso(bytes)
 }
--- a/crates/typst-library/src/text/mod.rs
+++ b/crates/typst-library/src/text/mod.rs
@ -1,6 +1,7 @@
 //! Text handling.
 mod deco;
 mod linebreak;
 mod misc;
 mod quote;
 mod quotes;
@ -9,6 +10,7 @@ mod shaping;
 mod shift;
 pub use self::deco::*;
 pub(crate) use self::linebreak::*;
 pub use self::misc::*;
 pub use self::quote::*;
 pub use self::quotes::*;