diff --git a/crates/typst-library/src/layout/par.rs b/crates/typst-library/src/layout/par.rs index 1cd749c83..f23652ca8 100644 --- a/crates/typst-library/src/layout/par.rs +++ b/crates/typst-library/src/layout/par.rs @@ -1,11 +1,3 @@ -use std::iter::Peekable; - -use icu_properties::{maps::CodePointMapData, LineBreak}; -use icu_provider::AsDeserializingBufferProvider; -use icu_provider_adapters::fork::ForkByKeyProvider; -use icu_provider_blob::BlobDataProvider; -use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter}; -use once_cell::sync::Lazy; use typst::eval::Tracer; use typst::model::DelayedErrors; use unicode_bidi::{BidiInfo, Level as BidiLevel}; @@ -16,8 +8,9 @@ use crate::layout::AlignElem; use crate::math::EquationElem; use crate::prelude::*; use crate::text::{ - char_is_cjk_script, is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedGlyph, - ShapedText, SmartquoteElem, SpaceElem, TextElem, BEGIN_PUNCT_PAT, END_PUNCT_PAT, + breakpoints, char_is_cjk_script, is_gb_style, shape, Breakpoint, LinebreakElem, + Quoter, Quotes, ShapedGlyph, ShapedText, SmartquoteElem, SpaceElem, TextElem, + BEGIN_PUNCT_PAT, END_PUNCT_PAT, }; /// Arranges text, spacing and inline-level elements into a paragraph. @@ -246,30 +239,32 @@ const OBJ_REPLACE: char = '\u{FFFC}'; // Object Replacement Character /// In many cases, we can directly reuse these results when constructing a line. /// Only when a line break falls onto a text index that is not safe-to-break per /// rustybuzz, we have to reshape that portion. -struct Preparation<'a> { +pub(crate) struct Preparation<'a> { /// Bidirectional text embedding levels for the paragraph. - bidi: BidiInfo<'a>, + pub bidi: BidiInfo<'a>, /// Text runs, spacing and layouted elements. - items: Vec>, + pub items: Vec>, /// The span mapper. - spans: SpanMapper, + pub spans: SpanMapper, /// The styles shared by all children. - styles: StyleChain<'a>, + pub styles: StyleChain<'a>, /// Whether to hyphenate if it's the same for all children. - hyphenate: Option, + pub hyphenate: Option, /// The text language if it's the same for all children. - lang: Option, + pub lang: Option, /// The paragraph's resolved horizontal alignment. - align: FixedAlign, + pub align: FixedAlign, /// Whether to justify the paragraph. - justify: bool, + pub justify: bool, /// The paragraph's hanging indent. - hang: Abs, + pub hang: Abs, + /// The CJK-latin spacing. + pub cjk_latin_spacing: bool, } impl<'a> Preparation<'a> { /// Find the item that contains the given `text_offset`. - fn find(&self, text_offset: usize) -> Option<&Item<'a>> { + pub fn find(&self, text_offset: usize) -> Option<&Item<'a>> { let mut cursor = 0; for item in &self.items { let end = cursor + item.len(); @@ -284,7 +279,7 @@ impl<'a> Preparation<'a> { /// Return the items that intersect the given `text_range`. /// /// Returns the expanded range around the items and the items. - fn slice(&self, text_range: Range) -> (Range, &[Item<'a>]) { + pub fn slice(&self, text_range: Range) -> (Range, &[Item<'a>]) { let mut cursor = 0; let mut start = 0; let mut end = 0; @@ -342,7 +337,7 @@ impl Segment<'_> { /// A prepared item in a paragraph layout. #[derive(Debug)] -enum Item<'a> { +pub(crate) enum Item<'a> { /// A shaped text run with consistent style and direction. Text(ShapedText<'a>), /// Absolute spacing between other items. @@ -357,14 +352,14 @@ enum Item<'a> { impl<'a> Item<'a> { /// If this a text item, return it. - fn text(&self) -> Option<&ShapedText<'a>> { + pub fn text(&self) -> Option<&ShapedText<'a>> { match self { Self::Text(shaped) => Some(shaped), _ => None, } } - fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> { + pub fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> { match self { Self::Text(shaped) => Some(shaped), _ => None, @@ -372,7 +367,7 @@ impl<'a> Item<'a> { } /// The text length of the item. - fn len(&self) -> usize { + pub fn len(&self) -> usize { match self { Self::Text(shaped) => shaped.text.len(), Self::Absolute(_) | Self::Fractional(_, _) => SPACING_REPLACE.len_utf8(), @@ -382,7 +377,7 @@ impl<'a> Item<'a> { } /// The natural layouted width of the item. - fn width(&self) -> Abs { + pub fn width(&self) -> Abs { match self { Self::Text(shaped) => shaped.width, Self::Absolute(v) => *v, @@ -737,6 +732,7 @@ fn prepare<'a>( align: AlignElem::alignment_in(styles).resolve(styles).x, justify: ParElem::justify_in(styles), hang: ParElem::hanging_indent_in(styles), + cjk_latin_spacing: TextElem::cjk_latin_spacing_in(styles).is_auto(), }) } @@ -889,11 +885,10 @@ fn linebreak_simple<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec = None; @@ -981,7 +975,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec = Lazy::new(|| { - let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); - LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap() -}); - -/// The line break segmenter for Chinese/Japanese text. -static CJ_SEGMENTER: Lazy = Lazy::new(|| { - let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); - let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap(); - let cj_provider = ForkByKeyProvider::new(cj_blob, provider); - LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap() -}); - -/// The Unicode line break properties for each code point. -static LINEBREAK_DATA: Lazy> = Lazy::new(|| { - let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); - let deser_provider = provider.as_deserializing(); - icu_properties::maps::load_line_break(&deser_provider).unwrap() -}); - -/// Determine all possible points in the text where lines can broken. -/// -/// Returns for each breakpoint the text index, whether the break is mandatory -/// (after `\n`) and whether a hyphen is required (when breaking inside of a -/// word). -fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> { - let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) { - CJ_SEGMENTER.segment_str(p.bidi.text) - } else { - SEGMENTER.segment_str(p.bidi.text) - }; - // The iterator always yields a breakpoint at index 0, we want to ignore it - linebreaks.next(); - Breakpoints { - p, - linebreaks: linebreaks.peekable(), - syllables: None, - offset: 0, - suffix: 0, - end: 0, - mandatory: false, - } -} - -/// An iterator over the line break opportunities in a text. -struct Breakpoints<'a> { - /// The paragraph's items. - p: &'a Preparation<'a>, - /// The inner iterator over the unicode line break opportunities. - linebreaks: Peekable>, - /// Iterator over syllables of the current word. - syllables: Option>, - /// The current text offset. - offset: usize, - /// The trimmed end of the current word. - suffix: usize, - /// The untrimmed end of the current word. - end: usize, - /// Whether the break after the current word is mandatory. - mandatory: bool, -} - -impl Iterator for Breakpoints<'_> { - type Item = (usize, bool, bool); - - fn next(&mut self) -> Option { - let lb = LINEBREAK_DATA.as_borrowed(); - - // If we're currently in a hyphenated "word", process the next syllable. - if let Some(syllable) = self.syllables.as_mut().and_then(Iterator::next) { - self.offset += syllable.len(); - if self.offset == self.suffix { - self.offset = self.end; - } - - let hyphen = self.offset < self.end; - if hyphen { - // Filter out hyphenation opportunities where hyphenation was - // actually disabled. - if !self.hyphenate(self.offset) { - return self.next(); - } - - // Filter out forbidden hyphenation opportunities. - if matches!( - syllable.chars().last().map(|c| lb.get(c)), - Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ) - ) { - return self.next(); - } - } - - return Some((self.offset, self.mandatory && !hyphen, hyphen)); - } - - loop { - // Get the next "word". - self.end = self.linebreaks.next()?; - self.mandatory = false; - - // Fix for: https://github.com/unicode-org/icu4x/issues/4146 - if let Some(c) = self.p.bidi.text[..self.end].chars().next_back() { - if self.end == self.p.bidi.text.len() { - self.mandatory = true; - break; - } - - self.mandatory = match lb.get(c) { - LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue, - LineBreak::MandatoryBreak - | LineBreak::CarriageReturn - | LineBreak::LineFeed - | LineBreak::NextLine => true, - _ => false, - }; - } - - break; - } - - // Hyphenate the next word. - if self.p.hyphenate != Some(false) { - if let Some(lang) = self.lang(self.offset) { - let word = &self.p.bidi.text[self.offset..self.end]; - let trimmed = word.trim_end_matches(|c: char| !c.is_alphabetic()); - if !trimmed.is_empty() { - self.suffix = self.offset + trimmed.len(); - self.syllables = Some(hypher::hyphenate(trimmed, lang)); - return self.next(); - } - } - } - - self.offset = self.end; - Some((self.end, self.mandatory, false)) - } -} - -impl Breakpoints<'_> { - /// Whether hyphenation is enabled at the given offset. - fn hyphenate(&self, offset: usize) -> bool { - self.p - .hyphenate - .or_else(|| { - let shaped = self.p.find(offset)?.text()?; - Some(TextElem::hyphenate_in(shaped.styles)) - }) - .unwrap_or(false) - } - - /// The text language at the given offset. - fn lang(&self, offset: usize) -> Option { - let lang = self.p.lang.or_else(|| { - let shaped = self.p.find(offset)?.text()?; - Some(TextElem::lang_in(shaped.styles)) - })?; - - let bytes = lang.as_str().as_bytes().try_into().ok()?; - hypher::Lang::from_iso(bytes) - } -} - /// Create a line which spans the given range. fn line<'a>( vt: &Vt, p: &'a Preparation, mut range: Range, - mandatory: bool, - hyphen: bool, - cjk_latin_spacing: bool, + breakpoint: Breakpoint, ) -> Line<'a> { let end = range.end; - let mut justify = p.justify && end < p.bidi.text.len() && !mandatory; + let mut justify = + p.justify && end < p.bidi.text.len() && breakpoint != Breakpoint::Mandatory; if range.is_empty() { return Line { @@ -1326,13 +1128,14 @@ fn line<'a>( // Deal with hyphens, dashes and justification. let shy = trimmed.ends_with('\u{ad}'); + let hyphen = breakpoint == Breakpoint::Hyphen; dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']); justify |= text.ends_with('\u{2028}'); // Deal with CJK punctuation at line ends. let gb_style = is_gb_style(shaped.lang, shaped.region); let maybe_adjust_last_glyph = trimmed.ends_with(END_PUNCT_PAT) - || (cjk_latin_spacing && trimmed.ends_with(char_is_cjk_script)); + || (p.cjk_latin_spacing && trimmed.ends_with(char_is_cjk_script)); // Usually, we don't want to shape an empty string because: // - We don't want the height of trimmed whitespace in a different @@ -1359,7 +1162,7 @@ fn line<'a>( let punct = reshaped.glyphs.to_mut().last_mut().unwrap(); punct.shrink_right(shrink_amount); reshaped.width -= shrink_amount.at(reshaped.size); - } else if cjk_latin_spacing + } else if p.cjk_latin_spacing && last_glyph.is_cjk_script() && (last_glyph.x_advance - last_glyph.x_offset) > Em::one() { @@ -1385,7 +1188,7 @@ fn line<'a>( // Deal with CJK characters at line starts. let text = &p.bidi.text[range.start..end]; let maybe_adjust_first_glyph = text.starts_with(BEGIN_PUNCT_PAT) - || (cjk_latin_spacing && text.starts_with(char_is_cjk_script)); + || (p.cjk_latin_spacing && text.starts_with(char_is_cjk_script)); // Reshape the start item if it's split in half. let mut first = None; @@ -1419,7 +1222,7 @@ fn line<'a>( let amount_abs = shrink_amount.at(reshaped.size); reshaped.width -= amount_abs; width -= amount_abs; - } else if cjk_latin_spacing + } else if p.cjk_latin_spacing && first_glyph.is_cjk_script() && first_glyph.x_offset > Em::zero() { diff --git a/crates/typst-library/src/text/linebreak.rs b/crates/typst-library/src/text/linebreak.rs new file mode 100644 index 000000000..a026df5da --- /dev/null +++ b/crates/typst-library/src/text/linebreak.rs @@ -0,0 +1,188 @@ +use icu_properties::{maps::CodePointMapData, LineBreak}; +use icu_provider::AsDeserializingBufferProvider; +use icu_provider_adapters::fork::ForkByKeyProvider; +use icu_provider_blob::BlobDataProvider; +use icu_segmenter::LineSegmenter; +use once_cell::sync::Lazy; +use typst::doc::Lang; + +use super::TextElem; +use crate::layout::Preparation; + +/// Generated by the following command: +/// +/// ```sh +/// icu4x-datagen --locales full \ +/// --format blob \ +/// --keys-for-bin target/debug/typst \ +/// --out crates/typst-library/assets/icudata.postcard \ +/// --overwrite +/// ``` +/// +/// Install icu_datagen with `cargo install icu_datagen`. +static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard"); + +/// Generated by the following command: +/// +/// ```sh +/// icu4x-datagen --locales zh ja \ +/// --format blob \ +/// --keys segmenter/line@1 \ +/// --out crates/typst-library/assets/cj_linebreak_data.postcard \ +/// --overwrite +/// ``` +/// +/// The used icu_datagen should be patched by +/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5 +static CJ_LINEBREAK_DATA: &[u8] = + include_bytes!("../../assets/cj_linebreak_data.postcard"); + +/// The general line break segmenter. +static SEGMENTER: Lazy = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap() +}); + +/// The line break segmenter for Chinese/Japanese text. +static CJ_SEGMENTER: Lazy = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap(); + let cj_provider = ForkByKeyProvider::new(cj_blob, provider); + LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap() +}); + +/// The Unicode line break properties for each code point. +static LINEBREAK_DATA: Lazy> = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let deser_provider = provider.as_deserializing(); + icu_properties::maps::load_line_break(&deser_provider).unwrap() +}); + +/// A line break opportunity. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(crate) enum Breakpoint { + /// Just a normal opportunity (e.g. after a space). + Normal, + /// A mandatory breakpoint (after '\n' or at the end of the text). + Mandatory, + /// An opportunity for hyphenating. + Hyphen, +} + +/// Calls `f` for all possible points in the text where lines can broken. +/// +/// Yields for each breakpoint the text index, whether the break is mandatory +/// (after `\n`) and whether a hyphen is required (when breaking inside of a +/// word). +/// +/// This is an internal instead of an external iterator because it makes the +/// code much simpler and the consumers of this function don't need the +/// composability and flexibility of external iteration anyway. +pub(crate) fn breakpoints<'a>( + p: &'a Preparation<'a>, + mut f: impl FnMut(usize, Breakpoint), +) { + let lb = LINEBREAK_DATA.as_borrowed(); + let segmenter = match p.lang { + Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER, + _ => &SEGMENTER, + }; + + let hyphenate = p.hyphenate != Some(false); + let mut last = 0; + + // Walk over all UAX #14 linebreak opportunities. + for point in segmenter.segment_str(p.bidi.text) { + // Skip breakpoint if there is no char before it. icu4x generates one + // at offset 0, but we don't want it. + let Some(c) = p.bidi.text[..point].chars().next_back() else { continue }; + + // Find out whether the last break was mandatory by checking against + // rules LB4 and LB5, special-casing the end of text according to LB3. + // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html + let breakpoint = if point == p.bidi.text.len() { + Breakpoint::Mandatory + } else { + match lb.get(c) { + // Fix for: https://github.com/unicode-org/icu4x/issues/4146 + LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue, + LineBreak::MandatoryBreak + | LineBreak::CarriageReturn + | LineBreak::LineFeed + | LineBreak::NextLine => Breakpoint::Mandatory, + _ => Breakpoint::Normal, + } + }; + + // Hyphenate between the last and current breakpoint. + 'hyphenate: { + if !hyphenate { + break 'hyphenate; + } + + // Extract a hyphenatable "word". + let word = + &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic()); + if word.is_empty() { + break 'hyphenate; + } + + let end = last + word.len(); + let mut offset = last; + + // Determine the language to hyphenate this word in. + let Some(lang) = lang_at(p, last) else { break 'hyphenate }; + + for syllable in hypher::hyphenate(word, lang) { + // Don't hyphenate after the final syllable. + offset += syllable.len(); + if offset == end { + continue; + } + + // Filter out hyphenation opportunities where hyphenation was + // actually disabled. + if !hyphenate_at(p, offset) { + continue; + } + + // Filter out forbidden hyphenation opportunities. + if matches!( + syllable.chars().next_back().map(|c| lb.get(c)), + Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ) + ) { + continue; + } + + // Call `f` for the word-internal hyphenation opportunity. + f(offset, Breakpoint::Hyphen); + } + } + + // Call `f` for the UAX #14 break opportunity. + f(point, breakpoint); + + last = point; + } +} + +/// Whether hyphenation is enabled at the given offset. +fn hyphenate_at(p: &Preparation, offset: usize) -> bool { + p.hyphenate + .or_else(|| { + let shaped = p.find(offset)?.text()?; + Some(TextElem::hyphenate_in(shaped.styles)) + }) + .unwrap_or(false) +} + +/// The text language at the given offset. +fn lang_at(p: &Preparation, offset: usize) -> Option { + let lang = p.lang.or_else(|| { + let shaped = p.find(offset)?.text()?; + Some(TextElem::lang_in(shaped.styles)) + })?; + + let bytes = lang.as_str().as_bytes().try_into().ok()?; + hypher::Lang::from_iso(bytes) +} diff --git a/crates/typst-library/src/text/mod.rs b/crates/typst-library/src/text/mod.rs index 4d2f5c63d..bfa775869 100644 --- a/crates/typst-library/src/text/mod.rs +++ b/crates/typst-library/src/text/mod.rs @@ -1,6 +1,7 @@ //! Text handling. mod deco; +mod linebreak; mod misc; mod quote; mod quotes; @@ -9,6 +10,7 @@ mod shaping; mod shift; pub use self::deco::*; +pub(crate) use self::linebreak::*; pub use self::misc::*; pub use self::quote::*; pub use self::quotes::*;