mirror of
https://github.com/typst/typst
synced 2025-05-15 01:25:28 +08:00
Simplify linebreaking
Switches from an iterator to callback style, which significantly increases the clarity of the whole thing.
This commit is contained in:
parent
0f4f37cc09
commit
4c75adbb04
@ -1,11 +1,3 @@
|
|||||||
use std::iter::Peekable;
|
|
||||||
|
|
||||||
use icu_properties::{maps::CodePointMapData, LineBreak};
|
|
||||||
use icu_provider::AsDeserializingBufferProvider;
|
|
||||||
use icu_provider_adapters::fork::ForkByKeyProvider;
|
|
||||||
use icu_provider_blob::BlobDataProvider;
|
|
||||||
use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter};
|
|
||||||
use once_cell::sync::Lazy;
|
|
||||||
use typst::eval::Tracer;
|
use typst::eval::Tracer;
|
||||||
use typst::model::DelayedErrors;
|
use typst::model::DelayedErrors;
|
||||||
use unicode_bidi::{BidiInfo, Level as BidiLevel};
|
use unicode_bidi::{BidiInfo, Level as BidiLevel};
|
||||||
@ -16,8 +8,9 @@ use crate::layout::AlignElem;
|
|||||||
use crate::math::EquationElem;
|
use crate::math::EquationElem;
|
||||||
use crate::prelude::*;
|
use crate::prelude::*;
|
||||||
use crate::text::{
|
use crate::text::{
|
||||||
char_is_cjk_script, is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedGlyph,
|
breakpoints, char_is_cjk_script, is_gb_style, shape, Breakpoint, LinebreakElem,
|
||||||
ShapedText, SmartquoteElem, SpaceElem, TextElem, BEGIN_PUNCT_PAT, END_PUNCT_PAT,
|
Quoter, Quotes, ShapedGlyph, ShapedText, SmartquoteElem, SpaceElem, TextElem,
|
||||||
|
BEGIN_PUNCT_PAT, END_PUNCT_PAT,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Arranges text, spacing and inline-level elements into a paragraph.
|
/// Arranges text, spacing and inline-level elements into a paragraph.
|
||||||
@ -246,30 +239,32 @@ const OBJ_REPLACE: char = '\u{FFFC}'; // Object Replacement Character
|
|||||||
/// In many cases, we can directly reuse these results when constructing a line.
|
/// In many cases, we can directly reuse these results when constructing a line.
|
||||||
/// Only when a line break falls onto a text index that is not safe-to-break per
|
/// Only when a line break falls onto a text index that is not safe-to-break per
|
||||||
/// rustybuzz, we have to reshape that portion.
|
/// rustybuzz, we have to reshape that portion.
|
||||||
struct Preparation<'a> {
|
pub(crate) struct Preparation<'a> {
|
||||||
/// Bidirectional text embedding levels for the paragraph.
|
/// Bidirectional text embedding levels for the paragraph.
|
||||||
bidi: BidiInfo<'a>,
|
pub bidi: BidiInfo<'a>,
|
||||||
/// Text runs, spacing and layouted elements.
|
/// Text runs, spacing and layouted elements.
|
||||||
items: Vec<Item<'a>>,
|
pub items: Vec<Item<'a>>,
|
||||||
/// The span mapper.
|
/// The span mapper.
|
||||||
spans: SpanMapper,
|
pub spans: SpanMapper,
|
||||||
/// The styles shared by all children.
|
/// The styles shared by all children.
|
||||||
styles: StyleChain<'a>,
|
pub styles: StyleChain<'a>,
|
||||||
/// Whether to hyphenate if it's the same for all children.
|
/// Whether to hyphenate if it's the same for all children.
|
||||||
hyphenate: Option<bool>,
|
pub hyphenate: Option<bool>,
|
||||||
/// The text language if it's the same for all children.
|
/// The text language if it's the same for all children.
|
||||||
lang: Option<Lang>,
|
pub lang: Option<Lang>,
|
||||||
/// The paragraph's resolved horizontal alignment.
|
/// The paragraph's resolved horizontal alignment.
|
||||||
align: FixedAlign,
|
pub align: FixedAlign,
|
||||||
/// Whether to justify the paragraph.
|
/// Whether to justify the paragraph.
|
||||||
justify: bool,
|
pub justify: bool,
|
||||||
/// The paragraph's hanging indent.
|
/// The paragraph's hanging indent.
|
||||||
hang: Abs,
|
pub hang: Abs,
|
||||||
|
/// The CJK-latin spacing.
|
||||||
|
pub cjk_latin_spacing: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Preparation<'a> {
|
impl<'a> Preparation<'a> {
|
||||||
/// Find the item that contains the given `text_offset`.
|
/// Find the item that contains the given `text_offset`.
|
||||||
fn find(&self, text_offset: usize) -> Option<&Item<'a>> {
|
pub fn find(&self, text_offset: usize) -> Option<&Item<'a>> {
|
||||||
let mut cursor = 0;
|
let mut cursor = 0;
|
||||||
for item in &self.items {
|
for item in &self.items {
|
||||||
let end = cursor + item.len();
|
let end = cursor + item.len();
|
||||||
@ -284,7 +279,7 @@ impl<'a> Preparation<'a> {
|
|||||||
/// Return the items that intersect the given `text_range`.
|
/// Return the items that intersect the given `text_range`.
|
||||||
///
|
///
|
||||||
/// Returns the expanded range around the items and the items.
|
/// Returns the expanded range around the items and the items.
|
||||||
fn slice(&self, text_range: Range) -> (Range, &[Item<'a>]) {
|
pub fn slice(&self, text_range: Range) -> (Range, &[Item<'a>]) {
|
||||||
let mut cursor = 0;
|
let mut cursor = 0;
|
||||||
let mut start = 0;
|
let mut start = 0;
|
||||||
let mut end = 0;
|
let mut end = 0;
|
||||||
@ -342,7 +337,7 @@ impl Segment<'_> {
|
|||||||
|
|
||||||
/// A prepared item in a paragraph layout.
|
/// A prepared item in a paragraph layout.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
enum Item<'a> {
|
pub(crate) enum Item<'a> {
|
||||||
/// A shaped text run with consistent style and direction.
|
/// A shaped text run with consistent style and direction.
|
||||||
Text(ShapedText<'a>),
|
Text(ShapedText<'a>),
|
||||||
/// Absolute spacing between other items.
|
/// Absolute spacing between other items.
|
||||||
@ -357,14 +352,14 @@ enum Item<'a> {
|
|||||||
|
|
||||||
impl<'a> Item<'a> {
|
impl<'a> Item<'a> {
|
||||||
/// If this a text item, return it.
|
/// If this a text item, return it.
|
||||||
fn text(&self) -> Option<&ShapedText<'a>> {
|
pub fn text(&self) -> Option<&ShapedText<'a>> {
|
||||||
match self {
|
match self {
|
||||||
Self::Text(shaped) => Some(shaped),
|
Self::Text(shaped) => Some(shaped),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> {
|
pub fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> {
|
||||||
match self {
|
match self {
|
||||||
Self::Text(shaped) => Some(shaped),
|
Self::Text(shaped) => Some(shaped),
|
||||||
_ => None,
|
_ => None,
|
||||||
@ -372,7 +367,7 @@ impl<'a> Item<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The text length of the item.
|
/// The text length of the item.
|
||||||
fn len(&self) -> usize {
|
pub fn len(&self) -> usize {
|
||||||
match self {
|
match self {
|
||||||
Self::Text(shaped) => shaped.text.len(),
|
Self::Text(shaped) => shaped.text.len(),
|
||||||
Self::Absolute(_) | Self::Fractional(_, _) => SPACING_REPLACE.len_utf8(),
|
Self::Absolute(_) | Self::Fractional(_, _) => SPACING_REPLACE.len_utf8(),
|
||||||
@ -382,7 +377,7 @@ impl<'a> Item<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The natural layouted width of the item.
|
/// The natural layouted width of the item.
|
||||||
fn width(&self) -> Abs {
|
pub fn width(&self) -> Abs {
|
||||||
match self {
|
match self {
|
||||||
Self::Text(shaped) => shaped.width,
|
Self::Text(shaped) => shaped.width,
|
||||||
Self::Absolute(v) => *v,
|
Self::Absolute(v) => *v,
|
||||||
@ -737,6 +732,7 @@ fn prepare<'a>(
|
|||||||
align: AlignElem::alignment_in(styles).resolve(styles).x,
|
align: AlignElem::alignment_in(styles).resolve(styles).x,
|
||||||
justify: ParElem::justify_in(styles),
|
justify: ParElem::justify_in(styles),
|
||||||
hang: ParElem::hanging_indent_in(styles),
|
hang: ParElem::hanging_indent_in(styles),
|
||||||
|
cjk_latin_spacing: TextElem::cjk_latin_spacing_in(styles).is_auto(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -889,11 +885,10 @@ fn linebreak_simple<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<Line
|
|||||||
let mut lines = vec![];
|
let mut lines = vec![];
|
||||||
let mut start = 0;
|
let mut start = 0;
|
||||||
let mut last = None;
|
let mut last = None;
|
||||||
let cjk_latin_spacing = TextElem::cjk_latin_spacing_in(p.styles).is_auto();
|
|
||||||
|
|
||||||
for (end, mandatory, hyphen) in breakpoints(p) {
|
breakpoints(p, |end, breakpoint| {
|
||||||
// Compute the line and its size.
|
// Compute the line and its size.
|
||||||
let mut attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
|
let mut attempt = line(vt, p, start..end, breakpoint);
|
||||||
|
|
||||||
// If the line doesn't fit anymore, we push the last fitting attempt
|
// If the line doesn't fit anymore, we push the last fitting attempt
|
||||||
// into the stack and rebuild the line from the attempt's end. The
|
// into the stack and rebuild the line from the attempt's end. The
|
||||||
@ -902,21 +897,21 @@ fn linebreak_simple<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<Line
|
|||||||
if let Some((last_attempt, last_end)) = last.take() {
|
if let Some((last_attempt, last_end)) = last.take() {
|
||||||
lines.push(last_attempt);
|
lines.push(last_attempt);
|
||||||
start = last_end;
|
start = last_end;
|
||||||
attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
|
attempt = line(vt, p, start..end, breakpoint);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finish the current line if there is a mandatory line break (i.e.
|
// Finish the current line if there is a mandatory line break (i.e.
|
||||||
// due to "\n") or if the line doesn't fit horizontally already
|
// due to "\n") or if the line doesn't fit horizontally already
|
||||||
// since then no shorter line will be possible.
|
// since then no shorter line will be possible.
|
||||||
if mandatory || !width.fits(attempt.width) {
|
if breakpoint == Breakpoint::Mandatory || !width.fits(attempt.width) {
|
||||||
lines.push(attempt);
|
lines.push(attempt);
|
||||||
start = end;
|
start = end;
|
||||||
last = None;
|
last = None;
|
||||||
} else {
|
} else {
|
||||||
last = Some((attempt, end));
|
last = Some((attempt, end));
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
if let Some((line, _)) = last {
|
if let Some((line, _)) = last {
|
||||||
lines.push(line);
|
lines.push(line);
|
||||||
@ -965,13 +960,12 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
let mut table = vec![Entry {
|
let mut table = vec![Entry {
|
||||||
pred: 0,
|
pred: 0,
|
||||||
total: 0.0,
|
total: 0.0,
|
||||||
line: line(vt, p, 0..0, false, false, false),
|
line: line(vt, p, 0..0, Breakpoint::Mandatory),
|
||||||
}];
|
}];
|
||||||
|
|
||||||
let em = TextElem::size_in(p.styles);
|
let em = TextElem::size_in(p.styles);
|
||||||
let cjk_latin_spacing = TextElem::cjk_latin_spacing_in(p.styles).is_auto();
|
|
||||||
|
|
||||||
for (end, mandatory, hyphen) in breakpoints(p) {
|
breakpoints(p, |end, breakpoint| {
|
||||||
let k = table.len();
|
let k = table.len();
|
||||||
let eof = end == p.bidi.text.len();
|
let eof = end == p.bidi.text.len();
|
||||||
let mut best: Option<Entry> = None;
|
let mut best: Option<Entry> = None;
|
||||||
@ -981,7 +975,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
// Layout the line.
|
// Layout the line.
|
||||||
let start = pred.line.end;
|
let start = pred.line.end;
|
||||||
|
|
||||||
let attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
|
let attempt = line(vt, p, start..end, breakpoint);
|
||||||
|
|
||||||
// Determine how much the line's spaces would need to be stretched
|
// Determine how much the line's spaces would need to be stretched
|
||||||
// to make it the desired width.
|
// to make it the desired width.
|
||||||
@ -1025,7 +1019,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
active += 1;
|
active += 1;
|
||||||
}
|
}
|
||||||
MAX_COST
|
MAX_COST
|
||||||
} else if mandatory || eof {
|
} else if breakpoint == Breakpoint::Mandatory || eof {
|
||||||
// This is a mandatory break and the line is not overfull, so
|
// This is a mandatory break and the line is not overfull, so
|
||||||
// all breakpoints before this one become inactive since no line
|
// all breakpoints before this one become inactive since no line
|
||||||
// can span above the mandatory break.
|
// can span above the mandatory break.
|
||||||
@ -1048,7 +1042,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Penalize hyphens.
|
// Penalize hyphens.
|
||||||
if hyphen {
|
if breakpoint == Breakpoint::Hyphen {
|
||||||
cost += HYPH_COST;
|
cost += HYPH_COST;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1073,7 +1067,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
}
|
}
|
||||||
|
|
||||||
table.push(best.unwrap());
|
table.push(best.unwrap());
|
||||||
}
|
});
|
||||||
|
|
||||||
// Retrace the best path.
|
// Retrace the best path.
|
||||||
let mut lines = vec![];
|
let mut lines = vec![];
|
||||||
@ -1089,208 +1083,16 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
lines
|
lines
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generated by the following command:
|
|
||||||
///
|
|
||||||
/// ```sh
|
|
||||||
/// icu4x-datagen --locales full \
|
|
||||||
/// --format blob \
|
|
||||||
/// --keys-for-bin target/debug/typst \
|
|
||||||
/// --out crates/typst-library/assets/icudata.postcard \
|
|
||||||
/// --overwrite
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// Install icu_datagen with `cargo install icu_datagen`.
|
|
||||||
static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard");
|
|
||||||
|
|
||||||
/// Generated by the following command:
|
|
||||||
///
|
|
||||||
/// ```sh
|
|
||||||
/// icu4x-datagen --locales zh ja \
|
|
||||||
/// --format blob \
|
|
||||||
/// --keys segmenter/line@1 \
|
|
||||||
/// --out crates/typst-library/assets/cj_linebreak_data.postcard \
|
|
||||||
/// --overwrite
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// The used icu_datagen should be patched by
|
|
||||||
/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
|
|
||||||
static CJ_LINEBREAK_DATA: &[u8] =
|
|
||||||
include_bytes!("../../assets/cj_linebreak_data.postcard");
|
|
||||||
|
|
||||||
/// The general line break segmenter.
|
|
||||||
static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
|
|
||||||
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
|
|
||||||
LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
|
|
||||||
});
|
|
||||||
|
|
||||||
/// The line break segmenter for Chinese/Japanese text.
|
|
||||||
static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
|
|
||||||
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
|
|
||||||
let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
|
|
||||||
let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
|
|
||||||
LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
|
|
||||||
});
|
|
||||||
|
|
||||||
/// The Unicode line break properties for each code point.
|
|
||||||
static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
|
|
||||||
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
|
|
||||||
let deser_provider = provider.as_deserializing();
|
|
||||||
icu_properties::maps::load_line_break(&deser_provider).unwrap()
|
|
||||||
});
|
|
||||||
|
|
||||||
/// Determine all possible points in the text where lines can broken.
|
|
||||||
///
|
|
||||||
/// Returns for each breakpoint the text index, whether the break is mandatory
|
|
||||||
/// (after `\n`) and whether a hyphen is required (when breaking inside of a
|
|
||||||
/// word).
|
|
||||||
fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> {
|
|
||||||
let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) {
|
|
||||||
CJ_SEGMENTER.segment_str(p.bidi.text)
|
|
||||||
} else {
|
|
||||||
SEGMENTER.segment_str(p.bidi.text)
|
|
||||||
};
|
|
||||||
// The iterator always yields a breakpoint at index 0, we want to ignore it
|
|
||||||
linebreaks.next();
|
|
||||||
Breakpoints {
|
|
||||||
p,
|
|
||||||
linebreaks: linebreaks.peekable(),
|
|
||||||
syllables: None,
|
|
||||||
offset: 0,
|
|
||||||
suffix: 0,
|
|
||||||
end: 0,
|
|
||||||
mandatory: false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An iterator over the line break opportunities in a text.
|
|
||||||
struct Breakpoints<'a> {
|
|
||||||
/// The paragraph's items.
|
|
||||||
p: &'a Preparation<'a>,
|
|
||||||
/// The inner iterator over the unicode line break opportunities.
|
|
||||||
linebreaks: Peekable<LineBreakIteratorUtf8<'a, 'a>>,
|
|
||||||
/// Iterator over syllables of the current word.
|
|
||||||
syllables: Option<hypher::Syllables<'a>>,
|
|
||||||
/// The current text offset.
|
|
||||||
offset: usize,
|
|
||||||
/// The trimmed end of the current word.
|
|
||||||
suffix: usize,
|
|
||||||
/// The untrimmed end of the current word.
|
|
||||||
end: usize,
|
|
||||||
/// Whether the break after the current word is mandatory.
|
|
||||||
mandatory: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Iterator for Breakpoints<'_> {
|
|
||||||
type Item = (usize, bool, bool);
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
let lb = LINEBREAK_DATA.as_borrowed();
|
|
||||||
|
|
||||||
// If we're currently in a hyphenated "word", process the next syllable.
|
|
||||||
if let Some(syllable) = self.syllables.as_mut().and_then(Iterator::next) {
|
|
||||||
self.offset += syllable.len();
|
|
||||||
if self.offset == self.suffix {
|
|
||||||
self.offset = self.end;
|
|
||||||
}
|
|
||||||
|
|
||||||
let hyphen = self.offset < self.end;
|
|
||||||
if hyphen {
|
|
||||||
// Filter out hyphenation opportunities where hyphenation was
|
|
||||||
// actually disabled.
|
|
||||||
if !self.hyphenate(self.offset) {
|
|
||||||
return self.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Filter out forbidden hyphenation opportunities.
|
|
||||||
if matches!(
|
|
||||||
syllable.chars().last().map(|c| lb.get(c)),
|
|
||||||
Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
|
|
||||||
) {
|
|
||||||
return self.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Some((self.offset, self.mandatory && !hyphen, hyphen));
|
|
||||||
}
|
|
||||||
|
|
||||||
loop {
|
|
||||||
// Get the next "word".
|
|
||||||
self.end = self.linebreaks.next()?;
|
|
||||||
self.mandatory = false;
|
|
||||||
|
|
||||||
// Fix for: https://github.com/unicode-org/icu4x/issues/4146
|
|
||||||
if let Some(c) = self.p.bidi.text[..self.end].chars().next_back() {
|
|
||||||
if self.end == self.p.bidi.text.len() {
|
|
||||||
self.mandatory = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.mandatory = match lb.get(c) {
|
|
||||||
LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
|
|
||||||
LineBreak::MandatoryBreak
|
|
||||||
| LineBreak::CarriageReturn
|
|
||||||
| LineBreak::LineFeed
|
|
||||||
| LineBreak::NextLine => true,
|
|
||||||
_ => false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Hyphenate the next word.
|
|
||||||
if self.p.hyphenate != Some(false) {
|
|
||||||
if let Some(lang) = self.lang(self.offset) {
|
|
||||||
let word = &self.p.bidi.text[self.offset..self.end];
|
|
||||||
let trimmed = word.trim_end_matches(|c: char| !c.is_alphabetic());
|
|
||||||
if !trimmed.is_empty() {
|
|
||||||
self.suffix = self.offset + trimmed.len();
|
|
||||||
self.syllables = Some(hypher::hyphenate(trimmed, lang));
|
|
||||||
return self.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.offset = self.end;
|
|
||||||
Some((self.end, self.mandatory, false))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Breakpoints<'_> {
|
|
||||||
/// Whether hyphenation is enabled at the given offset.
|
|
||||||
fn hyphenate(&self, offset: usize) -> bool {
|
|
||||||
self.p
|
|
||||||
.hyphenate
|
|
||||||
.or_else(|| {
|
|
||||||
let shaped = self.p.find(offset)?.text()?;
|
|
||||||
Some(TextElem::hyphenate_in(shaped.styles))
|
|
||||||
})
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The text language at the given offset.
|
|
||||||
fn lang(&self, offset: usize) -> Option<hypher::Lang> {
|
|
||||||
let lang = self.p.lang.or_else(|| {
|
|
||||||
let shaped = self.p.find(offset)?.text()?;
|
|
||||||
Some(TextElem::lang_in(shaped.styles))
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let bytes = lang.as_str().as_bytes().try_into().ok()?;
|
|
||||||
hypher::Lang::from_iso(bytes)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a line which spans the given range.
|
/// Create a line which spans the given range.
|
||||||
fn line<'a>(
|
fn line<'a>(
|
||||||
vt: &Vt,
|
vt: &Vt,
|
||||||
p: &'a Preparation,
|
p: &'a Preparation,
|
||||||
mut range: Range,
|
mut range: Range,
|
||||||
mandatory: bool,
|
breakpoint: Breakpoint,
|
||||||
hyphen: bool,
|
|
||||||
cjk_latin_spacing: bool,
|
|
||||||
) -> Line<'a> {
|
) -> Line<'a> {
|
||||||
let end = range.end;
|
let end = range.end;
|
||||||
let mut justify = p.justify && end < p.bidi.text.len() && !mandatory;
|
let mut justify =
|
||||||
|
p.justify && end < p.bidi.text.len() && breakpoint != Breakpoint::Mandatory;
|
||||||
|
|
||||||
if range.is_empty() {
|
if range.is_empty() {
|
||||||
return Line {
|
return Line {
|
||||||
@ -1326,13 +1128,14 @@ fn line<'a>(
|
|||||||
|
|
||||||
// Deal with hyphens, dashes and justification.
|
// Deal with hyphens, dashes and justification.
|
||||||
let shy = trimmed.ends_with('\u{ad}');
|
let shy = trimmed.ends_with('\u{ad}');
|
||||||
|
let hyphen = breakpoint == Breakpoint::Hyphen;
|
||||||
dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']);
|
dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']);
|
||||||
justify |= text.ends_with('\u{2028}');
|
justify |= text.ends_with('\u{2028}');
|
||||||
|
|
||||||
// Deal with CJK punctuation at line ends.
|
// Deal with CJK punctuation at line ends.
|
||||||
let gb_style = is_gb_style(shaped.lang, shaped.region);
|
let gb_style = is_gb_style(shaped.lang, shaped.region);
|
||||||
let maybe_adjust_last_glyph = trimmed.ends_with(END_PUNCT_PAT)
|
let maybe_adjust_last_glyph = trimmed.ends_with(END_PUNCT_PAT)
|
||||||
|| (cjk_latin_spacing && trimmed.ends_with(char_is_cjk_script));
|
|| (p.cjk_latin_spacing && trimmed.ends_with(char_is_cjk_script));
|
||||||
|
|
||||||
// Usually, we don't want to shape an empty string because:
|
// Usually, we don't want to shape an empty string because:
|
||||||
// - We don't want the height of trimmed whitespace in a different
|
// - We don't want the height of trimmed whitespace in a different
|
||||||
@ -1359,7 +1162,7 @@ fn line<'a>(
|
|||||||
let punct = reshaped.glyphs.to_mut().last_mut().unwrap();
|
let punct = reshaped.glyphs.to_mut().last_mut().unwrap();
|
||||||
punct.shrink_right(shrink_amount);
|
punct.shrink_right(shrink_amount);
|
||||||
reshaped.width -= shrink_amount.at(reshaped.size);
|
reshaped.width -= shrink_amount.at(reshaped.size);
|
||||||
} else if cjk_latin_spacing
|
} else if p.cjk_latin_spacing
|
||||||
&& last_glyph.is_cjk_script()
|
&& last_glyph.is_cjk_script()
|
||||||
&& (last_glyph.x_advance - last_glyph.x_offset) > Em::one()
|
&& (last_glyph.x_advance - last_glyph.x_offset) > Em::one()
|
||||||
{
|
{
|
||||||
@ -1385,7 +1188,7 @@ fn line<'a>(
|
|||||||
// Deal with CJK characters at line starts.
|
// Deal with CJK characters at line starts.
|
||||||
let text = &p.bidi.text[range.start..end];
|
let text = &p.bidi.text[range.start..end];
|
||||||
let maybe_adjust_first_glyph = text.starts_with(BEGIN_PUNCT_PAT)
|
let maybe_adjust_first_glyph = text.starts_with(BEGIN_PUNCT_PAT)
|
||||||
|| (cjk_latin_spacing && text.starts_with(char_is_cjk_script));
|
|| (p.cjk_latin_spacing && text.starts_with(char_is_cjk_script));
|
||||||
|
|
||||||
// Reshape the start item if it's split in half.
|
// Reshape the start item if it's split in half.
|
||||||
let mut first = None;
|
let mut first = None;
|
||||||
@ -1419,7 +1222,7 @@ fn line<'a>(
|
|||||||
let amount_abs = shrink_amount.at(reshaped.size);
|
let amount_abs = shrink_amount.at(reshaped.size);
|
||||||
reshaped.width -= amount_abs;
|
reshaped.width -= amount_abs;
|
||||||
width -= amount_abs;
|
width -= amount_abs;
|
||||||
} else if cjk_latin_spacing
|
} else if p.cjk_latin_spacing
|
||||||
&& first_glyph.is_cjk_script()
|
&& first_glyph.is_cjk_script()
|
||||||
&& first_glyph.x_offset > Em::zero()
|
&& first_glyph.x_offset > Em::zero()
|
||||||
{
|
{
|
||||||
|
188
crates/typst-library/src/text/linebreak.rs
Normal file
188
crates/typst-library/src/text/linebreak.rs
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
use icu_properties::{maps::CodePointMapData, LineBreak};
|
||||||
|
use icu_provider::AsDeserializingBufferProvider;
|
||||||
|
use icu_provider_adapters::fork::ForkByKeyProvider;
|
||||||
|
use icu_provider_blob::BlobDataProvider;
|
||||||
|
use icu_segmenter::LineSegmenter;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use typst::doc::Lang;
|
||||||
|
|
||||||
|
use super::TextElem;
|
||||||
|
use crate::layout::Preparation;
|
||||||
|
|
||||||
|
/// Generated by the following command:
|
||||||
|
///
|
||||||
|
/// ```sh
|
||||||
|
/// icu4x-datagen --locales full \
|
||||||
|
/// --format blob \
|
||||||
|
/// --keys-for-bin target/debug/typst \
|
||||||
|
/// --out crates/typst-library/assets/icudata.postcard \
|
||||||
|
/// --overwrite
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Install icu_datagen with `cargo install icu_datagen`.
|
||||||
|
static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard");
|
||||||
|
|
||||||
|
/// Generated by the following command:
|
||||||
|
///
|
||||||
|
/// ```sh
|
||||||
|
/// icu4x-datagen --locales zh ja \
|
||||||
|
/// --format blob \
|
||||||
|
/// --keys segmenter/line@1 \
|
||||||
|
/// --out crates/typst-library/assets/cj_linebreak_data.postcard \
|
||||||
|
/// --overwrite
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// The used icu_datagen should be patched by
|
||||||
|
/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
|
||||||
|
static CJ_LINEBREAK_DATA: &[u8] =
|
||||||
|
include_bytes!("../../assets/cj_linebreak_data.postcard");
|
||||||
|
|
||||||
|
/// The general line break segmenter.
|
||||||
|
static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
|
||||||
|
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
|
||||||
|
LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
/// The line break segmenter for Chinese/Japanese text.
|
||||||
|
static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
|
||||||
|
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
|
||||||
|
let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
|
||||||
|
let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
|
||||||
|
LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
/// The Unicode line break properties for each code point.
|
||||||
|
static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
|
||||||
|
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
|
||||||
|
let deser_provider = provider.as_deserializing();
|
||||||
|
icu_properties::maps::load_line_break(&deser_provider).unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
/// A line break opportunity.
|
||||||
|
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||||||
|
pub(crate) enum Breakpoint {
|
||||||
|
/// Just a normal opportunity (e.g. after a space).
|
||||||
|
Normal,
|
||||||
|
/// A mandatory breakpoint (after '\n' or at the end of the text).
|
||||||
|
Mandatory,
|
||||||
|
/// An opportunity for hyphenating.
|
||||||
|
Hyphen,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calls `f` for all possible points in the text where lines can broken.
|
||||||
|
///
|
||||||
|
/// Yields for each breakpoint the text index, whether the break is mandatory
|
||||||
|
/// (after `\n`) and whether a hyphen is required (when breaking inside of a
|
||||||
|
/// word).
|
||||||
|
///
|
||||||
|
/// This is an internal instead of an external iterator because it makes the
|
||||||
|
/// code much simpler and the consumers of this function don't need the
|
||||||
|
/// composability and flexibility of external iteration anyway.
|
||||||
|
pub(crate) fn breakpoints<'a>(
|
||||||
|
p: &'a Preparation<'a>,
|
||||||
|
mut f: impl FnMut(usize, Breakpoint),
|
||||||
|
) {
|
||||||
|
let lb = LINEBREAK_DATA.as_borrowed();
|
||||||
|
let segmenter = match p.lang {
|
||||||
|
Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
|
||||||
|
_ => &SEGMENTER,
|
||||||
|
};
|
||||||
|
|
||||||
|
let hyphenate = p.hyphenate != Some(false);
|
||||||
|
let mut last = 0;
|
||||||
|
|
||||||
|
// Walk over all UAX #14 linebreak opportunities.
|
||||||
|
for point in segmenter.segment_str(p.bidi.text) {
|
||||||
|
// Skip breakpoint if there is no char before it. icu4x generates one
|
||||||
|
// at offset 0, but we don't want it.
|
||||||
|
let Some(c) = p.bidi.text[..point].chars().next_back() else { continue };
|
||||||
|
|
||||||
|
// Find out whether the last break was mandatory by checking against
|
||||||
|
// rules LB4 and LB5, special-casing the end of text according to LB3.
|
||||||
|
// See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
|
||||||
|
let breakpoint = if point == p.bidi.text.len() {
|
||||||
|
Breakpoint::Mandatory
|
||||||
|
} else {
|
||||||
|
match lb.get(c) {
|
||||||
|
// Fix for: https://github.com/unicode-org/icu4x/issues/4146
|
||||||
|
LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
|
||||||
|
LineBreak::MandatoryBreak
|
||||||
|
| LineBreak::CarriageReturn
|
||||||
|
| LineBreak::LineFeed
|
||||||
|
| LineBreak::NextLine => Breakpoint::Mandatory,
|
||||||
|
_ => Breakpoint::Normal,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Hyphenate between the last and current breakpoint.
|
||||||
|
'hyphenate: {
|
||||||
|
if !hyphenate {
|
||||||
|
break 'hyphenate;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract a hyphenatable "word".
|
||||||
|
let word =
|
||||||
|
&p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
|
||||||
|
if word.is_empty() {
|
||||||
|
break 'hyphenate;
|
||||||
|
}
|
||||||
|
|
||||||
|
let end = last + word.len();
|
||||||
|
let mut offset = last;
|
||||||
|
|
||||||
|
// Determine the language to hyphenate this word in.
|
||||||
|
let Some(lang) = lang_at(p, last) else { break 'hyphenate };
|
||||||
|
|
||||||
|
for syllable in hypher::hyphenate(word, lang) {
|
||||||
|
// Don't hyphenate after the final syllable.
|
||||||
|
offset += syllable.len();
|
||||||
|
if offset == end {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter out hyphenation opportunities where hyphenation was
|
||||||
|
// actually disabled.
|
||||||
|
if !hyphenate_at(p, offset) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter out forbidden hyphenation opportunities.
|
||||||
|
if matches!(
|
||||||
|
syllable.chars().next_back().map(|c| lb.get(c)),
|
||||||
|
Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
|
||||||
|
) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call `f` for the word-internal hyphenation opportunity.
|
||||||
|
f(offset, Breakpoint::Hyphen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call `f` for the UAX #14 break opportunity.
|
||||||
|
f(point, breakpoint);
|
||||||
|
|
||||||
|
last = point;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether hyphenation is enabled at the given offset.
|
||||||
|
fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
|
||||||
|
p.hyphenate
|
||||||
|
.or_else(|| {
|
||||||
|
let shaped = p.find(offset)?.text()?;
|
||||||
|
Some(TextElem::hyphenate_in(shaped.styles))
|
||||||
|
})
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The text language at the given offset.
|
||||||
|
fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
|
||||||
|
let lang = p.lang.or_else(|| {
|
||||||
|
let shaped = p.find(offset)?.text()?;
|
||||||
|
Some(TextElem::lang_in(shaped.styles))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let bytes = lang.as_str().as_bytes().try_into().ok()?;
|
||||||
|
hypher::Lang::from_iso(bytes)
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
//! Text handling.
|
//! Text handling.
|
||||||
|
|
||||||
mod deco;
|
mod deco;
|
||||||
|
mod linebreak;
|
||||||
mod misc;
|
mod misc;
|
||||||
mod quote;
|
mod quote;
|
||||||
mod quotes;
|
mod quotes;
|
||||||
@ -9,6 +10,7 @@ mod shaping;
|
|||||||
mod shift;
|
mod shift;
|
||||||
|
|
||||||
pub use self::deco::*;
|
pub use self::deco::*;
|
||||||
|
pub(crate) use self::linebreak::*;
|
||||||
pub use self::misc::*;
|
pub use self::misc::*;
|
||||||
pub use self::quote::*;
|
pub use self::quote::*;
|
||||||
pub use self::quotes::*;
|
pub use self::quotes::*;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user