mirror of
https://github.com/typst/typst
synced 2025-05-13 12:36:23 +08:00
979 lines
33 KiB
Rust
979 lines
33 KiB
Rust
use std::ops::{Add, Sub};
|
|
use std::sync::LazyLock;
|
|
|
|
use az::SaturatingAs;
|
|
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
|
|
use icu_properties::LineBreak;
|
|
use icu_provider::AsDeserializingBufferProvider;
|
|
use icu_provider_adapters::fork::ForkByKeyProvider;
|
|
use icu_provider_blob::BlobDataProvider;
|
|
use icu_segmenter::LineSegmenter;
|
|
use typst_library::engine::Engine;
|
|
use typst_library::layout::{Abs, Em};
|
|
use typst_library::model::Linebreaks;
|
|
use typst_library::text::{is_default_ignorable, Lang, TextElem};
|
|
use typst_syntax::link_prefix;
|
|
use unicode_segmentation::UnicodeSegmentation;
|
|
|
|
use super::*;
|
|
|
|
/// The cost of a line or inline layout.
|
|
type Cost = f64;
|
|
|
|
// Cost parameters.
|
|
//
|
|
// We choose higher costs than the Knuth-Plass paper (which would be 50) because
|
|
// it hyphenates way to eagerly in Typst otherwise. Could be related to the
|
|
// ratios coming out differently since Typst doesn't have the concept of glue,
|
|
// so things work a bit differently.
|
|
const DEFAULT_HYPH_COST: Cost = 135.0;
|
|
const DEFAULT_RUNT_COST: Cost = 100.0;
|
|
|
|
// Other parameters.
|
|
const MIN_RATIO: f64 = -1.0;
|
|
const MIN_APPROX_RATIO: f64 = -0.5;
|
|
const BOUND_EPS: f64 = 1e-3;
|
|
|
|
/// The ICU blob data.
|
|
fn blob() -> BlobDataProvider {
|
|
BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU).unwrap()
|
|
}
|
|
|
|
/// The general line break segmenter.
|
|
static SEGMENTER: LazyLock<LineSegmenter> =
|
|
LazyLock::new(|| LineSegmenter::try_new_lstm_with_buffer_provider(&blob()).unwrap());
|
|
|
|
/// The line break segmenter for Chinese/Japanese text.
|
|
static CJ_SEGMENTER: LazyLock<LineSegmenter> = LazyLock::new(|| {
|
|
let cj_blob =
|
|
BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU_CJ_SEGMENT)
|
|
.unwrap();
|
|
let cj_provider = ForkByKeyProvider::new(cj_blob, blob());
|
|
LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
|
|
});
|
|
|
|
/// The Unicode line break properties for each code point.
|
|
static LINEBREAK_DATA: LazyLock<CodePointMapData<LineBreak>> = LazyLock::new(|| {
|
|
icu_properties::maps::load_line_break(&blob().as_deserializing()).unwrap()
|
|
});
|
|
|
|
/// A line break opportunity.
|
|
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
|
pub enum Breakpoint {
|
|
/// Just a normal opportunity (e.g. after a space).
|
|
Normal,
|
|
/// A mandatory breakpoint (after '\n' or at the end of the text).
|
|
Mandatory,
|
|
/// An opportunity for hyphenating and how many chars are before/after it
|
|
/// in the word.
|
|
Hyphen(u8, u8),
|
|
}
|
|
|
|
impl Breakpoint {
|
|
/// Trim a line before this breakpoint.
|
|
pub fn trim(self, line: &str) -> &str {
|
|
// Trim default ignorables.
|
|
let line = line.trim_end_matches(is_default_ignorable);
|
|
|
|
match self {
|
|
// Trim whitespace.
|
|
Self::Normal => line.trim_end_matches(char::is_whitespace),
|
|
|
|
// Trim linebreaks.
|
|
Self::Mandatory => {
|
|
let lb = LINEBREAK_DATA.as_borrowed();
|
|
line.trim_end_matches(|c| {
|
|
matches!(
|
|
lb.get(c),
|
|
LineBreak::MandatoryBreak
|
|
| LineBreak::CarriageReturn
|
|
| LineBreak::LineFeed
|
|
| LineBreak::NextLine
|
|
)
|
|
})
|
|
}
|
|
|
|
// Trim nothing further.
|
|
Self::Hyphen(..) => line,
|
|
}
|
|
}
|
|
|
|
/// Whether this is a hyphen breakpoint.
|
|
pub fn is_hyphen(self) -> bool {
|
|
matches!(self, Self::Hyphen(..))
|
|
}
|
|
}
|
|
|
|
/// Breaks the text into lines.
|
|
pub fn linebreak<'a>(
|
|
engine: &Engine,
|
|
p: &'a Preparation<'a>,
|
|
width: Abs,
|
|
) -> Vec<Line<'a>> {
|
|
match p.config.linebreaks {
|
|
Linebreaks::Simple => linebreak_simple(engine, p, width),
|
|
Linebreaks::Optimized => linebreak_optimized(engine, p, width),
|
|
}
|
|
}
|
|
|
|
/// Performs line breaking in simple first-fit style. This means that we build
|
|
/// lines greedily, always taking the longest possible line. This may lead to
|
|
/// very unbalanced line, but is fast and simple.
|
|
#[typst_macros::time]
|
|
fn linebreak_simple<'a>(
|
|
engine: &Engine,
|
|
p: &'a Preparation<'a>,
|
|
width: Abs,
|
|
) -> Vec<Line<'a>> {
|
|
let mut lines = Vec::with_capacity(16);
|
|
let mut start = 0;
|
|
let mut last = None;
|
|
|
|
breakpoints(p, |end, breakpoint| {
|
|
// Compute the line and its size.
|
|
let mut attempt = line(engine, p, start..end, breakpoint, lines.last());
|
|
|
|
// If the line doesn't fit anymore, we push the last fitting attempt
|
|
// into the stack and rebuild the line from the attempt's end. The
|
|
// resulting line cannot be broken up further.
|
|
if !width.fits(attempt.width) {
|
|
if let Some((last_attempt, last_end)) = last.take() {
|
|
lines.push(last_attempt);
|
|
start = last_end;
|
|
attempt = line(engine, p, start..end, breakpoint, lines.last());
|
|
}
|
|
}
|
|
|
|
// Finish the current line if there is a mandatory line break (i.e. due
|
|
// to "\n") or if the line doesn't fit horizontally already since then
|
|
// no shorter line will be possible.
|
|
if breakpoint == Breakpoint::Mandatory || !width.fits(attempt.width) {
|
|
lines.push(attempt);
|
|
start = end;
|
|
last = None;
|
|
} else {
|
|
last = Some((attempt, end));
|
|
}
|
|
});
|
|
|
|
if let Some((line, _)) = last {
|
|
lines.push(line);
|
|
}
|
|
|
|
lines
|
|
}
|
|
|
|
/// Performs line breaking in optimized Knuth-Plass style. Here, we use more
|
|
/// context to determine the line breaks than in the simple first-fit style. For
|
|
/// example, we might choose to cut a line short even though there is still a
|
|
/// bit of space to improve the fit of one of the following lines. The
|
|
/// Knuth-Plass algorithm is based on the idea of "cost". A line which has a
|
|
/// very tight or very loose fit has a higher cost than one that is just right.
|
|
/// Ending a line with a hyphen incurs extra cost and endings two successive
|
|
/// lines with hyphens even more.
|
|
///
|
|
/// To find the layout with the minimal total cost the algorithm uses dynamic
|
|
/// programming: For each possible breakpoint, it determines the optimal layout
|
|
/// _up to that point_. It walks over all possible start points for a line
|
|
/// ending at that point and finds the one for which the cost of the line plus
|
|
/// the cost of the optimal layout up to the start point (already computed and
|
|
/// stored in dynamic programming table) is minimal. The final result is simply
|
|
/// the layout determined for the last breakpoint at the end of text.
|
|
#[typst_macros::time]
|
|
fn linebreak_optimized<'a>(
|
|
engine: &Engine,
|
|
p: &'a Preparation<'a>,
|
|
width: Abs,
|
|
) -> Vec<Line<'a>> {
|
|
let metrics = CostMetrics::compute(p);
|
|
|
|
// Determines the exact costs of a likely good layout through Knuth-Plass
|
|
// with approximate metrics. We can use this cost as an upper bound to prune
|
|
// the search space in our proper optimization pass below.
|
|
let upper_bound = linebreak_optimized_approximate(engine, p, width, &metrics);
|
|
|
|
// Using the upper bound, perform exact optimized linebreaking.
|
|
linebreak_optimized_bounded(engine, p, width, &metrics, upper_bound)
|
|
}
|
|
|
|
/// Performs line breaking in optimized Knuth-Plass style, but with an upper
|
|
/// bound on the cost. This allows us to skip many parts of the search space.
|
|
#[typst_macros::time]
|
|
fn linebreak_optimized_bounded<'a>(
|
|
engine: &Engine,
|
|
p: &'a Preparation<'a>,
|
|
width: Abs,
|
|
metrics: &CostMetrics,
|
|
upper_bound: Cost,
|
|
) -> Vec<Line<'a>> {
|
|
/// An entry in the dynamic programming table for inline layout optimization.
|
|
struct Entry<'a> {
|
|
pred: usize,
|
|
total: Cost,
|
|
line: Line<'a>,
|
|
end: usize,
|
|
}
|
|
|
|
// Dynamic programming table.
|
|
let mut table = vec![Entry { pred: 0, total: 0.0, line: Line::empty(), end: 0 }];
|
|
|
|
let mut active = 0;
|
|
let mut prev_end = 0;
|
|
|
|
breakpoints(p, |end, breakpoint| {
|
|
// Find the optimal predecessor.
|
|
let mut best: Option<Entry> = None;
|
|
|
|
// A lower bound for the cost of all following line attempts.
|
|
let mut line_lower_bound = None;
|
|
|
|
for (pred_index, pred) in table.iter().enumerate().skip(active) {
|
|
let start = pred.end;
|
|
let unbreakable = prev_end == start;
|
|
|
|
// If the minimum cost we've established for the line is already
|
|
// too much, skip this attempt.
|
|
if line_lower_bound
|
|
.is_some_and(|lower| pred.total + lower > upper_bound + BOUND_EPS)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Build the line.
|
|
let attempt = line(engine, p, start..end, breakpoint, Some(&pred.line));
|
|
|
|
// Determine the cost of the line and its stretch ratio.
|
|
let (line_ratio, line_cost) = ratio_and_cost(
|
|
p,
|
|
metrics,
|
|
width,
|
|
&pred.line,
|
|
&attempt,
|
|
breakpoint,
|
|
unbreakable,
|
|
);
|
|
|
|
// If the line is overfull, we adjust the set of active candidate
|
|
// line starts. This is the case if
|
|
// - justification is on, but we'd need to shrink too much
|
|
// - justification is off and the line just doesn't fit
|
|
//
|
|
// If this is the earliest breakpoint in the active set
|
|
// (active == i), remove it from the active set. If there is an
|
|
// earlier one (active < i), then the logically shorter line was
|
|
// in fact longer (can happen with negative spacing) and we
|
|
// can't trim the active set just yet.
|
|
if line_ratio < metrics.min_ratio && active == pred_index {
|
|
active += 1;
|
|
}
|
|
|
|
// The total cost of this line and its chain of predecessors.
|
|
let total = pred.total + line_cost;
|
|
|
|
// If the line is already underfull (`line_ratio > 0`), any shorter
|
|
// slice of the line will be even more underfull. So it'll only get
|
|
// worse from here and further attempts would also have a cost
|
|
// exceeding `bound`. There is one exception: When the line has
|
|
// negative spacing, we can't know for sure, so we don't assign the
|
|
// lower bound in that case.
|
|
if line_ratio > 0.0
|
|
&& line_lower_bound.is_none()
|
|
&& !attempt.has_negative_width_items()
|
|
{
|
|
line_lower_bound = Some(line_cost);
|
|
}
|
|
|
|
// If the cost already exceeds the upper bound, we don't need to
|
|
// integrate this result into the table.
|
|
if total > upper_bound + BOUND_EPS {
|
|
continue;
|
|
}
|
|
|
|
// If this attempt is better than what we had before, take it!
|
|
if best.as_ref().is_none_or(|best| best.total >= total) {
|
|
best = Some(Entry { pred: pred_index, total, line: attempt, end });
|
|
}
|
|
}
|
|
|
|
// If this is a mandatory break, all breakpoints before this one become
|
|
// inactive since no line can span over the mandatory break.
|
|
if breakpoint == Breakpoint::Mandatory {
|
|
active = table.len();
|
|
}
|
|
|
|
table.extend(best);
|
|
prev_end = end;
|
|
});
|
|
|
|
// Retrace the best path.
|
|
let mut lines = Vec::with_capacity(16);
|
|
let mut idx = table.len() - 1;
|
|
|
|
// This should only happen if our bound was faulty. Which shouldn't happen!
|
|
if table[idx].end != p.text.len() {
|
|
#[cfg(debug_assertions)]
|
|
panic!("bounded inline layout is incomplete");
|
|
|
|
#[cfg(not(debug_assertions))]
|
|
return linebreak_optimized_bounded(engine, p, width, metrics, Cost::INFINITY);
|
|
}
|
|
|
|
while idx != 0 {
|
|
table.truncate(idx + 1);
|
|
let entry = table.pop().unwrap();
|
|
lines.push(entry.line);
|
|
idx = entry.pred;
|
|
}
|
|
|
|
lines.reverse();
|
|
lines
|
|
}
|
|
|
|
/// Runs the normal Knuth-Plass algorithm, but instead of building proper lines
|
|
/// (which is costly) to determine costs, it determines approximate costs using
|
|
/// cumulative arrays.
|
|
///
|
|
/// This results in a likely good inline layouts, for which we then compute
|
|
/// the exact cost. This cost is an upper bound for proper optimized
|
|
/// linebreaking. We can use it to heavily prune the search space.
|
|
#[typst_macros::time]
|
|
fn linebreak_optimized_approximate(
|
|
engine: &Engine,
|
|
p: &Preparation,
|
|
width: Abs,
|
|
metrics: &CostMetrics,
|
|
) -> Cost {
|
|
// Determine the cumulative estimation metrics.
|
|
let estimates = Estimates::compute(p);
|
|
|
|
/// An entry in the dynamic programming table for inline layout optimization.
|
|
struct Entry {
|
|
pred: usize,
|
|
total: Cost,
|
|
end: usize,
|
|
unbreakable: bool,
|
|
breakpoint: Breakpoint,
|
|
}
|
|
|
|
// Dynamic programming table.
|
|
let mut table = vec![Entry {
|
|
pred: 0,
|
|
total: 0.0,
|
|
end: 0,
|
|
unbreakable: false,
|
|
breakpoint: Breakpoint::Mandatory,
|
|
}];
|
|
|
|
let mut active = 0;
|
|
let mut prev_end = 0;
|
|
|
|
breakpoints(p, |end, breakpoint| {
|
|
// Find the optimal predecessor.
|
|
let mut best: Option<Entry> = None;
|
|
for (pred_index, pred) in table.iter().enumerate().skip(active) {
|
|
let start = pred.end;
|
|
let unbreakable = prev_end == start;
|
|
|
|
// Whether the line is justified. This is not 100% accurate w.r.t
|
|
// to line()'s behaviour, but good enough.
|
|
let justify = p.config.justify && breakpoint != Breakpoint::Mandatory;
|
|
|
|
// We don't really know whether the line naturally ends with a dash
|
|
// here, so we can miss that case, but it's ok, since all of this
|
|
// just an estimate.
|
|
let consecutive_dash = pred.breakpoint.is_hyphen() && breakpoint.is_hyphen();
|
|
|
|
// Estimate how much the line's spaces would need to be stretched to
|
|
// make it the desired width. We trim at the end to not take into
|
|
// account trailing spaces. This is, again, only an approximation of
|
|
// the real behaviour of `line`.
|
|
let trimmed_end = start + p.text[start..end].trim_end().len();
|
|
let line_ratio = raw_ratio(
|
|
p,
|
|
width,
|
|
estimates.widths.estimate(start..trimmed_end)
|
|
+ if breakpoint.is_hyphen() {
|
|
metrics.approx_hyphen_width
|
|
} else {
|
|
Abs::zero()
|
|
},
|
|
estimates.stretchability.estimate(start..trimmed_end),
|
|
estimates.shrinkability.estimate(start..trimmed_end),
|
|
estimates.justifiables.estimate(start..trimmed_end),
|
|
);
|
|
|
|
// Determine the line's cost.
|
|
let line_cost = raw_cost(
|
|
metrics,
|
|
breakpoint,
|
|
line_ratio,
|
|
justify,
|
|
unbreakable,
|
|
consecutive_dash,
|
|
true,
|
|
);
|
|
|
|
// Adjust the set of active breakpoints.
|
|
// See `linebreak_optimized` for details.
|
|
if line_ratio < metrics.min_ratio && active == pred_index {
|
|
active += 1;
|
|
}
|
|
|
|
// The total cost of this line and its chain of predecessors.
|
|
let total = pred.total + line_cost;
|
|
|
|
// If this attempt is better than what we had before, take it!
|
|
if best.as_ref().is_none_or(|best| best.total >= total) {
|
|
best = Some(Entry {
|
|
pred: pred_index,
|
|
total,
|
|
end,
|
|
unbreakable,
|
|
breakpoint,
|
|
});
|
|
}
|
|
}
|
|
|
|
// If this is a mandatory break, all breakpoints before this one become
|
|
// inactive.
|
|
if breakpoint == Breakpoint::Mandatory {
|
|
active = table.len();
|
|
}
|
|
|
|
table.extend(best);
|
|
prev_end = end;
|
|
});
|
|
|
|
// Retrace the best path.
|
|
let mut indices = Vec::with_capacity(16);
|
|
let mut idx = table.len() - 1;
|
|
while idx != 0 {
|
|
indices.push(idx);
|
|
idx = table[idx].pred;
|
|
}
|
|
|
|
let mut pred = Line::empty();
|
|
let mut start = 0;
|
|
let mut exact = 0.0;
|
|
|
|
// The cost that we optimized was only an approximate cost, so the layout we
|
|
// got here is only likely to be good, not guaranteed to be the best. We now
|
|
// computes its exact cost as that gives us a sound upper bound for the
|
|
// proper optimization pass.
|
|
for idx in indices.into_iter().rev() {
|
|
let Entry { end, breakpoint, unbreakable, .. } = table[idx];
|
|
|
|
let attempt = line(engine, p, start..end, breakpoint, Some(&pred));
|
|
let (ratio, line_cost) =
|
|
ratio_and_cost(p, metrics, width, &pred, &attempt, breakpoint, unbreakable);
|
|
|
|
// If approximation produces a valid layout without too much shrinking,
|
|
// exact layout is guaranteed to find the same layout. If, however, the
|
|
// line is overfull, we do not have this guarantee. Then, our bound
|
|
// becomes useless and actively harmful (it could be lower than what
|
|
// optimal layout produces). Thus, we immediately bail with an infinite
|
|
// bound in this case.
|
|
if ratio < metrics.min_ratio {
|
|
return Cost::INFINITY;
|
|
}
|
|
|
|
pred = attempt;
|
|
start = end;
|
|
exact += line_cost;
|
|
}
|
|
|
|
exact
|
|
}
|
|
|
|
/// Compute the stretch ratio and cost of a line.
|
|
#[allow(clippy::too_many_arguments)]
|
|
fn ratio_and_cost(
|
|
p: &Preparation,
|
|
metrics: &CostMetrics,
|
|
available_width: Abs,
|
|
pred: &Line,
|
|
attempt: &Line,
|
|
breakpoint: Breakpoint,
|
|
unbreakable: bool,
|
|
) -> (f64, Cost) {
|
|
let ratio = raw_ratio(
|
|
p,
|
|
available_width,
|
|
attempt.width,
|
|
attempt.stretchability(),
|
|
attempt.shrinkability(),
|
|
attempt.justifiables(),
|
|
);
|
|
|
|
let cost = raw_cost(
|
|
metrics,
|
|
breakpoint,
|
|
ratio,
|
|
attempt.justify,
|
|
unbreakable,
|
|
pred.dash.is_some() && attempt.dash.is_some(),
|
|
false,
|
|
);
|
|
|
|
(ratio, cost)
|
|
}
|
|
|
|
/// Determine the stretch ratio for a line given raw metrics.
|
|
///
|
|
/// - A ratio < min_ratio indicates an overfull line.
|
|
/// - A negative ratio indicates a line that needs shrinking.
|
|
/// - A ratio of zero indicates a perfect line.
|
|
/// - A positive ratio indicates a line that needs stretching.
|
|
fn raw_ratio(
|
|
p: &Preparation,
|
|
available_width: Abs,
|
|
line_width: Abs,
|
|
stretchability: Abs,
|
|
shrinkability: Abs,
|
|
justifiables: usize,
|
|
) -> f64 {
|
|
// Determine how much the line's spaces would need to be stretched
|
|
// to make it the desired width.
|
|
let mut delta = available_width - line_width;
|
|
|
|
// Avoid possible floating point errors in previous calculation.
|
|
if delta.approx_eq(Abs::zero()) {
|
|
delta = Abs::zero();
|
|
}
|
|
|
|
// Determine how much stretch or shrink is natural.
|
|
let adjustability = if delta >= Abs::zero() { stretchability } else { shrinkability };
|
|
|
|
// Observations:
|
|
// - `delta` is negative for a line that needs shrinking and positive for a
|
|
// line that needs stretching.
|
|
// - `adjustability` must be non-negative to make sense.
|
|
// - `ratio` inherits the sign of `delta`.
|
|
let mut ratio = delta / adjustability.max(Abs::zero());
|
|
|
|
// The most likely cause of a NaN result is that `delta` was zero. This
|
|
// often happens with monospace fonts and CJK texts. It means that the line
|
|
// already fits perfectly, so `ratio` should be zero then.
|
|
if ratio.is_nan() {
|
|
ratio = 0.0;
|
|
}
|
|
|
|
// If the ratio exceeds 1, we should stretch above the natural
|
|
// stretchability using justifiables.
|
|
if ratio > 1.0 {
|
|
// We should stretch the line above its stretchability. Now
|
|
// calculate the extra amount. Also, don't divide by zero.
|
|
let extra_stretch = (delta - adjustability) / justifiables.max(1) as f64;
|
|
// Normalize the amount by half the em size.
|
|
ratio = 1.0 + extra_stretch / (p.config.font_size / 2.0);
|
|
}
|
|
|
|
// The min value must be < MIN_RATIO, but how much smaller doesn't matter
|
|
// since overfull lines have hard-coded huge costs anyway.
|
|
//
|
|
// The max value is clamped to 10 since it doesn't really matter whether a
|
|
// line is stretched 10x or 20x.
|
|
ratio.clamp(MIN_RATIO - 1.0, 10.0)
|
|
}
|
|
|
|
/// Compute the cost of a line given raw metrics.
|
|
///
|
|
/// This mostly follows the formula in the Knuth-Plass paper, but there are some
|
|
/// adjustments.
|
|
fn raw_cost(
|
|
metrics: &CostMetrics,
|
|
breakpoint: Breakpoint,
|
|
ratio: f64,
|
|
justify: bool,
|
|
unbreakable: bool,
|
|
consecutive_dash: bool,
|
|
approx: bool,
|
|
) -> Cost {
|
|
// Determine the stretch/shrink cost of the line.
|
|
let badness = if ratio < metrics.min_ratio(approx) {
|
|
// Overfull line always has maximum cost.
|
|
1_000_000.0
|
|
} else if breakpoint != Breakpoint::Mandatory || justify || ratio < 0.0 {
|
|
// If the line shall be justified or needs shrinking, it has normal
|
|
// badness with cost 100|ratio|^3. We limit the ratio to 10 as to not
|
|
// get to close to our maximum cost.
|
|
100.0 * ratio.abs().powi(3)
|
|
} else {
|
|
// If the line shouldn't be justified and doesn't need shrink, we don't
|
|
// pay any cost.
|
|
0.0
|
|
};
|
|
|
|
// Compute penalties.
|
|
let mut penalty = 0.0;
|
|
|
|
// Penalize runts (lone words before a mandatory break / at the end).
|
|
if unbreakable && breakpoint == Breakpoint::Mandatory {
|
|
penalty += metrics.runt_cost;
|
|
}
|
|
|
|
// Penalize hyphenation.
|
|
if let Breakpoint::Hyphen(l, r) = breakpoint {
|
|
// We penalize hyphenations close to the edges of the word (< LIMIT
|
|
// chars) extra. For each step of distance from the limit, we add 15%
|
|
// to the cost.
|
|
const LIMIT: u8 = 5;
|
|
let steps = LIMIT.saturating_sub(l) + LIMIT.saturating_sub(r);
|
|
let extra = 0.15 * steps as f64;
|
|
penalty += (1.0 + extra) * metrics.hyph_cost;
|
|
}
|
|
|
|
// Penalize two consecutive dashes extra (not necessarily hyphens).
|
|
// Knuth-Plass does this separately after the squaring, with a higher cost,
|
|
// but I couldn't find any explanation as to why.
|
|
if consecutive_dash {
|
|
penalty += metrics.hyph_cost;
|
|
}
|
|
|
|
// From the Knuth-Plass Paper: $ (1 + beta_j + pi_j)^2 $.
|
|
//
|
|
// We add one to minimize the number of lines when everything else is more
|
|
// or less equal.
|
|
(1.0 + badness + penalty).powi(2)
|
|
}
|
|
|
|
/// Calls `f` for all possible points in the text where lines can broken.
|
|
///
|
|
/// Yields for each breakpoint the text index, whether the break is mandatory
|
|
/// (after `\n`) and whether a hyphen is required (when breaking inside of a
|
|
/// word).
|
|
///
|
|
/// This is an internal instead of an external iterator because it makes the
|
|
/// code much simpler and the consumers of this function don't need the
|
|
/// composability and flexibility of external iteration anyway.
|
|
fn breakpoints(p: &Preparation, mut f: impl FnMut(usize, Breakpoint)) {
|
|
let text = p.text;
|
|
|
|
// Single breakpoint at the end for empty text.
|
|
if text.is_empty() {
|
|
f(0, Breakpoint::Mandatory);
|
|
return;
|
|
}
|
|
|
|
let hyphenate = p.config.hyphenate != Some(false);
|
|
let lb = LINEBREAK_DATA.as_borrowed();
|
|
let segmenter = match p.config.lang {
|
|
Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
|
|
_ => &SEGMENTER,
|
|
};
|
|
|
|
let mut last = 0;
|
|
let mut iter = segmenter.segment_str(text).peekable();
|
|
|
|
loop {
|
|
// Special case for links. UAX #14 doesn't handle them well.
|
|
let (head, tail) = text.split_at(last);
|
|
if head.ends_with("://") || tail.starts_with("www.") {
|
|
let (link, _) = link_prefix(tail);
|
|
linebreak_link(link, |i| f(last + i, Breakpoint::Normal));
|
|
last += link.len();
|
|
while iter.peek().is_some_and(|&p| p < last) {
|
|
iter.next();
|
|
}
|
|
}
|
|
|
|
// Get the next UAX #14 linebreak opportunity.
|
|
let Some(point) = iter.next() else { break };
|
|
|
|
// Skip breakpoint if there is no char before it. icu4x generates one
|
|
// at offset 0, but we don't want it.
|
|
let Some(c) = text[..point].chars().next_back() else { continue };
|
|
|
|
// Find out whether the last break was mandatory by checking against
|
|
// rules LB4 and LB5, special-casing the end of text according to LB3.
|
|
// See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
|
|
let breakpoint = if point == text.len() {
|
|
Breakpoint::Mandatory
|
|
} else {
|
|
match lb.get(c) {
|
|
// Fix for: https://github.com/unicode-org/icu4x/issues/4146
|
|
LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
|
|
LineBreak::MandatoryBreak
|
|
| LineBreak::CarriageReturn
|
|
| LineBreak::LineFeed
|
|
| LineBreak::NextLine => Breakpoint::Mandatory,
|
|
_ => Breakpoint::Normal,
|
|
}
|
|
};
|
|
|
|
// Hyphenate between the last and current breakpoint.
|
|
if hyphenate && last < point {
|
|
for segment in text[last..point].split_word_bounds() {
|
|
if !segment.is_empty() && segment.chars().all(char::is_alphabetic) {
|
|
hyphenations(p, &lb, last, segment, &mut f);
|
|
}
|
|
last += segment.len();
|
|
}
|
|
}
|
|
|
|
// Call `f` for the UAX #14 break opportunity.
|
|
f(point, breakpoint);
|
|
last = point;
|
|
}
|
|
}
|
|
|
|
/// Generate breakpoints for hyphenations within a word.
|
|
fn hyphenations(
|
|
p: &Preparation,
|
|
lb: &CodePointMapDataBorrowed<LineBreak>,
|
|
mut offset: usize,
|
|
word: &str,
|
|
mut f: impl FnMut(usize, Breakpoint),
|
|
) {
|
|
let Some(lang) = lang_at(p, offset) else { return };
|
|
let count = word.chars().count();
|
|
let end = offset + word.len();
|
|
|
|
let mut chars = 0;
|
|
for syllable in hypher::hyphenate(word, lang) {
|
|
offset += syllable.len();
|
|
chars += syllable.chars().count();
|
|
|
|
// Don't hyphenate after the final syllable.
|
|
if offset == end {
|
|
continue;
|
|
}
|
|
|
|
// Filter out hyphenation opportunities where hyphenation was actually
|
|
// disabled.
|
|
if !hyphenate_at(p, offset) {
|
|
continue;
|
|
}
|
|
|
|
// Filter out forbidden hyphenation opportunities.
|
|
if matches!(
|
|
syllable.chars().next_back().map(|c| lb.get(c)),
|
|
Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
|
|
) {
|
|
continue;
|
|
}
|
|
|
|
// Determine the number of codepoints before and after the hyphenation.
|
|
let l = chars.saturating_as::<u8>();
|
|
let r = (count - chars).saturating_as::<u8>();
|
|
|
|
// Call `f` for the word-internal hyphenation opportunity.
|
|
f(offset, Breakpoint::Hyphen(l, r));
|
|
}
|
|
}
|
|
|
|
/// Produce linebreak opportunities for a link.
|
|
fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
|
|
#[derive(PartialEq)]
|
|
enum Class {
|
|
Alphabetic,
|
|
Digit,
|
|
Open,
|
|
Other,
|
|
}
|
|
|
|
impl Class {
|
|
fn of(c: char) -> Self {
|
|
if c.is_alphabetic() {
|
|
Class::Alphabetic
|
|
} else if c.is_numeric() {
|
|
Class::Digit
|
|
} else if matches!(c, '(' | '[') {
|
|
Class::Open
|
|
} else {
|
|
Class::Other
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut offset = 0;
|
|
let mut prev = Class::Other;
|
|
|
|
for (end, c) in link.char_indices() {
|
|
let class = Class::of(c);
|
|
|
|
// Emit opportunities when going from
|
|
// - other -> other
|
|
// - alphabetic -> numeric
|
|
// - numeric -> alphabetic
|
|
// Never before/after opening delimiters.
|
|
if end > 0
|
|
&& prev != Class::Open
|
|
&& if class == Class::Other { prev == Class::Other } else { class != prev }
|
|
{
|
|
let piece = &link[offset..end];
|
|
if piece.len() < 16 {
|
|
// For bearably long segments, emit them as one.
|
|
offset = end;
|
|
f(offset);
|
|
} else {
|
|
// If it gets very long (e.g. a hash in the URL), just allow a
|
|
// break at every char.
|
|
for c in piece.chars() {
|
|
offset += c.len_utf8();
|
|
f(offset);
|
|
}
|
|
}
|
|
}
|
|
|
|
prev = class;
|
|
}
|
|
}
|
|
|
|
/// Whether hyphenation is enabled at the given offset.
|
|
fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
|
|
p.config.hyphenate.unwrap_or_else(|| {
|
|
let (_, item) = p.get(offset);
|
|
match item.text() {
|
|
Some(text) => TextElem::hyphenate_in(text.styles).unwrap_or(p.config.justify),
|
|
None => false,
|
|
}
|
|
})
|
|
}
|
|
|
|
/// The text language at the given offset.
|
|
fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
|
|
let lang = p.config.lang.or_else(|| {
|
|
let (_, item) = p.get(offset);
|
|
let styles = item.text()?.styles;
|
|
Some(TextElem::lang_in(styles))
|
|
})?;
|
|
|
|
let bytes = lang.as_str().as_bytes().try_into().ok()?;
|
|
hypher::Lang::from_iso(bytes)
|
|
}
|
|
|
|
/// Resolved metrics relevant for cost computation.
|
|
struct CostMetrics {
|
|
min_ratio: f64,
|
|
min_approx_ratio: f64,
|
|
approx_hyphen_width: Abs,
|
|
hyph_cost: Cost,
|
|
runt_cost: Cost,
|
|
}
|
|
|
|
impl CostMetrics {
|
|
/// Compute shared metrics for inline layout optimization.
|
|
fn compute(p: &Preparation) -> Self {
|
|
Self {
|
|
// When justifying, we may stretch spaces below their natural width.
|
|
min_ratio: if p.config.justify { MIN_RATIO } else { 0.0 },
|
|
min_approx_ratio: if p.config.justify { MIN_APPROX_RATIO } else { 0.0 },
|
|
// Approximate hyphen width for estimates.
|
|
approx_hyphen_width: Em::new(0.33).at(p.config.font_size),
|
|
// Costs.
|
|
hyph_cost: DEFAULT_HYPH_COST * p.config.costs.hyphenation().get(),
|
|
runt_cost: DEFAULT_RUNT_COST * p.config.costs.runt().get(),
|
|
}
|
|
}
|
|
|
|
/// The minimum line ratio we allow for shrinking. For approximate layout,
|
|
/// we allow less because otherwise we get an invalid layout fairly often,
|
|
/// which makes our bound useless.
|
|
fn min_ratio(&self, approx: bool) -> f64 {
|
|
if approx {
|
|
self.min_approx_ratio
|
|
} else {
|
|
self.min_ratio
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Estimated line metrics.
|
|
///
|
|
/// Allows to get a quick estimate of a metric for a line between two byte
|
|
/// positions.
|
|
struct Estimates {
|
|
widths: CumulativeVec<Abs>,
|
|
stretchability: CumulativeVec<Abs>,
|
|
shrinkability: CumulativeVec<Abs>,
|
|
justifiables: CumulativeVec<usize>,
|
|
}
|
|
|
|
impl Estimates {
|
|
/// Compute estimations for approximate Knuth-Plass layout.
|
|
fn compute(p: &Preparation) -> Self {
|
|
let cap = p.text.len();
|
|
|
|
let mut widths = CumulativeVec::with_capacity(cap);
|
|
let mut stretchability = CumulativeVec::with_capacity(cap);
|
|
let mut shrinkability = CumulativeVec::with_capacity(cap);
|
|
let mut justifiables = CumulativeVec::with_capacity(cap);
|
|
|
|
for (range, item) in p.items.iter() {
|
|
if let Item::Text(shaped) = item {
|
|
for g in shaped.glyphs.iter() {
|
|
let byte_len = g.range.len();
|
|
let stretch = g.stretchability().0 + g.stretchability().1;
|
|
let shrink = g.shrinkability().0 + g.shrinkability().1;
|
|
widths.push(byte_len, g.x_advance.at(shaped.size));
|
|
stretchability.push(byte_len, stretch.at(shaped.size));
|
|
shrinkability.push(byte_len, shrink.at(shaped.size));
|
|
justifiables.push(byte_len, g.is_justifiable() as usize);
|
|
}
|
|
} else {
|
|
widths.push(range.len(), item.natural_width());
|
|
}
|
|
|
|
widths.adjust(range.end);
|
|
stretchability.adjust(range.end);
|
|
shrinkability.adjust(range.end);
|
|
justifiables.adjust(range.end);
|
|
}
|
|
|
|
Self {
|
|
widths,
|
|
stretchability,
|
|
shrinkability,
|
|
justifiables,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// An accumulative array of a metric.
|
|
struct CumulativeVec<T> {
|
|
total: T,
|
|
summed: Vec<T>,
|
|
}
|
|
|
|
impl<T> CumulativeVec<T>
|
|
where
|
|
T: Default + Copy + Add<Output = T> + Sub<Output = T>,
|
|
{
|
|
/// Create a new instance with the given capacity.
|
|
fn with_capacity(capacity: usize) -> Self {
|
|
let total = T::default();
|
|
let mut summed = Vec::with_capacity(capacity);
|
|
summed.push(total);
|
|
Self { total, summed }
|
|
}
|
|
|
|
/// Adjust to cover the given byte length.
|
|
fn adjust(&mut self, len: usize) {
|
|
self.summed.resize(len, self.total);
|
|
}
|
|
|
|
/// Adds a new segment with the given byte length and metric.
|
|
fn push(&mut self, byte_len: usize, metric: T) {
|
|
self.total = self.total + metric;
|
|
for _ in 0..byte_len {
|
|
self.summed.push(self.total);
|
|
}
|
|
}
|
|
|
|
/// Estimates the metrics for the line spanned by the range.
|
|
#[track_caller]
|
|
fn estimate(&self, range: Range) -> T {
|
|
self.get(range.end) - self.get(range.start)
|
|
}
|
|
|
|
/// Get the metric at the given byte position.
|
|
#[track_caller]
|
|
fn get(&self, index: usize) -> T {
|
|
match index.checked_sub(1) {
|
|
None => T::default(),
|
|
Some(i) => self.summed[i],
|
|
}
|
|
}
|
|
}
|