Tune hyphenation (#4584)

This commit is contained in:
Laurenz 2024-07-19 13:47:51 +02:00 committed by GitHub
parent 4275447788
commit 3ef0991fbb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 74 additions and 68 deletions

View File

@ -133,7 +133,7 @@ pub fn line<'a>(
|| (p.justify && breakpoint != Breakpoint::Mandatory); || (p.justify && breakpoint != Breakpoint::Mandatory);
// Process dashes. // Process dashes.
let dash = if breakpoint == Breakpoint::Hyphen || full.ends_with(SHY) { let dash = if breakpoint.is_hyphen() || full.ends_with(SHY) {
Some(Dash::Soft) Some(Dash::Soft)
} else if full.ends_with(HYPHEN) { } else if full.ends_with(HYPHEN) {
Some(Dash::Hard) Some(Dash::Hard)

View File

@ -1,5 +1,6 @@
use std::ops::{Add, Sub}; use std::ops::{Add, Sub};
use az::SaturatingAs;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed}; use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
use icu_properties::sets::CodePointSetData; use icu_properties::sets::CodePointSetData;
use icu_properties::LineBreak; use icu_properties::LineBreak;
@ -21,10 +22,15 @@ use crate::text::{Lang, TextElem};
type Cost = f64; type Cost = f64;
// Cost parameters. // Cost parameters.
const DEFAULT_HYPH_COST: Cost = 0.5; //
const DEFAULT_RUNT_COST: Cost = 0.5; // We choose higher costs than the Knuth-Plass paper (which would be 50) because
const CONSECUTIVE_DASH_COST: Cost = 0.3; // it hyphenates way to eagerly in Typst otherwise. Could be related to the
const MAX_COST: Cost = 1_000_000.0; // ratios coming out differently since Typst doesn't have the concept of glue,
// so things work a bit differently.
const DEFAULT_HYPH_COST: Cost = 135.0;
const DEFAULT_RUNT_COST: Cost = 100.0;
// Other parameters.
const MIN_RATIO: f64 = -1.0; const MIN_RATIO: f64 = -1.0;
const MIN_APPROX_RATIO: f64 = -0.5; const MIN_APPROX_RATIO: f64 = -0.5;
const BOUND_EPS: f64 = 1e-3; const BOUND_EPS: f64 = 1e-3;
@ -65,8 +71,9 @@ pub enum Breakpoint {
Normal, Normal,
/// A mandatory breakpoint (after '\n' or at the end of the text). /// A mandatory breakpoint (after '\n' or at the end of the text).
Mandatory, Mandatory,
/// An opportunity for hyphenating. /// An opportunity for hyphenating and how many chars are before/after it
Hyphen, /// in the word.
Hyphen(u8, u8),
} }
impl Breakpoint { impl Breakpoint {
@ -95,9 +102,14 @@ impl Breakpoint {
} }
// Trim nothing further. // Trim nothing further.
Self::Hyphen => line, Self::Hyphen(..) => line,
} }
} }
/// Whether this is a hyphen breakpoint.
pub fn is_hyphen(self) -> bool {
matches!(self, Self::Hyphen(..))
}
} }
/// Breaks the paragraph into lines. /// Breaks the paragraph into lines.
@ -254,7 +266,6 @@ fn linebreak_optimized_bounded<'a>(
width, width,
&pred.line, &pred.line,
&attempt, &attempt,
end,
breakpoint, breakpoint,
unbreakable, unbreakable,
); );
@ -374,8 +385,6 @@ fn linebreak_optimized_approximate(
let mut prev_end = 0; let mut prev_end = 0;
breakpoints(p, |end, breakpoint| { breakpoints(p, |end, breakpoint| {
let at_end = end == p.text.len();
// Find the optimal predecessor. // Find the optimal predecessor.
let mut best: Option<Entry> = None; let mut best: Option<Entry> = None;
for (pred_index, pred) in table.iter().enumerate().skip(active) { for (pred_index, pred) in table.iter().enumerate().skip(active) {
@ -384,13 +393,12 @@ fn linebreak_optimized_approximate(
// Whether the line is justified. This is not 100% accurate w.r.t // Whether the line is justified. This is not 100% accurate w.r.t
// to line()'s behaviour, but good enough. // to line()'s behaviour, but good enough.
let justify = p.justify && !at_end && breakpoint != Breakpoint::Mandatory; let justify = p.justify && breakpoint != Breakpoint::Mandatory;
// We don't really know whether the line naturally ends with a dash // We don't really know whether the line naturally ends with a dash
// here, so we can miss that case, but it's ok, since all of this // here, so we can miss that case, but it's ok, since all of this
// just an estimate. // just an estimate.
let consecutive_dash = let consecutive_dash = pred.breakpoint.is_hyphen() && breakpoint.is_hyphen();
pred.breakpoint == Breakpoint::Hyphen && breakpoint == Breakpoint::Hyphen;
// Estimate how much the line's spaces would need to be stretched to // Estimate how much the line's spaces would need to be stretched to
// make it the desired width. We trim at the end to not take into // make it the desired width. We trim at the end to not take into
@ -401,7 +409,7 @@ fn linebreak_optimized_approximate(
p, p,
width, width,
estimates.widths.estimate(start..trimmed_end) estimates.widths.estimate(start..trimmed_end)
+ if breakpoint == Breakpoint::Hyphen { + if breakpoint.is_hyphen() {
metrics.approx_hyphen_width metrics.approx_hyphen_width
} else { } else {
Abs::zero() Abs::zero()
@ -416,7 +424,6 @@ fn linebreak_optimized_approximate(
metrics, metrics,
breakpoint, breakpoint,
line_ratio, line_ratio,
at_end,
justify, justify,
unbreakable, unbreakable,
consecutive_dash, consecutive_dash,
@ -474,17 +481,8 @@ fn linebreak_optimized_approximate(
let Entry { end, breakpoint, unbreakable, .. } = table[idx]; let Entry { end, breakpoint, unbreakable, .. } = table[idx];
let attempt = line(engine, p, start..end, breakpoint, Some(&pred)); let attempt = line(engine, p, start..end, breakpoint, Some(&pred));
let (_, line_cost) =
let (_, line_cost) = ratio_and_cost( ratio_and_cost(p, metrics, width, &pred, &attempt, breakpoint, unbreakable);
p,
metrics,
width,
&pred,
&attempt,
end,
breakpoint,
unbreakable,
);
pred = attempt; pred = attempt;
start = end; start = end;
@ -502,7 +500,6 @@ fn ratio_and_cost(
available_width: Abs, available_width: Abs,
pred: &Line, pred: &Line,
attempt: &Line, attempt: &Line,
end: usize,
breakpoint: Breakpoint, breakpoint: Breakpoint,
unbreakable: bool, unbreakable: bool,
) -> (f64, Cost) { ) -> (f64, Cost) {
@ -519,7 +516,6 @@ fn ratio_and_cost(
metrics, metrics,
breakpoint, breakpoint,
ratio, ratio,
end == p.text.len(),
attempt.justify, attempt.justify,
unbreakable, unbreakable,
pred.dash.is_some() && attempt.dash.is_some(), pred.dash.is_some() && attempt.dash.is_some(),
@ -569,57 +565,64 @@ fn raw_ratio(
} }
/// Compute the cost of a line given raw metrics. /// Compute the cost of a line given raw metrics.
#[allow(clippy::too_many_arguments)] ///
/// This mostly follows the formula in the Knuth-Plass paper, but there are some
/// adjustments.
fn raw_cost( fn raw_cost(
metrics: &CostMetrics, metrics: &CostMetrics,
breakpoint: Breakpoint, breakpoint: Breakpoint,
ratio: f64, ratio: f64,
at_end: bool,
justify: bool, justify: bool,
unbreakable: bool, unbreakable: bool,
consecutive_dash: bool, consecutive_dash: bool,
approx: bool, approx: bool,
) -> Cost { ) -> Cost {
// Determine the cost of the line. // Determine the stretch/shrink cost of the line.
let mut cost = if ratio < metrics.min_ratio(approx) { let badness = if ratio < metrics.min_ratio(approx) {
// Overfull line always has maximum cost. // Overfull line always has maximum cost.
MAX_COST 1_000_000.0
} else if breakpoint == Breakpoint::Mandatory || at_end { } else if justify || ratio < 0.0 {
// - If ratio < 0, we always need to shrink the line (even the last one). // If the line shall be justified or needs shrinking, it has normal
// - If ratio > 0, we need to stretch the line only when it is justified // badness with cost 100|ratio|^3. We limit the ratio to 10 as to not
// (last line is not justified by default even if `p.justify` is true). // get to close to our maximum cost.
if ratio < 0.0 || (ratio > 0.0 && justify) { 100.0 * ratio.abs().min(10.0).powi(3)
ratio.powi(3).abs()
} else {
0.0
}
} else { } else {
// Normal line with cost of |ratio^3|. // If the line shouldn't be justified and doesn't need shrink, we don't
ratio.powi(3).abs() // pay any cost.
0.0
}; };
// Penalize runts (lone words in the last line). // Compute penalties.
if unbreakable && at_end { let mut penalty = 0.0;
cost += metrics.runt_cost;
// Penalize runts (lone words before a mandatory break / at the end).
if unbreakable && breakpoint == Breakpoint::Mandatory {
penalty += metrics.runt_cost;
} }
// Penalize hyphenation. // Penalize hyphenation.
if breakpoint == Breakpoint::Hyphen { if let Breakpoint::Hyphen(l, r) = breakpoint {
cost += metrics.hyph_cost; // We penalize hyphenations close to the edges of the word (< LIMIT
// chars) extra. For each step of distance from the limit, we add 15%
// to the cost.
const LIMIT: u8 = 5;
let steps = LIMIT.saturating_sub(l) + LIMIT.saturating_sub(r);
let extra = 0.15 * steps as f64;
penalty += (1.0 + extra) * metrics.hyph_cost;
} }
// In the Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a, // Penalize two consecutive dashes extra (not necessarily hyphens).
// where r is the ratio, p=50 is the penalty, and a=3000 is // Knuth-Plass does this separately after the squaring, with a higher cost,
// consecutive the penalty. We divide the whole formula by 10, // but I couldn't find any explanation as to why.
// resulting (0.01 + |r|^3 + p)^2 + a, where p=0.5 and a=0.3
let mut cost = (0.01 + cost).powi(2);
// Penalize two consecutive dashes (not necessarily hyphens) extra.
if consecutive_dash { if consecutive_dash {
cost += CONSECUTIVE_DASH_COST; penalty += metrics.hyph_cost;
} }
cost // From the Knuth-Plass Paper: $ (1 + beta_j + pi_j)^2 $.
//
// We add one to minimize the number of lines when everything else is more
// or less equal.
(1.0 + badness + penalty).powi(2)
} }
/// Calls `f` for all possible points in the text where lines can broken. /// Calls `f` for all possible points in the text where lines can broken.
@ -711,10 +714,13 @@ fn hyphenations(
mut f: impl FnMut(usize, Breakpoint), mut f: impl FnMut(usize, Breakpoint),
) { ) {
let Some(lang) = lang_at(p, offset) else { return }; let Some(lang) = lang_at(p, offset) else { return };
let count = word.chars().count();
let end = offset + word.len(); let end = offset + word.len();
let mut chars = 0;
for syllable in hypher::hyphenate(word, lang) { for syllable in hypher::hyphenate(word, lang) {
offset += syllable.len(); offset += syllable.len();
chars += syllable.chars().count();
// Don't hyphenate after the final syllable. // Don't hyphenate after the final syllable.
if offset == end { if offset == end {
@ -735,8 +741,12 @@ fn hyphenations(
continue; continue;
} }
// Determine the number of codepoints before and after the hyphenation.
let l = chars.saturating_as::<u8>();
let r = (count - chars).saturating_as::<u8>();
// Call `f` for the word-internal hyphenation opportunity. // Call `f` for the word-internal hyphenation opportunity.
f(offset, Breakpoint::Hyphen); f(offset, Breakpoint::Hyphen(l, r));
} }
} }
@ -825,9 +835,9 @@ fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
struct CostMetrics { struct CostMetrics {
min_ratio: f64, min_ratio: f64,
min_approx_ratio: f64, min_approx_ratio: f64,
approx_hyphen_width: Abs,
hyph_cost: Cost, hyph_cost: Cost,
runt_cost: Cost, runt_cost: Cost,
approx_hyphen_width: Abs,
} }
impl CostMetrics { impl CostMetrics {
@ -837,10 +847,11 @@ impl CostMetrics {
// When justifying, we may stretch spaces below their natural width. // When justifying, we may stretch spaces below their natural width.
min_ratio: if p.justify { MIN_RATIO } else { 0.0 }, min_ratio: if p.justify { MIN_RATIO } else { 0.0 },
min_approx_ratio: if p.justify { MIN_APPROX_RATIO } else { 0.0 }, min_approx_ratio: if p.justify { MIN_APPROX_RATIO } else { 0.0 },
hyph_cost: DEFAULT_HYPH_COST * p.costs.hyphenation().get(),
runt_cost: DEFAULT_RUNT_COST * p.costs.runt().get(),
// Approximate hyphen width for estimates. // Approximate hyphen width for estimates.
approx_hyphen_width: Em::new(0.33).at(p.size), approx_hyphen_width: Em::new(0.33).at(p.size),
// Costs.
hyph_cost: DEFAULT_HYPH_COST * p.costs.hyphenation().get(),
runt_cost: DEFAULT_RUNT_COST * p.costs.runt().get(),
} }
} }

View File

@ -512,11 +512,6 @@ pub struct TextElem {
/// default of `auto`, prevents them. More nuanced cost specification for /// default of `auto`, prevents them. More nuanced cost specification for
/// these modifications is planned for the future.) /// these modifications is planned for the future.)
/// ///
/// The default costs are an acceptable balance, but some may find that it
/// hyphenates or avoids runs too eagerly, breaking the flow of dense prose.
/// A cost of 600% (six times the normal cost) may work better for such
/// contexts.
///
/// ```example /// ```example
/// #set text(hyphenate: true, size: 11.4pt) /// #set text(hyphenate: true, size: 11.4pt)
/// #set par(justify: true) /// #set par(justify: true)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

After

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.5 KiB

After

Width:  |  Height:  |  Size: 6.4 KiB