Fix hyphen duplication rule for some languages (#4058)

This commit is contained in:
Gabriel Araújo 2024-05-15 10:42:13 -03:00 committed by GitHub
parent 484a0e60d8
commit 017f2f4566
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 179 additions and 22 deletions

View File

@ -298,6 +298,19 @@ impl SpanMapper {
}
}
/// A dash at the end of a line.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub(super) enum Dash {
/// A hyphen added to break a word.
SoftHyphen,
/// Regular hyphen, present in a compound word, e.g. beija-flor.
HardHyphen,
/// An em dash.
Long,
/// An en dash.
Short,
}
/// A layouted line, consisting of a sequence of layouted paragraph items that
/// are mostly borrowed from the preparation phase. This type enables you to
/// measure the size of a line in a range before committing to building the
@ -327,7 +340,7 @@ struct Line<'a> {
justify: bool,
/// Whether the line ends with a hyphen or dash, either naturally or through
/// hyphenation.
dash: bool,
dash: Option<Dash>,
}
impl<'a> Line<'a> {
@ -814,8 +827,10 @@ fn linebreak_simple<'a>(
let mut last = None;
breakpoints(p, |end, breakpoint| {
let prepend_hyphen = lines.last().map(should_repeat_hyphen).unwrap_or(false);
// Compute the line and its size.
let mut attempt = line(engine, p, start..end, breakpoint);
let mut attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
// If the line doesn't fit anymore, we push the last fitting attempt
// into the stack and rebuild the line from the attempt's end. The
@ -824,7 +839,7 @@ fn linebreak_simple<'a>(
if let Some((last_attempt, last_end)) = last.take() {
lines.push(last_attempt);
start = last_end;
attempt = line(engine, p, start..end, breakpoint);
attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
}
}
@ -894,7 +909,7 @@ fn linebreak_optimized<'a>(
let mut table = vec![Entry {
pred: 0,
total: 0.0,
line: line(engine, p, 0..0, Breakpoint::Mandatory),
line: line(engine, p, 0..0, Breakpoint::Mandatory, false),
}];
let em = p.size;
@ -908,8 +923,9 @@ fn linebreak_optimized<'a>(
for (i, pred) in table.iter().enumerate().skip(active) {
// Layout the line.
let start = pred.line.end;
let prepend_hyphen = should_repeat_hyphen(&pred.line);
let attempt = line(engine, p, start..end, breakpoint);
let attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
// Determine how much the line's spaces would need to be stretched
// to make it the desired width.
@ -987,7 +1003,7 @@ fn linebreak_optimized<'a>(
cost = (0.01 + cost).powi(2);
// Penalize two consecutive dashes (not necessarily hyphens) extra.
if attempt.dash && pred.line.dash {
if attempt.dash.is_some() && pred.line.dash.is_some() {
cost += CONSECUTIVE_DASH_COST;
}
@ -1022,6 +1038,7 @@ fn line<'a>(
p: &'a Preparation,
mut range: Range,
breakpoint: Breakpoint,
prepend_hyphen: bool,
) -> Line<'a> {
let end = range.end;
let mut justify =
@ -1037,7 +1054,7 @@ fn line<'a>(
last: None,
width: Abs::zero(),
justify,
dash: false,
dash: None,
};
}
@ -1047,7 +1064,7 @@ fn line<'a>(
// Reshape the last item if it's split in half or hyphenated.
let mut last = None;
let mut dash = false;
let mut dash = None;
if let Some((Item::Text(shaped), before)) = inner.split_last() {
// Compute the range we want to shape, trimming whitespace at the
// end of the line.
@ -1062,7 +1079,17 @@ fn line<'a>(
// Deal with hyphens, dashes and justification.
let shy = trimmed.ends_with('\u{ad}');
let hyphen = breakpoint == Breakpoint::Hyphen;
dash = hyphen || shy || trimmed.ends_with(['-', '', '—']);
dash = if hyphen || shy {
Some(Dash::SoftHyphen)
} else if trimmed.ends_with('-') {
Some(Dash::HardHyphen)
} else if trimmed.ends_with('') {
Some(Dash::Short)
} else if trimmed.ends_with('—') {
Some(Dash::Long)
} else {
None
};
justify |= text.ends_with('\u{2028}');
// Deal with CJK punctuation at line ends.
@ -1079,7 +1106,11 @@ fn line<'a>(
// need the shaped empty string to make the line the appropriate
// height. That is the case exactly if the string is empty and there
// are no other items in the line.
if hyphen || start + shaped.text.len() > range.end || maybe_adjust_last_glyph {
if hyphen
|| start + shaped.text.len() > range.end
|| maybe_adjust_last_glyph
|| prepend_hyphen
{
if hyphen || start < range.end || before.is_empty() {
let mut reshaped = shaped.reshape(engine, &p.spans, start..range.end);
if hyphen || shy {
@ -1131,7 +1162,10 @@ fn line<'a>(
let end = range.end.min(base + shaped.text.len());
// Reshape if necessary.
if range.start + shaped.text.len() > end || maybe_adjust_first_glyph {
if range.start + shaped.text.len() > end
|| maybe_adjust_first_glyph
|| prepend_hyphen
{
// If the range is empty, we don't want to push an empty text item.
if range.start < end {
let reshaped = shaped.reshape(engine, &p.spans, range.start..end);
@ -1143,6 +1177,15 @@ fn line<'a>(
}
}
if prepend_hyphen {
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
if let Some(reshaped) = reshaped {
let width_before = reshaped.width;
reshaped.prepend_hyphen(engine, p.fallback);
width += reshaped.width - width_before;
}
}
if maybe_adjust_first_glyph {
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
if let Some(reshaped) = reshaped {
@ -1446,3 +1489,49 @@ fn overhang(c: char) -> f64 {
_ => 0.0,
}
}
/// Whether the hyphen should repeat at the start of the next line.
fn should_repeat_hyphen(pred_line: &Line) -> bool {
// If the predecessor line does not end with a Dash::HardHyphen, we shall
// not place a hyphen at the start of the next line.
if pred_line.dash != Some(Dash::HardHyphen) {
return false;
}
// If there's a trimmed out space, we needn't repeat the hyphen. That's the
// case of a text like "...kebab é a -melhor- comida que existe", where the
// hyphens are a kind of emphasis marker.
if pred_line.trimmed.end != pred_line.end {
return false;
}
// The hyphen should repeat only in the languages that require that feature.
// For more information see the discussion at https://github.com/typst/typst/issues/3235
let Some(Item::Text(shape)) = pred_line.last.as_ref() else { return false };
match shape.lang {
// - Lower Sorbian: see https://dolnoserbski.de/ortografija/psawidla/K3
// - Czech: see https://prirucka.ujc.cas.cz/?id=164
// - Croatian: see http://pravopis.hr/pravilo/spojnica/68/
// - Polish: see https://www.ortograf.pl/zasady-pisowni/lacznik-zasady-pisowni
// - Portuguese: see https://www2.senado.leg.br/bdsf/bitstream/handle/id/508145/000997415.pdf (Base XX)
// - Slovak: see https://www.zones.sk/studentske-prace/gramatika/10620-pravopis-rozdelovanie-slov/
Lang::LOWER_SORBIAN
| Lang::CZECH
| Lang::CROATIAN
| Lang::POLISH
| Lang::PORTUGUESE
| Lang::SLOVAK => true,
// In Spanish the hyphen is required only if the word next to hyphen is
// not capitalized. Otherwise, the hyphen must not be repeated.
//
// See § 4.1.1.1.2.e on the "Ortografía de la lengua española"
// https://www.rae.es/ortografía/como-signo-de-división-de-palabras-a-final-de-línea
Lang::SPANISH => pred_line.bidi.text[pred_line.end..]
.chars()
.next()
.map(|c| !c.is_uppercase())
.unwrap_or(false),
_ => false,
}
}

View File

@ -447,6 +447,15 @@ impl<'a> ShapedText<'a> {
/// Push a hyphen to end of the text.
pub fn push_hyphen(&mut self, engine: &Engine, fallback: bool) {
self.insert_hyphen(engine, fallback, Side::Right)
}
/// Prepend a hyphen to start of the text.
pub fn prepend_hyphen(&mut self, engine: &Engine, fallback: bool) {
self.insert_hyphen(engine, fallback, Side::Left)
}
fn insert_hyphen(&mut self, engine: &Engine, fallback: bool, side: Side) {
let world = engine.world;
let book = world.book();
let fallback_func = if fallback {
@ -464,17 +473,17 @@ impl<'a> ShapedText<'a> {
let ttf = font.ttf();
let glyph_id = ttf.glyph_index('-')?;
let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
let range = self
.glyphs
.last()
.map(|g| g.range.end..g.range.end)
// In the unlikely chance that we hyphenate after an empty line,
// ensure that the glyph range still falls after self.base so
// that subtracting either of the endpoints by self.base doesn't
// underflow. See <https://github.com/typst/typst/issues/2283>.
.unwrap_or_else(|| self.base..self.base);
let range = match side {
Side::Left => self.glyphs.first().map(|g| g.range.start..g.range.start),
Side::Right => self.glyphs.last().map(|g| g.range.end..g.range.end),
}
// In the unlikely chance that we hyphenate after an empty line,
// ensure that the glyph range still falls after self.base so
// that subtracting either of the endpoints by self.base doesn't
// underflow. See <https://github.com/typst/typst/issues/2283>.
.unwrap_or_else(|| self.base..self.base);
self.width += x_advance.at(self.size);
self.glyphs.to_mut().push(ShapedGlyph {
let glyph = ShapedGlyph {
font,
glyph_id: glyph_id.0,
x_advance,
@ -487,7 +496,11 @@ impl<'a> ShapedText<'a> {
span: (Span::detached(), 0),
is_justifiable: false,
script: Script::Common,
});
};
match side {
Side::Left => self.glyphs.to_mut().insert(0, glyph),
Side::Right => self.glyphs.to_mut().push(glyph),
}
Some(())
});
}

View File

@ -57,6 +57,7 @@ impl Lang {
pub const BOKMÅL: Self = Self(*b"nb ", 2);
pub const CATALAN: Self = Self(*b"ca ", 2);
pub const CHINESE: Self = Self(*b"zh ", 2);
pub const CROATIAN: Self = Self(*b"hr ", 2);
pub const CZECH: Self = Self(*b"cs ", 2);
pub const DANISH: Self = Self(*b"da ", 2);
pub const DUTCH: Self = Self(*b"nl ", 2);
@ -70,12 +71,14 @@ impl Lang {
pub const HUNGARIAN: Self = Self(*b"hu ", 2);
pub const ITALIAN: Self = Self(*b"it ", 2);
pub const JAPANESE: Self = Self(*b"ja ", 2);
pub const LOWER_SORBIAN: Self = Self(*b"dsb", 3);
pub const NYNORSK: Self = Self(*b"nn ", 2);
pub const POLISH: Self = Self(*b"pl ", 2);
pub const PORTUGUESE: Self = Self(*b"pt ", 2);
pub const ROMANIAN: Self = Self(*b"ro ", 2);
pub const RUSSIAN: Self = Self(*b"ru ", 2);
pub const SERBIAN: Self = Self(*b"sr ", 2);
pub const SLOVAK: Self = Self(*b"sk ", 2);
pub const SLOVENIAN: Self = Self(*b"sl ", 2);
pub const SPANISH: Self = Self(*b"es ", 2);
pub const SWEDISH: Self = Self(*b"sv ", 2);

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 983 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@ -50,6 +50,58 @@ It's a #emph[Tree]beard.
#set text(hyphenate: true)
#h(6pt) networks, the rest.
--- hyphenate-pt-repeat-hyphen-natural-word-breaking ---
// The word breaker naturally breaks arco-da-velha at arco-/-da-velha,
// so we shall repeat the hyphen, even that hyphenate is set to false.
#set page(width: 4cm)
#set text(lang: "pt")
Alguma coisa no arco-da-velha é algo que está muito longe.
--- hyphenate-pt-repeat-hyphen-hyphenate-true ---
#set page(width: 4cm)
#set text(lang: "pt", hyphenate: true)
Alguma coisa no arco-da-velha é algo que está muito longe.
--- hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis ---
#set page(width: 4cm)
#set text(lang: "pt", hyphenate: true)
Alguma coisa no _arco-da-velha_ é algo que está muito longe.
--- hyphenate-pt-no-repeat-hyphen ---
#set page(width: 4cm)
#set text(lang: "pt", hyphenate: true)
Um médico otorrinolaringologista cuida da garganta do paciente.
--- hyphenate-pt-dash-emphasis ---
// If the hyphen is followed by a space we shall not repeat the hyphen
// at the next line
#set page(width: 4cm)
#set text(lang: "pt", hyphenate: true)
Quebabe é a -melhor- comida que existe.
--- hyphenate-es-repeat-hyphen ---
#set page(width: 6cm)
#set text(lang: "es", hyphenate: true)
Lo que entendemos por nivel léxico-semántico, en cuanto su sentido más
gramatical: es aquel que estudia el origen y forma de las palabras de
un idioma.
--- hyphenate-es-captalized-names ---
// If the hyphen is followed by a capitalized word we shall not repeat
// the hyphen at the next line
#set page(width: 6.2cm)
#set text(lang: "es", hyphenate: true)
Tras el estallido de la contienda Ruiz-Giménez fue detenido junto a sus
dos hermanos y puesto bajo custodia por las autoridades republicanas, con
el objetivo de protegerle de las patrullas de milicianos.
--- costs-widow-orphan ---
#set page(height: 60pt)