Fix hyphen duplication rule for some languages (#4058)
@ -298,6 +298,19 @@ impl SpanMapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// A dash at the end of a line.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||||
pub(super) enum Dash {
|
||||
/// A hyphen added to break a word.
|
||||
SoftHyphen,
|
||||
/// Regular hyphen, present in a compound word, e.g. beija-flor.
|
||||
HardHyphen,
|
||||
/// An em dash.
|
||||
Long,
|
||||
/// An en dash.
|
||||
Short,
|
||||
}
|
||||
|
||||
/// A layouted line, consisting of a sequence of layouted paragraph items that
|
||||
/// are mostly borrowed from the preparation phase. This type enables you to
|
||||
/// measure the size of a line in a range before committing to building the
|
||||
@ -327,7 +340,7 @@ struct Line<'a> {
|
||||
justify: bool,
|
||||
/// Whether the line ends with a hyphen or dash, either naturally or through
|
||||
/// hyphenation.
|
||||
dash: bool,
|
||||
dash: Option<Dash>,
|
||||
}
|
||||
|
||||
impl<'a> Line<'a> {
|
||||
@ -814,8 +827,10 @@ fn linebreak_simple<'a>(
|
||||
let mut last = None;
|
||||
|
||||
breakpoints(p, |end, breakpoint| {
|
||||
let prepend_hyphen = lines.last().map(should_repeat_hyphen).unwrap_or(false);
|
||||
|
||||
// Compute the line and its size.
|
||||
let mut attempt = line(engine, p, start..end, breakpoint);
|
||||
let mut attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
|
||||
|
||||
// If the line doesn't fit anymore, we push the last fitting attempt
|
||||
// into the stack and rebuild the line from the attempt's end. The
|
||||
@ -824,7 +839,7 @@ fn linebreak_simple<'a>(
|
||||
if let Some((last_attempt, last_end)) = last.take() {
|
||||
lines.push(last_attempt);
|
||||
start = last_end;
|
||||
attempt = line(engine, p, start..end, breakpoint);
|
||||
attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
|
||||
}
|
||||
}
|
||||
|
||||
@ -894,7 +909,7 @@ fn linebreak_optimized<'a>(
|
||||
let mut table = vec![Entry {
|
||||
pred: 0,
|
||||
total: 0.0,
|
||||
line: line(engine, p, 0..0, Breakpoint::Mandatory),
|
||||
line: line(engine, p, 0..0, Breakpoint::Mandatory, false),
|
||||
}];
|
||||
|
||||
let em = p.size;
|
||||
@ -908,8 +923,9 @@ fn linebreak_optimized<'a>(
|
||||
for (i, pred) in table.iter().enumerate().skip(active) {
|
||||
// Layout the line.
|
||||
let start = pred.line.end;
|
||||
let prepend_hyphen = should_repeat_hyphen(&pred.line);
|
||||
|
||||
let attempt = line(engine, p, start..end, breakpoint);
|
||||
let attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
|
||||
|
||||
// Determine how much the line's spaces would need to be stretched
|
||||
// to make it the desired width.
|
||||
@ -987,7 +1003,7 @@ fn linebreak_optimized<'a>(
|
||||
cost = (0.01 + cost).powi(2);
|
||||
|
||||
// Penalize two consecutive dashes (not necessarily hyphens) extra.
|
||||
if attempt.dash && pred.line.dash {
|
||||
if attempt.dash.is_some() && pred.line.dash.is_some() {
|
||||
cost += CONSECUTIVE_DASH_COST;
|
||||
}
|
||||
|
||||
@ -1022,6 +1038,7 @@ fn line<'a>(
|
||||
p: &'a Preparation,
|
||||
mut range: Range,
|
||||
breakpoint: Breakpoint,
|
||||
prepend_hyphen: bool,
|
||||
) -> Line<'a> {
|
||||
let end = range.end;
|
||||
let mut justify =
|
||||
@ -1037,7 +1054,7 @@ fn line<'a>(
|
||||
last: None,
|
||||
width: Abs::zero(),
|
||||
justify,
|
||||
dash: false,
|
||||
dash: None,
|
||||
};
|
||||
}
|
||||
|
||||
@ -1047,7 +1064,7 @@ fn line<'a>(
|
||||
|
||||
// Reshape the last item if it's split in half or hyphenated.
|
||||
let mut last = None;
|
||||
let mut dash = false;
|
||||
let mut dash = None;
|
||||
if let Some((Item::Text(shaped), before)) = inner.split_last() {
|
||||
// Compute the range we want to shape, trimming whitespace at the
|
||||
// end of the line.
|
||||
@ -1062,7 +1079,17 @@ fn line<'a>(
|
||||
// Deal with hyphens, dashes and justification.
|
||||
let shy = trimmed.ends_with('\u{ad}');
|
||||
let hyphen = breakpoint == Breakpoint::Hyphen;
|
||||
dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']);
|
||||
dash = if hyphen || shy {
|
||||
Some(Dash::SoftHyphen)
|
||||
} else if trimmed.ends_with('-') {
|
||||
Some(Dash::HardHyphen)
|
||||
} else if trimmed.ends_with('–') {
|
||||
Some(Dash::Short)
|
||||
} else if trimmed.ends_with('—') {
|
||||
Some(Dash::Long)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
justify |= text.ends_with('\u{2028}');
|
||||
|
||||
// Deal with CJK punctuation at line ends.
|
||||
@ -1079,7 +1106,11 @@ fn line<'a>(
|
||||
// need the shaped empty string to make the line the appropriate
|
||||
// height. That is the case exactly if the string is empty and there
|
||||
// are no other items in the line.
|
||||
if hyphen || start + shaped.text.len() > range.end || maybe_adjust_last_glyph {
|
||||
if hyphen
|
||||
|| start + shaped.text.len() > range.end
|
||||
|| maybe_adjust_last_glyph
|
||||
|| prepend_hyphen
|
||||
{
|
||||
if hyphen || start < range.end || before.is_empty() {
|
||||
let mut reshaped = shaped.reshape(engine, &p.spans, start..range.end);
|
||||
if hyphen || shy {
|
||||
@ -1131,7 +1162,10 @@ fn line<'a>(
|
||||
let end = range.end.min(base + shaped.text.len());
|
||||
|
||||
// Reshape if necessary.
|
||||
if range.start + shaped.text.len() > end || maybe_adjust_first_glyph {
|
||||
if range.start + shaped.text.len() > end
|
||||
|| maybe_adjust_first_glyph
|
||||
|| prepend_hyphen
|
||||
{
|
||||
// If the range is empty, we don't want to push an empty text item.
|
||||
if range.start < end {
|
||||
let reshaped = shaped.reshape(engine, &p.spans, range.start..end);
|
||||
@ -1143,6 +1177,15 @@ fn line<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
if prepend_hyphen {
|
||||
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
|
||||
if let Some(reshaped) = reshaped {
|
||||
let width_before = reshaped.width;
|
||||
reshaped.prepend_hyphen(engine, p.fallback);
|
||||
width += reshaped.width - width_before;
|
||||
}
|
||||
}
|
||||
|
||||
if maybe_adjust_first_glyph {
|
||||
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
|
||||
if let Some(reshaped) = reshaped {
|
||||
@ -1446,3 +1489,49 @@ fn overhang(c: char) -> f64 {
|
||||
_ => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether the hyphen should repeat at the start of the next line.
|
||||
fn should_repeat_hyphen(pred_line: &Line) -> bool {
|
||||
// If the predecessor line does not end with a Dash::HardHyphen, we shall
|
||||
// not place a hyphen at the start of the next line.
|
||||
if pred_line.dash != Some(Dash::HardHyphen) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If there's a trimmed out space, we needn't repeat the hyphen. That's the
|
||||
// case of a text like "...kebab é a -melhor- comida que existe", where the
|
||||
// hyphens are a kind of emphasis marker.
|
||||
if pred_line.trimmed.end != pred_line.end {
|
||||
return false;
|
||||
}
|
||||
|
||||
// The hyphen should repeat only in the languages that require that feature.
|
||||
// For more information see the discussion at https://github.com/typst/typst/issues/3235
|
||||
let Some(Item::Text(shape)) = pred_line.last.as_ref() else { return false };
|
||||
|
||||
match shape.lang {
|
||||
// - Lower Sorbian: see https://dolnoserbski.de/ortografija/psawidla/K3
|
||||
// - Czech: see https://prirucka.ujc.cas.cz/?id=164
|
||||
// - Croatian: see http://pravopis.hr/pravilo/spojnica/68/
|
||||
// - Polish: see https://www.ortograf.pl/zasady-pisowni/lacznik-zasady-pisowni
|
||||
// - Portuguese: see https://www2.senado.leg.br/bdsf/bitstream/handle/id/508145/000997415.pdf (Base XX)
|
||||
// - Slovak: see https://www.zones.sk/studentske-prace/gramatika/10620-pravopis-rozdelovanie-slov/
|
||||
Lang::LOWER_SORBIAN
|
||||
| Lang::CZECH
|
||||
| Lang::CROATIAN
|
||||
| Lang::POLISH
|
||||
| Lang::PORTUGUESE
|
||||
| Lang::SLOVAK => true,
|
||||
// In Spanish the hyphen is required only if the word next to hyphen is
|
||||
// not capitalized. Otherwise, the hyphen must not be repeated.
|
||||
//
|
||||
// See § 4.1.1.1.2.e on the "Ortografía de la lengua española"
|
||||
// https://www.rae.es/ortografía/como-signo-de-división-de-palabras-a-final-de-línea
|
||||
Lang::SPANISH => pred_line.bidi.text[pred_line.end..]
|
||||
.chars()
|
||||
.next()
|
||||
.map(|c| !c.is_uppercase())
|
||||
.unwrap_or(false),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
@ -447,6 +447,15 @@ impl<'a> ShapedText<'a> {
|
||||
|
||||
/// Push a hyphen to end of the text.
|
||||
pub fn push_hyphen(&mut self, engine: &Engine, fallback: bool) {
|
||||
self.insert_hyphen(engine, fallback, Side::Right)
|
||||
}
|
||||
|
||||
/// Prepend a hyphen to start of the text.
|
||||
pub fn prepend_hyphen(&mut self, engine: &Engine, fallback: bool) {
|
||||
self.insert_hyphen(engine, fallback, Side::Left)
|
||||
}
|
||||
|
||||
fn insert_hyphen(&mut self, engine: &Engine, fallback: bool, side: Side) {
|
||||
let world = engine.world;
|
||||
let book = world.book();
|
||||
let fallback_func = if fallback {
|
||||
@ -464,17 +473,17 @@ impl<'a> ShapedText<'a> {
|
||||
let ttf = font.ttf();
|
||||
let glyph_id = ttf.glyph_index('-')?;
|
||||
let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
|
||||
let range = self
|
||||
.glyphs
|
||||
.last()
|
||||
.map(|g| g.range.end..g.range.end)
|
||||
// In the unlikely chance that we hyphenate after an empty line,
|
||||
// ensure that the glyph range still falls after self.base so
|
||||
// that subtracting either of the endpoints by self.base doesn't
|
||||
// underflow. See <https://github.com/typst/typst/issues/2283>.
|
||||
.unwrap_or_else(|| self.base..self.base);
|
||||
let range = match side {
|
||||
Side::Left => self.glyphs.first().map(|g| g.range.start..g.range.start),
|
||||
Side::Right => self.glyphs.last().map(|g| g.range.end..g.range.end),
|
||||
}
|
||||
// In the unlikely chance that we hyphenate after an empty line,
|
||||
// ensure that the glyph range still falls after self.base so
|
||||
// that subtracting either of the endpoints by self.base doesn't
|
||||
// underflow. See <https://github.com/typst/typst/issues/2283>.
|
||||
.unwrap_or_else(|| self.base..self.base);
|
||||
self.width += x_advance.at(self.size);
|
||||
self.glyphs.to_mut().push(ShapedGlyph {
|
||||
let glyph = ShapedGlyph {
|
||||
font,
|
||||
glyph_id: glyph_id.0,
|
||||
x_advance,
|
||||
@ -487,7 +496,11 @@ impl<'a> ShapedText<'a> {
|
||||
span: (Span::detached(), 0),
|
||||
is_justifiable: false,
|
||||
script: Script::Common,
|
||||
});
|
||||
};
|
||||
match side {
|
||||
Side::Left => self.glyphs.to_mut().insert(0, glyph),
|
||||
Side::Right => self.glyphs.to_mut().push(glyph),
|
||||
}
|
||||
Some(())
|
||||
});
|
||||
}
|
||||
|
@ -57,6 +57,7 @@ impl Lang {
|
||||
pub const BOKMÅL: Self = Self(*b"nb ", 2);
|
||||
pub const CATALAN: Self = Self(*b"ca ", 2);
|
||||
pub const CHINESE: Self = Self(*b"zh ", 2);
|
||||
pub const CROATIAN: Self = Self(*b"hr ", 2);
|
||||
pub const CZECH: Self = Self(*b"cs ", 2);
|
||||
pub const DANISH: Self = Self(*b"da ", 2);
|
||||
pub const DUTCH: Self = Self(*b"nl ", 2);
|
||||
@ -70,12 +71,14 @@ impl Lang {
|
||||
pub const HUNGARIAN: Self = Self(*b"hu ", 2);
|
||||
pub const ITALIAN: Self = Self(*b"it ", 2);
|
||||
pub const JAPANESE: Self = Self(*b"ja ", 2);
|
||||
pub const LOWER_SORBIAN: Self = Self(*b"dsb", 3);
|
||||
pub const NYNORSK: Self = Self(*b"nn ", 2);
|
||||
pub const POLISH: Self = Self(*b"pl ", 2);
|
||||
pub const PORTUGUESE: Self = Self(*b"pt ", 2);
|
||||
pub const ROMANIAN: Self = Self(*b"ro ", 2);
|
||||
pub const RUSSIAN: Self = Self(*b"ru ", 2);
|
||||
pub const SERBIAN: Self = Self(*b"sr ", 2);
|
||||
pub const SLOVAK: Self = Self(*b"sk ", 2);
|
||||
pub const SLOVENIAN: Self = Self(*b"sl ", 2);
|
||||
pub const SPANISH: Self = Self(*b"es ", 2);
|
||||
pub const SWEDISH: Self = Self(*b"sv ", 2);
|
||||
|
BIN
tests/ref/hyphenate-es-captalized-names.png
Normal file
After Width: | Height: | Size: 4.1 KiB |
BIN
tests/ref/hyphenate-es-repeat-hyphen.png
Normal file
After Width: | Height: | Size: 3.1 KiB |
BIN
tests/ref/hyphenate-pt-dash-emphasis.png
Normal file
After Width: | Height: | Size: 983 B |
BIN
tests/ref/hyphenate-pt-no-repeat-hyphen.png
Normal file
After Width: | Height: | Size: 1.5 KiB |
After Width: | Height: | Size: 1.3 KiB |
BIN
tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.png
Normal file
After Width: | Height: | Size: 1.3 KiB |
BIN
tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.png
Normal file
After Width: | Height: | Size: 1.3 KiB |
@ -50,6 +50,58 @@ It's a #emph[Tree]beard.
|
||||
#set text(hyphenate: true)
|
||||
#h(6pt) networks, the rest.
|
||||
|
||||
--- hyphenate-pt-repeat-hyphen-natural-word-breaking ---
|
||||
// The word breaker naturally breaks arco-da-velha at arco-/-da-velha,
|
||||
// so we shall repeat the hyphen, even that hyphenate is set to false.
|
||||
#set page(width: 4cm)
|
||||
#set text(lang: "pt")
|
||||
|
||||
Alguma coisa no arco-da-velha é algo que está muito longe.
|
||||
|
||||
--- hyphenate-pt-repeat-hyphen-hyphenate-true ---
|
||||
#set page(width: 4cm)
|
||||
#set text(lang: "pt", hyphenate: true)
|
||||
|
||||
Alguma coisa no arco-da-velha é algo que está muito longe.
|
||||
|
||||
--- hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis ---
|
||||
#set page(width: 4cm)
|
||||
#set text(lang: "pt", hyphenate: true)
|
||||
|
||||
Alguma coisa no _arco-da-velha_ é algo que está muito longe.
|
||||
|
||||
--- hyphenate-pt-no-repeat-hyphen ---
|
||||
#set page(width: 4cm)
|
||||
#set text(lang: "pt", hyphenate: true)
|
||||
|
||||
Um médico otorrinolaringologista cuida da garganta do paciente.
|
||||
|
||||
--- hyphenate-pt-dash-emphasis ---
|
||||
// If the hyphen is followed by a space we shall not repeat the hyphen
|
||||
// at the next line
|
||||
#set page(width: 4cm)
|
||||
#set text(lang: "pt", hyphenate: true)
|
||||
|
||||
Quebabe é a -melhor- comida que existe.
|
||||
|
||||
--- hyphenate-es-repeat-hyphen ---
|
||||
#set page(width: 6cm)
|
||||
#set text(lang: "es", hyphenate: true)
|
||||
|
||||
Lo que entendemos por nivel léxico-semántico, en cuanto su sentido más
|
||||
gramatical: es aquel que estudia el origen y forma de las palabras de
|
||||
un idioma.
|
||||
|
||||
--- hyphenate-es-captalized-names ---
|
||||
// If the hyphen is followed by a capitalized word we shall not repeat
|
||||
// the hyphen at the next line
|
||||
#set page(width: 6.2cm)
|
||||
#set text(lang: "es", hyphenate: true)
|
||||
|
||||
Tras el estallido de la contienda Ruiz-Giménez fue detenido junto a sus
|
||||
dos hermanos y puesto bajo custodia por las autoridades republicanas, con
|
||||
el objetivo de protegerle de las patrullas de milicianos.
|
||||
|
||||
--- costs-widow-orphan ---
|
||||
#set page(height: 60pt)
|
||||
|
||||
|