Fix hyphenation outside of words (#4498)

This commit is contained in:
Laurenz 2024-07-04 15:27:43 +02:00 committed by GitHub
parent 0ef672c347
commit 129a4d600c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 57 additions and 41 deletions

View File

@ -1,6 +1,6 @@
use std::ops::{Add, Sub};
use icu_properties::maps::CodePointMapData;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
use icu_properties::sets::CodePointSetData;
use icu_properties::LineBreak;
use icu_provider::AsDeserializingBufferProvider;
@ -8,6 +8,7 @@ use icu_provider_adapters::fork::ForkByKeyProvider;
use icu_provider_blob::BlobDataProvider;
use icu_segmenter::LineSegmenter;
use once_cell::sync::Lazy;
use unicode_segmentation::UnicodeSegmentation;
use super::*;
use crate::engine::Engine;
@ -630,7 +631,7 @@ fn raw_cost(
/// This is an internal instead of an external iterator because it makes the
/// code much simpler and the consumers of this function don't need the
/// composability and flexibility of external iteration anyway.
fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint)) {
fn breakpoints(p: &Preparation, mut f: impl FnMut(usize, Breakpoint)) {
let text = p.text;
// Single breakpoint at the end for empty text.
@ -661,7 +662,7 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint))
}
}
// Get the UAX #14 linebreak opportunities.
// Get the next UAX #14 linebreak opportunity.
let Some(point) = iter.next() else { break };
// Skip breakpoint if there is no char before it. icu4x generates one
@ -686,32 +687,44 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint))
};
// Hyphenate between the last and current breakpoint.
'hyphenate: {
if !hyphenate {
break 'hyphenate;
}
// Extract a hyphenatable "word".
let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
if word.is_empty() {
break 'hyphenate;
}
let end = last + word.len();
if hyphenate {
let mut offset = last;
for segment in text[last..point].split_word_bounds() {
if !segment.is_empty() && segment.chars().all(char::is_alphabetic) {
hyphenations(p, &lb, offset, segment, &mut f);
}
offset += segment.len();
}
}
// Determine the language to hyphenate this word in.
let Some(lang) = lang_at(p, last) else { break 'hyphenate };
// Call `f` for the UAX #14 break opportunity.
f(point, breakpoint);
last = point;
}
}
/// Generate breakpoints for hyphenations within a word.
fn hyphenations(
p: &Preparation,
lb: &CodePointMapDataBorrowed<LineBreak>,
mut offset: usize,
word: &str,
mut f: impl FnMut(usize, Breakpoint),
) {
let Some(lang) = lang_at(p, offset) else { return };
let end = offset + word.len();
for syllable in hypher::hyphenate(word, lang) {
// Don't hyphenate after the final syllable.
offset += syllable.len();
// Don't hyphenate after the final syllable.
if offset == end {
continue;
}
// Filter out hyphenation opportunities where hyphenation was
// actually disabled.
// Filter out hyphenation opportunities where hyphenation was actually
// disabled.
if !hyphenate_at(p, offset) {
continue;
}
@ -727,13 +740,6 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint))
// Call `f` for the word-internal hyphenation opportunity.
f(offset, Breakpoint::Hyphen);
}
}
// Call `f` for the UAX #14 break opportunity.
f(point, breakpoint);
last = point;
}
}
/// Produce linebreak opportunities for a link.

Binary file not shown.

After

Width:  |  Height:  |  Size: 1011 B

View File

@ -50,6 +50,16 @@ It's a #emph[Tree]beard.
#set text(hyphenate: true)
#h(6pt) networks, the rest.
--- hyphenate-outside-of-words ---
// More tests for hyphenation of non-words.
#set text(hyphenate: true)
#block(width: 0pt, "doesn't")
#block(width: 0pt, "(OneNote)")
#block(width: 0pt, "(present)")
#set text(lang: "de")
#block(width: 0pt, "(bzw.)")
--- hyphenate-pt-repeat-hyphen-natural-word-breaking ---
// The word breaker naturally breaks arco-da-velha at arco-/-da-velha,
// so we shall repeat the hyphen, even that hyphenate is set to false.