Fix hyphenation outside of words (#4498)

This commit is contained in:
Laurenz 2024-07-04 15:27:43 +02:00 committed by GitHub
parent 0ef672c347
commit 129a4d600c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 57 additions and 41 deletions

View File

@ -1,6 +1,6 @@
use std::ops::{Add, Sub};
use icu_properties::maps::CodePointMapData;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
use icu_properties::sets::CodePointSetData;
use icu_properties::LineBreak;
use icu_provider::AsDeserializingBufferProvider;
@ -8,6 +8,7 @@ use icu_provider_adapters::fork::ForkByKeyProvider;
use icu_provider_blob::BlobDataProvider;
use icu_segmenter::LineSegmenter;
use once_cell::sync::Lazy;
use unicode_segmentation::UnicodeSegmentation;
use super::*;
use crate::engine::Engine;
@ -630,7 +631,7 @@ fn raw_cost(
/// This is an internal instead of an external iterator because it makes the
/// code much simpler and the consumers of this function don't need the
/// composability and flexibility of external iteration anyway.
fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint)) {
fn breakpoints(p: &Preparation, mut f: impl FnMut(usize, Breakpoint)) {
let text = p.text;
// Single breakpoint at the end for empty text.
@ -661,7 +662,7 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint))
}
}
// Get the UAX #14 linebreak opportunities.
// Get the next UAX #14 linebreak opportunity.
let Some(point) = iter.next() else { break };
// Skip breakpoint if there is no char before it. icu4x generates one
@ -686,46 +687,13 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint))
};
// Hyphenate between the last and current breakpoint.
'hyphenate: {
if !hyphenate {
break 'hyphenate;
}
// Extract a hyphenatable "word".
let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
if word.is_empty() {
break 'hyphenate;
}
let end = last + word.len();
if hyphenate {
let mut offset = last;
// Determine the language to hyphenate this word in.
let Some(lang) = lang_at(p, last) else { break 'hyphenate };
for syllable in hypher::hyphenate(word, lang) {
// Don't hyphenate after the final syllable.
offset += syllable.len();
if offset == end {
continue;
for segment in text[last..point].split_word_bounds() {
if !segment.is_empty() && segment.chars().all(char::is_alphabetic) {
hyphenations(p, &lb, offset, segment, &mut f);
}
// Filter out hyphenation opportunities where hyphenation was
// actually disabled.
if !hyphenate_at(p, offset) {
continue;
}
// Filter out forbidden hyphenation opportunities.
if matches!(
syllable.chars().next_back().map(|c| lb.get(c)),
Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
) {
continue;
}
// Call `f` for the word-internal hyphenation opportunity.
f(offset, Breakpoint::Hyphen);
offset += segment.len();
}
}
@ -736,6 +704,44 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint))
}
}
/// Generate breakpoints for hyphenations within a word.
fn hyphenations(
p: &Preparation,
lb: &CodePointMapDataBorrowed<LineBreak>,
mut offset: usize,
word: &str,
mut f: impl FnMut(usize, Breakpoint),
) {
let Some(lang) = lang_at(p, offset) else { return };
let end = offset + word.len();
for syllable in hypher::hyphenate(word, lang) {
offset += syllable.len();
// Don't hyphenate after the final syllable.
if offset == end {
continue;
}
// Filter out hyphenation opportunities where hyphenation was actually
// disabled.
if !hyphenate_at(p, offset) {
continue;
}
// Filter out forbidden hyphenation opportunities.
if matches!(
syllable.chars().next_back().map(|c| lb.get(c)),
Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
) {
continue;
}
// Call `f` for the word-internal hyphenation opportunity.
f(offset, Breakpoint::Hyphen);
}
}
/// Produce linebreak opportunities for a link.
fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
#[derive(PartialEq)]

Binary file not shown.

After

Width:  |  Height:  |  Size: 1011 B

View File

@ -50,6 +50,16 @@ It's a #emph[Tree]beard.
#set text(hyphenate: true)
#h(6pt) networks, the rest.
--- hyphenate-outside-of-words ---
// More tests for hyphenation of non-words.
#set text(hyphenate: true)
#block(width: 0pt, "doesn't")
#block(width: 0pt, "(OneNote)")
#block(width: 0pt, "(present)")
#set text(lang: "de")
#block(width: 0pt, "(bzw.)")
--- hyphenate-pt-repeat-hyphen-natural-word-breaking ---
// The word breaker naturally breaks arco-da-velha at arco-/-da-velha,
// so we shall repeat the hyphen, even that hyphenate is set to false.