Refine linebreak algorithm for better Chinese justification (#701)
@ -457,22 +457,35 @@ impl<'a> Line<'a> {
|
|||||||
self.items().skip(start).take(end - start)
|
self.items().skip(start).take(end - start)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// How many justifiable glyphs the line contains.
|
/// How many glyphs are in the text where we can insert additional
|
||||||
|
/// space when encountering underfull lines.
|
||||||
fn justifiables(&self) -> usize {
|
fn justifiables(&self) -> usize {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
for shaped in self.items().filter_map(Item::text) {
|
for shaped in self.items().filter_map(Item::text) {
|
||||||
count += shaped.justifiables();
|
count += shaped.justifiables();
|
||||||
}
|
}
|
||||||
|
// CJK character at line end should not be adjusted.
|
||||||
|
if self
|
||||||
|
.items()
|
||||||
|
.last()
|
||||||
|
.and_then(Item::text)
|
||||||
|
.map(|s| s.cjk_justifiable_at_last())
|
||||||
|
.unwrap_or(false)
|
||||||
|
{
|
||||||
|
count -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
count
|
count
|
||||||
}
|
}
|
||||||
|
|
||||||
/// How much of the line is stretchable spaces.
|
/// How much can the line stretch
|
||||||
fn stretch(&self) -> Abs {
|
fn stretchability(&self) -> Abs {
|
||||||
let mut stretch = Abs::zero();
|
self.items().filter_map(Item::text).map(|s| s.stretchability()).sum()
|
||||||
for shaped in self.items().filter_map(Item::text) {
|
}
|
||||||
stretch += shaped.stretch();
|
|
||||||
}
|
/// How much can the line shrink
|
||||||
stretch
|
fn shrinkability(&self) -> Abs {
|
||||||
|
self.items().filter_map(Item::text).map(|s| s.shrinkability()).sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The sum of fractions in the line.
|
/// The sum of fractions in the line.
|
||||||
@ -835,10 +848,9 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
|
|
||||||
// Cost parameters.
|
// Cost parameters.
|
||||||
const HYPH_COST: Cost = 0.5;
|
const HYPH_COST: Cost = 0.5;
|
||||||
const CONSECUTIVE_DASH_COST: Cost = 30.0;
|
const CONSECUTIVE_DASH_COST: Cost = 300.0;
|
||||||
const MAX_COST: Cost = 1_000_000.0;
|
const MAX_COST: Cost = 1_000_000.0;
|
||||||
const MIN_COST: Cost = -MAX_COST;
|
const MIN_RATIO: f64 = -1.0;
|
||||||
const MIN_RATIO: f64 = -0.15;
|
|
||||||
|
|
||||||
// Dynamic programming table.
|
// Dynamic programming table.
|
||||||
let mut active = 0;
|
let mut active = 0;
|
||||||
@ -864,13 +876,30 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
// Determine how much the line's spaces would need to be stretched
|
// Determine how much the line's spaces would need to be stretched
|
||||||
// to make it the desired width.
|
// to make it the desired width.
|
||||||
let delta = width - attempt.width;
|
let delta = width - attempt.width;
|
||||||
let mut ratio = delta / attempt.stretch();
|
// Determine how much stretch are permitted.
|
||||||
if ratio.is_infinite() {
|
let adjust = if delta >= Abs::zero() {
|
||||||
ratio = delta / (em / 2.0);
|
attempt.stretchability()
|
||||||
|
} else {
|
||||||
|
attempt.shrinkability()
|
||||||
|
};
|
||||||
|
// Ideally, the ratio should between -1.0 and 1.0, but sometimes a value above 1.0
|
||||||
|
// is possible, in which case the line is underfull.
|
||||||
|
let mut ratio = delta / adjust;
|
||||||
|
if ratio.is_nan() {
|
||||||
|
// The line is not stretchable, but it just fits.
|
||||||
|
// This often happens with monospace fonts and CJK texts.
|
||||||
|
ratio = 0.0;
|
||||||
|
}
|
||||||
|
if ratio.is_infinite() {
|
||||||
|
// The line's not stretchable, we calculate the ratio in another way...
|
||||||
|
ratio = delta / (em / 2.0);
|
||||||
|
// ...and because it is underfull/overfull, make sure the ratio is at least 1.0.
|
||||||
|
if ratio > 0.0 {
|
||||||
|
ratio += 1.0;
|
||||||
|
} else {
|
||||||
|
ratio -= 1.0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// At some point, it doesn't matter any more.
|
|
||||||
ratio = ratio.min(10.0);
|
|
||||||
|
|
||||||
// Determine the cost of the line.
|
// Determine the cost of the line.
|
||||||
let min_ratio = if attempt.justify { MIN_RATIO } else { 0.0 };
|
let min_ratio = if attempt.justify { MIN_RATIO } else { 0.0 };
|
||||||
@ -883,11 +912,15 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
active = i + 1;
|
active = i + 1;
|
||||||
MAX_COST
|
MAX_COST
|
||||||
} else if mandatory || eof {
|
} else if mandatory || eof {
|
||||||
// This is a mandatory break and the line is not overfull, so it
|
// This is a mandatory break and the line is not overfull, so
|
||||||
// has minimum cost. All breakpoints before this one become
|
// all breakpoints before this one become inactive since no line
|
||||||
// inactive since no line can span above the mandatory break.
|
// can span above the mandatory break.
|
||||||
active = k;
|
active = k;
|
||||||
MIN_COST + if attempt.justify { ratio.powi(3).abs() } else { 0.0 }
|
if attempt.justify {
|
||||||
|
ratio.powi(3).abs()
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Normal line with cost of |ratio^3|.
|
// Normal line with cost of |ratio^3|.
|
||||||
ratio.powi(3).abs()
|
ratio.powi(3).abs()
|
||||||
@ -898,6 +931,12 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
|||||||
cost += HYPH_COST;
|
cost += HYPH_COST;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// In Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a,
|
||||||
|
// where r is the ratio, p=50 is penaty, and a=3000 is consecutive penaty.
|
||||||
|
// We divide the whole formula by 10, resulting (0.01 + |r|^3 + p)^2 + a,
|
||||||
|
// where p=0.5 and a=300
|
||||||
|
cost = (0.01 + cost).powi(2);
|
||||||
|
|
||||||
// Penalize two consecutive dashes (not necessarily hyphens) extra.
|
// Penalize two consecutive dashes (not necessarily hyphens) extra.
|
||||||
if attempt.dash && pred.line.dash {
|
if attempt.dash && pred.line.dash {
|
||||||
cost += CONSECUTIVE_DASH_COST;
|
cost += CONSECUTIVE_DASH_COST;
|
||||||
@ -1233,13 +1272,32 @@ fn commit(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine how much to justify each space.
|
// Determine how much addtional space is needed.
|
||||||
|
// The justicication_ratio is for the first step justification,
|
||||||
|
// extra_justification is for the last step.
|
||||||
|
// For more info on multi-step justification, see Procedures for Inter-
|
||||||
|
// Character Space Expansion in W3C document Chinese Layout Requirements.
|
||||||
let fr = line.fr();
|
let fr = line.fr();
|
||||||
let mut justification = Abs::zero();
|
let mut justification_ratio = 0.0;
|
||||||
if remaining < Abs::zero() || (line.justify && fr.is_zero()) {
|
let mut extra_justification = Abs::zero();
|
||||||
|
|
||||||
|
let shrink = line.shrinkability();
|
||||||
|
let stretch = line.stretchability();
|
||||||
|
if remaining < Abs::zero() && shrink > Abs::zero() {
|
||||||
|
// Attempt to reduce the length of the line, using shrinkability.
|
||||||
|
justification_ratio = (remaining / shrink).max(-1.0);
|
||||||
|
remaining = (remaining + shrink).min(Abs::zero());
|
||||||
|
} else if line.justify && fr.is_zero() {
|
||||||
|
// Attempt to increase the length of the line, using stretchability.
|
||||||
|
if stretch > Abs::zero() {
|
||||||
|
justification_ratio = (remaining / stretch).min(1.0);
|
||||||
|
remaining = (remaining - stretch).max(Abs::zero());
|
||||||
|
}
|
||||||
|
|
||||||
let justifiables = line.justifiables();
|
let justifiables = line.justifiables();
|
||||||
if justifiables > 0 {
|
if justifiables > 0 && remaining > Abs::zero() {
|
||||||
justification = remaining / justifiables as f64;
|
// Underfull line, distribute the extra space.
|
||||||
|
extra_justification = remaining / justifiables as f64;
|
||||||
remaining = Abs::zero();
|
remaining = Abs::zero();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1275,7 +1333,7 @@ fn commit(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
Item::Text(shaped) => {
|
Item::Text(shaped) => {
|
||||||
let frame = shaped.build(vt, justification);
|
let frame = shaped.build(vt, justification_ratio, extra_justification);
|
||||||
push(&mut offset, frame);
|
push(&mut offset, frame);
|
||||||
}
|
}
|
||||||
Item::Frame(frame) => {
|
Item::Frame(frame) => {
|
||||||
|
@ -70,22 +70,42 @@ impl ShapedGlyph {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Whether the glyph is justifiable.
|
/// Whether the glyph is justifiable.
|
||||||
///
|
|
||||||
/// Typst's basic justification strategy is to stretch all the spaces
|
|
||||||
/// in a line until the line fills the available width. However, some
|
|
||||||
/// scripts (notably Chinese and Japanese) don't use spaces.
|
|
||||||
///
|
|
||||||
/// In Japanese typography, the convention is to insert space evenly
|
|
||||||
/// between all glyphs. I assume it's the same in Chinese.
|
|
||||||
pub fn is_justifiable(&self) -> bool {
|
pub fn is_justifiable(&self) -> bool {
|
||||||
self.is_space() || is_spaceless(self.c.script())
|
self.is_space() || self.is_cjk() || self.is_cjk_punctuation()
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/// Does this script separate its words using spaces?
|
pub fn is_cjk(&self) -> bool {
|
||||||
fn is_spaceless(script: Script) -> bool {
|
use Script::*;
|
||||||
use Script::*;
|
matches!(self.c.script(), Hiragana | Katakana | Han)
|
||||||
matches!(script, Hiragana | Katakana | Han)
|
}
|
||||||
|
|
||||||
|
pub fn is_cjk_punctuation(&self) -> bool {
|
||||||
|
matches!(self.c, ',' | '。' | '、' | ':' | ';')
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The stretchability of the character.
|
||||||
|
pub fn stretchability(&self) -> Em {
|
||||||
|
let width = self.x_advance;
|
||||||
|
if self.is_space() {
|
||||||
|
// The number for spaces is from Knuth-Plass' paper
|
||||||
|
width / 2.0
|
||||||
|
} else {
|
||||||
|
Em::zero()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The shrinkability of the character.
|
||||||
|
pub fn shrinkability(&self) -> Em {
|
||||||
|
let width = self.x_advance;
|
||||||
|
if self.is_space() {
|
||||||
|
// The number for spaces is from Knuth-Plass' paper
|
||||||
|
width / 3.0
|
||||||
|
} else if self.is_cjk_punctuation() {
|
||||||
|
width / 2.0
|
||||||
|
} else {
|
||||||
|
Em::zero()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A side you can go toward.
|
/// A side you can go toward.
|
||||||
@ -101,7 +121,12 @@ impl<'a> ShapedText<'a> {
|
|||||||
///
|
///
|
||||||
/// The `justification` defines how much extra advance width each
|
/// The `justification` defines how much extra advance width each
|
||||||
/// [justifiable glyph](ShapedGlyph::is_justifiable) will get.
|
/// [justifiable glyph](ShapedGlyph::is_justifiable) will get.
|
||||||
pub fn build(&self, vt: &Vt, justification: Abs) -> Frame {
|
pub fn build(
|
||||||
|
&self,
|
||||||
|
vt: &Vt,
|
||||||
|
justification_ratio: f64,
|
||||||
|
extra_justification: Abs,
|
||||||
|
) -> Frame {
|
||||||
let (top, bottom) = self.measure(vt);
|
let (top, bottom) = self.measure(vt);
|
||||||
let size = Size::new(self.width, top + bottom);
|
let size = Size::new(self.width, top + bottom);
|
||||||
|
|
||||||
@ -120,19 +145,25 @@ impl<'a> ShapedText<'a> {
|
|||||||
let pos = Point::new(offset, top + shift - y_offset.at(self.size));
|
let pos = Point::new(offset, top + shift - y_offset.at(self.size));
|
||||||
let glyphs = group
|
let glyphs = group
|
||||||
.iter()
|
.iter()
|
||||||
.map(|glyph| Glyph {
|
.map(|glyph| {
|
||||||
id: glyph.glyph_id,
|
let mut justification = Em::zero();
|
||||||
x_advance: glyph.x_advance
|
if justification_ratio < 0.0 {
|
||||||
+ if glyph.is_justifiable() {
|
justification += glyph.shrinkability() * justification_ratio
|
||||||
frame.size_mut().x += justification;
|
} else {
|
||||||
Em::from_length(justification, self.size)
|
justification += glyph.stretchability() * justification_ratio
|
||||||
} else {
|
}
|
||||||
Em::zero()
|
if glyph.is_justifiable() {
|
||||||
},
|
justification += Em::from_length(extra_justification, self.size)
|
||||||
x_offset: glyph.x_offset,
|
}
|
||||||
c: glyph.c,
|
frame.size_mut().x += justification.at(self.size);
|
||||||
span: glyph.span,
|
Glyph {
|
||||||
offset: glyph.offset,
|
id: glyph.glyph_id,
|
||||||
|
x_advance: glyph.x_advance + justification,
|
||||||
|
x_offset: glyph.x_offset,
|
||||||
|
c: glyph.c,
|
||||||
|
span: glyph.span,
|
||||||
|
offset: glyph.offset,
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
@ -200,17 +231,35 @@ impl<'a> ShapedText<'a> {
|
|||||||
(top, bottom)
|
(top, bottom)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// How many justifiable glyphs the text contains.
|
/// How many glyphs are in the text where we can insert additional
|
||||||
|
/// space when encountering underfull lines.
|
||||||
pub fn justifiables(&self) -> usize {
|
pub fn justifiables(&self) -> usize {
|
||||||
self.glyphs.iter().filter(|g| g.is_justifiable()).count()
|
self.glyphs.iter().filter(|g| g.is_justifiable()).count()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The width of the spaces in the text.
|
/// Whether the last glyph is a CJK character which should not be justified
|
||||||
pub fn stretch(&self) -> Abs {
|
/// on line end.
|
||||||
|
pub fn cjk_justifiable_at_last(&self) -> bool {
|
||||||
|
self.glyphs
|
||||||
|
.last()
|
||||||
|
.map(|g| g.is_cjk() || g.is_cjk_punctuation())
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The stretchability of the text.
|
||||||
|
pub fn stretchability(&self) -> Abs {
|
||||||
self.glyphs
|
self.glyphs
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|g| g.is_justifiable())
|
.map(|g| g.stretchability())
|
||||||
.map(|g| g.x_advance)
|
.sum::<Em>()
|
||||||
|
.at(self.size)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The shrinkability of the text
|
||||||
|
pub fn shrinkability(&self) -> Abs {
|
||||||
|
self.glyphs
|
||||||
|
.iter()
|
||||||
|
.map(|g| g.shrinkability())
|
||||||
.sum::<Em>()
|
.sum::<Em>()
|
||||||
.at(self.size)
|
.at(self.size)
|
||||||
}
|
}
|
||||||
|
BIN
tests/ref/layout/par-justify-cjk.png
Normal file
After Width: | Height: | Size: 43 KiB |
Before Width: | Height: | Size: 29 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 68 KiB After Width: | Height: | Size: 67 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 18 KiB |
23
tests/typ/layout/par-justify-cjk.typ
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
// Test Chinese text in narrow lines.
|
||||||
|
|
||||||
|
// In Chinese typography, line length should be multiples of the character size
|
||||||
|
// and the line ends should be aligned with each other.
|
||||||
|
// Most Chinese publications do not use hanging punctuation at line end.
|
||||||
|
#set page(width: auto)
|
||||||
|
#set par(justify: true)
|
||||||
|
#set text(overhang: false, lang: "zh")
|
||||||
|
|
||||||
|
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
|
||||||
|
中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
|
||||||
|
]
|
||||||
|
|
||||||
|
---
|
||||||
|
// Japanese typography is more complex, make sure it is at least a bit sensible.
|
||||||
|
#set page(width: auto)
|
||||||
|
#set par(justify: true)
|
||||||
|
#set text(lang: "jp")
|
||||||
|
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
|
||||||
|
ウィキペディア(英: Wikipedia)は、世界中のボランティアの共同作業によって執筆及び作成されるフリーの多言語インターネット百科事典である。主に寄付に依って活動している非営利団体「ウィキメディア財団」が所有・運営している。
|
||||||
|
|
||||||
|
専門家によるオンライン百科事典プロジェクトNupedia(ヌーペディア)を前身として、2001年1月、ラリー・サンガーとジミー・ウェールズ(英: Jimmy Donal "Jimbo" Wales)により英語でプロジェクトが開始された。
|
||||||
|
]
|
@ -20,12 +20,6 @@ D
|
|||||||
A B C #linebreak(justify: true)
|
A B C #linebreak(justify: true)
|
||||||
D E F #linebreak(justify: true)
|
D E F #linebreak(justify: true)
|
||||||
|
|
||||||
---
|
|
||||||
// Test that justificating chinese text is at least a bit sensible.
|
|
||||||
#set page(width: 200pt)
|
|
||||||
#set par(justify: true)
|
|
||||||
中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
|
|
||||||
|
|
||||||
---
|
---
|
||||||
// Test that there are no hick-ups with justification enabled and
|
// Test that there are no hick-ups with justification enabled and
|
||||||
// basically empty paragraph.
|
// basically empty paragraph.
|
||||||
|