diff --git a/Cargo.lock b/Cargo.lock index 10335d5ed..c515fb2cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -338,6 +338,12 @@ dependencies = [ "roff", ] +[[package]] +name = "cobs" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" + [[package]] name = "codespan-reporting" version = "0.11.1" @@ -803,6 +809,118 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8302d8dfd6044d3ddb3f807a5ef3d7bbca9a574959c6d6e4dc39aa7012d0d5" +dependencies = [ + "displaydoc", + "serde", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3003f85dccfc0e238ff567693248c59153a46f4e6125ba4020b973cef4d1d335" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_properties" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce0e1aa26851f16c9e04412a5911c86b7f8768dac8f8d4c5f1c568a7e5d7a434" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_provider", + "serde", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_provider" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8dc312a7b6148f7dfe098047ae2494d12d4034f48ade58d4f353000db376e305" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "postcard", + "serde", + "stable_deref_trait", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_adapters" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4ae1e2bd0c41728b77e7c46e9afdec5e2127d1eedacc684724667d50c126bd3" +dependencies = [ + "icu_locid", + "icu_provider", + "tinystr", + "yoke", + "zerovec", +] + +[[package]] +name = "icu_provider_blob" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd364c9a01f791a4bc04a74cf2a1d01d9f6926a40fd5ae1c28004e1e70d8338b" +dependencies = [ + "icu_provider", + "postcard", + "serde", + "writeable", + "yoke", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b728b9421e93eff1d9f8681101b78fa745e0748c95c655c83f337044a7e10" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "icu_segmenter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3300a7b6bf187be98a57264ad094f11f2e062c2e8263132af010ff522ee5495" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid", + "icu_provider", + "num-traits", + "serde", + "utf8_iter", + "zerovec", +] + [[package]] name = "idna" version = "0.3.0" @@ -1063,6 +1181,12 @@ dependencies = [ "libdeflate-sys", ] +[[package]] +name = "libm" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -1085,6 +1209,12 @@ dependencies = [ "rand_chacha", ] +[[package]] +name = "litemap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a04a5b2b6f54acba899926491d0a6c59d98012938ca2ab5befb281c034e8f94" + [[package]] name = "lock_api" version = "0.4.9" @@ -1227,6 +1357,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1397,6 +1528,16 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "postcard" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfa512cd0d087cc9f99ad30a1bf64795b67871edbead083ffc3a4dfafa59aa00" +dependencies = [ + "cobs", + "serde", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -1911,6 +2052,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "unicode-xid", +] + [[package]] name = "syntect" version = "5.0.0" @@ -2056,6 +2209,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ac3f5b6856e931e15e07b478e98c8045239829a65f9156d4fa7e7788197a5ef" dependencies = [ "displaydoc", + "serde", + "zerovec", ] [[package]] @@ -2299,6 +2454,11 @@ dependencies = [ "ecow", "hayagriva", "hypher", + "icu_properties", + "icu_provider", + "icu_provider_adapters", + "icu_provider_blob", + "icu_segmenter", "kurbo", "lipsum", "log", @@ -2319,7 +2479,6 @@ dependencies = [ "unicode-math-class", "unicode-script", "unicode-segmentation", - "xi-unicode", ] [[package]] @@ -2447,6 +2606,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "unicode_names2" version = "0.6.0" @@ -2530,6 +2695,12 @@ dependencies = [ "svgtypes", ] +[[package]] +name = "utf8_iter" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a8922555b9500e3d865caed19330172cd67cbf82203f1a3311d8c305cc9f33" + [[package]] name = "utf8parse" version = "0.2.1" @@ -2814,6 +2985,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "writeable" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60e49e42bdb1d5dc76f4cd78102f8f0714d32edfa3efb82286eb0f0b1fc0da0f" + [[package]] name = "wyz" version = "0.5.1" @@ -2823,12 +3000,6 @@ dependencies = [ "tap", ] -[[package]] -name = "xi-unicode" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a67300977d3dc3f8034dae89778f502b6ba20b269527b3223ba59c0cf393bb8a" - [[package]] name = "xmlparser" version = "0.13.5" @@ -2866,6 +3037,75 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "yoke" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1848075a23a28f9773498ee9a0f2cf58fcbad4f8c0ccf84a210ab33c6ae495de" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af46c169923ed7516eef0aa32b56d2651b229f57458ebe46b49ddd6efef5b7a2" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "synstructure", +] + +[[package]] +name = "zerofrom" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df54d76c3251de27615dfcce21e636c172dafb2549cd7fd93e21c66f6ca6bea2" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4eae7c1f7d4b8eafce526bc0771449ddc2f250881ae31c50d22c032b5a1c499" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "synstructure", +] + +[[package]] +name = "zerovec" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "198f54134cd865f437820aa3b43d0ad518af4e68ee161b444cdd15d8e567c8ea" +dependencies = [ + "serde", + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "486558732d5dde10d0f8cb2936507c1bb21bc539d924c949baf5f36a58e51bac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "synstructure", +] + [[package]] name = "zopfli" version = "0.7.2" diff --git a/assets/data/cj_linebreak_data.postcard b/assets/data/cj_linebreak_data.postcard new file mode 100644 index 000000000..910dd1676 Binary files /dev/null and b/assets/data/cj_linebreak_data.postcard differ diff --git a/assets/data/icudata.postcard b/assets/data/icudata.postcard new file mode 100644 index 000000000..e910bf8e4 Binary files /dev/null and b/assets/data/icudata.postcard differ diff --git a/assets/fonts/NotoSansThai-Regular.ttf b/assets/fonts/NotoSansThai-Regular.ttf new file mode 100644 index 000000000..ced12d665 Binary files /dev/null and b/assets/fonts/NotoSansThai-Regular.ttf differ diff --git a/library/Cargo.toml b/library/Cargo.toml index b6e06c8e3..85bf4a2c6 100644 --- a/library/Cargo.toml +++ b/library/Cargo.toml @@ -25,6 +25,11 @@ csv = "1" ecow = "0.1" hayagriva = "0.3" hypher = "0.1" +icu_properties = { version = "1.2.0", features = ["serde"] } +icu_provider = { version = "1.2.0", features = ["sync"] } +icu_provider_adapters = "1.2.0" +icu_provider_blob = "1.2.0" +icu_segmenter = { version = "1.2.1", features = ["serde"] } kurbo = "0.9" lipsum = "0.9" log = "0.4" @@ -44,4 +49,3 @@ unicode-bidi = "0.3.13" unicode-math-class = "0.1" unicode-script = "0.5" unicode-segmentation = "1" -xi-unicode = "0.3" diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs index e056b1c91..79b368138 100644 --- a/library/src/layout/par.rs +++ b/library/src/layout/par.rs @@ -1,7 +1,12 @@ +use icu_properties::{maps::CodePointMapData, LineBreak}; +use icu_provider::AsDeserializingBufferProvider; +use icu_provider_adapters::fork::ForkByKeyProvider; +use icu_provider_blob::BlobDataProvider; +use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter}; +use once_cell::sync::Lazy; use typst::eval::Tracer; use unicode_bidi::{BidiInfo, Level as BidiLevel}; use unicode_script::{Script, UnicodeScript}; -use xi_unicode::LineBreakIterator; use super::{BoxElem, HElem, Sizing, Spacing}; use crate::layout::AlignElem; @@ -998,15 +1003,65 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap() +}); + +/// The Unicode line break properties for each code point. +static CJ_SEGMENTER: Lazy = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap(); + let cj_provider = ForkByKeyProvider::new(cj_blob, provider); + LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap() +}); + +/// The line break segmenter for Chinese/Jpanese text. +static LINEBREAK_DATA: Lazy> = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let deser_provider = provider.as_deserializing(); + icu_properties::maps::load_line_break(&deser_provider).unwrap() +}); + /// Determine all possible points in the text where lines can broken. /// /// Returns for each breakpoint the text index, whether the break is mandatory /// (after `\n`) and whether a hyphen is required (when breaking inside of a /// word). fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> { + let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) { + CJ_SEGMENTER.segment_str(p.bidi.text) + } else { + SEGMENTER.segment_str(p.bidi.text) + }; + // The iterator always yields a breakpoint at index 0, we want to ignore it + linebreaks.next(); Breakpoints { p, - linebreaks: LineBreakIterator::new(p.bidi.text), + linebreaks, syllables: None, offset: 0, suffix: 0, @@ -1020,7 +1075,7 @@ struct Breakpoints<'a> { /// The paragraph's items. p: &'a Preparation<'a>, /// The inner iterator over the unicode line break opportunities. - linebreaks: LineBreakIterator<'a>, + linebreaks: LineBreakIteratorUtf8<'a, 'a>, /// Iterator over syllables of the current word. syllables: Option>, /// The current text offset. @@ -1054,8 +1109,20 @@ impl Iterator for Breakpoints<'_> { return Some((self.offset, self.mandatory && !hyphen, hyphen)); } + let lb = LINEBREAK_DATA.as_borrowed(); + // Get the next "word". - (self.end, self.mandatory) = self.linebreaks.next()?; + self.end = self.linebreaks.next()?; + self.mandatory = + self.p.bidi.text[..self.end].chars().next_back().map_or(false, |c| { + matches!( + lb.get(c), + LineBreak::MandatoryBreak + | LineBreak::CarriageReturn + | LineBreak::LineFeed + | LineBreak::NextLine + ) || self.end == self.p.bidi.text.len() + }); // Hyphenate the next word. if self.p.hyphenate != Some(false) { diff --git a/src/doc.rs b/src/doc.rs index 7c9c7fc80..3fdcf80f9 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -524,6 +524,7 @@ impl Lang { pub const FRENCH: Self = Self(*b"fr ", 2); pub const GERMAN: Self = Self(*b"de ", 2); pub const ITALIAN: Self = Self(*b"it ", 2); + pub const JAPANESE: Self = Self(*b"ja ", 2); pub const NYNORSK: Self = Self(*b"nn ", 2); pub const POLISH: Self = Self(*b"pl ", 2); pub const PORTUGUESE: Self = Self(*b"pt ", 2); diff --git a/tests/ref/text/linebreak.png b/tests/ref/text/linebreak.png index b137e292d..1e94762bc 100644 Binary files a/tests/ref/text/linebreak.png and b/tests/ref/text/linebreak.png differ diff --git a/tests/typ/layout/par-justify-cjk.typ b/tests/typ/layout/par-justify-cjk.typ index 04b328a7f..41b3e7292 100644 --- a/tests/typ/layout/par-justify-cjk.typ +++ b/tests/typ/layout/par-justify-cjk.typ @@ -43,7 +43,7 @@ #set text(font: "Noto Serif CJK SC", lang: "zh") #set par(justify: true) -孔雀最早见于《山海经》中的《海内经》:\u{200b}“有孔雀。”东汉杨孚著《异物志》记载,岭南:“孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。” +孔雀最早见于《山海经》中的《海内经》:“有孔雀。”东汉杨孚著《异物志》记载,岭南:“孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。” #set text(font: "Noto Serif CJK TC", lang: "zh", region: "hk") 孔雀最早见于《山海经》中的《海内经》:「有孔雀。」东汉杨孚著《异物志》记载,岭南:「孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。」 diff --git a/tests/typ/text/linebreak.typ b/tests/typ/text/linebreak.typ index c926789a2..7e409a804 100644 --- a/tests/typ/text/linebreak.typ +++ b/tests/typ/text/linebreak.typ @@ -43,3 +43,9 @@ Second part // Test comments at the end of a line with pre-spacing First part // Second part + +--- +// Test linebreak for East Asian languages +ทีวีตรวจทานนอร์ทแฟรีเลคเชอร์โกลด์อัลบัมเชอร์รี่เย้วสโตร์กฤษณ์เคลมเยอบีร่าพ่อค้าบลูเบอร์รี่สหัสวรรษโฮปแคนูโยโย่จูนสตรอว์เบอร์รีซื่อบื้อเยนแบ็กโฮเป็นไงโดนัททอมสเตริโอแคนูวิทย์แดรี่โดนัทวิทย์แอปพริคอทเซอร์ไพรส์ไฮบริดกิฟท์อินเตอร์โซนเซอร์วิสเทียมทานโคโยตี้ม็อบเที่ยงคืนบุญคุณ + +