Lexer change: Allow emphasis in CJK text without spaces (#2648)

2025-07-15 08:32:53 +08:00 · 2023-11-15 22:01:15 +08:00 · 2023-11-15 22:01:15 +08:00 · f4a81091f7
commit f4a81091f7
parent 50ea3b4f16
8 changed files with 20 additions and 2 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3117,6 +3117,7 @@ dependencies = [
 "tracing",
 "unicode-ident",
 "unicode-math-class",
 "unicode-script",
 "unicode-segmentation",
 "unscanny",
 ]
--- a/assets/fonts/NotoSerifCJKsc-Bold.otf
+++ b/assets/fonts/NotoSerifCJKsc-Bold.otf
--- a/assets/fonts/NotoSerifCJKtc-Bold.otf
+++ b/assets/fonts/NotoSerifCJKtc-Bold.otf
--- a/crates/typst-syntax/Cargo.toml
+++ b/crates/typst-syntax/Cargo.toml
@ -23,5 +23,6 @@ serde = { workspace = true }
 tracing = { workspace = true }
 unicode-ident = { workspace = true }
 unicode-math-class = { workspace = true }
 unicode-script = { workspace = true }
 unicode-segmentation = { workspace = true }
 unscanny = { workspace = true }
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@ -1,5 +1,6 @@
 use ecow::{eco_format, EcoString};
 use unicode_ident::{is_xid_continue, is_xid_start};
 use unicode_script::{Script, UnicodeScript};
 use unicode_segmentation::UnicodeSegmentation;
 use unscanny::Scanner;
@ -343,10 +344,18 @@ impl Lexer<'_> {
    }
    fn in_word(&self) -> bool {
-        let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
+        let wordy = |c: Option<char>| {
            c.map_or(false, |c| {
                c.is_alphanumeric()
                    && !matches!(
                        c.script(),
                        Script::Han | Script::Hiragana | Script::Katakana
                    )
            })
        };
        let prev = self.s.scout(-2);
        let next = self.s.peek();
-        alphanum(prev) && alphanum(next)
+        wordy(prev) && wordy(next)
    }
    fn space_or_end(&self) -> bool {
--- a/tests/ref/text/emphasis.png
+++ b/tests/ref/text/emphasis.png
--- a/tests/ref/text/lang-with-region.png
+++ b/tests/ref/text/lang-with-region.png
--- a/tests/typ/text/emphasis.typ
+++ b/tests/typ/text/emphasis.typ
@ -7,6 +7,13 @@ _Emphasized and *strong* words!_
 // Inside of a word it's a normal underscore or star.
 hello_world Nutzer*innen
 // CJK characters will not need spaces.
 中文一般使用*粗体*或者_楷体_来表示强调。
 日本語では、*太字*や_斜体_を使って強調します。
 中文中混有*Strong*和_Empasis_。
 // Can contain paragraph in nested content block.
 _Still #[