Lexer change: Allow emphasis in CJK text without spaces (#2648)

This commit is contained in:
Peng Guanwen 2023-11-15 22:01:15 +08:00 committed by GitHub
parent 50ea3b4f16
commit f4a81091f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 20 additions and 2 deletions

1
Cargo.lock generated
View File

@ -3117,6 +3117,7 @@ dependencies = [
"tracing", "tracing",
"unicode-ident", "unicode-ident",
"unicode-math-class", "unicode-math-class",
"unicode-script",
"unicode-segmentation", "unicode-segmentation",
"unscanny", "unscanny",
] ]

Binary file not shown.

Binary file not shown.

View File

@ -23,5 +23,6 @@ serde = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
unicode-ident = { workspace = true } unicode-ident = { workspace = true }
unicode-math-class = { workspace = true } unicode-math-class = { workspace = true }
unicode-script = { workspace = true }
unicode-segmentation = { workspace = true } unicode-segmentation = { workspace = true }
unscanny = { workspace = true } unscanny = { workspace = true }

View File

@ -1,5 +1,6 @@
use ecow::{eco_format, EcoString}; use ecow::{eco_format, EcoString};
use unicode_ident::{is_xid_continue, is_xid_start}; use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_script::{Script, UnicodeScript};
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
use unscanny::Scanner; use unscanny::Scanner;
@ -343,10 +344,18 @@ impl Lexer<'_> {
} }
fn in_word(&self) -> bool { fn in_word(&self) -> bool {
let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); let wordy = |c: Option<char>| {
c.map_or(false, |c| {
c.is_alphanumeric()
&& !matches!(
c.script(),
Script::Han | Script::Hiragana | Script::Katakana
)
})
};
let prev = self.s.scout(-2); let prev = self.s.scout(-2);
let next = self.s.peek(); let next = self.s.peek();
alphanum(prev) && alphanum(next) wordy(prev) && wordy(next)
} }
fn space_or_end(&self) -> bool { fn space_or_end(&self) -> bool {

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.4 KiB

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

View File

@ -7,6 +7,13 @@ _Emphasized and *strong* words!_
// Inside of a word it's a normal underscore or star. // Inside of a word it's a normal underscore or star.
hello_world Nutzer*innen hello_world Nutzer*innen
// CJK characters will not need spaces.
中文一般使用*粗体*或者_楷体_来表示强调。
日本語では、*太字*_斜体_を使って強調します。
中文中混有*Strong*_Empasis_
// Can contain paragraph in nested content block. // Can contain paragraph in nested content block.
_Still #[ _Still #[