From 73253d465192454f0dfe3fe9eef46d495b343aef Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Tue, 17 Dec 2024 22:07:45 +0800 Subject: [PATCH] Support for defining which charset should be covered by a font (#5305) Co-authored-by: Laurenz --- Cargo.lock | 1 + Cargo.toml | 1 + crates/typst-layout/src/image.rs | 2 +- crates/typst-layout/src/inline/shaping.rs | 42 ++++-- crates/typst-layout/src/math/mod.rs | 2 +- crates/typst-library/Cargo.toml | 1 + crates/typst-library/src/text/mod.rs | 159 ++++++++++++++++++---- crates/typst-library/src/text/shift.rs | 6 +- crates/typst-macros/src/lib.rs | 2 +- tests/ref/text-font-covers-chinese.png | Bin 0 -> 2619 bytes tests/ref/text-font-covers-numbers.png | Bin 0 -> 500 bytes tests/suite/text/font.typ | 37 +++++ 12 files changed, 211 insertions(+), 42 deletions(-) create mode 100644 tests/ref/text-font-covers-chinese.png create mode 100644 tests/ref/text-font-covers-numbers.png diff --git a/Cargo.lock b/Cargo.lock index 94ce026e9..e6c1cf0f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2931,6 +2931,7 @@ dependencies = [ "qcms", "rayon", "regex", + "regex-syntax", "roxmltree", "rust_decimal", "rustybuzz", diff --git a/Cargo.toml b/Cargo.toml index b20d54e87..f4afefa43 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,6 +94,7 @@ qcms = "0.3.0" quote = "1" rayon = "1.7.0" regex = "1" +regex-syntax = "0.8" resvg = { version = "0.43", default-features = false, features = ["raster-images"] } roxmltree = "0.20" rust_decimal = { version = "1.36.0", default-features = false, features = ["maths"] } diff --git a/crates/typst-layout/src/image.rs b/crates/typst-layout/src/image.rs index 628fe10d6..f44d68873 100644 --- a/crates/typst-layout/src/image.rs +++ b/crates/typst-layout/src/image.rs @@ -54,7 +54,7 @@ pub fn layout_image( format, elem.alt(styles), engine.world, - &families(styles).collect::>(), + &families(styles).map(|f| f.as_str()).collect::>(), elem.flatten_text(styles), ) .at(span)?; diff --git a/crates/typst-layout/src/inline/shaping.rs b/crates/typst-layout/src/inline/shaping.rs index c2b892d82..d6b7632b6 100644 --- a/crates/typst-layout/src/inline/shaping.rs +++ b/crates/typst-layout/src/inline/shaping.rs @@ -11,8 +11,8 @@ use typst_library::engine::Engine; use typst_library::foundations::{Smart, StyleChain}; use typst_library::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size}; use typst_library::text::{ - families, features, is_default_ignorable, variant, Font, FontVariant, Glyph, Lang, - Region, TextEdgeBounds, TextElem, TextItem, + families, features, is_default_ignorable, variant, Font, FontFamily, FontVariant, + Glyph, Lang, Region, TextEdgeBounds, TextElem, TextItem, }; use typst_library::World; use typst_utils::SliceExt; @@ -351,7 +351,7 @@ impl<'a> ShapedText<'a> { for family in families(self.styles) { if let Some(font) = world .book() - .select(family, self.variant) + .select(family.as_str(), self.variant) .and_then(|id| world.font(id)) { expand(&font, TextEdgeBounds::Zero); @@ -463,7 +463,8 @@ impl<'a> ShapedText<'a> { None }; let mut chain = families(self.styles) - .map(|family| book.select(family, self.variant)) + .filter(|family| family.covers().map_or(true, |c| c.is_match("-"))) + .map(|family| book.select(family.as_str(), self.variant)) .chain(fallback_func.iter().map(|f| f())) .flatten(); @@ -719,7 +720,7 @@ fn shape_segment<'a>( ctx: &mut ShapingContext, base: usize, text: &str, - mut families: impl Iterator + Clone, + mut families: impl Iterator + Clone, ) { // Don't try shaping newlines, tabs, or default ignorables. if text @@ -732,11 +733,18 @@ fn shape_segment<'a>( // Find the next available family. let world = ctx.engine.world; let book = world.book(); - let mut selection = families.find_map(|family| { - book.select(family, ctx.variant) + let mut selection = None; + let mut covers = None; + for family in families.by_ref() { + selection = book + .select(family.as_str(), ctx.variant) .and_then(|id| world.font(id)) - .filter(|font| !ctx.used.contains(font)) - }); + .filter(|font| !ctx.used.contains(font)); + if selection.is_some() { + covers = family.covers(); + break; + } + } // Do font fallback if the families are exhausted and fallback is enabled. if selection.is_none() && ctx.fallback { @@ -795,6 +803,16 @@ fn shape_segment<'a>( let pos = buffer.glyph_positions(); let ltr = ctx.dir.is_positive(); + // Whether the character at the given offset is covered by the coverage. + let is_covered = |offset| { + let end = text[offset..] + .char_indices() + .nth(1) + .map(|(i, _)| offset + i) + .unwrap_or(text.len()); + covers.map_or(true, |cov| cov.is_match(&text[offset..end])) + }; + // Collect the shaped glyphs, doing fallback and shaping parts again with // the next font if necessary. let mut i = 0; @@ -803,7 +821,7 @@ fn shape_segment<'a>( let cluster = info.cluster as usize; // Add the glyph to the shaped output. - if info.glyph_id != 0 { + if info.glyph_id != 0 && is_covered(cluster) { // Determine the text range of the glyph. let start = base + cluster; let end = base @@ -836,7 +854,9 @@ fn shape_segment<'a>( } else { // First, search for the end of the tofu sequence. let k = i; - while infos.get(i + 1).is_some_and(|info| info.glyph_id == 0) { + while infos.get(i + 1).is_some_and(|info| { + info.glyph_id == 0 || !is_covered(info.cluster as usize) + }) { i += 1; } diff --git a/crates/typst-layout/src/math/mod.rs b/crates/typst-layout/src/math/mod.rs index 32059cef9..e642f6338 100644 --- a/crates/typst-layout/src/math/mod.rs +++ b/crates/typst-layout/src/math/mod.rs @@ -237,7 +237,7 @@ fn find_math_font( let variant = variant(styles); let world = engine.world; let Some(font) = families(styles).find_map(|family| { - let id = world.book().select(family, variant)?; + let id = world.book().select(family.as_str(), variant)?; let font = world.font(id)?; let _ = font.ttf().tables().math?.constants?; Some(font) diff --git a/crates/typst-library/Cargo.toml b/crates/typst-library/Cargo.toml index d854e4d53..cc5e26712 100644 --- a/crates/typst-library/Cargo.toml +++ b/crates/typst-library/Cargo.toml @@ -44,6 +44,7 @@ png = { workspace = true } qcms = { workspace = true } rayon = { workspace = true } regex = { workspace = true } +regex-syntax = { workspace = true } roxmltree = { workspace = true } rust_decimal = { workspace = true } rustybuzz = { workspace = true } diff --git a/crates/typst-library/src/text/mod.rs b/crates/typst-library/src/text/mod.rs index 91927b572..ee81e3f2a 100644 --- a/crates/typst-library/src/text/mod.rs +++ b/crates/typst-library/src/text/mod.rs @@ -29,6 +29,7 @@ pub use self::smartquote::*; pub use self::space::*; use std::fmt::{self, Debug, Formatter}; +use std::hash::Hash; use std::sync::LazyLock; use ecow::{eco_format, EcoString}; @@ -39,13 +40,14 @@ use rustybuzz::Feature; use smallvec::SmallVec; use ttf_parser::Tag; use typst_syntax::Spanned; +use typst_utils::singleton; use crate::diag::{bail, warning, HintedStrResult, SourceResult}; use crate::engine::Engine; use crate::foundations::{ cast, category, dict, elem, Args, Array, Cast, Category, Construct, Content, Dict, - Fold, IntoValue, NativeElement, Never, NoneValue, Packed, PlainText, Repr, Resolve, - Scope, Set, Smart, StyleChain, + Fold, IntoValue, NativeElement, Never, NoneValue, Packed, PlainText, Regex, Repr, + Resolve, Scope, Set, Smart, StyleChain, }; use crate::layout::{Abs, Axis, Dir, Em, Length, Ratio, Rel}; use crate::model::ParElem; @@ -94,7 +96,21 @@ pub(super) fn define(global: &mut Scope) { /// ``` #[elem(Debug, Construct, PlainText, Repr)] pub struct TextElem { - /// A font family name or priority list of font family names. + /// A font family descriptor or priority list of font family descriptor. + /// + /// A font family descriptor can be a plain string representing the family + /// name or a dictionary with the following keys: + /// + /// - `name` (required): The font family name. + /// - `covers` (optional): Defines the Unicode codepoints for which the + /// family shall be used. This can be: + /// - A predefined coverage set: + /// - `{"latin-in-cjk"}` covers all codepoints except for those which + /// exist in Latin fonts, but should preferrably be taken from CJK + /// fonts. + /// - A [regular expression]($regex) that defines exactly which codepoints + /// shall be covered. Accepts only the subset of regular expressions + /// which consist of exactly one dot, letter, or character class. /// /// When processing text, Typst tries all specified font families in order /// until it finds a font that has the necessary glyphs. In the example @@ -129,6 +145,21 @@ pub struct TextElem { /// /// This is Latin. \ /// هذا عربي. + /// + /// // Change font only for numbers. + /// #set text(font: ( + /// (name: "PT Sans", covers: regex("[0-9]")), + /// "Libertinus Serif" + /// )) + /// + /// The number 123. + /// + /// // Mix Latin and CJK fonts. + /// #set text(font: ( + /// (name: "Inria Serif", covers: "latin-in-cjk"), + /// "Noto Serif CJK SC" + /// )) + /// 分别设置“中文”和English字体 /// ``` #[parse({ let font_list: Option> = args.named("font")?; @@ -766,35 +797,107 @@ impl PlainText for Packed { } /// A lowercased font family like "arial". -#[derive(Clone, Eq, PartialEq, Hash)] -pub struct FontFamily(EcoString); +#[derive(Debug, Clone, PartialEq, Hash)] +pub struct FontFamily { + // The name of the font family + name: EcoString, + // A regex that defines the Unicode codepoints supported by the font. + covers: Option, +} impl FontFamily { /// Create a named font family variant. pub fn new(string: &str) -> Self { - Self(string.to_lowercase().into()) + Self::with_coverage(string, None) + } + + /// Create a font family by name and optional Unicode coverage. + pub fn with_coverage(string: &str, covers: Option) -> Self { + Self { name: string.to_lowercase().into(), covers } } /// The lowercased family name. pub fn as_str(&self) -> &str { - &self.0 + &self.name } -} -impl Debug for FontFamily { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - self.0.fmt(f) + /// The user-set coverage of the font family. + pub fn covers(&self) -> Option<&Regex> { + self.covers.as_ref().map(|covers| covers.as_regex()) } } cast! { FontFamily, - self => self.0.into_value(), + self => self.name.into_value(), string: EcoString => Self::new(&string), + mut v: Dict => { + let ret = Self::with_coverage( + &v.take("name")?.cast::()?, + v.take("covers").ok().map(|v| v.cast()).transpose()? + ); + v.finish(&["name", "covers"])?; + ret + }, +} + +/// Defines which codepoints a font family will be used for. +#[derive(Debug, Clone, PartialEq, Hash)] +pub enum Covers { + /// Covers all codepoints except those used both in Latin and CJK fonts. + LatinInCjk, + /// Covers the set of codepoints for which the regex matches. + Regex(Regex), +} + +impl Covers { + /// Retrieve the regex for the coverage. + pub fn as_regex(&self) -> &Regex { + match self { + Self::LatinInCjk => singleton!( + Regex, + Regex::new( + "[^\u{00B7}\u{2013}\u{2014}\u{2018}\u{2019}\ + \u{201C}\u{201D}\u{2025}-\u{2027}\u{2E3A}]" + ) + .unwrap() + ), + Self::Regex(regex) => regex, + } + } +} + +cast! { + Covers, + self => match self { + Self::LatinInCjk => "latin-in-cjk".into_value(), + Self::Regex(regex) => regex.into_value(), + }, + + /// Covers all codepoints except those used both in Latin and CJK fonts. + "latin-in-cjk" => Covers::LatinInCjk, + + regex: Regex => { + let ast = regex_syntax::ast::parse::Parser::new().parse(regex.as_str()); + match ast { + Ok( + regex_syntax::ast::Ast::ClassBracketed(..) + | regex_syntax::ast::Ast::ClassUnicode(..) + | regex_syntax::ast::Ast::ClassPerl(..) + | regex_syntax::ast::Ast::Dot(..) + | regex_syntax::ast::Ast::Literal(..), + ) => {} + _ => bail!( + "coverage regex may only use dot, letters, and character classes"; + hint: "the regex is applied to each letter individually" + ), + } + Covers::Regex(regex) + }, } /// Font family fallback list. -#[derive(Debug, Default, Clone, Eq, PartialEq, Hash)] +#[derive(Debug, Default, Clone, PartialEq, Hash)] pub struct FontList(pub Vec); impl<'a> IntoIterator for &'a FontList { @@ -809,7 +912,7 @@ impl<'a> IntoIterator for &'a FontList { cast! { FontList, self => if self.0.len() == 1 { - self.0.into_iter().next().unwrap().0.into_value() + self.0.into_iter().next().unwrap().name.into_value() } else { self.0.into_value() }, @@ -818,20 +921,22 @@ cast! { } /// Resolve a prioritized iterator over the font families. -pub fn families(styles: StyleChain) -> impl Iterator + Clone { - const FALLBACKS: &[&str] = &[ - "libertinus serif", - "twitter color emoji", - "noto color emoji", - "apple color emoji", - "segoe ui emoji", - ]; - - let tail = if TextElem::fallback_in(styles) { FALLBACKS } else { &[] }; - TextElem::font_in(styles) +pub fn families(styles: StyleChain) -> impl Iterator + Clone { + let fallbacks = singleton!(Vec, { + [ + "libertinus serif", + "twitter color emoji", + "noto color emoji", + "apple color emoji", + "segoe ui emoji", + ] .into_iter() - .map(|family| family.as_str()) - .chain(tail.iter().copied()) + .map(FontFamily::new) + .collect() + }); + + let tail = if TextElem::fallback_in(styles) { fallbacks.as_slice() } else { &[] }; + TextElem::font_in(styles).into_iter().chain(tail.iter()) } /// Resolve the font variant. diff --git a/crates/typst-library/src/text/shift.rs b/crates/typst-library/src/text/shift.rs index 003ecf47c..9723bbf0c 100644 --- a/crates/typst-library/src/text/shift.rs +++ b/crates/typst-library/src/text/shift.rs @@ -157,7 +157,11 @@ fn is_shapable(engine: &Engine, text: &str, styles: StyleChain) -> bool { .select(family.as_str(), variant(styles)) .and_then(|id| world.font(id)) { - return text.chars().all(|c| font.ttf().glyph_index(c).is_some()); + let covers = family.covers(); + return text.chars().all(|c| { + covers.map_or(true, |cov| cov.is_match(c.encode_utf8(&mut [0; 4]))) + && font.ttf().glyph_index(c).is_some() + }); } } diff --git a/crates/typst-macros/src/lib.rs b/crates/typst-macros/src/lib.rs index e1c3c13ab..578389c7f 100644 --- a/crates/typst-macros/src/lib.rs +++ b/crates/typst-macros/src/lib.rs @@ -280,7 +280,7 @@ pub fn category(stream: BoundaryStream, item: BoundaryStream) -> BoundaryStream /// - `Reflect` makes Typst's runtime aware of the type's characteristics. /// It's important for autocompletion, error messages, etc. /// - `FromValue` defines how to cast from a value into this type. -/// - `IntoValue` defines how to cast fromthis type into a value. +/// - `IntoValue` defines how to cast from this type into a value. /// /// ```ignore /// /// An integer between 0 and 13. diff --git a/tests/ref/text-font-covers-chinese.png b/tests/ref/text-font-covers-chinese.png new file mode 100644 index 0000000000000000000000000000000000000000..5c9b4b1e177a50a8305e2345c124df45ab639d59 GIT binary patch literal 2619 zcmV-B3dHq^P)?(}o9v5jlkKkEZ1%-wQ@7n}TerKe()41iwzXQVEoi+I@mep7R&>OAXT6|x zs#Oc(9c2rMfC$wG4`>zXtfQh(fuaZkg+UMogqdL&hBI@%JCid(Xs|*Y)0och6$v4q z{CRPnd>`Xy+G9T02m9cM11n&qr*pJ>4<1>?E`S|EtgWoGZK&Wt9jVFUQyJnDV`7u` zrq6=wTnETqiD$Z80LORc8#s0I=eG44KC=;`48TnIq$R|tsmHvOHDyc(wBPPeI2&FY zM6F-eR1A=-g(4>apTniPs^`-unKKVYaekLrb*wjV>l_&@v&}F=mV;YDOq%)|ae|pv z+^{&8yE-N}cY-Z(`l@XNNK(WMNg8qOw%DC4#EHT2=aZMNiG_<%IVlP3%kBj%Z~vjm zHSNGGRtG1Bx4U7aqbJTmXz=M2fi93Nn4f!{vu6?*XsaavdNUa&SVNQ0SBy1D#5e< zLEnj9!a%`#Ve6$GNAzokX(Bcbsy?<(ere=4=jLXacl&hWiOA~bhP*I z0wa~50*wm^wuV~VaJna(5Pkr8r$9QGHGFNU^%6Z8tq6kT+kar)*&aqEK^#2 zI?*cccx8XFSmK5?%k&*^B!KB$S`*l!25tv{iw7t!qs-Jpxa*vt-2M4pb9VcCr|*$? z^kF0F3|8y~4++-LRt7+M6za@SvKnLs0 zkef|*Z^7Y%K7J;f;*5tj}IIe zO!y9xr}Fd_)Ky4`S;IPu*TDz?chnaTlLnRzWhndFj zO~@9w0{}TwE&*^O0gSu7{sCD&@VUjlZ?zBh!9KW`utLw~bRyH;?r^8~@Wd*j0HTH1 zT5VOfR}yO`^Oy*i;n>lH#A9EjFY>sD?1x$bVvpciS_6RniIS)C06@;1vcG5Wh2;=w z^n|%NE;5RvZwN6oFQd6)e8o2no<6>KSdw;z0RbXGh*#_A9%d)X zTV^`%>aTa(dvY;wJq*WGxKr$399-yu$*?o!R+m2R*xN#A!i0DWpfF-{3SG7#sNnM+ z!WskBO90b67PKrfyWym)*0p=0U5vsHUK+d1we6dkF^5O zf;t#j9*4fC`9UBi5< z5aO^9b7N?G|7OYgStWa~d7+)2HLn+-|u^wtbSvk*{^15;F$NXWj{6c*#iJr>g!8`&jJ&*-1c7_?bR?p zJd=L9jz0cf9j=QpJd3*GC2-r3pP^;z<K8;<<9M4nCCBmNE0yCWSY~ss9IBxVK(cE7Nd^EE#k|0kBhEz-k zyP7af0JN-x8xUcRu2I80Y4i^q5n`$Fey}pf3^hxXPGMe=d4U-wrUUS})xV^Xedl$=nZ)ssPwAyo{M)@xg6wxIHD|e~Ivs zJIue9J~$@DMU{8o$7R>y@U|{tF7=BK>q(dwR*Pd}gcx!zsl6r8lHcO#-<&=5GRAjM;UNF8gdk&ZZv1 zD@v$c47AK*q1gjpFl|oUN%wHEdhu-l7L20Hst@%z%ul4jsr|rLMFdfg3*P(gEmr>P zySrh|J;Izl2P4K5&}qdz4Rb^RHIYMV;rkTBhB?S7|Fm)b_PIk94fBS?6^qGkc;YAe z?qK`bBksQo9F)P`x#9gF?gzKG0aiQE3`Rg{0vgm*Z{;4l;{-6G&gh`i>V1#IqY2wl zX9LCZkl;OF1K@TV>Kt%o7XSq1P&>~n@NIwBOkKnL)Vp27JW2?$TL?`o^5~|f>kFDv zs=Uxn&y|&xIiVE*8*Z3(Xt%lGHUKK$HC9wql$`w1K}J}Pe&|bQ^agL$V$@$>d!dLc z17CeQjy~uo5i)GZwd_8xJ(C1D?KMS3xs#KD$yypmg~WO%%m)ox;Lg;8b9pJFqmQ37 z;kua8{-`TmCSxK$RZD~}bW^i8;1?A|4fFSu$DJMU?*6N7KMwQt5{25z?X{uRW6jP6HWL5<002ovPDHLkV1g?9|6~9F literal 0 HcmV?d00001 diff --git a/tests/ref/text-font-covers-numbers.png b/tests/ref/text-font-covers-numbers.png new file mode 100644 index 0000000000000000000000000000000000000000..9ed95c2f964b076c5018c3f77111ccde82845ac1 GIT binary patch literal 500 zcmVZG7SHXS(APVoKNYX~k_I$^E$TobB3kq8|Jg1M8vn?# zxajAk>~96vbum3-G-U9{TmLKb% z{NHl_?bc5Zjy~D+`{?=qdn)(7`v3pQjxRrUzt{=3(h96~#;pJUw-x@U#A35W<%|9& zy-hsz|NmRvo}C{5o2LGCyLw)ATS@XM>koT;uG&AlQ})*CyIHkF&|Hoea3$(cPf667G#g&I!rvl}Q4evRm@4A*D+Z99BtH~fEK z+`H_#S=t|