diff --git a/Cargo.lock b/Cargo.lock index 94ce026e9..e6c1cf0f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2931,6 +2931,7 @@ dependencies = [ "qcms", "rayon", "regex", + "regex-syntax", "roxmltree", "rust_decimal", "rustybuzz", diff --git a/Cargo.toml b/Cargo.toml index b20d54e87..f4afefa43 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,6 +94,7 @@ qcms = "0.3.0" quote = "1" rayon = "1.7.0" regex = "1" +regex-syntax = "0.8" resvg = { version = "0.43", default-features = false, features = ["raster-images"] } roxmltree = "0.20" rust_decimal = { version = "1.36.0", default-features = false, features = ["maths"] } diff --git a/crates/typst-layout/src/image.rs b/crates/typst-layout/src/image.rs index 628fe10d6..f44d68873 100644 --- a/crates/typst-layout/src/image.rs +++ b/crates/typst-layout/src/image.rs @@ -54,7 +54,7 @@ pub fn layout_image( format, elem.alt(styles), engine.world, - &families(styles).collect::>(), + &families(styles).map(|f| f.as_str()).collect::>(), elem.flatten_text(styles), ) .at(span)?; diff --git a/crates/typst-layout/src/inline/shaping.rs b/crates/typst-layout/src/inline/shaping.rs index c2b892d82..d6b7632b6 100644 --- a/crates/typst-layout/src/inline/shaping.rs +++ b/crates/typst-layout/src/inline/shaping.rs @@ -11,8 +11,8 @@ use typst_library::engine::Engine; use typst_library::foundations::{Smart, StyleChain}; use typst_library::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size}; use typst_library::text::{ - families, features, is_default_ignorable, variant, Font, FontVariant, Glyph, Lang, - Region, TextEdgeBounds, TextElem, TextItem, + families, features, is_default_ignorable, variant, Font, FontFamily, FontVariant, + Glyph, Lang, Region, TextEdgeBounds, TextElem, TextItem, }; use typst_library::World; use typst_utils::SliceExt; @@ -351,7 +351,7 @@ impl<'a> ShapedText<'a> { for family in families(self.styles) { if let Some(font) = world .book() - .select(family, self.variant) + .select(family.as_str(), self.variant) .and_then(|id| world.font(id)) { expand(&font, TextEdgeBounds::Zero); @@ -463,7 +463,8 @@ impl<'a> ShapedText<'a> { None }; let mut chain = families(self.styles) - .map(|family| book.select(family, self.variant)) + .filter(|family| family.covers().map_or(true, |c| c.is_match("-"))) + .map(|family| book.select(family.as_str(), self.variant)) .chain(fallback_func.iter().map(|f| f())) .flatten(); @@ -719,7 +720,7 @@ fn shape_segment<'a>( ctx: &mut ShapingContext, base: usize, text: &str, - mut families: impl Iterator + Clone, + mut families: impl Iterator + Clone, ) { // Don't try shaping newlines, tabs, or default ignorables. if text @@ -732,11 +733,18 @@ fn shape_segment<'a>( // Find the next available family. let world = ctx.engine.world; let book = world.book(); - let mut selection = families.find_map(|family| { - book.select(family, ctx.variant) + let mut selection = None; + let mut covers = None; + for family in families.by_ref() { + selection = book + .select(family.as_str(), ctx.variant) .and_then(|id| world.font(id)) - .filter(|font| !ctx.used.contains(font)) - }); + .filter(|font| !ctx.used.contains(font)); + if selection.is_some() { + covers = family.covers(); + break; + } + } // Do font fallback if the families are exhausted and fallback is enabled. if selection.is_none() && ctx.fallback { @@ -795,6 +803,16 @@ fn shape_segment<'a>( let pos = buffer.glyph_positions(); let ltr = ctx.dir.is_positive(); + // Whether the character at the given offset is covered by the coverage. + let is_covered = |offset| { + let end = text[offset..] + .char_indices() + .nth(1) + .map(|(i, _)| offset + i) + .unwrap_or(text.len()); + covers.map_or(true, |cov| cov.is_match(&text[offset..end])) + }; + // Collect the shaped glyphs, doing fallback and shaping parts again with // the next font if necessary. let mut i = 0; @@ -803,7 +821,7 @@ fn shape_segment<'a>( let cluster = info.cluster as usize; // Add the glyph to the shaped output. - if info.glyph_id != 0 { + if info.glyph_id != 0 && is_covered(cluster) { // Determine the text range of the glyph. let start = base + cluster; let end = base @@ -836,7 +854,9 @@ fn shape_segment<'a>( } else { // First, search for the end of the tofu sequence. let k = i; - while infos.get(i + 1).is_some_and(|info| info.glyph_id == 0) { + while infos.get(i + 1).is_some_and(|info| { + info.glyph_id == 0 || !is_covered(info.cluster as usize) + }) { i += 1; } diff --git a/crates/typst-layout/src/math/mod.rs b/crates/typst-layout/src/math/mod.rs index 32059cef9..e642f6338 100644 --- a/crates/typst-layout/src/math/mod.rs +++ b/crates/typst-layout/src/math/mod.rs @@ -237,7 +237,7 @@ fn find_math_font( let variant = variant(styles); let world = engine.world; let Some(font) = families(styles).find_map(|family| { - let id = world.book().select(family, variant)?; + let id = world.book().select(family.as_str(), variant)?; let font = world.font(id)?; let _ = font.ttf().tables().math?.constants?; Some(font) diff --git a/crates/typst-library/Cargo.toml b/crates/typst-library/Cargo.toml index d854e4d53..cc5e26712 100644 --- a/crates/typst-library/Cargo.toml +++ b/crates/typst-library/Cargo.toml @@ -44,6 +44,7 @@ png = { workspace = true } qcms = { workspace = true } rayon = { workspace = true } regex = { workspace = true } +regex-syntax = { workspace = true } roxmltree = { workspace = true } rust_decimal = { workspace = true } rustybuzz = { workspace = true } diff --git a/crates/typst-library/src/text/mod.rs b/crates/typst-library/src/text/mod.rs index 91927b572..ee81e3f2a 100644 --- a/crates/typst-library/src/text/mod.rs +++ b/crates/typst-library/src/text/mod.rs @@ -29,6 +29,7 @@ pub use self::smartquote::*; pub use self::space::*; use std::fmt::{self, Debug, Formatter}; +use std::hash::Hash; use std::sync::LazyLock; use ecow::{eco_format, EcoString}; @@ -39,13 +40,14 @@ use rustybuzz::Feature; use smallvec::SmallVec; use ttf_parser::Tag; use typst_syntax::Spanned; +use typst_utils::singleton; use crate::diag::{bail, warning, HintedStrResult, SourceResult}; use crate::engine::Engine; use crate::foundations::{ cast, category, dict, elem, Args, Array, Cast, Category, Construct, Content, Dict, - Fold, IntoValue, NativeElement, Never, NoneValue, Packed, PlainText, Repr, Resolve, - Scope, Set, Smart, StyleChain, + Fold, IntoValue, NativeElement, Never, NoneValue, Packed, PlainText, Regex, Repr, + Resolve, Scope, Set, Smart, StyleChain, }; use crate::layout::{Abs, Axis, Dir, Em, Length, Ratio, Rel}; use crate::model::ParElem; @@ -94,7 +96,21 @@ pub(super) fn define(global: &mut Scope) { /// ``` #[elem(Debug, Construct, PlainText, Repr)] pub struct TextElem { - /// A font family name or priority list of font family names. + /// A font family descriptor or priority list of font family descriptor. + /// + /// A font family descriptor can be a plain string representing the family + /// name or a dictionary with the following keys: + /// + /// - `name` (required): The font family name. + /// - `covers` (optional): Defines the Unicode codepoints for which the + /// family shall be used. This can be: + /// - A predefined coverage set: + /// - `{"latin-in-cjk"}` covers all codepoints except for those which + /// exist in Latin fonts, but should preferrably be taken from CJK + /// fonts. + /// - A [regular expression]($regex) that defines exactly which codepoints + /// shall be covered. Accepts only the subset of regular expressions + /// which consist of exactly one dot, letter, or character class. /// /// When processing text, Typst tries all specified font families in order /// until it finds a font that has the necessary glyphs. In the example @@ -129,6 +145,21 @@ pub struct TextElem { /// /// This is Latin. \ /// هذا عربي. + /// + /// // Change font only for numbers. + /// #set text(font: ( + /// (name: "PT Sans", covers: regex("[0-9]")), + /// "Libertinus Serif" + /// )) + /// + /// The number 123. + /// + /// // Mix Latin and CJK fonts. + /// #set text(font: ( + /// (name: "Inria Serif", covers: "latin-in-cjk"), + /// "Noto Serif CJK SC" + /// )) + /// 分别设置“中文”和English字体 /// ``` #[parse({ let font_list: Option> = args.named("font")?; @@ -766,35 +797,107 @@ impl PlainText for Packed { } /// A lowercased font family like "arial". -#[derive(Clone, Eq, PartialEq, Hash)] -pub struct FontFamily(EcoString); +#[derive(Debug, Clone, PartialEq, Hash)] +pub struct FontFamily { + // The name of the font family + name: EcoString, + // A regex that defines the Unicode codepoints supported by the font. + covers: Option, +} impl FontFamily { /// Create a named font family variant. pub fn new(string: &str) -> Self { - Self(string.to_lowercase().into()) + Self::with_coverage(string, None) + } + + /// Create a font family by name and optional Unicode coverage. + pub fn with_coverage(string: &str, covers: Option) -> Self { + Self { name: string.to_lowercase().into(), covers } } /// The lowercased family name. pub fn as_str(&self) -> &str { - &self.0 + &self.name } -} -impl Debug for FontFamily { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - self.0.fmt(f) + /// The user-set coverage of the font family. + pub fn covers(&self) -> Option<&Regex> { + self.covers.as_ref().map(|covers| covers.as_regex()) } } cast! { FontFamily, - self => self.0.into_value(), + self => self.name.into_value(), string: EcoString => Self::new(&string), + mut v: Dict => { + let ret = Self::with_coverage( + &v.take("name")?.cast::()?, + v.take("covers").ok().map(|v| v.cast()).transpose()? + ); + v.finish(&["name", "covers"])?; + ret + }, +} + +/// Defines which codepoints a font family will be used for. +#[derive(Debug, Clone, PartialEq, Hash)] +pub enum Covers { + /// Covers all codepoints except those used both in Latin and CJK fonts. + LatinInCjk, + /// Covers the set of codepoints for which the regex matches. + Regex(Regex), +} + +impl Covers { + /// Retrieve the regex for the coverage. + pub fn as_regex(&self) -> &Regex { + match self { + Self::LatinInCjk => singleton!( + Regex, + Regex::new( + "[^\u{00B7}\u{2013}\u{2014}\u{2018}\u{2019}\ + \u{201C}\u{201D}\u{2025}-\u{2027}\u{2E3A}]" + ) + .unwrap() + ), + Self::Regex(regex) => regex, + } + } +} + +cast! { + Covers, + self => match self { + Self::LatinInCjk => "latin-in-cjk".into_value(), + Self::Regex(regex) => regex.into_value(), + }, + + /// Covers all codepoints except those used both in Latin and CJK fonts. + "latin-in-cjk" => Covers::LatinInCjk, + + regex: Regex => { + let ast = regex_syntax::ast::parse::Parser::new().parse(regex.as_str()); + match ast { + Ok( + regex_syntax::ast::Ast::ClassBracketed(..) + | regex_syntax::ast::Ast::ClassUnicode(..) + | regex_syntax::ast::Ast::ClassPerl(..) + | regex_syntax::ast::Ast::Dot(..) + | regex_syntax::ast::Ast::Literal(..), + ) => {} + _ => bail!( + "coverage regex may only use dot, letters, and character classes"; + hint: "the regex is applied to each letter individually" + ), + } + Covers::Regex(regex) + }, } /// Font family fallback list. -#[derive(Debug, Default, Clone, Eq, PartialEq, Hash)] +#[derive(Debug, Default, Clone, PartialEq, Hash)] pub struct FontList(pub Vec); impl<'a> IntoIterator for &'a FontList { @@ -809,7 +912,7 @@ impl<'a> IntoIterator for &'a FontList { cast! { FontList, self => if self.0.len() == 1 { - self.0.into_iter().next().unwrap().0.into_value() + self.0.into_iter().next().unwrap().name.into_value() } else { self.0.into_value() }, @@ -818,20 +921,22 @@ cast! { } /// Resolve a prioritized iterator over the font families. -pub fn families(styles: StyleChain) -> impl Iterator + Clone { - const FALLBACKS: &[&str] = &[ - "libertinus serif", - "twitter color emoji", - "noto color emoji", - "apple color emoji", - "segoe ui emoji", - ]; - - let tail = if TextElem::fallback_in(styles) { FALLBACKS } else { &[] }; - TextElem::font_in(styles) +pub fn families(styles: StyleChain) -> impl Iterator + Clone { + let fallbacks = singleton!(Vec, { + [ + "libertinus serif", + "twitter color emoji", + "noto color emoji", + "apple color emoji", + "segoe ui emoji", + ] .into_iter() - .map(|family| family.as_str()) - .chain(tail.iter().copied()) + .map(FontFamily::new) + .collect() + }); + + let tail = if TextElem::fallback_in(styles) { fallbacks.as_slice() } else { &[] }; + TextElem::font_in(styles).into_iter().chain(tail.iter()) } /// Resolve the font variant. diff --git a/crates/typst-library/src/text/shift.rs b/crates/typst-library/src/text/shift.rs index 003ecf47c..9723bbf0c 100644 --- a/crates/typst-library/src/text/shift.rs +++ b/crates/typst-library/src/text/shift.rs @@ -157,7 +157,11 @@ fn is_shapable(engine: &Engine, text: &str, styles: StyleChain) -> bool { .select(family.as_str(), variant(styles)) .and_then(|id| world.font(id)) { - return text.chars().all(|c| font.ttf().glyph_index(c).is_some()); + let covers = family.covers(); + return text.chars().all(|c| { + covers.map_or(true, |cov| cov.is_match(c.encode_utf8(&mut [0; 4]))) + && font.ttf().glyph_index(c).is_some() + }); } } diff --git a/crates/typst-macros/src/lib.rs b/crates/typst-macros/src/lib.rs index e1c3c13ab..578389c7f 100644 --- a/crates/typst-macros/src/lib.rs +++ b/crates/typst-macros/src/lib.rs @@ -280,7 +280,7 @@ pub fn category(stream: BoundaryStream, item: BoundaryStream) -> BoundaryStream /// - `Reflect` makes Typst's runtime aware of the type's characteristics. /// It's important for autocompletion, error messages, etc. /// - `FromValue` defines how to cast from a value into this type. -/// - `IntoValue` defines how to cast fromthis type into a value. +/// - `IntoValue` defines how to cast from this type into a value. /// /// ```ignore /// /// An integer between 0 and 13. diff --git a/tests/ref/text-font-covers-chinese.png b/tests/ref/text-font-covers-chinese.png new file mode 100644 index 000000000..5c9b4b1e1 Binary files /dev/null and b/tests/ref/text-font-covers-chinese.png differ diff --git a/tests/ref/text-font-covers-numbers.png b/tests/ref/text-font-covers-numbers.png new file mode 100644 index 000000000..9ed95c2f9 Binary files /dev/null and b/tests/ref/text-font-covers-numbers.png differ diff --git a/tests/suite/text/font.typ b/tests/suite/text/font.typ index bb75f4ae7..5af8dcb9c 100644 --- a/tests/suite/text/font.typ +++ b/tests/suite/text/font.typ @@ -112,3 +112,40 @@ I [ ] text(fill: t, "Hello") }) + +--- text-font-types --- +#let ubuntu = (name: "Ubuntu", covers: regex("[\u{20}-\u{FFFF}]")) +#set text(font: ubuntu) +#set text(font: (ubuntu, "Ubuntu")) + +--- text-font-covers-chinese --- +// Without ranges, the quotation mark is using the Latin font. +#set text(font: ("Ubuntu", "Noto Serif CJK SC")) +分别设置“中文”和English字体 + +// With ranges, the quotation mark is using the Chinese font. +#set text(font: ((name: "Noto Serif CJK SC", covers: regex("[\u{00B7}-\u{3134F}]")), "Ubuntu")) +分别设置“中文”和English字体 + +// With "latin-in-cjk", the quotation mark is also using the Chinese font. +#set text(font: ((name: "Ubuntu", covers: "latin-in-cjk"), "Noto Serif CJK SC")) +分别设置“中文”和English字体 + +--- text-font-covers-numbers --- +// Change font only for numbers. +#set text(font: ( + (name: "PT Sans", covers: regex("[0-9]")), + "Libertinus Serif" +)) + +The number 123. + +--- text-font-covers-bad-1 --- +// Error: 17-59 coverage regex may only use dot, letters, and character classes +// Hint: 17-59 the regex is applied to each letter individually +#set text(font: (name: "Ubuntu", covers: regex("20-FFFF"))) + +--- text-font-covers-bad-2 --- +// Error: 17-65 coverage regex may only use dot, letters, and character classes +// Hint: 17-65 the regex is applied to each letter individually +#set text(font: (name: "Ubuntu", covers: regex("\u{20}-\u{10}")))