Support for defining which charset should be covered by a font (#5305)

Co-authored-by: Laurenz <laurmaedje@gmail.com>
This commit is contained in:
Peng Guanwen 2024-12-17 22:07:45 +08:00 committed by GitHub
parent 54cee16c31
commit 73253d4651
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 211 additions and 42 deletions

1
Cargo.lock generated
View File

@ -2931,6 +2931,7 @@ dependencies = [
"qcms",
"rayon",
"regex",
"regex-syntax",
"roxmltree",
"rust_decimal",
"rustybuzz",

View File

@ -94,6 +94,7 @@ qcms = "0.3.0"
quote = "1"
rayon = "1.7.0"
regex = "1"
regex-syntax = "0.8"
resvg = { version = "0.43", default-features = false, features = ["raster-images"] }
roxmltree = "0.20"
rust_decimal = { version = "1.36.0", default-features = false, features = ["maths"] }

View File

@ -54,7 +54,7 @@ pub fn layout_image(
format,
elem.alt(styles),
engine.world,
&families(styles).collect::<Vec<_>>(),
&families(styles).map(|f| f.as_str()).collect::<Vec<_>>(),
elem.flatten_text(styles),
)
.at(span)?;

View File

@ -11,8 +11,8 @@ use typst_library::engine::Engine;
use typst_library::foundations::{Smart, StyleChain};
use typst_library::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size};
use typst_library::text::{
families, features, is_default_ignorable, variant, Font, FontVariant, Glyph, Lang,
Region, TextEdgeBounds, TextElem, TextItem,
families, features, is_default_ignorable, variant, Font, FontFamily, FontVariant,
Glyph, Lang, Region, TextEdgeBounds, TextElem, TextItem,
};
use typst_library::World;
use typst_utils::SliceExt;
@ -351,7 +351,7 @@ impl<'a> ShapedText<'a> {
for family in families(self.styles) {
if let Some(font) = world
.book()
.select(family, self.variant)
.select(family.as_str(), self.variant)
.and_then(|id| world.font(id))
{
expand(&font, TextEdgeBounds::Zero);
@ -463,7 +463,8 @@ impl<'a> ShapedText<'a> {
None
};
let mut chain = families(self.styles)
.map(|family| book.select(family, self.variant))
.filter(|family| family.covers().map_or(true, |c| c.is_match("-")))
.map(|family| book.select(family.as_str(), self.variant))
.chain(fallback_func.iter().map(|f| f()))
.flatten();
@ -719,7 +720,7 @@ fn shape_segment<'a>(
ctx: &mut ShapingContext,
base: usize,
text: &str,
mut families: impl Iterator<Item = &'a str> + Clone,
mut families: impl Iterator<Item = &'a FontFamily> + Clone,
) {
// Don't try shaping newlines, tabs, or default ignorables.
if text
@ -732,11 +733,18 @@ fn shape_segment<'a>(
// Find the next available family.
let world = ctx.engine.world;
let book = world.book();
let mut selection = families.find_map(|family| {
book.select(family, ctx.variant)
let mut selection = None;
let mut covers = None;
for family in families.by_ref() {
selection = book
.select(family.as_str(), ctx.variant)
.and_then(|id| world.font(id))
.filter(|font| !ctx.used.contains(font))
});
.filter(|font| !ctx.used.contains(font));
if selection.is_some() {
covers = family.covers();
break;
}
}
// Do font fallback if the families are exhausted and fallback is enabled.
if selection.is_none() && ctx.fallback {
@ -795,6 +803,16 @@ fn shape_segment<'a>(
let pos = buffer.glyph_positions();
let ltr = ctx.dir.is_positive();
// Whether the character at the given offset is covered by the coverage.
let is_covered = |offset| {
let end = text[offset..]
.char_indices()
.nth(1)
.map(|(i, _)| offset + i)
.unwrap_or(text.len());
covers.map_or(true, |cov| cov.is_match(&text[offset..end]))
};
// Collect the shaped glyphs, doing fallback and shaping parts again with
// the next font if necessary.
let mut i = 0;
@ -803,7 +821,7 @@ fn shape_segment<'a>(
let cluster = info.cluster as usize;
// Add the glyph to the shaped output.
if info.glyph_id != 0 {
if info.glyph_id != 0 && is_covered(cluster) {
// Determine the text range of the glyph.
let start = base + cluster;
let end = base
@ -836,7 +854,9 @@ fn shape_segment<'a>(
} else {
// First, search for the end of the tofu sequence.
let k = i;
while infos.get(i + 1).is_some_and(|info| info.glyph_id == 0) {
while infos.get(i + 1).is_some_and(|info| {
info.glyph_id == 0 || !is_covered(info.cluster as usize)
}) {
i += 1;
}

View File

@ -237,7 +237,7 @@ fn find_math_font(
let variant = variant(styles);
let world = engine.world;
let Some(font) = families(styles).find_map(|family| {
let id = world.book().select(family, variant)?;
let id = world.book().select(family.as_str(), variant)?;
let font = world.font(id)?;
let _ = font.ttf().tables().math?.constants?;
Some(font)

View File

@ -44,6 +44,7 @@ png = { workspace = true }
qcms = { workspace = true }
rayon = { workspace = true }
regex = { workspace = true }
regex-syntax = { workspace = true }
roxmltree = { workspace = true }
rust_decimal = { workspace = true }
rustybuzz = { workspace = true }

View File

@ -29,6 +29,7 @@ pub use self::smartquote::*;
pub use self::space::*;
use std::fmt::{self, Debug, Formatter};
use std::hash::Hash;
use std::sync::LazyLock;
use ecow::{eco_format, EcoString};
@ -39,13 +40,14 @@ use rustybuzz::Feature;
use smallvec::SmallVec;
use ttf_parser::Tag;
use typst_syntax::Spanned;
use typst_utils::singleton;
use crate::diag::{bail, warning, HintedStrResult, SourceResult};
use crate::engine::Engine;
use crate::foundations::{
cast, category, dict, elem, Args, Array, Cast, Category, Construct, Content, Dict,
Fold, IntoValue, NativeElement, Never, NoneValue, Packed, PlainText, Repr, Resolve,
Scope, Set, Smart, StyleChain,
Fold, IntoValue, NativeElement, Never, NoneValue, Packed, PlainText, Regex, Repr,
Resolve, Scope, Set, Smart, StyleChain,
};
use crate::layout::{Abs, Axis, Dir, Em, Length, Ratio, Rel};
use crate::model::ParElem;
@ -94,7 +96,21 @@ pub(super) fn define(global: &mut Scope) {
/// ```
#[elem(Debug, Construct, PlainText, Repr)]
pub struct TextElem {
/// A font family name or priority list of font family names.
/// A font family descriptor or priority list of font family descriptor.
///
/// A font family descriptor can be a plain string representing the family
/// name or a dictionary with the following keys:
///
/// - `name` (required): The font family name.
/// - `covers` (optional): Defines the Unicode codepoints for which the
/// family shall be used. This can be:
/// - A predefined coverage set:
/// - `{"latin-in-cjk"}` covers all codepoints except for those which
/// exist in Latin fonts, but should preferrably be taken from CJK
/// fonts.
/// - A [regular expression]($regex) that defines exactly which codepoints
/// shall be covered. Accepts only the subset of regular expressions
/// which consist of exactly one dot, letter, or character class.
///
/// When processing text, Typst tries all specified font families in order
/// until it finds a font that has the necessary glyphs. In the example
@ -129,6 +145,21 @@ pub struct TextElem {
///
/// This is Latin. \
/// هذا عربي.
///
/// // Change font only for numbers.
/// #set text(font: (
/// (name: "PT Sans", covers: regex("[0-9]")),
/// "Libertinus Serif"
/// ))
///
/// The number 123.
///
/// // Mix Latin and CJK fonts.
/// #set text(font: (
/// (name: "Inria Serif", covers: "latin-in-cjk"),
/// "Noto Serif CJK SC"
/// ))
/// 分别设置“中文”和English字体
/// ```
#[parse({
let font_list: Option<Spanned<FontList>> = args.named("font")?;
@ -766,35 +797,107 @@ impl PlainText for Packed<TextElem> {
}
/// A lowercased font family like "arial".
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct FontFamily(EcoString);
#[derive(Debug, Clone, PartialEq, Hash)]
pub struct FontFamily {
// The name of the font family
name: EcoString,
// A regex that defines the Unicode codepoints supported by the font.
covers: Option<Covers>,
}
impl FontFamily {
/// Create a named font family variant.
pub fn new(string: &str) -> Self {
Self(string.to_lowercase().into())
Self::with_coverage(string, None)
}
/// Create a font family by name and optional Unicode coverage.
pub fn with_coverage(string: &str, covers: Option<Covers>) -> Self {
Self { name: string.to_lowercase().into(), covers }
}
/// The lowercased family name.
pub fn as_str(&self) -> &str {
&self.0
}
&self.name
}
impl Debug for FontFamily {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
self.0.fmt(f)
/// The user-set coverage of the font family.
pub fn covers(&self) -> Option<&Regex> {
self.covers.as_ref().map(|covers| covers.as_regex())
}
}
cast! {
FontFamily,
self => self.0.into_value(),
self => self.name.into_value(),
string: EcoString => Self::new(&string),
mut v: Dict => {
let ret = Self::with_coverage(
&v.take("name")?.cast::<EcoString>()?,
v.take("covers").ok().map(|v| v.cast()).transpose()?
);
v.finish(&["name", "covers"])?;
ret
},
}
/// Defines which codepoints a font family will be used for.
#[derive(Debug, Clone, PartialEq, Hash)]
pub enum Covers {
/// Covers all codepoints except those used both in Latin and CJK fonts.
LatinInCjk,
/// Covers the set of codepoints for which the regex matches.
Regex(Regex),
}
impl Covers {
/// Retrieve the regex for the coverage.
pub fn as_regex(&self) -> &Regex {
match self {
Self::LatinInCjk => singleton!(
Regex,
Regex::new(
"[^\u{00B7}\u{2013}\u{2014}\u{2018}\u{2019}\
\u{201C}\u{201D}\u{2025}-\u{2027}\u{2E3A}]"
)
.unwrap()
),
Self::Regex(regex) => regex,
}
}
}
cast! {
Covers,
self => match self {
Self::LatinInCjk => "latin-in-cjk".into_value(),
Self::Regex(regex) => regex.into_value(),
},
/// Covers all codepoints except those used both in Latin and CJK fonts.
"latin-in-cjk" => Covers::LatinInCjk,
regex: Regex => {
let ast = regex_syntax::ast::parse::Parser::new().parse(regex.as_str());
match ast {
Ok(
regex_syntax::ast::Ast::ClassBracketed(..)
| regex_syntax::ast::Ast::ClassUnicode(..)
| regex_syntax::ast::Ast::ClassPerl(..)
| regex_syntax::ast::Ast::Dot(..)
| regex_syntax::ast::Ast::Literal(..),
) => {}
_ => bail!(
"coverage regex may only use dot, letters, and character classes";
hint: "the regex is applied to each letter individually"
),
}
Covers::Regex(regex)
},
}
/// Font family fallback list.
#[derive(Debug, Default, Clone, Eq, PartialEq, Hash)]
#[derive(Debug, Default, Clone, PartialEq, Hash)]
pub struct FontList(pub Vec<FontFamily>);
impl<'a> IntoIterator for &'a FontList {
@ -809,7 +912,7 @@ impl<'a> IntoIterator for &'a FontList {
cast! {
FontList,
self => if self.0.len() == 1 {
self.0.into_iter().next().unwrap().0.into_value()
self.0.into_iter().next().unwrap().name.into_value()
} else {
self.0.into_value()
},
@ -818,20 +921,22 @@ cast! {
}
/// Resolve a prioritized iterator over the font families.
pub fn families(styles: StyleChain) -> impl Iterator<Item = &str> + Clone {
const FALLBACKS: &[&str] = &[
pub fn families(styles: StyleChain) -> impl Iterator<Item = &FontFamily> + Clone {
let fallbacks = singleton!(Vec<FontFamily>, {
[
"libertinus serif",
"twitter color emoji",
"noto color emoji",
"apple color emoji",
"segoe ui emoji",
];
let tail = if TextElem::fallback_in(styles) { FALLBACKS } else { &[] };
TextElem::font_in(styles)
]
.into_iter()
.map(|family| family.as_str())
.chain(tail.iter().copied())
.map(FontFamily::new)
.collect()
});
let tail = if TextElem::fallback_in(styles) { fallbacks.as_slice() } else { &[] };
TextElem::font_in(styles).into_iter().chain(tail.iter())
}
/// Resolve the font variant.

View File

@ -157,7 +157,11 @@ fn is_shapable(engine: &Engine, text: &str, styles: StyleChain) -> bool {
.select(family.as_str(), variant(styles))
.and_then(|id| world.font(id))
{
return text.chars().all(|c| font.ttf().glyph_index(c).is_some());
let covers = family.covers();
return text.chars().all(|c| {
covers.map_or(true, |cov| cov.is_match(c.encode_utf8(&mut [0; 4])))
&& font.ttf().glyph_index(c).is_some()
});
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 500 B

View File

@ -112,3 +112,40 @@ I
[ ]
text(fill: t, "Hello")
})
--- text-font-types ---
#let ubuntu = (name: "Ubuntu", covers: regex("[\u{20}-\u{FFFF}]"))
#set text(font: ubuntu)
#set text(font: (ubuntu, "Ubuntu"))
--- text-font-covers-chinese ---
// Without ranges, the quotation mark is using the Latin font.
#set text(font: ("Ubuntu", "Noto Serif CJK SC"))
分别设置“中文”和English字体
// With ranges, the quotation mark is using the Chinese font.
#set text(font: ((name: "Noto Serif CJK SC", covers: regex("[\u{00B7}-\u{3134F}]")), "Ubuntu"))
分别设置“中文”和English字体
// With "latin-in-cjk", the quotation mark is also using the Chinese font.
#set text(font: ((name: "Ubuntu", covers: "latin-in-cjk"), "Noto Serif CJK SC"))
分别设置“中文”和English字体
--- text-font-covers-numbers ---
// Change font only for numbers.
#set text(font: (
(name: "PT Sans", covers: regex("[0-9]")),
"Libertinus Serif"
))
The number 123.
--- text-font-covers-bad-1 ---
// Error: 17-59 coverage regex may only use dot, letters, and character classes
// Hint: 17-59 the regex is applied to each letter individually
#set text(font: (name: "Ubuntu", covers: regex("20-FFFF")))
--- text-font-covers-bad-2 ---
// Error: 17-65 coverage regex may only use dot, letters, and character classes
// Hint: 17-65 the regex is applied to each letter individually
#set text(font: (name: "Ubuntu", covers: regex("\u{20}-\u{10}")))