use std::cmp::Reverse; use std::collections::BTreeMap; use serde::{Deserialize, Serialize}; use ttf_parser::{name_id, PlatformId, Tag}; use unicode_segmentation::UnicodeSegmentation; use super::{Font, FontStretch, FontStyle, FontVariant, FontWeight}; /// Metadata about a collection of fonts. #[derive(Default, Clone, Hash)] pub struct FontBook { /// Maps from lowercased family names to font indices. families: BTreeMap>, /// Metadata about each font in the collection. infos: Vec, } impl FontBook { /// Create a new, empty font book. pub fn new() -> Self { Self { families: BTreeMap::new(), infos: vec![] } } /// Create a font book for a collection of fonts. pub fn from_fonts<'a>(fonts: impl IntoIterator) -> Self { let mut book = Self::new(); for font in fonts { book.push(font.info().clone()); } book } /// Insert metadata into the font book. pub fn push(&mut self, info: FontInfo) { let index = self.infos.len(); let family = info.family.to_lowercase(); self.families.entry(family).or_default().push(index); self.infos.push(info); } /// An ordered iterator over all font families this book knows and details /// about the fonts that are part of them. pub fn families( &self, ) -> impl Iterator)> + '_ { // Since the keys are lowercased, we instead use the family field of the // first face's info. self.families.values().map(|ids| { let family = self.infos[ids[0]].family.as_str(); let infos = ids.iter().map(|&id| &self.infos[id]); (family, infos) }) } /// Try to find and load a font from the given `family` that matches /// the given `variant` as closely as possible. /// /// The `family` should be all lowercase. pub fn select(&self, family: &str, variant: FontVariant) -> Option { let ids = self.families.get(family)?; self.find_best_variant(None, variant, ids.iter().copied()) } /// Try to find and load a fallback font that /// - is as close as possible to the font `like` (if any) /// - is as close as possible to the given `variant` /// - is suitable for shaping the given `text` pub fn select_fallback( &self, like: Option<&FontInfo>, variant: FontVariant, text: &str, ) -> Option { // Find the fonts that contain the text's first char ... let c = text.chars().next()?; let ids = self .infos .iter() .enumerate() .filter(|(_, info)| info.coverage.contains(c as u32)) .map(|(index, _)| index); // ... and find the best variant among them. self.find_best_variant(like, variant, ids) } /// Find the font in the passed iterator that /// - is closest to the font `like` (if any) /// - is closest to the given `variant` /// /// To do that we compute a key for all variants and select the one with the /// minimal key. This key prioritizes: /// - If `like` is some other font: /// - Are both fonts (not) monospaced? /// - Do both fonts (not) have serifs? /// - How many words do the families share in their prefix? E.g. "Noto /// Sans" and "Noto Sans Arabic" share two words, whereas "IBM Plex /// Arabic" shares none with "Noto Sans", so prefer "Noto Sans Arabic" /// if `like` is "Noto Sans". In case there are two equally good /// matches, we prefer the shorter one because it is less special (e.g. /// if `like` is "Noto Sans Arabic", we prefer "Noto Sans" over "Noto /// Sans CJK HK".) /// - The style (normal / italic / oblique). If we want italic or oblique /// but it doesn't exist, the other one of the two is still better than /// normal. /// - The absolute distance to the target stretch. /// - The absolute distance to the target weight. fn find_best_variant( &self, like: Option<&FontInfo>, variant: FontVariant, ids: impl IntoIterator, ) -> Option { let mut best = None; let mut best_key = None; for id in ids { let current = &self.infos[id]; let key = ( like.map(|like| { ( current.flags.contains(FontFlags::MONOSPACE) != like.flags.contains(FontFlags::MONOSPACE), current.flags.contains(FontFlags::SERIF) != like.flags.contains(FontFlags::SERIF), Reverse(shared_prefix_words(¤t.family, &like.family)), current.family.len(), ) }), current.variant.style.distance(variant.style), current.variant.stretch.distance(variant.stretch), current.variant.weight.distance(variant.weight), ); if best_key.map_or(true, |b| key < b) { best = Some(id); best_key = Some(key); } } best } } /// Properties of a single font. #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] pub struct FontInfo { /// The typographic font family this font is part of. pub family: String, /// Properties that distinguish this font from other fonts in the same /// family. pub variant: FontVariant, /// Properties of the font. pub flags: FontFlags, /// The unicode coverage of the font. pub coverage: Coverage, } bitflags::bitflags! { /// Bitflags describing characteristics of a font. #[derive(Serialize, Deserialize)] pub struct FontFlags: u32 { /// All glyphs have the same width. const MONOSPACE = 1 << 0; /// Glyphs have short strokes at their stems. const SERIF = 1 << 1; } } impl FontInfo { /// Compute metadata for all fonts in the given data. pub fn from_data(data: &[u8]) -> impl Iterator + '_ { let count = ttf_parser::fonts_in_collection(data).unwrap_or(1); (0 .. count).filter_map(move |index| { let ttf = ttf_parser::Face::parse(data, index).ok()?; Self::from_ttf(&ttf) }) } /// Compute metadata for a single ttf-parser face. pub fn from_ttf(ttf: &ttf_parser::Face) -> Option { // We cannot use Name ID 16 "Typographic Family", because for some // fonts it groups together more than just Style / Weight / Stretch // variants (e.g. Display variants of Noto fonts) and then some // variants become inaccessible from Typst. And even though the // fsSelection bit WWS should help us decide whether that is the // case, it's wrong for some fonts (e.g. for certain variants of "Noto // Sans Display"). // // So, instead we use Name ID 1 "Family" and trim many common // suffixes for which know that they just describe styling (e.g. // "ExtraBold"). // // Also, for Noto fonts we use Name ID 4 "Full Name" instead, // because Name ID 1 "Family" sometimes contains "Display" and // sometimes doesn't for the Display variants and that mixes things // up. let family = { let mut family = find_name(ttf, name_id::FAMILY)?; if family.starts_with("Noto") { family = find_name(ttf, name_id::FULL_NAME)?; } typographic_family(&family).to_string() }; let variant = { let mut full = find_name(ttf, name_id::FULL_NAME).unwrap_or_default(); full.make_ascii_lowercase(); // Some fonts miss the relevant bits for italic or oblique, so // we also try to infer that from the full name. let italic = ttf.is_italic() || full.contains("italic"); let oblique = ttf.is_oblique() || full.contains("oblique") || full.contains("slanted"); let style = match (italic, oblique) { (false, false) => FontStyle::Normal, (true, _) => FontStyle::Italic, (_, true) => FontStyle::Oblique, }; let weight = FontWeight::from_number(ttf.weight().to_number()); let stretch = FontStretch::from_number(ttf.width().to_number()); FontVariant { style, weight, stretch } }; // Determine the unicode coverage. let mut codepoints = vec![]; for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) { if subtable.is_unicode() { subtable.codepoints(|c| codepoints.push(c)); } } let mut flags = FontFlags::empty(); flags.set(FontFlags::MONOSPACE, ttf.is_monospaced()); // Determine whether this is a serif or sans-serif font. if let Some(panose) = ttf .raw_face() .table(Tag::from_bytes(b"OS/2")) .and_then(|os2| os2.get(32 .. 45)) { if matches!(panose, [2, 2 ..= 10, ..]) { flags.insert(FontFlags::SERIF); } } Some(FontInfo { family, variant, flags, coverage: Coverage::from_vec(codepoints), }) } } /// Try to find and decode the name with the given id. pub(super) fn find_name(ttf: &ttf_parser::Face, name_id: u16) -> Option { ttf.names().into_iter().find_map(|entry| { if entry.name_id == name_id { if let Some(string) = entry.to_string() { return Some(string); } if entry.platform_id == PlatformId::Macintosh && entry.encoding_id == 0 { return Some(decode_mac_roman(entry.name)); } } None }) } /// Decode mac roman encoded bytes into a string. fn decode_mac_roman(coded: &[u8]) -> String { #[rustfmt::skip] const TABLE: [char; 128] = [ 'Ä', 'Å', 'Ç', 'É', 'Ñ', 'Ö', 'Ü', 'á', 'à', 'â', 'ä', 'ã', 'å', 'ç', 'é', 'è', 'ê', 'ë', 'í', 'ì', 'î', 'ï', 'ñ', 'ó', 'ò', 'ô', 'ö', 'õ', 'ú', 'ù', 'û', 'ü', '†', '°', '¢', '£', '§', '•', '¶', 'ß', '®', '©', '™', '´', '¨', '≠', 'Æ', 'Ø', '∞', '±', '≤', '≥', '¥', 'µ', '∂', '∑', '∏', 'π', '∫', 'ª', 'º', 'Ω', 'æ', 'ø', '¿', '¡', '¬', '√', 'ƒ', '≈', '∆', '«', '»', '…', '\u{a0}', 'À', 'Ã', 'Õ', 'Œ', 'œ', '–', '—', '“', '”', '‘', '’', '÷', '◊', 'ÿ', 'Ÿ', '⁄', '€', '‹', '›', 'fi', 'fl', '‡', '·', '‚', '„', '‰', 'Â', 'Ê', 'Á', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'Ó', 'Ô', '\u{f8ff}', 'Ò', 'Ú', 'Û', 'Ù', 'ı', 'ˆ', '˜', '¯', '˘', '˙', '˚', '¸', '˝', '˛', 'ˇ', ]; fn char_from_mac_roman(code: u8) -> char { if code < 128 { code as char } else { TABLE[(code - 128) as usize] } } coded.iter().copied().map(char_from_mac_roman).collect() } /// Trim style naming from a family name. fn typographic_family(mut family: &str) -> &str { // Separators between names, modifiers and styles. const SEPARATORS: [char; 3] = [' ', '-', '_']; // Modifiers that can appear in combination with suffixes. const MODIFIERS: &[&str] = &[ "extra", "ext", "ex", "x", "semi", "sem", "sm", "demi", "dem", "ultra", ]; // Style suffixes. #[rustfmt::skip] const SUFFIXES: &[&str] = &[ "normal", "italic", "oblique", "slanted", "thin", "th", "hairline", "light", "lt", "regular", "medium", "med", "md", "bold", "bd", "demi", "extb", "black", "blk", "bk", "heavy", "narrow", "condensed", "cond", "cn", "cd", "compressed", "expanded", "exp" ]; // Trim spacing and weird leading dots in Apple fonts. family = family.trim().trim_start_matches('.'); // Lowercase the string so that the suffixes match case-insensitively. let lower = family.to_ascii_lowercase(); let mut len = usize::MAX; let mut trimmed = lower.as_str(); // Trim style suffixes repeatedly. while trimmed.len() < len { len = trimmed.len(); // Find style suffix. let mut t = match SUFFIXES.iter().find_map(|s| trimmed.strip_suffix(s)) { Some(t) => t, None => break, }; // Strip optional separator. if let Some(s) = t.strip_suffix(SEPARATORS) { trimmed = s; t = s; } // Also allow an extra modifier, but apply it only if it is separated it // from the text before it (to prevent false positives). if let Some(t) = MODIFIERS.iter().find_map(|s| t.strip_suffix(s)) { if let Some(stripped) = t.strip_suffix(SEPARATORS) { trimmed = stripped; } } } &family[.. len] } /// How many words the two strings share in their prefix. fn shared_prefix_words(left: &str, right: &str) -> usize { left.unicode_words() .zip(right.unicode_words()) .take_while(|(l, r)| l == r) .count() } /// A compactly encoded set of codepoints. /// /// The set is represented by alternating specifications of how many codepoints /// are not in the set and how many are in the set. /// /// For example, for the set `{2, 3, 4, 9, 10, 11, 15, 18, 19}`, there are: /// - 2 codepoints not inside (0, 1) /// - 3 codepoints inside (2, 3, 4) /// - 4 codepoints not inside (5, 6, 7, 8) /// - 3 codepoints inside (9, 10, 11) /// - 3 codepoints not inside (12, 13, 14) /// - 1 codepoint inside (15) /// - 2 codepoints not inside (16, 17) /// - 2 codepoints inside (18, 19) /// /// So the resulting encoding is `[2, 3, 4, 3, 3, 1, 2, 2]`. #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] #[serde(transparent)] pub struct Coverage(Vec); impl Coverage { /// Encode a vector of codepoints. pub fn from_vec(mut codepoints: Vec) -> Self { codepoints.sort(); codepoints.dedup(); let mut runs = Vec::new(); let mut next = 0; for c in codepoints { if let Some(run) = runs.last_mut().filter(|_| c == next) { *run += 1; } else { runs.push(c - next); runs.push(1); } next = c + 1; } Self(runs) } /// Whether the codepoint is covered. pub fn contains(&self, c: u32) -> bool { let mut inside = false; let mut cursor = 0; for &run in &self.0 { if (cursor .. cursor + run).contains(&c) { return inside; } cursor += run; inside = !inside; } false } } #[cfg(test)] mod tests { use super::*; #[test] fn test_trim_styles() { assert_eq!(typographic_family("Atma Light"), "Atma"); assert_eq!(typographic_family("eras bold"), "eras"); assert_eq!(typographic_family("footlight mt light"), "footlight mt"); assert_eq!(typographic_family("times new roman"), "times new roman"); assert_eq!( typographic_family("noto sans mono cond sembd"), "noto sans mono" ); assert_eq!(typographic_family("noto serif SEMCOND sembd"), "noto serif"); assert_eq!(typographic_family("crimson text"), "crimson text"); assert_eq!(typographic_family("footlight light"), "footlight"); assert_eq!(typographic_family("Noto Sans"), "Noto Sans"); assert_eq!(typographic_family("Noto Sans Light"), "Noto Sans"); assert_eq!( typographic_family("Noto Sans Semicondensed Heavy"), "Noto Sans" ); assert_eq!(typographic_family("Familx"), "Familx"); assert_eq!(typographic_family("Font Ultra"), "Font Ultra"); assert_eq!(typographic_family("Font Ultra Bold"), "Font"); } #[test] fn test_coverage() { #[track_caller] fn test(set: &[u32], runs: &[u32]) { let coverage = Coverage::from_vec(set.to_vec()); assert_eq!(coverage.0, runs); let max = 5 + set.iter().copied().max().unwrap_or_default(); for c in 0 .. max { assert_eq!(set.contains(&c), coverage.contains(c)); } } test(&[], &[]); test(&[0], &[0, 1]); test(&[1], &[1, 1]); test(&[0, 1], &[0, 2]); test(&[0, 1, 3], &[0, 2, 1, 1]); test( // {2, 3, 4, 9, 10, 11, 15, 18, 19} &[18, 19, 2, 4, 9, 11, 15, 3, 3, 10], &[2, 3, 4, 3, 3, 1, 2, 2], ) } }