//! OpenType font subsetting. use std::borrow::Cow; use std::collections::HashSet; use std::convert::{TryFrom, TryInto}; use std::iter; use std::ops::Range; use ttf_parser::parser::{ FromData, LazyArray16, LazyArray32, Offset, Offset16, Offset32, Stream, F2DOT14, }; use ttf_parser::Tag; /// Subset a font face for PDF embedding. /// /// This will remove the outlines of all glyphs that are not part of the given /// slice. Furthmore, all character mapping and layout tables are dropped as /// shaping has already happened. /// /// Returns `None` if the font data is fatally broken (in which case /// `ttf-parser` would probably already have rejected the font, so this /// shouldn't happen if the font data has already passed through `ttf-parser`). pub fn subset(data: &[u8], index: u32, glyphs: &HashSet) -> Option> { Some(Subsetter::new(data, index, glyphs)?.subset()) } struct Subsetter<'a> { data: &'a [u8], magic: Magic, records: LazyArray16<'a, TableRecord>, num_glyphs: u16, glyphs: &'a HashSet, tables: Vec<(Tag, Cow<'a, [u8]>)>, } impl<'a> Subsetter<'a> { /// Parse the font header and create a new subsetter. fn new(data: &'a [u8], index: u32, glyphs: &'a HashSet) -> Option { let mut s = Stream::new(data); let mut magic = s.read::()?; if magic == Magic::Collection { // Parse font collection header if necessary. s.skip::(); let num_faces = s.read::()?; let offsets = s.read_array32::(num_faces)?; let offset = offsets.get(index)?.to_usize(); s = Stream::new_at(data, offset)?; magic = s.read::()?; if magic == Magic::Collection { return None; } } // Read number of table records. let count = s.read::()?; // Skip boring parts of header. s.skip::(); s.skip::(); s.skip::(); // Read the table records. let records = s.read_array16::(count)?; let mut subsetter = Self { data, magic, records, num_glyphs: 0, glyphs, tables: vec![], }; // Find out number of glyphs. let maxp = subsetter.table_data(MAXP)?; subsetter.num_glyphs = Stream::read_at::(maxp, 4)?; Some(subsetter) } /// Encode the subsetted font file. fn subset(mut self) -> Vec { self.subset_tables(); // Start writing a brand new font. let mut w = Vec::new(); w.write(self.magic); // Write table directory. let count = self.tables.len() as u16; let entry_selector = (count as f32).log2().floor() as u16; let search_range = entry_selector.pow(2) * 16; let range_shift = count * 16 - search_range; w.write(count); w.write(search_range); w.write(entry_selector); w.write(range_shift); // Tables shall be sorted by tag. self.tables.sort_by_key(|&(tag, _)| tag); // This variable will hold the offset to the checksum adjustment field // in the head table, which we'll have to write in the end (after // checksumming the whole font). let mut checksum_adjustment_offset = None; // Write table records. let mut offset = 12 + self.tables.len() * TableRecord::SIZE; for (tag, data) in &mut self.tables { if *tag == HEAD { // Zero out checksum field in head table. data.to_mut()[8 .. 12].copy_from_slice(&[0; 4]); checksum_adjustment_offset = Some(offset + 8); } let len = data.len(); w.write(TableRecord { tag: *tag, checksum: checksum(data), offset: offset as u32, length: len as u32, }); // Account for the padding to 4 bytes. offset += len + len % 4; } // Write tables. for (_, data) in &self.tables { // Write data plus padding zeros to align to 4 bytes. w.extend(data.as_ref()); w.extend(iter::repeat(0).take(data.len() % 4)); } // Write checksumAdjustment field in head table. if let Some(i) = checksum_adjustment_offset { let sum = checksum(&w); let val = 0xB1B0AFBA_u32.wrapping_sub(sum); w[i .. i + 4].copy_from_slice(&val.to_be_bytes()); } w } /// Subset, drop and copy tables. fn subset_tables(&mut self) { // Remove unnecessary name information. let handled_post = post::subset(self).is_some(); // Remove unnecessary glyph outlines. let handled_glyf_loca = glyf::subset(self).is_some(); let handled_cff1 = cff::subset_v1(self).is_some(); for record in self.records { // If `handled` is true, we don't take any further action, if it's // false, we copy the table. #[rustfmt::skip] let handled = match &record.tag.to_bytes() { // Drop: Glyphs are already mapped. b"cmap" => true, // Drop: Layout is already finished. b"GPOS" | b"GSUB" | b"BASE" | b"JSTF" | b"MATH" | b"ankr" | b"kern" | b"kerx" | b"mort" | b"morx" | b"trak" | b"bsln" | b"just" | b"feat" | b"prop" => true, // Drop: They don't render in PDF viewers anyway. // TODO: We probably have to convert fonts with one of these // tables into Type 3 fonts where glyphs are described by either // PDF graphics operators or XObject images. b"CBDT" | b"CBLC" | b"COLR" | b"CPAL" | b"sbix" | b"SVG " => true, // Subsetted: Subsetting happens outside the loop, but if it // failed, we simply copy the affected table(s). b"post" => handled_post, b"loca" | b"glyf" => handled_glyf_loca, b"CFF " => handled_cff1, // Copy: All other tables are simply copied. _ => false, }; if !handled { if let Some(data) = self.table_data(record.tag) { self.push_table(record.tag, data); } } } } /// Retrieve the table data for a table. fn table_data(&mut self, tag: Tag) -> Option<&'a [u8]> { let (_, record) = self.records.binary_search_by(|record| record.tag.cmp(&tag))?; let start = record.offset as usize; let end = start + (record.length as usize); self.data.get(start .. end) } /// Push a new table. fn push_table(&mut self, tag: Tag, data: impl Into>) { self.tables.push((tag, data.into())); } } // Some common tags. const HEAD: Tag = Tag::from_bytes(b"head"); const MAXP: Tag = Tag::from_bytes(b"maxp"); const POST: Tag = Tag::from_bytes(b"post"); const LOCA: Tag = Tag::from_bytes(b"loca"); const GLYF: Tag = Tag::from_bytes(b"glyf"); const CFF1: Tag = Tag::from_bytes(b"CFF "); /// Calculate a checksum over the sliced data as sum of u32's. The data length /// must be a multiple of four. fn checksum(data: &[u8]) -> u32 { let mut sum = 0u32; for chunk in data.chunks(4) { let mut bytes = [0; 4]; bytes[.. chunk.len()].copy_from_slice(chunk); sum = sum.wrapping_add(u32::from_be_bytes(bytes)); } sum } /// Zero all bytes in a slice. fn memzero(slice: &mut [u8]) { for byte in slice { *byte = 0; } } /// Convenience trait for writing into a byte buffer. trait BufExt { fn write(&mut self, v: T); } impl BufExt for Vec { fn write(&mut self, v: T) { v.write(self); } } /// A trait for writing raw binary data. trait ToData { fn write(&self, data: &mut Vec); } impl ToData for u8 { fn write(&self, data: &mut Vec) { data.push(*self); } } impl ToData for u16 { fn write(&self, data: &mut Vec) { data.extend(&self.to_be_bytes()); } } impl ToData for Offset16 { fn write(&self, data: &mut Vec) { self.0.write(data); } } impl ToData for u32 { fn write(&self, data: &mut Vec) { data.extend(&self.to_be_bytes()); } } impl ToData for Offset32 { fn write(&self, data: &mut Vec) { self.0.write(data); } } impl ToData for Tag { fn write(&self, data: &mut Vec) { self.as_u32().write(data); } } /// Font magic number. #[derive(Debug, Copy, Clone, Eq, PartialEq)] enum Magic { TrueType, OpenType, Collection, } impl FromData for Magic { const SIZE: usize = 4; fn parse(data: &[u8]) -> Option { match u32::parse(data)? { 0x00010000 | 0x74727565 => Some(Magic::TrueType), 0x4F54544F => Some(Magic::OpenType), 0x74746366 => Some(Magic::Collection), _ => None, } } } impl ToData for Magic { fn write(&self, data: &mut Vec) { let value: u32 = match self { Magic::TrueType => 0x00010000, Magic::OpenType => 0x4F54544F, Magic::Collection => 0x74746366, }; value.write(data); } } /// Locates a table in the font file. #[derive(Debug, Copy, Clone, Eq, PartialEq)] struct TableRecord { tag: Tag, checksum: u32, offset: u32, length: u32, } impl FromData for TableRecord { const SIZE: usize = 16; fn parse(data: &[u8]) -> Option { let mut s = Stream::new(data); Some(TableRecord { tag: s.read::()?, checksum: s.read::()?, offset: s.read::()?, length: s.read::()?, }) } } impl ToData for TableRecord { fn write(&self, data: &mut Vec) { self.tag.write(data); self.checksum.write(data); self.offset.write(data); self.length.write(data); } } mod post { use super::*; /// Subset the post table by removing the name information. pub(super) fn subset(subsetter: &mut Subsetter) -> Option<()> { // Table version three is the one without names. let mut new = 0x00030000_u32.to_be_bytes().to_vec(); new.extend(subsetter.table_data(POST)?.get(4 .. 32)?); subsetter.push_table(POST, new); Some(()) } } mod glyf { use super::*; /// Subset the glyf and loca tables by clearing out glyph data for /// unused glyphs. pub(super) fn subset(subsetter: &mut Subsetter) -> Option<()> { let head = subsetter.table_data(HEAD)?; let short = Stream::read_at::(head, 50)? == 0; if short { subset_impl::(subsetter) } else { subset_impl::(subsetter) } } fn subset_impl(subsetter: &mut Subsetter) -> Option<()> where T: LocaOffset, { let loca = subsetter.table_data(LOCA)?; let glyf = subsetter.table_data(GLYF)?; let offsets = LazyArray32::::new(loca); let glyph_data = |id: u16| { let from = offsets.get(u32::from(id))?.loca_to_usize(); let to = offsets.get(u32::from(id) + 1)?.loca_to_usize(); glyf.get(from .. to) }; // The set of all glyphs we will include in the subset. let mut subset = HashSet::new(); // Because glyphs may depend on other glyphs as components (also with // multiple layers of nesting), we have to process all glyphs to find // their components. For notdef and all requested glyphs we simply use // an iterator, but to track other glyphs that need processing we create // a work stack. let mut iter = iter::once(0).chain(subsetter.glyphs.iter().copied()); let mut work = vec![]; // Find composite glyph descriptions. while let Some(id) = work.pop().or_else(|| iter.next()) { if subset.insert(id) { let mut s = Stream::new(glyph_data(id)?); if let Some(num_contours) = s.read::() { // Negative means this is a composite glyph. if num_contours < 0 { // Skip min/max metrics. s.read::(); s.read::(); s.read::(); s.read::(); // Read component glyphs. work.extend(component_glyphs(s)); } } } } let mut sub_loca = vec![]; let mut sub_glyf = vec![]; for id in 0 .. subsetter.num_glyphs { // If the glyph shouldn't be contained in the subset, it will // still get a loca entry, but the glyf data is simply empty. sub_loca.write(T::usize_to_loca(sub_glyf.len())?); if subset.contains(&id) { sub_glyf.extend(glyph_data(id)?); } } sub_loca.write(T::usize_to_loca(sub_glyf.len())?); subsetter.push_table(LOCA, sub_loca); subsetter.push_table(GLYF, sub_glyf); Some(()) } trait LocaOffset: Sized + FromData + ToData { fn loca_to_usize(self) -> usize; fn usize_to_loca(offset: usize) -> Option; } impl LocaOffset for Offset16 { fn loca_to_usize(self) -> usize { 2 * usize::from(self.0) } fn usize_to_loca(offset: usize) -> Option { if offset % 2 == 0 { (offset / 2).try_into().ok().map(Self) } else { None } } } impl LocaOffset for Offset32 { fn loca_to_usize(self) -> usize { self.0 as usize } fn usize_to_loca(offset: usize) -> Option { offset.try_into().ok().map(Self) } } /// Returns an iterator over the component glyphs referenced by the given /// `glyf` table composite glyph description. fn component_glyphs(mut s: Stream) -> impl Iterator + '_ { const ARG_1_AND_2_ARE_WORDS: u16 = 0x0001; const WE_HAVE_A_SCALE: u16 = 0x0008; const MORE_COMPONENTS: u16 = 0x0020; const WE_HAVE_AN_X_AND_Y_SCALE: u16 = 0x0040; const WE_HAVE_A_TWO_BY_TWO: u16 = 0x0080; let mut done = false; iter::from_fn(move || { if done { return None; } let flags = s.read::()?; let component = s.read::()?; if flags & ARG_1_AND_2_ARE_WORDS != 0 { s.skip::(); s.skip::(); } else { s.skip::(); } if flags & WE_HAVE_A_SCALE != 0 { s.skip::(); } else if flags & WE_HAVE_AN_X_AND_Y_SCALE != 0 { s.skip::(); s.skip::(); } else if flags & WE_HAVE_A_TWO_BY_TWO != 0 { s.skip::(); s.skip::(); s.skip::(); s.skip::(); } done = flags & MORE_COMPONENTS == 0; Some(component) }) } } mod cff { use super::*; /// Subset the CFF table by zeroing glyph data for unused glyphs. pub(super) fn subset_v1(subsetter: &mut Subsetter) -> Option<()> { let cff = subsetter.table_data(CFF1)?; let mut s = Stream::new(cff); let (major, _) = (s.read::()?, s.skip::()); if major != 1 { return None; } let header_size = s.read::()?; s = Stream::new_at(cff, usize::from(header_size))?; // Skip the name index. Index::parse_stream(&mut s); // Read the top dict. The index should contain only one item. let top_dict_index = Index::parse_stream(&mut s)?; let top_dict = Dict::parse(top_dict_index.get(0)?); let mut sub_cff = cff.to_vec(); // Because completely rebuilding the CFF structure would be pretty // complex, for now, we employ a peculiar strategy for CFF subsetting: // We simply replace unused data with zeros. This way, the font // structure and offsets can stay the same. And while the CFF table // itself doesn't shrink, the actual embedded font is compressed and // greatly benefits from the repeated zeros. zero_char_strings(subsetter, cff, &top_dict, &mut sub_cff); zero_subr_indices(subsetter, cff, &top_dict, &mut sub_cff); subsetter.push_table(CFF1, sub_cff); Some(()) } /// Zero unused char strings. fn zero_char_strings( subsetter: &Subsetter, cff: &[u8], top_dict: &Dict, sub_cff: &mut [u8], ) -> Option<()> { let char_strings_offset = top_dict.get_offset(Op::CHAR_STRINGS)?; let char_strings = Index::parse(cff.get(char_strings_offset ..)?)?; for (id, _, range) in char_strings.iter() { if !subsetter.glyphs.contains(&id) { let start = char_strings_offset + range.start; let end = char_strings_offset + range.end; memzero(sub_cff.get_mut(start .. end)?); } } Some(()) } /// Zero unused local subroutine indices. We don't currently remove /// individual subroutines because finding out which ones are used is /// complicated. fn zero_subr_indices( subsetter: &Subsetter, cff: &[u8], top_dict: &Dict, sub_cff: &mut [u8], ) -> Option<()> { // Parse FD Select data structure, which maps from glyph ids to find // dict indices. let fd_select_offset = top_dict.get_offset(Op::FD_SELECT)?; let fd_select = parse_fd_select(cff.get(fd_select_offset ..)?, subsetter.num_glyphs)?; // Clear local subrs from unused font dicts. let fd_array_offset = top_dict.get_offset(Op::FD_ARRAY)?; let fd_array = Index::parse(cff.get(fd_array_offset ..)?)?; // Determine which font dict's subrs to keep. let mut sub_fds = HashSet::new(); for &glyph in subsetter.glyphs { sub_fds.insert(fd_select.get(usize::from(glyph))?); } for (i, data, _) in fd_array.iter() { if !sub_fds.contains(&(i as u8)) { let font_dict = Dict::parse(data); if let Some(private_range) = font_dict.get_range(Op::PRIVATE) { let private_dict = Dict::parse(cff.get(private_range.clone())?); if let Some(subrs_offset) = private_dict.get_offset(Op::SUBRS) { let start = private_range.start + subrs_offset; let index = Index::parse(cff.get(start ..)?)?; let end = start + index.data.len(); memzero(sub_cff.get_mut(start .. end)?); } } } } Some(()) } /// Returns the font dict index for each glyph. fn parse_fd_select(data: &[u8], num_glyphs: u16) -> Option> { let mut s = Stream::new(data); let format = s.read::()?; Some(match format { 0 => Cow::Borrowed(s.read_bytes(usize::from(num_glyphs))?), 3 => { let count = usize::from(s.read::()?); let mut fds = vec![]; let mut start = s.read::()?; for _ in 0 .. count { let fd = s.read::()?; let end = s.read::()?; for _ in start .. end { fds.push(fd); } start = end; } Cow::Owned(fds) } _ => Cow::Borrowed(&[]), }) } struct Index<'a> { /// The data of the whole index (including its header). data: &'a [u8], /// The data ranges for the actual items. items: Vec>, } impl<'a> Index<'a> { fn parse(data: &'a [u8]) -> Option { let mut s = Stream::new(data); let count = usize::from(s.read::()?); let mut items = Vec::with_capacity(count); let mut len = 2; if count > 0 { let offsize = usize::from(s.read::()?); if !matches!(offsize, 1 ..= 4) { return None; } // Read an offset and transform it to be relative to the start // of the index. let data_offset = 3 + offsize * (count + 1); let mut read_offset = || { let mut bytes = [0u8; 4]; bytes[4 - offsize .. 4].copy_from_slice(s.read_bytes(offsize)?); Some(data_offset - 1 + u32::from_be_bytes(bytes) as usize) }; let mut last = read_offset()?; for _ in 0 .. count { let offset = read_offset()?; data.get(last .. offset)?; items.push(last .. offset); last = offset; } len = last; } Some(Self { data: data.get(.. len)?, items }) } fn parse_stream(s: &'a mut Stream) -> Option { let index = Index::parse(s.tail()?)?; s.advance(index.data.len()); Some(index) } fn get(&self, idx: usize) -> Option<&'a [u8]> { self.data.get(self.items.get(idx)?.clone()) } fn iter(&self) -> impl Iterator)> + '_ { self.items .iter() .enumerate() .map(move |(i, item)| (i as u16, &self.data[item.clone()], item.clone())) } } struct Dict<'a>(Vec>); impl<'a> Dict<'a> { fn parse(data: &'a [u8]) -> Self { let mut s = Stream::new(data); Self(iter::from_fn(|| Pair::parse(&mut s)).collect()) } fn get(&self, op: Op) -> Option<&[Operand<'a>]> { self.0 .iter() .find(|pair| pair.op == op) .map(|pair| pair.operands.as_slice()) } fn get_offset(&self, op: Op) -> Option { match self.get(op)? { &[Operand::Int(offset)] if offset > 0 => usize::try_from(offset).ok(), _ => None, } } fn get_range(&self, op: Op) -> Option> { match self.get(op)? { &[Operand::Int(len), Operand::Int(offset)] if offset > 0 => { let offset = usize::try_from(offset).ok()?; let len = usize::try_from(len).ok()?; Some(offset .. offset + len) } _ => None, } } } #[derive(Debug)] struct Pair<'a> { operands: Vec>, op: Op, } impl<'a> Pair<'a> { fn parse(s: &mut Stream<'a>) -> Option { let mut operands = vec![]; while s.clone().read::()? > 21 { operands.push(Operand::parse(s)?); } Some(Self { operands, op: Op::parse(s)? }) } } #[derive(Debug, Eq, PartialEq)] struct Op(u8, u8); impl Op { const CHAR_STRINGS: Self = Self(17, 0); const PRIVATE: Self = Self(18, 0); const SUBRS: Self = Self(19, 0); const FD_ARRAY: Self = Self(12, 36); const FD_SELECT: Self = Self(12, 37); fn parse(s: &mut Stream) -> Option { let b0 = s.read::()?; match b0 { 12 => Some(Self(b0, s.read::()?)), 0 ..= 21 => Some(Self(b0, 0)), _ => None, } } } #[derive(Debug)] enum Operand<'a> { Int(i32), Real(&'a [u8]), } impl<'a> Operand<'a> { fn parse(s: &mut Stream<'a>) -> Option { let b0 = i32::from(s.read::()?); Some(match b0 { 30 => { let mut len = 0; for &byte in s.tail()? { len += 1; if byte & 0x0f == 0x0f { break; } } Self::Real(s.read_bytes(len)?) } 32 ..= 246 => Self::Int(b0 - 139), 247 ..= 250 => { let b1 = i32::from(s.read::()?); Self::Int((b0 - 247) * 256 + b1 + 108) } 251 ..= 254 => { let b1 = i32::from(s.read::()?); Self::Int(-(b0 - 251) * 256 - b1 - 108) } 28 => Self::Int(i32::from(s.read::()?)), 29 => Self::Int(s.read::()?), _ => return None, }) } } }