diff --git a/Cargo.toml b/Cargo.toml index 1a4442fd6..d7568c68a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,4 @@ pdf = { path = "../pdf" } opentype = { path = "../opentype" } unicode-segmentation = "1.2" unicode-xid = "0.1.0" +byteorder = "1" diff --git a/src/font.rs b/src/font.rs new file mode 100644 index 000000000..ddbcf1bf7 --- /dev/null +++ b/src/font.rs @@ -0,0 +1,411 @@ +//! Font utility and subsetting. + +use std::fmt; +use std::io::{self, Cursor}; +use std::collections::HashMap; +use byteorder::{BE, ReadBytesExt, WriteBytesExt}; +use opentype::{OpenTypeReader, Outlines, TableRecord, Tag}; +use opentype::tables::{Header, CharMap, MaximumProfile, HorizontalMetrics}; + +/// An font wrapper which allows to subset a font. +pub struct Font { + program: Vec, +} + +impl Font { + /// Create a new font from a font program. + pub fn new(program: Vec) -> Font { + Font { program } + } + + /// Generate a subsetted version of this font including only the chars listed in + /// `chars`. + /// + /// The resulting pair contains the new font data and the new glyph mapping. + /// + /// All needed tables will be included (returning an error if a table was not present + /// in the source font) and optional tables will be included if there were present + /// in the source font. + pub fn subsetted( + &self, + chars: C, + needed_tables: I1, + optional_tables: I2 + ) -> Result<(Vec, HashMap), SubsettingError> + where + C: IntoIterator, + I1: IntoIterator, S1: AsRef, + I2: IntoIterator, S2: AsRef + { + let mut chars: Vec = chars.into_iter().collect(); + chars.sort(); + + let mut cursor = Cursor::new(&self.program); + let mut reader = OpenTypeReader::new(&mut cursor); + let outlines = reader.outlines()?; + let mut tables = reader.tables()?.to_vec(); + tables.sort_by_key(|r| r.tag); + + Subsetter { + program: &self.program, + reader, + outlines, + tables, + cmap: None, + hmtx: None, + loca: None, + chars, + records: Vec::new(), + body: Vec::new(), + }.subset(needed_tables, optional_tables) + } +} + +struct Subsetter<'p> { + // Original font + program: &'p [u8], + reader: OpenTypeReader<'p, Cursor<&'p Vec>>, + outlines: Outlines, + tables: Vec, + cmap: Option, + hmtx: Option, + loca: Option>, + + // Subsetted font + chars: Vec, + records: Vec, + body: Vec, +} + +impl<'p> Subsetter<'p> { + fn subset(mut self, needed_tables: I1, optional_tables: I2) + -> SubsetResult<(Vec, HashMap)> + where + I1: IntoIterator, S1: AsRef, + I2: IntoIterator, S2: AsRef + { + // Iterate through the needed tables first + for table in needed_tables.into_iter() { + let table = table.as_ref(); + let tag: Tag = table.parse() + .map_err(|_| SubsettingError::UnsupportedTable(table.to_string()))?; + + if self.contains(tag) { + self.write_table(tag)?; + } else { + return Err(SubsettingError::MissingTable(tag.to_string())); + } + } + + // Now iterate through the optional tables + for table in optional_tables.into_iter() { + let table = table.as_ref(); + let tag: Tag = table.parse() + .map_err(|_| SubsettingError::UnsupportedTable(table.to_string()))?; + + if self.contains(tag) { + self.write_table(tag)?; + } + } + + self.write_header()?; + + let mapping = self.chars.into_iter().enumerate().map(|(i, c)| (c, i as u16)) + .collect::>(); + + Ok((self.body, mapping)) + } + + fn write_header(&mut self) -> SubsetResult<()> { + // Create an output buffer + let header_len = 12 + self.records.len() * 16; + let mut header = Vec::with_capacity(header_len); + + let num_tables = self.records.len() as u16; + + // The highester power lower than the table count. + let mut max_power = 1u16; + while max_power * 2 <= num_tables { + max_power *= 2; + } + max_power = std::cmp::min(max_power, num_tables); + + let search_range = max_power * 16; + let entry_selector = (max_power as f32).log2() as u16; + let range_shift = num_tables * 16 - search_range; + + // Write the base header + header.write_u32::(match self.outlines { + Outlines::TrueType => 0x00010000, + Outlines::CFF => 0x4f54544f, + })?; + header.write_u16::(num_tables)?; + header.write_u16::(search_range)?; + header.write_u16::(entry_selector)?; + header.write_u16::(range_shift)?; + + // Write the table records + for record in &self.records { + header.extend(record.tag.value()); + header.write_u32::(record.check_sum)?; + header.write_u32::(header_len as u32 + record.offset)?; + header.write_u32::(record.length)?; + } + + header.append(&mut self.body); + self.body = header; + + Ok(()) + } + + fn write_table(&mut self, tag: Tag) -> SubsetResult<()> { + match tag.value() { + b"head" | b"cvt " | b"prep" | b"fpgm" | b"name" | b"post" | b"OS/2" => { + self.copy_table(tag) + }, + b"hhea" => { + let table = self.get_table_data(tag)?; + let glyph_count = self.chars.len() as u16; + self.write_table_body(tag, |this| { + this.body.extend(&table[..table.len() - 2]); + Ok(this.body.write_u16::(glyph_count)?) + }) + }, + b"maxp" => { + let table = self.get_table_data(tag)?; + let glyph_count = self.chars.len() as u16; + self.write_table_body(tag, |this| { + this.body.extend(&table[..4]); + this.body.write_u16::(glyph_count)?; + Ok(this.body.extend(&table[6..])) + }) + }, + b"hmtx" => { + self.write_table_body(tag, |this| { + this.read_cmap()?; + this.read_hmtx()?; + let cmap = this.cmap.as_ref().unwrap(); + let metrics = this.hmtx.as_ref().unwrap(); + + for &c in &this.chars { + let glyph_id = take(cmap.get(c), c)?; + let metrics = take(metrics.get(glyph_id), c)?; + + this.body.write_i16::(metrics.advance_width)?; + this.body.write_i16::(metrics.left_side_bearing)?; + } + Ok(()) + }) + }, + b"loca" => { + self.write_table_body(tag, |this| { + this.read_cmap()?; + this.read_loca()?; + let cmap = this.cmap.as_ref().unwrap(); + let loca = this.loca.as_ref().unwrap(); + + let mut offset = 0; + for &c in &this.chars { + this.body.write_u32::(offset)?; + let glyph = take(cmap.get(c), c)? as usize; + let len = take(loca.get(glyph + 1), c)? - take(loca.get(glyph), c)?; + offset += len; + } + this.body.write_u32::(offset)?; + Ok(()) + }) + }, + b"glyf" => { + self.write_table_body(tag, |this| { + let table = this.get_table_data(tag)?; + this.read_cmap()?; + this.read_loca()?; + let cmap = this.cmap.as_ref().unwrap(); + let loca = this.loca.as_ref().unwrap(); + + for &c in &this.chars { + let glyph = take(cmap.get(c), c)? as usize; + let start = *take(loca.get(glyph), c)? as usize; + let end = *take(loca.get(glyph + 1), c)? as usize; + let shapes = table.get(start..end).ok_or(SubsettingError::InvalidFont)?; + this.body.extend(shapes); + } + Ok(()) + }) + }, + b"cmap" => { + // Always uses format 12 for simplicity + self.write_table_body(tag, |this| { + // Find out which chars are in consecutive groups + let mut groups = Vec::new(); + let len = this.chars.len(); + let mut i = 0; + while i < len { + let start = i; + while i + 1 < len && this.chars[i+1] as u32 == this.chars[i] as u32 + 1 { + i += 1; + } + groups.push((this.chars[start], this.chars[i], start)); + i += 1; + } + + // Table header + this.body.write_u16::(0)?; + this.body.write_u16::(1)?; + this.body.write_u16::(3)?; + this.body.write_u16::(1)?; + this.body.write_u32::(12)?; + + // Subtable header + this.body.write_u16::(12)?; + this.body.write_u16::(0)?; + this.body.write_u32::((16 + 12 * groups.len()) as u32)?; + this.body.write_u32::(0)?; + this.body.write_u32::(groups.len() as u32)?; + + // Subtable body + for group in &groups { + this.body.write_u32::(group.0 as u32)?; + this.body.write_u32::(group.1 as u32)?; + this.body.write_u32::(group.2 as u32)?; + } + + Ok(()) + }) + }, + + _ => Err(SubsettingError::UnsupportedTable(tag.to_string())), + } + } + + fn copy_table(&mut self, tag: Tag) -> SubsetResult<()> { + self.write_table_body(tag, |this| { + let table = this.get_table_data(tag)?; + Ok(this.body.extend(table)) + }) + } + + fn write_table_body(&mut self, tag: Tag, writer: F) -> SubsetResult<()> + where F: FnOnce(&mut Self) -> SubsetResult<()> { + let start = self.body.len(); + writer(self)?; + let end = self.body.len(); + while (self.body.len() - start) % 4 != 0 { + self.body.push(0); + } + + Ok(self.records.push(TableRecord { + tag, + check_sum: calculate_check_sum(&self.body[start..]), + offset: start as u32, + length: (end - start) as u32, + })) + } + + fn get_table_data(&self, tag: Tag) -> SubsetResult<&'p [u8]> { + let record = match self.tables.binary_search_by_key(&tag, |r| r.tag) { + Ok(index) => &self.tables[index], + Err(_) => return Err(SubsettingError::MissingTable(tag.to_string())), + }; + + self.program.get(record.offset as usize .. (record.offset + record.length) as usize) + .ok_or(SubsettingError::InvalidFont) + } + + fn contains(&self, tag: Tag) -> bool { + self.tables.binary_search_by_key(&tag, |r| r.tag).is_ok() + } + + fn read_cmap(&mut self) -> SubsetResult<()> { + Ok(if self.cmap.is_none() { + self.cmap = Some(self.reader.read_table::()?); + }) + } + + fn read_hmtx(&mut self) -> SubsetResult<()> { + Ok(if self.hmtx.is_none() { + self.hmtx = Some(self.reader.read_table::()?); + }) + } + + fn read_loca(&mut self) -> SubsetResult<()> { + Ok(if self.loca.is_none() { + let mut table = self.get_table_data("loca".parse().unwrap())?; + let format = self.reader.read_table::
()?.index_to_loc_format; + let count = self.reader.read_table::()?.num_glyphs + 1; + + let loca = if format == 0 { + (0..count).map(|_| table.read_u16::() + .map(|x| (x as u32) * 2)) + .collect::>>() + } else { + (0..count).map(|_| table.read_u32::()) + .collect::>>() + }?; + + self.loca = Some(loca); + }) + } +} + + +/// Calculate a checksum over the sliced data as sum of u32's. +/// The data length has to be a multiple of four. +fn calculate_check_sum(data: &[u8]) -> u32 { + let mut sum = 0u32; + data.chunks_exact(4).for_each(|c| { + sum = sum.wrapping_add( + ((c[0] as u32) << 24) + + ((c[1] as u32) << 16) + + ((c[2] as u32) << 8) + + (c[3] as u32) + ); + }); + sum +} + +/// Returns an error about a missing character or the wrapped data. +fn take(opt: Option, c: char) -> SubsetResult { + opt.ok_or(SubsettingError::MissingCharacter(c)) +} + + +type SubsetResult = Result; + +/// A failure when subsetting a font. +#[derive(Debug)] +pub enum SubsettingError { + MissingTable(String), + UnsupportedTable(String), + MissingCharacter(char), + InvalidFont, + FontError(opentype::Error), + IoError(io::Error), +} + +impl From for SubsettingError { + fn from(err: io::Error) -> SubsettingError { + SubsettingError::IoError(err) + } +} + +impl From for SubsettingError { + fn from(err: opentype::Error) -> SubsettingError { + SubsettingError::FontError(err) + } +} + +impl fmt::Display for SubsettingError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use SubsettingError::*; + write!(f, "subsetting error: ")?; + match self { + MissingTable(table) => write!(f, "missing table: {}", table), + UnsupportedTable(table) => write!(f, "unsupported table: {}", table), + MissingCharacter(c) => write!(f, "missing character: {}", c), + InvalidFont => write!(f, "invalid font"), + FontError(err) => write!(f, "font error: {}", err), + IoError(err) => write!(f, "io error: {}", err), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index a22061a6c..09740fb11 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,6 +23,7 @@ mod pdf; mod utility; +pub mod font; pub mod parsing; pub mod doc; diff --git a/src/pdf.rs b/src/pdf.rs index 85c2fa163..68de5d3e2 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -2,7 +2,7 @@ use std::fmt; use std::io::{self, Write, Cursor}; -use crate::doc::Document; +use std::collections::{HashMap, HashSet}; use pdf::{PdfWriter, Id, Rect, Version, Trailer}; use pdf::doc::{Catalog, PageTree, Page, Resource, Content}; use pdf::text::Text; @@ -11,6 +11,8 @@ use pdf::font::{ WidthRecord, FontDescriptor, FontFlags, EmbeddedFont, GlyphUnit }; use opentype::{OpenTypeReader, tables::{self, NameEntry, MacStyleFlags}}; +use crate::doc::Document; +use crate::font::Font; /// A type that is a sink for documents that can be written in the _PDF_ format. @@ -47,6 +49,12 @@ impl From for PdfWritingError { } } +impl From for PdfWritingError { + fn from(err: crate::font::SubsettingError) -> PdfWritingError { + PdfWritingError { message: format!("{}", err) } + } +} + impl fmt::Display for PdfWritingError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "pdf writing error: {}", self.message) @@ -60,7 +68,7 @@ struct PdfCreator<'a, W: Write> { writer: PdfWriter<'a, W>, doc: &'a Document, offsets: Offsets, - font_data: FontData, + font: PdfFont, } /// Offsets for the various groups of ids. @@ -72,17 +80,6 @@ struct Offsets { fonts: (Id, Id), } -/// The data we need from the font. -struct FontData { - data: Vec, - name: tables::Name, - head: tables::Header, - post: tables::Post, - os2: tables::OS2, - hmtx: tables::HorizontalMetrics, - cmap: tables::CharMap, -} - impl<'a, W: Write> PdfCreator<'a, W> { /// Create a new _PDF_ Creator. pub fn new(target: &'a mut W, doc: &'a Document) -> PdfResult> { @@ -94,9 +91,17 @@ impl<'a, W: Write> PdfCreator<'a, W> { let contents = (pages.1 + 1, pages.1 + content_count); let fonts = (contents.1 + 1, contents.1 + 4); - // Read the font from a file. + // Find out which chars are used in this document. + let mut chars = HashSet::new(); + for page in &doc.pages { + for content in &page.contents { + chars.extend(content.0.chars()); + } + } + + // Create a subsetted pdf font. let data = std::fs::read(format!("../fonts/{}.ttf", doc.font))?; - let font_data = FontData::load(data)?; + let font = PdfFont::new(&doc.font, data, chars)?; Ok(PdfCreator { writer: PdfWriter::new(target), @@ -108,7 +113,7 @@ impl<'a, W: Write> PdfCreator<'a, W> { contents, fonts, }, - font_data, + font, }) } @@ -184,17 +189,9 @@ impl<'a, W: Write> PdfCreator<'a, W> { /// Write the fonts. fn write_fonts(&mut self) -> PdfResult<()> { let id = self.offsets.fonts.0; - let font_data = &self.font_data; - - // Create conversion function from font units to PDF units. - let ratio = 1000.0 / (font_data.head.units_per_em as f32); - let convert = |x| (ratio * x as f32).round() as GlyphUnit; - - let font_name = font_data.name.get_decoded(NameEntry::PostScriptName); - let base_font = font_name.as_ref().unwrap_or(&self.doc.font); self.writer.write_obj(id, &Type0Font::new( - base_font.clone(), + self.font.name.clone(), CMapEncoding::Predefined("Identity-H".to_owned()), id + 1 )).unwrap(); @@ -202,80 +199,124 @@ impl<'a, W: Write> PdfCreator<'a, W> { self.writer.write_obj(id + 1, CIDFont::new( CIDFontType::Type2, - base_font.clone(), + self.font.name.clone(), CIDSystemInfo::new("(Adobe)", "(Identity)", 0), id + 2, - ).widths(vec![ - WidthRecord::start(0, font_data.hmtx.metrics.iter().map(|m| convert(m.advance_width)) - )]) + ).widths(vec![WidthRecord::start(0, self.font.widths.clone())]) ).unwrap(); - let mut flags = FontFlags::empty(); - flags.set(FontFlags::FIXED_PITCH, font_data.post.is_fixed_pitch); - flags.set(FontFlags::SERIF, base_font.contains("Serif")); - flags.insert(FontFlags::SYMBOLIC); - flags.set(FontFlags::ITALIC, font_data.head.mac_style.contains(MacStyleFlags::ITALIC)); - flags.insert(FontFlags::SMALL_CAP); - self.writer.write_obj(id + 2, FontDescriptor::new( - base_font.clone(), - flags, - font_data.post.italic_angle.to_f32(), + self.font.name.clone(), + self.font.flags, + self.font.italic_angle, ) - .font_bbox(Rect::new( - convert(font_data.head.x_min), - convert(font_data.head.y_min), - convert(font_data.head.x_max), - convert(font_data.head.y_max) - )) - .ascent(convert(font_data.os2.s_typo_ascender)) - .descent(convert(font_data.os2.s_typo_descender)) - .cap_height(convert(font_data.os2.s_cap_height - .unwrap_or(font_data.os2.s_typo_ascender))) - .stem_v((10.0 + 220.0 * (font_data.os2.us_weight_class as f32 - - 50.0) / 900.0) as GlyphUnit) + .font_bbox(self.font.bounding_box) + .ascent(self.font.ascender) + .descent(self.font.descender) + .cap_height(self.font.cap_height) + .stem_v(self.font.stem_v) .font_file_3(id + 3) ).unwrap(); - self.writer.write_obj(id + 3, &EmbeddedFont::OpenType(&font_data.data)).unwrap(); + + self.writer.write_obj(id + 3, &EmbeddedFont::OpenType(&self.font.data)).unwrap(); Ok(()) } /// Encode the given text for our font. fn encode(&self, text: &str) -> Vec { - let default = self.font_data.os2.us_default_char.unwrap_or(0); let mut bytes = Vec::with_capacity(2 * text.len()); - text.chars().map(|c| { - self.font_data.cmap.get(c).unwrap_or(default) - }) - .for_each(|glyph| { + for glyph in text.chars().map(|c| self.font.map(c)) { bytes.push((glyph >> 8) as u8); bytes.push((glyph & 0xff) as u8); - }); + } bytes } } -impl FontData { - /// Load various needed tables from the font data. - pub fn load(data: Vec) -> PdfResult { - let mut readable = Cursor::new(data); + +/// The data we need from the font. +struct PdfFont { + data: Vec, + mapping: HashMap, + default_glyph: u16, + name: String, + widths: Vec, + flags: FontFlags, + italic_angle: f32, + bounding_box: Rect, + ascender: GlyphUnit, + descender: GlyphUnit, + cap_height: GlyphUnit, + stem_v: GlyphUnit, +} + +impl PdfFont { + /// Create a subetted version of the font and calculate some information + /// needed for creating the _PDF_. + pub fn new(font_name: &str, data: Vec, chars: HashSet) -> PdfResult { + let mut readable = Cursor::new(&data); let mut reader = OpenTypeReader::new(&mut readable); - let name = reader.read_table::()?; let head = reader.read_table::()?; + let name = reader.read_table::()?; let post = reader.read_table::()?; let os2 = reader.read_table::()?; - let hmtx = reader.read_table::()?; - let cmap = reader.read_table::()?; - Ok(FontData { - data: readable.into_inner(), - name, head, post, os2, hmtx, cmap, + let font = Font::new(data); + let (subsetted, mapping) = font.subsetted( + chars, + &["head", "hhea", "maxp", "hmtx", "loca", "glyf"], + &["cvt ", "prep", "fpgm", "OS/2", "cmap", "name", "post"], + )?; + + let unit_ratio = 1000.0 / (head.units_per_em as f32); + let convert = |x| (unit_ratio * x as f32).round() as GlyphUnit; + + let base_font = name.get_decoded(NameEntry::PostScriptName); + let font_name = base_font.unwrap_or_else(|| font_name.to_owned()); + + + let mut flags = FontFlags::empty(); + flags.set(FontFlags::FIXED_PITCH, post.is_fixed_pitch); + flags.set(FontFlags::SERIF, font_name.contains("Serif")); + flags.insert(FontFlags::SYMBOLIC); + flags.set(FontFlags::ITALIC, head.mac_style.contains(MacStyleFlags::ITALIC)); + flags.insert(FontFlags::SMALL_CAP); + + let mut readable = Cursor::new(&subsetted); + let mut reader = OpenTypeReader::new(&mut readable); + let hmtx = reader.read_table::()?; + let widths = hmtx.metrics.iter().map(|m| convert(m.advance_width)).collect(); + + + Ok(PdfFont { + data: subsetted, + mapping, + default_glyph: os2.us_default_char.unwrap_or(0), + name: font_name, + widths, + flags, + italic_angle: post.italic_angle.to_f32(), + bounding_box: Rect::new( + convert(head.x_min), + convert(head.y_min), + convert(head.x_max), + convert(head.y_max) + ), + ascender: convert(os2.s_typo_ascender), + descender: convert(os2.s_typo_descender), + cap_height: convert(os2.s_cap_height.unwrap_or(os2.s_typo_ascender)), + stem_v: (10.0 + 220.0 * (os2.us_weight_class as f32 - 50.0) / 900.0) as GlyphUnit, }) } + + /// Map a character to it's glyph index. + fn map(&self, c: char) -> u16 { + self.mapping.get(&c).map(|&g| g).unwrap_or(self.default_glyph) + } } @@ -304,4 +345,17 @@ mod pdf_tests { Stet clita kasd gubergren, no sea takimata sanctus est. "); } + + // #[test] + // fn pdf_fix_1() { + // use unicode_normalization::UnicodeNormalization; + + // let text = "Hello World! from Typeset‼"; + // let chars = text.nfd().collect::>(); + + // // Create a subsetted pdf font. + // let data = std::fs::read("../fonts/NotoSans-Regular.ttf").unwrap(); + // let font = PdfFont::new("NotoSans-Regular", data, chars).unwrap(); + // std::fs::write("../target/NotoTest.ttf", font.data).unwrap(); + // } }