commit 5a600eb354c65ec008cbf020e45705c2f401d669 Author: Laurenz Date: Tue Feb 12 21:31:35 2019 +0100 Move crate into workspace subfolder diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..701c87970 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "typeset" +version = "0.1.0" +authors = ["Laurenz Mรคdje "] +edition = "2018" + +[dependencies] +unicode-segmentation = "1.2" +unicode-xid = "0.1.0" +byteorder = "1" diff --git a/src/doc.rs b/src/doc.rs new file mode 100644 index 000000000..04e214a3c --- /dev/null +++ b/src/doc.rs @@ -0,0 +1,187 @@ +//! Generation of abstract documents from syntax trees. + +use std::fmt; +use crate::parsing::{SyntaxTree, Node}; +use crate::font::{Font, BuiltinFont}; + + +/// Abstract representation of a complete typesetted document. +/// +/// This abstract thing can then be serialized into a specific format like PDF. +#[derive(Debug, Clone, PartialEq)] +pub struct Document { + /// The pages of the document. + pub pages: Vec, + /// The fonts used by the document. + pub fonts: Vec, +} + +impl Document { + /// Create a new document without content. + pub fn new() -> Document { + Document { + pages: vec![], + fonts: vec![], + } + } +} + +/// A page of a document. +#[derive(Debug, Clone, PartialEq)] +pub struct Page { + /// The width and height of the page. + pub size: [Size; 2], + /// The contents of the page. + pub contents: Vec, +} + +/// Plain text. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Text(pub String); + +/// A font (either built-in or external). +#[derive(Debug, Clone, PartialEq)] +pub enum DocumentFont { + /// One of the 14 built-in fonts. + Builtin(BuiltinFont), + /// An externally loaded font. + Loaded(Font), +} + +/// A distance that can be created from different units of length. +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Size { + /// The size in typographic points (1/72 inches). + pub points: f32, +} + +impl Size { + /// Create a size from a number of points. + pub fn from_points(points: f32) -> Size { + Size { points } + } + + /// Create a size from a number of inches. + pub fn from_inches(inches: f32) -> Size { + Size { points: inches / 72.0 } + } + + /// Create a size from a number of millimeters. + pub fn from_mm(mm: f32) -> Size { + Size { points: 2.8345 * mm } + } + + /// Create a size from a number of centimeters. + pub fn from_cm(cm: f32) -> Size { + Size { points: 0.028345 * cm } + } +} + + +/// A type that can be generated into a document. +pub trait Generate { + /// Generate a document from self. + fn generate(self) -> GenResult; +} + +impl Generate for SyntaxTree<'_> { + fn generate(self) -> GenResult { + Generator::new(self).generate() + } +} + +/// Result type used for parsing. +type GenResult = std::result::Result; + +/// A failure when generating. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct GenerationError { + /// A message describing the error. + pub message: String, +} + +impl fmt::Display for GenerationError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "generation error: {}", self.message) + } +} + + +/// Transforms an abstract syntax tree into a document. +#[derive(Debug, Clone)] +struct Generator<'s> { + tree: SyntaxTree<'s>, +} + +impl<'s> Generator<'s> { + /// Create a new generator from a syntax tree. + fn new(tree: SyntaxTree<'s>) -> Generator<'s> { + Generator { tree } + } + + /// Generate the abstract document. + fn generate(&mut self) -> GenResult { + let fonts = vec![DocumentFont::Builtin(BuiltinFont::Helvetica)]; + + let mut text = String::new(); + for node in &self.tree.nodes { + match node { + Node::Space if !text.is_empty() => text.push(' '), + Node::Space | Node::Newline => (), + Node::Word(word) => text.push_str(word), + + Node::ToggleItalics | Node::ToggleBold | Node::ToggleMath => unimplemented!(), + Node::Func(_) => unimplemented!(), + + } + } + + let page = Page { + size: [Size::from_mm(210.0), Size::from_mm(297.0)], + contents: vec![ Text(text) ], + }; + + Ok(Document { + pages: vec![page], + fonts, + }) + } + + /// Gives a generation error with a message. + #[inline] + fn err>(&self, message: S) -> GenResult { + Err(GenerationError { message: message.into() }) + } +} + + +#[cfg(test)] +mod generator_tests { + use super::*; + use crate::parsing::{Tokenize, Parse}; + + /// Test if the source gets generated into the document. + fn test(src: &str, doc: Document) { + assert_eq!(src.tokenize().parse().unwrap().generate(), Ok(doc)); + } + + /// Test if generation gives this error for the source code. + fn test_err(src: &str, err: GenerationError) { + assert_eq!(src.tokenize().parse().unwrap().generate(), Err(err)); + } + + #[test] + fn generator_simple() { + test("This is an example of a sentence.", Document { + pages: vec![ + Page { + size: [Size::from_mm(210.0), Size::from_mm(297.0)], + contents: vec![ + Text("This is an example of a sentence.".to_owned()), + ] + } + ], + fonts: vec![DocumentFont::Builtin(BuiltinFont::Helvetica)], + }); + } +} diff --git a/src/font.rs b/src/font.rs new file mode 100644 index 000000000..1280aec36 --- /dev/null +++ b/src/font.rs @@ -0,0 +1,270 @@ +//! Reading of metrics and font data from _OpenType_ and _TrueType_ font files. + +#![allow(unused_variables)] + +use std::fmt; +use std::io::{self, Read, Seek, SeekFrom}; +use byteorder::{BE, ReadBytesExt}; + + +/// A loaded opentype (or truetype) font. +#[derive(Debug, Clone, PartialEq)] +pub struct Font { + /// The PostScript name of this font. + pub name: String, +} + +impl Font { + /// Create a new font from a byte source. + pub fn new(data: &mut R) -> FontResult where R: Read + Seek { + OpenTypeReader::new(data).read() + } +} + +/// Built-in fonts. +#[derive(Debug, Copy, Clone, PartialEq)] +#[allow(missing_docs)] +pub enum BuiltinFont { + Courier, + CourierBold, + CourierOblique, + CourierBoldOblique, + Helvetica, + HelveticaBold, + HelveticaOblique, + HelveticaBoldOblique, + TimesRoman, + TimesBold, + TimeItalic, + TimeBoldItalic, + Symbol, + ZapfDingbats, +} + +impl BuiltinFont { + /// The name of the font. + pub fn name(&self) -> &'static str { + use BuiltinFont::*; + match self { + Courier => "Courier", + CourierBold => "Courier-Bold", + CourierOblique => "Courier-Oblique", + CourierBoldOblique => "Courier-BoldOblique", + Helvetica => "Helvetica", + HelveticaBold => "Helvetica-Bold", + HelveticaOblique => "Helvetica-Oblique", + HelveticaBoldOblique => "Helvetica-BoldOblique", + TimesRoman => "Times-Roman", + TimesBold => "Times-Bold", + TimeItalic => "Time-Italic", + TimeBoldItalic => "Time-BoldItalic", + Symbol => "Symbol", + ZapfDingbats => "ZapfDingbats", + } + } +} + + +/// Result type used for tokenization. +type FontResult = std::result::Result; + +/// A failure when loading a font. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct LoadingError { + /// A message describing the error. + pub message: String, +} + +impl From for LoadingError { + fn from(err: io::Error) -> LoadingError { + LoadingError { message: format!("io error: {}", err) } + } +} + +impl fmt::Display for LoadingError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "font loading error: {}", self.message) + } +} + + +/// Reads a font from a _OpenType_ or _TrueType_ font file. +struct OpenTypeReader<'r, R> where R: Read + Seek { + data: &'r mut R, + font: Font, + table_records: Vec, +} + +/// Used to identify a table, design-variation axis, script, +/// language system, feature, or baseline. +#[derive(Clone, PartialEq)] +struct Tag(pub [u8; 4]); + +impl PartialEq<&str> for Tag { + fn eq(&self, other: &&str) -> bool { + other.as_bytes() == &self.0 + } +} + +impl fmt::Debug for Tag { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "\"{}\"", self) + } +} + +impl fmt::Display for Tag { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let a = self.0; + write!(f, "{}{}{}{}", a[0] as char, a[1] as char, a[2] as char, a[3] as char) + } +} + +/// Stores information about one table. +#[derive(Debug, Clone, PartialEq)] +struct TableRecord { + table: Tag, + check_sum: u32, + offset: u32, + length: u32, +} + +impl<'r, R> OpenTypeReader<'r, R> where R: Read + Seek { + /// Create a new reader from a byte source. + pub fn new(data: &'r mut R) -> OpenTypeReader<'r, R> { + OpenTypeReader { + data, + font: Font { + name: String::new(), + }, + table_records: vec![], + } + } + + /// Read the font from the byte source. + pub fn read(mut self) -> FontResult { + self.read_table_records()?; + self.read_name_table()?; + + Ok(self.font) + } + + /// Read the offset table. + fn read_table_records(&mut self) -> FontResult<()> { + let sfnt_version = self.data.read_u32::()?; + let num_tables = self.data.read_u16::()?; + let search_range = self.data.read_u16::()?; + let entry_selector = self.data.read_u16::()?; + let range_shift = self.data.read_u16::()?; + + let outlines = match sfnt_version { + 0x00010000 => "truetype", + 0x4F54544F => "cff", + _ => return self.err("unsuported font outlines"), + }; + + for _ in 0 .. num_tables { + let table = self.read_tag()?; + let check_sum = self.data.read_u32::()?; + let offset = self.data.read_u32::()?; + let length = self.data.read_u32::()?; + + self.table_records.push(TableRecord { + table, + check_sum, + offset, + length, + }); + } + + Ok(()) + } + + /// Read the name table (gives general information about the font). + fn read_name_table(&mut self) -> FontResult<()> { + let table = match self.table_records.iter().find(|record| record.table == "name") { + Some(table) => table, + None => return self.err("missing 'name' table"), + }; + + self.data.seek(SeekFrom::Start(table.offset as u64))?; + + let format = self.data.read_u16::()?; + let count = self.data.read_u16::()?; + let string_offset = self.data.read_u16::()?; + + let storage = (table.offset + string_offset as u32) as u64; + + let mut name = None; + + for _ in 0 .. count { + let platform_id = self.data.read_u16::()?; + let encoding_id = self.data.read_u16::()?; + let language_id = self.data.read_u16::()?; + let name_id = self.data.read_u16::()?; + let length = self.data.read_u16::()?; + let offset = self.data.read_u16::()?; + + // Postscript name is what we are interested in + if name_id == 6 && platform_id == 3 && encoding_id == 1 { + if length % 2 != 0 { + return self.err("invalid encoded name"); + } + + self.data.seek(SeekFrom::Start(storage + offset as u64))?; + let mut buffer = Vec::with_capacity(length as usize / 2); + + for _ in 0 .. length / 2 { + buffer.push(self.data.read_u16::()?); + } + + name = match String::from_utf16(&buffer) { + Ok(string) => Some(string), + Err(_) => return self.err("invalid encoded name"), + }; + + break; + } + } + + self.font.name = match name { + Some(name) => name, + None => return self.err("missing postscript font name"), + }; + + Ok(()) + } + + /// Read a tag (array of four u8's). + fn read_tag(&mut self) -> FontResult { + let mut tag = [0u8; 4]; + self.data.read(&mut tag)?; + Ok(Tag(tag)) + } + + /// Gives a font loading error with a message. + fn err>(&self, message: S) -> FontResult { + Err(LoadingError { message: message.into() }) + } +} + + +#[cfg(test)] +mod font_tests { + use super::*; + + /// Test if the loaded font is the same as the expected font. + fn test(path: &str, font: Font) { + let mut file = std::fs::File::open(path).unwrap(); + assert_eq!(Font::new(&mut file), Ok(font)); + } + + #[test] + fn opentype() { + test("../fonts/NotoSerif-Regular.ttf", Font { + name: "NotoSerif".to_owned(), + }); + test("../fonts/NotoSansMath-Regular.ttf", Font { + name: "NotoSansMath-Regular".to_owned(), + }); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 000000000..2959925e8 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,11 @@ +//! Typeset is a library for compiling _plain-text_ strings written in the +//! corresponding typesetting language into a typesetted document in a +//! file format like _PDF_. + +#![allow(unused)] + +pub mod parsing; +pub mod doc; +pub mod font; +pub mod pdf; +pub mod utility; diff --git a/src/parsing.rs b/src/parsing.rs new file mode 100644 index 000000000..5efa69e5e --- /dev/null +++ b/src/parsing.rs @@ -0,0 +1,696 @@ +//! Parsing of source code into tokens and syntax trees. + +use std::fmt; +use std::iter::Peekable; +use std::mem::swap; +use unicode_segmentation::{UnicodeSegmentation, UWordBounds}; +use crate::utility::{Splinor, Spline, Splined, StrExt}; + + +/// A logical unit of the incoming text stream. +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum Token<'s> { + /// One or more whitespace (non-newline) codepoints. + Space, + /// A line feed (either `\n` or `\r\n`). + Newline, + /// A left bracket: `[`. + LeftBracket, + /// A right bracket: `]`. + RightBracket, + /// A colon (`:`) indicating the beginning of function arguments. + /// + /// If a colon occurs outside of the function header, it will be + /// tokenized as a `Word`. + Colon, + /// Same as with `Colon`. + Equals, + /// Two underscores, indicating text in _italics_. + DoubleUnderscore, + /// Two stars, indicating **bold** text. + DoubleStar, + /// A dollar sign, indicating mathematical content. + Dollar, + /// A hashtag starting a comment. + Hashtag, + /// Everything else just is a literal word. + Word(&'s str), +} + + +/// A type that is seperable into logical units (tokens). +pub trait Tokenize { + /// Tokenize self into logical units. + fn tokenize<'s>(&'s self) -> Tokens<'s>; +} + +impl Tokenize for str { + fn tokenize<'s>(&'s self) -> Tokens<'s> { + Tokens::new(self) + } +} + + +/// An iterator over the tokens of a text. +#[derive(Clone)] +pub struct Tokens<'s> { + source: &'s str, + words: Peekable>, + state: TokensState<'s>, + stack: Vec>, +} + +impl fmt::Debug for Tokens<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Tokens") + .field("source", &self.source) + .field("words", &"Peekable") + .field("state", &self.state) + .field("stack", &self.stack) + .finish() + } +} + +/// The state the tokenizer is in. +#[derive(Debug, Clone)] +enum TokensState<'s> { + /// The base state if there is nothing special we are in. + Body, + /// Inside a function header. Here colons and equal signs get parsed + /// as distinct tokens rather than text. + Function, + /// We expect either the end of the function or the beginning of the body. + MaybeBody, + /// We are inside one unicode word that consists of multiple tokens, + /// because it contains double underscores. + DoubleUnderscore(Spline<'s, Token<'s>>), +} + +impl PartialEq for TokensState<'_> { + fn eq(&self, other: &TokensState) -> bool { + use TokensState as TS; + + match (self, other) { + (TS::Body, TS::Body) => true, + (TS::Function, TS::Function) => true, + (TS::MaybeBody, TS::MaybeBody) => true, + // They are not necessarily different, but we don't care + _ => false, + } + } +} + +impl<'s> Iterator for Tokens<'s> { + type Item = Token<'s>; + + /// Advance the iterator, return the next token or nothing. + fn next(&mut self) -> Option> { + use TokensState as TS; + + // Return the remaining words and double underscores. + if let TS::DoubleUnderscore(ref mut splinor) = self.state { + loop { + if let Some(splined) = splinor.next() { + return Some(match splined { + Splined::Value(word) if word != "" => Token::Word(word), + Splined::Splinor(s) => s, + _ => continue, + }); + } else { + self.unswitch(); + break; + } + } + } + + // Skip whitespace, but if at least one whitespace word existed, + // remember that, because we return a space token. + let mut whitespace = false; + while let Some(word) = self.words.peek() { + if !word.is_whitespace() { + break; + } + whitespace = true; + self.advance(); + } + if whitespace { + return Some(Token::Space); + } + + // Function maybe has a body + if self.state == TS::MaybeBody { + match *self.words.peek()? { + "[" => { + self.state = TS::Body; + return Some(self.consumed(Token::LeftBracket)); + }, + _ => self.unswitch(), + } + } + + // Now all special cases are handled and we can finally look at the + // next words. + let next = self.words.next()?; + let afterwards = self.words.peek(); + + Some(match next { + // Special characters + "[" => { + self.switch(TS::Function); + Token::LeftBracket + }, + "]" => { + if self.state == TS::Function { + self.state = TS::MaybeBody; + } + Token::RightBracket + }, + "$" => Token::Dollar, + "#" => Token::Hashtag, + + // Context sensitive operators + ":" if self.state == TS::Function => Token::Colon, + "=" if self.state == TS::Function => Token::Equals, + + // Double star/underscore + "*" if afterwards == Some(&"*") => { + self.consumed(Token::DoubleStar) + }, + "__" => Token::DoubleUnderscore, + + // Newlines + "\n" | "\r\n" => Token::Newline, + + // Escaping + r"\" => { + if let Some(next) = afterwards { + let escapable = match *next { + "[" | "]" | "$" | "#" | r"\" | ":" | "=" | "*" | "_" => true, + w if w.starts_with("__") => true, + _ => false, + }; + + if escapable { + let next = *next; + self.advance(); + return Some(Token::Word(next)); + } + } + + Token::Word(r"\") + }, + + // Double underscores hidden in words. + word if word.contains("__") => { + let spline = word.spline("__", Token::DoubleUnderscore); + self.switch(TS::DoubleUnderscore(spline)); + return self.next(); + }, + + // Now it seems like it's just a normal word. + word => Token::Word(word), + }) + } +} + +impl<'s> Tokens<'s> { + /// Create a new token stream from text. + #[inline] + pub fn new(source: &'s str) -> Tokens<'s> { + Tokens { + source, + words: source.split_word_bounds().peekable(), + state: TokensState::Body, + stack: vec![], + } + } + + /// Advance the iterator by one step. + #[inline] + fn advance(&mut self) { + self.words.next(); + } + + /// Switch to the given state. + #[inline] + fn switch(&mut self, mut state: TokensState<'s>) { + swap(&mut state, &mut self.state); + self.stack.push(state); + } + + /// Go back to the top-of-stack state. + #[inline] + fn unswitch(&mut self) { + self.state = self.stack.pop().unwrap_or(TokensState::Body); + } + + /// Advance and return the given token. + #[inline] + fn consumed(&mut self, token: Token<'s>) -> Token<'s> { + self.advance(); + token + } +} + + +/// A tree representation of the source. +#[derive(Debug, Clone, PartialEq)] +pub struct SyntaxTree<'s> { + /// The children. + pub nodes: Vec>, +} + +impl<'s> SyntaxTree<'s> { + /// Create an empty syntax tree. + pub fn new() -> SyntaxTree<'s> { + SyntaxTree { nodes: vec![] } + } +} + +/// A node in the abstract syntax tree. +#[derive(Debug, Clone, PartialEq)] +pub enum Node<'s> { + /// Whitespace between other nodes. + Space, + /// A line feed. + Newline, + /// Indicates that italics were enabled/disabled. + ToggleItalics, + /// Indicates that boldface was enabled/disabled. + ToggleBold, + /// Indicates that math mode was enabled/disabled. + ToggleMath, + /// A literal word. + Word(&'s str), + /// A function invocation. + Func(Function<'s>), +} + +/// A node representing a function invocation. +#[derive(Debug, Clone, PartialEq)] +pub struct Function<'s> { + /// The name of the function. + pub name: &'s str, + /// Some syntax tree if the function had a body (second set of brackets), + /// otherwise nothing. + pub body: Option>, +} + + +/// A type that is parseable into a syntax tree. +pub trait Parse<'s> { + /// Parse self into a syntax tree. + fn parse(self) -> ParseResult>; +} + +impl<'s> Parse<'s> for Tokens<'s> { + fn parse(self) -> ParseResult> { + Parser::new(self).parse() + } +} + +impl<'s> Parse<'s> for Vec> { + fn parse(self) -> ParseResult> { + Parser::new(self.into_iter()).parse() + } +} + +/// Result type used for parsing. +type ParseResult = std::result::Result; + +/// A failure when parsing. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct ParseError { + /// A message describing the error. + pub message: String, +} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "parse error: {}", self.message) + } +} + + +/// Parses a token stream into an abstract syntax tree. +#[derive(Debug, Clone)] +struct Parser<'s, T> where T: Iterator> { + tokens: Peekable, + state: ParserState, + stack: Vec>, + tree: SyntaxTree<'s>, +} + +/// The state the parser is in. +#[derive(Debug, Clone, PartialEq)] +enum ParserState { + /// The base state of the parser. + Body, + /// Inside a function header. + Function, +} + +impl<'s, T> Parser<'s, T> where T: Iterator> { + /// Create a new parser from a type that emits results of tokens. + fn new(tokens: T) -> Parser<'s, T> { + Parser { + tokens: tokens.peekable(), + state: ParserState::Body, + stack: vec![], + tree: SyntaxTree::new(), + } + } + + /// Parse into an abstract syntax tree. + fn parse(mut self) -> ParseResult> { + use ParserState as PS; + + while let Some(token) = self.tokens.next() { + // Comment + if token == Token::Hashtag { + self.skip_while(|t| *t != Token::Newline); + self.advance(); + } + + match self.state { + PS::Body => match token { + // Whitespace + Token::Space => self.append(Node::Space), + Token::Newline => self.append(Node::Newline), + + // Words + Token::Word(word) => self.append(Node::Word(word)), + + // Functions + Token::LeftBracket => self.switch(PS::Function), + Token::RightBracket => { + match self.stack.pop() { + Some(func) => self.append(Node::Func(func)), + None => return self.err("unexpected closing bracket"), + } + }, + + // Modifiers + Token::DoubleUnderscore => self.append(Node::ToggleItalics), + Token::DoubleStar => self.append(Node::ToggleBold), + Token::Dollar => self.append(Node::ToggleMath), + + // Should not happen + Token::Colon | Token::Equals | Token::Hashtag => unreachable!(), + }, + + PS::Function => { + let name = match token { + Token::Word(word) if word.is_identifier() => word, + _ => return self.err("expected identifier"), + }; + + if self.tokens.next() != Some(Token::RightBracket) { + return self.err("expected closing bracket"); + } + + let mut func = Function { + name, + body: None, + }; + + // This function has a body. + if let Some(Token::LeftBracket) = self.tokens.peek() { + self.advance(); + func.body = Some(SyntaxTree::new()); + self.stack.push(func); + } else { + self.append(Node::Func(func)); + } + + self.switch(PS::Body); + }, + } + } + + if !self.stack.is_empty() { + return self.err("expected closing bracket"); + } + + Ok(self.tree) + } + + /// Advance the iterator by one step. + #[inline] + fn advance(&mut self) { + self.tokens.next(); + } + + /// Skip tokens until the condition is met. + #[inline] + fn skip_while(&mut self, f: F) where F: Fn(&Token) -> bool { + while let Some(token) = self.tokens.peek() { + if !f(token) { + break; + } + self.advance(); + } + } + + /// Switch the state. + #[inline] + fn switch(&mut self, state: ParserState) { + self.state = state; + } + + /// Append a node to the top-of-stack function or the main tree itself. + #[inline] + fn append(&mut self, node: Node<'s>) { + let tree = match self.stack.last_mut() { + Some(func) => func.body.get_or_insert_with(|| SyntaxTree::new()), + None => &mut self.tree, + }; + + tree.nodes.push(node); + } + + /// Gives a parsing error with a message. + #[inline] + fn err>(&self, message: S) -> ParseResult { + Err(ParseError { message: message.into() }) + } +} + + +#[cfg(test)] +mod token_tests { + use super::*; + use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R, + Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS, + Dollar as D, Hashtag as H, Word as W}; + + /// Test if the source code tokenizes to the tokens. + fn test(src: &str, tokens: Vec) { + assert_eq!(src.tokenize().collect::>(), tokens); + } + + /// Tokenizes the basic building blocks. + #[test] + fn tokenize_base() { + test("", vec![]); + test("Hallo", vec![W("Hallo")]); + test("[", vec![L]); + test("]", vec![R]); + test("$", vec![D]); + test("#", vec![H]); + test("**", vec![DS]); + test("__", vec![DU]); + test("\n", vec![N]); + } + + /// Tests if escaping with backslash works as it should. + #[test] + fn tokenize_escape() { + test(r"\[", vec![W("[")]); + test(r"\]", vec![W("]")]); + test(r"\#", vec![W("#")]); + test(r"\$", vec![W("$")]); + test(r"\:", vec![W(":")]); + test(r"\=", vec![W("=")]); + test(r"\**", vec![W("*"), W("*")]); + test(r"\*", vec![W("*")]); + test(r"\__", vec![W("__")]); + test(r"\_", vec![W("_")]); + test(r"\hello", vec![W(r"\"), W("hello")]); + } + + /// Tokenizes some more realistic examples. + #[test] + fn tokenize_examples() { + test(r" + [function][ + Test [italic][example]! + ] + ", vec![ + N, S, L, W("function"), R, L, N, S, W("Test"), S, L, W("italic"), R, L, + W("example"), R, W("!"), N, S, R, N, S + ]); + + test(r" + [page: size=A4] + [font: size=12pt] + + Das ist ein Beispielsatz mit **fetter** Schrift. + ", vec![ + N, S, L, W("page"), C, S, W("size"), E, W("A4"), R, N, S, + L, W("font"), C, S, W("size"), E, W("12pt"), R, N, N, S, + W("Das"), S, W("ist"), S, W("ein"), S, W("Beispielsatz"), S, W("mit"), S, + DS, W("fetter"), DS, S, W("Schrift"), W("."), N, S + ]); + } + + /// This test checks whether the colon and equals symbols get parsed correctly + /// depending on the context: Either in a function header or in a body. + #[test] + fn tokenize_symbols_context() { + test("[func: key=value][Answer: 7]", + vec![L, W("func"), C, S, W("key"), E, W("value"), R, L, + W("Answer"), W(":"), S, W("7"), R]); + test("[[n: k=v]:x][:[=]]:=", + vec![L, L, W("n"), C, S, W("k"), E, W("v"), R, C, W("x"), R, + L, W(":"), L, E, R, R, W(":"), W("=")]); + test("[func: __key__=value]", + vec![L, W("func"), C, S, DU, W("key"), DU, E, W("value"), R]); + } + + /// This test has a special look at the double underscore syntax, because + /// per Unicode standard they are not seperate words and thus harder to parse + /// than the stars. + #[test] + fn tokenize_double_underscore() { + test("he__llo__world_ _ __ Now this_ is__ special!", + vec![W("he"), DU, W("llo"), DU, W("world_"), S, W("_"), S, DU, S, W("Now"), S, + W("this_"), S, W("is"), DU, S, W("special"), W("!")]); + } + + /// This test is for checking if non-ASCII characters get parsed correctly. + #[test] + fn tokenize_unicode() { + test("[document][Hello ๐ŸŒ!]", + vec![L, W("document"), R, L, W("Hello"), S, W("๐ŸŒ"), W("!"), R]); + test("[f]โบ.", vec![L, W("f"), R, W("โบ"), W(".")]); + } + + /// This test looks if LF- and CRLF-style newlines get both identified correctly. + #[test] + fn tokenize_whitespace_newlines() { + test(" \t", vec![S]); + test("First line\r\nSecond line\nThird line\n", + vec![W("First"), S, W("line"), N, W("Second"), S, W("line"), N, + W("Third"), S, W("line"), N]); + } +} + + +#[cfg(test)] +mod parse_tests { + use super::*; + use Node::{Space as S, Newline as N, Word as W, Func as F}; + + /// Test if the source code parses into the syntax tree. + fn test(src: &str, tree: SyntaxTree) { + assert_eq!(src.tokenize().parse(), Ok(tree)); + } + + /// Test if the source parses into the error. + fn test_err(src: &str, err: ParseError) { + assert_eq!(src.tokenize().parse(), Err(err)); + } + + /// Short cut macro to create a syntax tree. + /// Is `vec`-like and the elements are the nodes. + macro_rules! tree { + ($($x:expr),*) => ( + SyntaxTree { nodes: vec![$($x),*] } + ); + ($($x:expr,)*) => (tree![$($x),*]) + } + + /// Parse the basic cases. + #[test] + fn parse_base() { + test("", tree! {}); + test("Hello World!", tree! { W("Hello"), S, W("World"), W("!")}); + } + + /// Parse things dealing with functions. + #[test] + fn parse_functions() { + test("[test]", tree! { F(Function { name: "test", body: None }) }); + test("This is an [modifier][example] of a function invocation.", tree! { + W("This"), S, W("is"), S, W("an"), S, + F(Function { name: "modifier", body: Some(tree! { W("example") }) }), S, + W("of"), S, W("a"), S, W("function"), S, W("invocation"), W(".") + }); + test("[func][Hello][links][Here][end]", tree! { + F(Function { + name: "func", + body: Some(tree! { W("Hello") }), + }), + F(Function { + name: "links", + body: Some(tree! { W("Here") }), + }), + F(Function { + name: "end", + body: None, + }), + }); + test("[bodyempty][]", tree! { + F(Function { + name: "bodyempty", + body: Some(tree! {}) + }) + }); + test("[nested][[func][call]] outside", tree! { + F(Function { + name: "nested", + body: Some(tree! { F(Function { + name: "func", + body: Some(tree! { W("call") }), + }), }), + }), + S, W("outside") + }); + } + + /// Tests if the parser handles non-ASCII stuff correctly. + #[test] + fn parse_unicode() { + test("[lib_parse] โบ.", tree! { + F(Function { + name: "lib_parse", + body: None + }), + S, W("โบ"), W(".") + }); + test("[func123][Hello ๐ŸŒ!]", tree! { + F(Function { + name: "func123", + body: Some(tree! { W("Hello"), S, W("๐ŸŒ"), W("!") }), + }) + }); + } + + /// Tests whether errors get reported correctly. + #[test] + fn parse_errors() { + test_err("No functions here]", ParseError { + message: "unexpected closing bracket".to_owned(), + }); + test_err("[hello][world", ParseError { + message: "expected closing bracket".to_owned(), + }); + test_err("[hello world", ParseError { + message: "expected closing bracket".to_owned(), + }); + test_err("[ no-name][Why?]", ParseError { + message: "expected identifier".to_owned(), + }); + } +} diff --git a/src/pdf.rs b/src/pdf.rs new file mode 100644 index 000000000..5cdf335c6 --- /dev/null +++ b/src/pdf.rs @@ -0,0 +1,375 @@ +//! Writing of documents in the _PDF_ format. + +use std::io::{self, Write}; +use crate::doc::{Document, Text, DocumentFont, Size}; + + +/// A type that is a sink for types that can be written conforming +/// to the _PDF_ format (that may be things like sizes, other objects +/// or whole documents). +pub trait WritePdf { + /// Write self into a byte sink, returning how many bytes were written. + fn write_pdf(&mut self, object: &T) -> io::Result; +} + +impl WritePdf for W { + fn write_pdf(&mut self, document: &Document) -> io::Result { + PdfWriter::new(document).write(self) + } +} + +impl WritePdf for W { + fn write_pdf(&mut self, size: &Size) -> io::Result { + self.write_str(size.points) + } +} + +/// A type that is a sink for types that can be converted to strings +/// and thus can be written string-like into a byte sink. +pub trait WriteByteString { + /// Write the string-like type into self, returning how many + /// bytes were written. + fn write_str(&mut self, string_like: S) -> io::Result; +} + +impl WriteByteString for W { + fn write_str(&mut self, string_like: S) -> io::Result { + self.write(string_like.to_string().as_bytes()) + } +} + + +/// Writes an abstract document into a byte sink in the _PDF_ format. +#[derive(Debug, Clone)] +struct PdfWriter<'d> { + doc: &'d Document, + w: usize, + catalog_id: u32, + page_tree_id: u32, + resources_start: u32, + pages_start: u32, + content_start: u32, + xref_table: Vec, + offset_xref: u32, +} + +impl<'d> PdfWriter<'d> { + /// Create a new pdf writer from a document. + fn new(doc: &'d Document) -> PdfWriter<'d> { + // Calculate unique ids for each object + let catalog_id: u32 = 1; + let page_tree_id = catalog_id + 1; + let pages_start = page_tree_id + 1; + let resources_start = pages_start + doc.pages.len() as u32; + let content_start = resources_start + doc.fonts.len() as u32; + + PdfWriter { + doc, + catalog_id, + page_tree_id, + resources_start, + pages_start, + content_start, + w: 0, + xref_table: vec![], + offset_xref: 0, + } + } + + /// Write the document into a byte sink. + fn write(&mut self, target: &mut W) -> io::Result { + self.write_header(target)?; + + self.write_document_catalog(target)?; + self.write_page_tree(target)?; + self.write_pages(target)?; + + self.write_resources(target)?; + + self.write_content(target)?; + // self.write_fonts(target)?; + + self.write_xref_table(target)?; + self.write_trailer(target)?; + self.write_start_xref(target)?; + + Ok(self.w) + } + + /// Write the pdf header. + fn write_header(&mut self, target: &mut W) -> io::Result { + // Write the magic start + self.w += target.write(b"%PDF-1.7\n")?; + Ok(self.w) + } + + /// Write the document catalog (contains general info about the document). + fn write_document_catalog(&mut self, target: &mut W) -> io::Result { + self.xref_table.push(self.w as u32); + + self.w += target.write_str(self.catalog_id)?; + self.w += target.write(b" 0 obj\n")?; + self.w += target.write(b"<<\n")?; + self.w += target.write(b"/Type /Catalog\n")?; + + self.w += target.write(b"/Pages ")?; + self.w += target.write_str(self.page_tree_id)?; + self.w += target.write(b" 0 R\n")?; + + self.w += target.write(b">>\n")?; + self.w += target.write(b"endobj\n")?; + + Ok(self.w) + } + + /// Write the page tree (overview over the pages of a document). + fn write_page_tree(&mut self, target: &mut W) -> io::Result { + self.xref_table.push(self.w as u32); + + // Create page tree + self.w += target.write_str(self.page_tree_id)?; + self.w += target.write(b" 0 obj\n")?; + self.w += target.write(b"<<\n")?; + self.w += target.write(b"/Type /Pages\n")?; + + self.w += target.write(b"/Count ")?; + self.w += target.write_str(self.doc.pages.len())?; + self.w += target.write(b"\n")?; + + self.w += target.write(b"/Kids [")?; + + for id in self.pages_start .. self.pages_start + self.doc.pages.len() as u32 { + self.w += target.write_str(id)?; + self.w += target.write(b" 0 R ")?; + } + + self.w += target.write(b"]\n")?; + + self.w += target.write(b"/Resources\n")?; + self.w += target.write(b"<<\n")?; + + self.w += target.write(b"/Font\n")?; + self.w += target.write(b"<<\n")?; + + let mut font_id = self.resources_start; + for nr in 1 ..= self.doc.fonts.len() as u32 { + self.w += target.write(b"/F")?; + self.w += target.write_str(nr)?; + self.w += target.write(b" ")?; + self.w += target.write_str(font_id)?; + self.w += target.write(b" 0 R\n")?; + font_id += 1; + } + + self.w += target.write(b">>\n")?; + self.w += target.write(b">>\n")?; + + self.w += target.write(b">>\n")?; + self.w += target.write(b"endobj\n")?; + + Ok(self.w) + } + + /// Write the page descriptions. + fn write_pages(&mut self, target: &mut W) -> io::Result { + let mut page_id = self.pages_start; + let mut content_id = self.content_start; + + for page in &self.doc.pages { + self.xref_table.push(self.w as u32); + + self.w += target.write_str(page_id)?; + self.w += target.write(b" 0 obj\n")?; + self.w += target.write(b"<<\n")?; + self.w += target.write(b"/Type /Page\n")?; + + self.w += target.write(b"/Parent ")?; + self.w += target.write_str(self.page_tree_id)?; + self.w += target.write(b" 0 R\n")?; + + self.w += target.write(b"/MediaBox [0 0 ")?; + self.w += target.write_pdf(&page.size[0])?; + self.w += target.write(b" ")?; + self.w += target.write_pdf(&page.size[1])?; + self.w += target.write(b"]\n")?; + + self.w += target.write(b"/Contents [")?; + + for _ in &page.contents { + self.w += target.write_str(content_id)?; + self.w += target.write(b" 0 R ")?; + + content_id += 1; + } + + self.w += target.write(b"]\n")?; + + self.w += target.write(b">>\n")?; + self.w += target.write(b"endobj\n")?; + + page_id += 1; + } + + Ok(self.w) + } + + /// Write the resources used by the file (fonts and friends). + fn write_resources(&mut self, target: &mut W) -> io::Result { + let mut id = self.resources_start; + + for font in &self.doc.fonts { + self.xref_table.push(self.w as u32); + + self.w += target.write_str(id)?; + self.w += target.write(b" 0 obj\n")?; + self.w += target.write(b"<<\n")?; + self.w += target.write(b"/Type /Font\n")?; + + match font { + DocumentFont::Builtin(builtin) => { + self.w += target.write(b"/Subtype /Type1\n")?; + self.w += target.write(b"/BaseFont /")?; + self.w += target.write_str(builtin.name())?; + self.w += target.write(b"\n")?; + }, + DocumentFont::Loaded(font) => { + self.w += target.write(b"/Subtype /TrueType\n")?; + self.w += target.write(b"/BaseFont /")?; + self.w += target.write_str(font.name.as_str())?; + self.w += target.write(b"\n")?; + unimplemented!(); + }, + } + + self.w += target.write(b">>\n")?; + self.w += target.write(b"endobj\n")?; + + id += 1; + } + + Ok(self.w) + } + + /// Write the page contents. + fn write_content(&mut self, target: &mut W) -> io::Result { + let mut id = self.content_start; + + for page in &self.doc.pages { + for content in &page.contents { + self.xref_table.push(self.w as u32); + + self.w += target.write_str(id)?; + self.w += target.write(b" 0 obj\n")?; + self.w += target.write(b"<<\n")?; + + let mut buffer = Vec::new(); + buffer.write(b"BT/\n")?; + + buffer.write(b"/F1 13 Tf\n")?; + buffer.write(b"108 734 Td\n")?; + buffer.write(b"(")?; + + let Text(string) = content; + buffer.write(string.as_bytes())?; + + buffer.write(b") Tj\n")?; + buffer.write(b"ET\n")?; + + self.w += target.write(b"/Length ")?; + self.w += target.write_str(buffer.len())?; + self.w += target.write(b"\n")?; + + self.w += target.write(b">>\n")?; + + self.w += target.write(b"stream\n")?; + self.w += target.write(&buffer)?; + self.w += target.write(b"endstream\n")?; + + self.w += target.write(b"endobj\n")?; + + id += 1; + } + } + + Ok(self.w) + } + + /// Write the cross-reference table. + fn write_xref_table(&mut self, target: &mut W) -> io::Result { + self.offset_xref = self.w as u32; + + self.w += target.write(b"xref\n")?; + self.w += target.write(b"0 ")?; + self.w += target.write_str(self.xref_table.len())?; + self.w += target.write(b"\n")?; + + self.w += target.write(b"0000000000 65535 f\r\n")?; + + for offset in &self.xref_table { + self.w += target.write(format!("{:010}", offset).as_bytes())?; + self.w += target.write(b" 00000 n")?; + self.w += target.write(b"\r\n")?; + } + + Ok(self.w) + } + + /// Write the trailer (points to the root object). + fn write_trailer(&mut self, target: &mut W) -> io::Result { + self.w += target.write(b"trailer\n")?; + self.w += target.write(b"<<\n")?; + + self.w += target.write(b"/Root ")?; + self.w += target.write_str(self.catalog_id)?; + self.w += target.write(b" 0 R\n")?; + + self.w += target.write(b"/Size ")?; + self.w += target.write_str(self.xref_table.len() + 1)?; + self.w += target.write(b"\n")?; + + self.w += target.write(b">>\n")?; + + Ok(self.w) + } + + /// Write where the cross-reference table starts. + fn write_start_xref(&mut self, target: &mut W) -> io::Result { + self.w += target.write(b"startxref\n")?; + self.w += target.write_str(self.offset_xref)?; + self.w += target.write(b"\n")?; + + Ok(self.w) + } +} + + +#[cfg(test)] +mod pdf_tests { + use super::*; + use crate::parsing::{Tokenize, Parse}; + use crate::doc::Generate; + + /// Create a pdf with a name from the source code. + fn test(name: &str, src: &str) { + let mut file = std::fs::File::create(name).unwrap(); + let doc = src.tokenize() + .parse().unwrap() + .generate().unwrap(); + file.write_pdf(&doc).unwrap(); + } + + #[test] + fn pdf_simple() { + test("../target/write1.pdf", "This is an example of a sentence."); + test("../target/write2.pdf"," + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed + diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam + voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd + gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor + sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut + labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et + justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est + Lorem ipsum dolor sit amet. + "); + } +} diff --git a/src/utility.rs b/src/utility.rs new file mode 100644 index 000000000..8304025d0 --- /dev/null +++ b/src/utility.rs @@ -0,0 +1,138 @@ +//! Utility functionality. + +use std::str::Split; +use std::iter::Peekable; +use unicode_xid::UnicodeXID; + + +/// Types that can be splined. +pub trait Splinor { + /// Returns an iterator over the substrings splitted by the pattern, + /// intertwined with the splinor. + /// + /// # Example + /// + /// ``` + /// # use typeset::utility::*; + /// #[derive(Debug, Copy, Clone, PartialEq)] + /// struct Space; + /// + /// let v: Vec> = "My airplane flies!".spline(" ", Space).collect(); + /// assert_eq!(v, [ + /// Splined::Value("My"), + /// Splined::Splinor(Space), + /// Splined::Value("airplane"), + /// Splined::Splinor(Space), + /// Splined::Value("flies!"), + /// ]); + /// ``` + fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T>; +} + +impl Splinor for str { + fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T> { + Spline { + splinor: Splined::Splinor(splinor), + split: self.split(pat).peekable(), + next_splinor: false, + } + } +} + +/// Iterator over splitted values and splinors. +/// +/// Created by the [`spline`](Splinor::spline) function. +#[derive(Debug, Clone)] +pub struct Spline<'s, T> { + splinor: Splined<'s, T>, + split: Peekable>, + next_splinor: bool, +} + +/// Represents either a splitted substring or a splinor. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub enum Splined<'s, T> { + /// A substring. + Value(&'s str), + /// An intertwined splinor. + Splinor(T), +} + +impl<'s, T: Clone> Iterator for Spline<'s, T> { + type Item = Splined<'s, T>; + + fn next(&mut self) -> Option> { + if self.next_splinor && self.split.peek().is_some() { + self.next_splinor = false; + return Some(self.splinor.clone()); + } else { + self.next_splinor = true; + return Some(Splined::Value(self.split.next()?)) + } + } +} + + +/// More useful functions on `str`'s. +pub trait StrExt { + /// Whether self consists only of whitespace. + fn is_whitespace(&self) -> bool; + + /// Whether this word is a valid unicode identifier. + fn is_identifier(&self) -> bool; +} + +impl StrExt for str { + #[inline] + fn is_whitespace(&self) -> bool { + self.chars().all(|c| c.is_whitespace() && c != '\n') + } + + fn is_identifier(&self) -> bool { + let mut chars = self.chars(); + + match chars.next() { + Some(c) if !UnicodeXID::is_xid_start(c) => return false, + None => return false, + _ => (), + } + + while let Some(c) = chars.next() { + if !UnicodeXID::is_xid_continue(c) { + return false; + } + } + + true + } +} + + +#[cfg(test)] +mod splinor_tests { + use super::*; + use Splined::{Value as V, Splinor as S}; + + #[derive(Debug, Copy, Clone, PartialEq)] + enum Token { DoubleUnderscore } + + fn test(string: &str, pat: &str, splinor: T, vec: Vec>) + where T: std::fmt::Debug + Clone + PartialEq { + assert_eq!(string.spline(pat, splinor).collect::>(), vec); + } + + #[test] + fn splinor() { + let s = S(Token::DoubleUnderscore); + test("__he__llo__world__", "__", Token::DoubleUnderscore, + vec![V(""), s, V("he"), s, V("llo"), s, V("world"), s, V("")]); + test("__Italic__", "__", Token::DoubleUnderscore, + vec![V(""), s, V("Italic"), s, V("")]); + test("Key__Value", "__", Token::DoubleUnderscore, + vec![V("Key"), s, V("Value")]); + test("__Start__NoEnd", "__", Token::DoubleUnderscore, + vec![V(""), s, V("Start"), s, V("NoEnd")]); + test("NoStart__End__", "__", Token::DoubleUnderscore, + vec![V("NoStart"), s, V("End"), s, V("")]); + } +}