From b1e956419d94a0c3876891b3d6a4976cc4a3ab09 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Sat, 11 Jan 2020 10:11:14 +0100 Subject: [PATCH] =?UTF-8?q?Re-engineer=20tokenization=20=F0=9F=9A=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/func/mod.rs | 2 +- src/library/mod.rs | 7 +- src/size.rs | 2 +- src/style.rs | 13 +- src/syntax/mod.rs | 60 +-- src/syntax/parsing.rs | 835 +--------------------------------------- src/syntax/span.rs | 2 - src/syntax/tokens.rs | 765 ++++++++++++++---------------------- tests/parse.rs | 21 +- tests/parsing/base.rs | 78 ---- tests/parsing/tokens.rs | 62 +++ 11 files changed, 398 insertions(+), 1449 deletions(-) delete mode 100644 tests/parsing/base.rs create mode 100644 tests/parsing/tokens.rs diff --git a/src/func/mod.rs b/src/func/mod.rs index 69f28e007..01c77327e 100644 --- a/src/func/mod.rs +++ b/src/func/mod.rs @@ -15,7 +15,7 @@ pub mod prelude { pub use crate::func::{Scope, ParseFunc, LayoutFunc, Command, Commands}; pub use crate::layout::prelude::*; pub use crate::syntax::{ - parse, ParseContext, ParseResult, + ParseContext, ParseResult, SyntaxTree, FuncCall, FuncArgs, PosArg, KeyArg, Expression, Ident, ExpressionKind, Spanned, Span diff --git a/src/library/mod.rs b/src/library/mod.rs index 013e99627..92c3c9488 100644 --- a/src/library/mod.rs +++ b/src/library/mod.rs @@ -297,9 +297,10 @@ function! { parse!(forbidden: body); if let Some(name) = args.get_pos_opt::()? { - let flip = args.get_key_opt::("flip")? - .unwrap_or(false); - PageSizeFunc::Paper(Paper::from_name(name.as_str())?, flip) + let flip = args.get_key_opt::("flip")?.unwrap_or(false); + let paper = Paper::from_name(name.as_str()) + .ok_or_else(|| error!(@"invalid paper name: `{}`", name))?; + PageSizeFunc::Paper(paper, flip) } else { PageSizeFunc::Custom(ExtentMap::new(&mut args, true)?) } diff --git a/src/size.rs b/src/size.rs index 5b84c2ad8..a5bc5d7ff 100644 --- a/src/size.rs +++ b/src/size.rs @@ -72,7 +72,7 @@ impl Size { impl Display for Size { fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, "{}cm", self.to_cm()) + write!(f, "{}pt", self.points) } } diff --git a/src/style.rs b/src/style.rs index e552a63d6..35de5da13 100644 --- a/src/style.rs +++ b/src/style.rs @@ -3,7 +3,6 @@ use toddle::query::{FontFallbackTree, FontVariant, FontStyle, FontWeight}; use crate::size::{Size, Size2D, SizeBox, ValueBox, PSize}; -use crate::syntax::ParseResult; /// Defines properties of pages and text. @@ -157,7 +156,7 @@ pub struct Paper { impl Paper { /// The paper with the given name. - pub fn from_name(name: &str) -> ParseResult { + pub fn from_name(name: &str) -> Option { parse_paper(name) } } @@ -193,11 +192,11 @@ macro_rules! papers { class: $class, };)* - fn parse_paper(paper: &str) -> ParseResult { - Ok(match paper.to_lowercase().as_str() { - $($($patterns)* => $var,)* - _ => error!("unknown paper size: `{}`", paper), - }) + fn parse_paper(paper: &str) -> Option { + match paper.to_lowercase().as_str() { + $($($patterns)* => Some($var),)* + _ => None, + } } }; } diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index b0cbcafae..10a509d2f 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -11,48 +11,6 @@ pub_use_mod!(parsing); pub_use_mod!(span); -/// A logical unit of the incoming text stream. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum Token<'s> { - /// One or more whitespace (non-newline) codepoints. - Space, - /// A line feed (`\n`, `\r\n` and some more as defined by the Unicode standard). - Newline, - /// A left bracket: `[`. - LeftBracket, - /// A right bracket: `]`. - RightBracket, - /// A colon (`:`) indicating the beginning of function arguments (Function - /// header only). - /// - /// If a colon occurs outside of a function header, it will be tokenized as - /// [Text](Token::Text), just like the other tokens annotated with - /// _Header only_. - Colon, - /// An equals (`=`) sign assigning a function argument a value (Header only). - Equals, - /// A comma (`,`) separating two function arguments (Header only). - Comma, - /// Quoted text as a string value (Header only). - Quoted(&'s str), - /// An underscore, indicating text in italics (Body only). - Underscore, - /// A star, indicating bold text (Body only). - Star, - /// A backtick, indicating monospace text (Body only). - Backtick, - /// A line comment. - LineComment(&'s str), - /// A block comment. - BlockComment(&'s str), - /// A star followed by a slash unexpectedly ending a block comment - /// (the comment was not started before, otherwise a - /// [BlockComment](Token::BlockComment) would be returned). - StarSlash, - /// Any consecutive string which does not contain markup. - Text(&'s str), -} - /// A tree representation of source code. #[derive(Debug, PartialEq)] pub struct SyntaxTree { @@ -256,11 +214,11 @@ debug_display!(Expression); pub struct Ident(pub String); impl Ident { - pub fn new(string: String) -> ParseResult { - if is_identifier(&string) { - Ok(Ident(string)) + pub fn new(ident: S) -> Option where S: AsRef + Into { + if is_identifier(ident.as_ref()) { + Some(Ident(ident.into())) } else { - error!("invalid identifier: `{}`", string); + None } } @@ -277,20 +235,20 @@ impl Display for Ident { debug_display!(Ident); -/// Whether this word is a valid unicode identifier. +/// Whether this word is a valid identifier. fn is_identifier(string: &str) -> bool { let mut chars = string.chars(); match chars.next() { - Some('-') => (), - Some(c) if UnicodeXID::is_xid_start(c) => (), + Some('-') => {} + Some(c) if UnicodeXID::is_xid_start(c) => {} _ => return false, } while let Some(c) = chars.next() { match c { - '.' | '-' => (), - c if UnicodeXID::is_xid_continue(c) => (), + '.' | '-' => {} + c if UnicodeXID::is_xid_continue(c) => {} _ => return false, } } diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index dc39145ac..4a50ef963 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -1,7 +1,4 @@ -//! Parsing of token streams into syntax trees. - use crate::func::Scope; -use crate::size::Size; use super::*; @@ -10,7 +7,7 @@ pub type ParseResult = crate::TypesetResult; /// Parses source code into a syntax tree given a context. pub fn parse(src: &str, ctx: ParseContext) -> ParseResult { - Parser::new(src, ctx).parse() + unimplemented!() } /// The context for parsing. @@ -19,833 +16,3 @@ pub struct ParseContext<'a> { /// The scope containing function definitions. pub scope: &'a Scope, } - -/// Transforms token streams into syntax trees. -#[derive(Debug)] -struct Parser<'s> { - src: &'s str, - tokens: PeekableTokens<'s>, - ctx: ParseContext<'s>, - tree: SyntaxTree, - color_tokens: Vec>, -} - -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -enum NewlineState { - /// No newline yet. - Zero, - /// We saw one newline with the given span already and are - /// looking for another. - One(Span), - /// We saw at least two newlines and wrote one, thus not - /// writing another one for more newlines. - TwoOrMore, -} - -impl<'s> Parser<'s> { - /// Create a new parser from the source code and the context. - fn new(src: &'s str, ctx: ParseContext<'s>) -> Parser<'s> { - Parser { - src, - tokens: PeekableTokens::new(tokenize(src)), - ctx, - tree: SyntaxTree::new(), - color_tokens: vec![], - } - } - - /// Parse the source into a syntax tree. - fn parse(mut self) -> ParseResult { - while self.tokens.peek().is_some() { - self.parse_white()?; - self.parse_body_part()?; - } - - Ok(self.tree) - } - - /// Parse the next part of the body. - fn parse_body_part(&mut self) -> ParseResult<()> { - use Token::*; - - if let Some(token) = self.tokens.peek() { - match token.v { - // Functions. - LeftBracket => self.parse_func()?, - RightBracket => error!("unexpected closing bracket"), - - // Modifiers. - Underscore => self.add_consumed(Node::ToggleItalics, token.span), - Star => self.add_consumed(Node::ToggleBolder, token.span), - Backtick => self.add_consumed(Node::ToggleMonospace, token.span), - - // Normal text. - Text(word) => self.add_consumed(Node::Text(word.to_owned()), token.span), - - // The rest is handled elsewhere or should not happen, because - // the tokenizer does not yield these in a body. - Space | Newline | LineComment(_) | BlockComment(_) | - Colon | Equals | Comma | Quoted(_) | StarSlash - => panic!("parse_body_part: unexpected token: {:?}", token), - } - } - - Ok(()) - } - - /// Parse a complete function from the current position. - fn parse_func(&mut self) -> ParseResult<()> { - // This should only be called if a left bracket was seen. - let token = self.tokens.next().expect("parse_func: expected token"); - assert!(token.v == Token::LeftBracket); - - self.add_color_token(ColorToken::Bracket, token.span); - - let mut span = token.span; - let name = self.parse_func_name()?; - - // Check for arguments - let args = match self.tokens.next() { - Some(Spanned { v: Token::RightBracket, span }) => { - self.add_color_token(ColorToken::Bracket, span); - FuncArgs::new() - }, - Some(Spanned { v: Token::Colon, span }) => { - self.add_color_token(ColorToken::Colon, span); - self.parse_func_args()? - } - _ => error!("expected arguments or closing bracket"), - }; - - span.end = self.tokens.get_position(); - let (func, body_span) = self.parse_func_call(name, args)?; - - if let Some(body_span) = body_span { - span.expand(body_span); - } - - // Finally this function is parsed to the end. - self.add(Node::Func(func), span); - - Ok(()) - } - - /// Parse a function header. - fn parse_func_name(&mut self) -> ParseResult> { - self.skip_white(); - - let name = match self.tokens.next() { - Some(Spanned { v: Token::Text(word), span }) => { - let ident = Ident::new(word.to_string())?; - Spanned::new(ident, span) - } - _ => error!("expected identifier"), - }; - - self.add_color_token(ColorToken::FuncName, name.span); - self.skip_white(); - - Ok(name) - } - - /// Parse the arguments to a function. - fn parse_func_args(&mut self) -> ParseResult { - let mut args = FuncArgs::new(); - - loop { - self.skip_white(); - - match self.parse_func_arg()? { - Some(DynArg::Pos(arg)) => args.add_pos(arg), - Some(DynArg::Key(arg)) => args.add_key(arg), - None => {}, - } - - match self.tokens.next() { - Some(Spanned { v: Token::Comma, span }) => { - self.add_color_token(ColorToken::Comma, span); - } - Some(Spanned { v: Token::RightBracket, span }) => { - self.add_color_token(ColorToken::Bracket, span); - break; - } - _ => error!("expected comma or closing bracket"), - } - } - - Ok(args) - } - - /// Parse one argument to a function. - fn parse_func_arg(&mut self) -> ParseResult> { - let token = match self.tokens.peek() { - Some(token) => token, - None => return Ok(None), - }; - - Ok(match token.v { - Token::Text(name) => { - self.advance(); - self.skip_white(); - - Some(match self.tokens.peek() { - Some(Spanned { v: Token::Equals, span }) => { - self.advance(); - self.skip_white(); - - let name = Ident::new(name.to_string())?; - let key = Spanned::new(name, token.span); - - self.add_color_token(ColorToken::KeyArg, key.span); - self.add_color_token(ColorToken::Equals, span); - - let next = self.tokens.next() - .ok_or_else(|| error!(@"expected expression"))?; - - let value = Self::parse_expression(next)?; - - self.add_expr_token(&value); - - let span = Span::merge(key.span, value.span); - let arg = KeyArg { key, value }; - - DynArg::Key(Spanned::new(arg, span)) - } - - _ => { - let expr = Self::parse_expression(token)?; - self.add_expr_token(&expr); - DynArg::Pos(expr) - } - }) - } - - Token::Quoted(_) => { - self.advance(); - self.skip_white(); - - self.add_color_token(ColorToken::ExprStr, token.span); - - Some(DynArg::Pos(Self::parse_expression(token)?)) - } - - _ => None, - }) - } - - /// Parse a function call. - fn parse_func_call(&mut self, name: Spanned, args: FuncArgs) - -> ParseResult<(FuncCall, Option)> { - // Now we want to parse this function dynamically. - let parser = self - .ctx - .scope - .get_parser(&name.v.0) - .ok_or_else(|| error!(@"unknown function: `{}`", &name.v))?; - - let has_body = self.tokens.peek().map(Spanned::value) == Some(Token::LeftBracket); - - // Do the parsing dependent on whether the function has a body. - Ok(if has_body { - self.advance(); - - // Find out the string which makes the body of this function. - let start_index = self.tokens.string_index(); - let mut start_pos = self.tokens.get_position(); - start_pos.column -= 1; - - let (mut end_index, mut end_pos) = - find_closing_bracket(&self.src[start_index..]) - .ok_or_else(|| error!(@"expected closing bracket"))?; - - end_index += start_index; - end_pos.column += 1; - - let span = Span::new(start_pos, end_pos); - - // Parse the body. - let body_string = &self.src[start_index..end_index]; - let body = parser(args, Some(body_string), self.ctx)?; - - // Skip to the end of the function in the token stream. - self.tokens.set_string_index(end_index); - - // Now the body should be closed. - let token = self.tokens.next().expect("parse_func_body: expected token"); - assert!(token.v == Token::RightBracket); - - (FuncCall(body), Some(span)) - } else { - (FuncCall(parser(args, None, self.ctx)?), None) - }) - } - - /// Parse an expression. - fn parse_expression(token: Spanned) -> ParseResult> { - Ok(Spanned::new(match token.v { - Token::Quoted(text) => Expression::Str(text.to_owned()), - Token::Text(text) => { - if let Ok(b) = text.parse::() { - Expression::Bool(b) - } else if let Ok(num) = text.parse::() { - Expression::Num(num) - } else if let Ok(size) = text.parse::() { - Expression::Size(size) - } else { - // This loop does not actually loop, but is used for breaking. - loop { - if text.ends_with('%') { - if let Ok(percent) = text[.. text.len()-1].parse::() { - break Expression::Num(percent / 100.0); - } - } - - break Expression::Ident(Ident::new(text.to_string())?); - } - } - } - _ => error!("expected expression"), - }, token.span)) - } - - /// Parse whitespace (as long as there is any) and skip over comments. - fn parse_white(&mut self) -> ParseResult<()> { - let mut state = NewlineState::Zero; - - while let Some(token) = self.tokens.peek() { - match token.v { - Token::Space => { - self.advance(); - match state { - NewlineState::Zero | NewlineState::TwoOrMore => { - self.add_space(token.span); - } - _ => {} - } - } - - Token::Newline => { - self.advance(); - match state { - NewlineState::Zero => state = NewlineState::One(token.span), - NewlineState::One(span) => { - self.add(Node::Newline, Span::merge(span, token.span)); - state = NewlineState::TwoOrMore; - }, - NewlineState::TwoOrMore => self.add_space(token.span), - } - } - - _ => { - if let NewlineState::One(span) = state { - self.add_space(Span::new(span.start, token.span.start)); - } - - state = NewlineState::Zero; - match token.v { - Token::LineComment(_) | Token::BlockComment(_) => self.advance(), - Token::StarSlash => error!("unexpected end of block comment"), - _ => break, - } - } - } - } - - Ok(()) - } - - /// Skip over whitespace and comments. - fn skip_white(&mut self) { - while let Some(token) = self.tokens.peek() { - match token.v { - Token::Space | Token::Newline | - Token::LineComment(_) | Token::BlockComment(_) => self.advance(), - _ => break, - } - } - } - - /// Advance the iterator by one step. - fn advance(&mut self) { - self.tokens.next(); - } - - /// Append a node to the tree. - fn add(&mut self, node: Node, span: Span) { - self.tree.nodes.push(Spanned::new(node, span)); - } - - /// Append a space, merging with a previous space if there is one. - fn add_space(&mut self, span: Span) { - match self.tree.nodes.last_mut() { - Some(ref mut node) if node.v == Node::Space => node.span.expand(span), - _ => self.add(Node::Space, span), - } - } - - /// Advance and return the given node. - fn add_consumed(&mut self, node: Node, span: Span) { - self.advance(); - self.add(node, span); - } - - /// Add a color token to the list. - fn add_color_token(&mut self, token: ColorToken, span: Span) { - self.color_tokens.push(Spanned::new(token, span)); - } - - /// Add a color token for an expression. - fn add_expr_token(&mut self, expr: &Spanned) { - let kind = match expr.v { - Expression::Bool(_) => ColorToken::ExprBool, - Expression::Ident(_) => ColorToken::ExprIdent, - Expression::Num(_) => ColorToken::ExprNumber, - Expression::Size(_) => ColorToken::ExprSize, - Expression::Str(_) => ColorToken::ExprStr, - }; - - self.add_color_token(kind, expr.span); - } -} - -/// Find the index of the first unbalanced and unescaped closing bracket. -fn find_closing_bracket(src: &str) -> Option<(usize, Position)> { - let mut parens = 0; - let mut escaped = false; - let mut line = 1; - let mut line_start_index = 0; - - for (index, c) in src.char_indices() { - match c { - '\\' => { - escaped = !escaped; - continue; - } - c if is_newline_char(c) => { - line += 1; - line_start_index = index + c.len_utf8(); - } - ']' if !escaped && parens == 0 => { - let position = Position { - line, - column: index - line_start_index, - }; - - return Some((index, position)) - } - '[' if !escaped => parens += 1, - ']' if !escaped => parens -= 1, - _ => {} - } - escaped = false; - } - None -} - -/// A peekable iterator for tokens which allows access to the original iterator -/// inside this module (which is needed by the parser). -#[derive(Debug, Clone)] -struct PeekableTokens<'s> { - tokens: Tokens<'s>, - peeked: Option>>>, -} - -impl<'s> PeekableTokens<'s> { - /// Create a new iterator from a string. - fn new(tokens: Tokens<'s>) -> PeekableTokens<'s> { - PeekableTokens { - tokens, - peeked: None, - } - } - - /// Peek at the next element. - fn peek(&mut self) -> Option>> { - let iter = &mut self.tokens; - *self.peeked.get_or_insert_with(|| iter.next()) - } - - fn get_position(&self) -> Position { - match self.peeked { - Some(Some(peeked)) => peeked.span.start, - _ => self.tokens.get_position(), - } - } - - fn string_index(&self) -> usize { - match self.peeked { - Some(Some(peeked)) => peeked.span.start.line, - _ => self.tokens.string_index(), - } - } - - fn set_string_index(&mut self, index: usize) { - self.tokens.set_string_index(index); - self.peeked = None; - } -} - -impl<'s> Iterator for PeekableTokens<'s> { - type Item = Spanned>; - - fn next(&mut self) -> Option { - match self.peeked.take() { - Some(value) => value, - None => self.tokens.next(), - } - } -} - - -#[cfg(test)] -#[allow(non_snake_case)] -mod tests { - use crate::func::{Commands, Scope}; - use crate::layout::{LayoutContext, LayoutResult}; - use crate::syntax::*; - use Node::{Func as F, Newline as N, Space as S}; - - function! { - /// A testing function which just parses it's body into a syntax - /// tree. - #[derive(Debug)] - pub struct TreeFn { pub tree: SyntaxTree } - - parse(args, body, ctx) { - args.clear(); - TreeFn { - tree: parse!(expected: body, ctx) - } - } - - layout() { vec![] } - } - - impl PartialEq for TreeFn { - fn eq(&self, other: &TreeFn) -> bool { - assert_tree_equal(&self.tree, &other.tree); - true - } - } - - function! { - /// A testing function without a body. - #[derive(Debug, Default, PartialEq)] - pub struct BodylessFn(Vec, Vec<(Ident, Expression)>); - - parse(args, body) { - parse!(forbidden: body); - BodylessFn( - args.pos().map(Spanned::value).collect(), - args.keys().map(|arg| (arg.v.key.v, arg.v.value.v)).collect(), - ) - } - - layout() { vec![] } - } - - mod args { - use super::*; - use super::Expression; - pub use Expression::{Num as N, Size as Z, Bool as B}; - - pub fn S(string: &str) -> Expression { Expression::Str(string.to_owned()) } - pub fn I(string: &str) -> Expression { - Expression::Ident(Ident::new(string.to_owned()).unwrap()) - } - } - - /// Asserts that two syntax trees are equal except for all spans inside them. - fn assert_tree_equal(a: &SyntaxTree, b: &SyntaxTree) { - for (x, y) in a.nodes.iter().zip(&b.nodes) { - if x.v != y.v { - panic!("trees are not equal: ({:#?}) != ({:#?})", x.v, y.v); - } - } - } - - /// Test if the source code parses into the syntax tree. - fn test(src: &str, tree: SyntaxTree) { - let ctx = ParseContext { - scope: &Scope::new(), - }; - assert_tree_equal(&parse(src, ctx).unwrap(), &tree); - } - - /// Test with a scope containing function definitions. - fn test_scoped(scope: &Scope, src: &str, tree: SyntaxTree) { - let ctx = ParseContext { scope }; - assert_tree_equal(&parse(src, ctx).unwrap(), &tree); - } - - /// Test if the source parses into the error. - fn test_err(src: &str, err: &str) { - let ctx = ParseContext { - scope: &Scope::new(), - }; - assert_eq!(parse(src, ctx).unwrap_err().to_string(), err); - } - - /// Test with a scope if the source parses into the error. - fn test_err_scoped(scope: &Scope, src: &str, err: &str) { - let ctx = ParseContext { scope }; - assert_eq!(parse(src, ctx).unwrap_err().to_string(), err); - } - - fn test_color(scope: &Scope, src: &str, tokens: Vec<(usize, usize, ColorToken)>) { - let ctx = ParseContext { scope }; - let tree = parse(src, ctx).unwrap(); - // assert_eq!(tree.tokens, - // tokens.into_iter() - // .map(|(s, e, t)| Spanned::new(t, Span::new(s, e))) - // .collect::>() - // ); - } - - /// Create a text node. - fn T(s: &str) -> Node { - Node::Text(s.to_owned()) - } - - fn zerospan(val: T) -> Spanned { - Spanned::new(val, Span::new(Position::new(0, 0), Position::new(0, 0))) - } - - /// Shortcut macro to create a syntax tree. Is `vec`-like and the elements - /// are the nodes without spans. - macro_rules! tree { - ($($x:expr),*) => ({ - #[allow(unused_mut)] let mut nodes = vec![]; - $( - nodes.push(zerospan($x)); - )* - SyntaxTree { nodes } - }); - ($($x:expr,)*) => (tree![$($x),*]) - } - - /// Shortcut macro to create a function. - macro_rules! func { - () => ( - FuncCall(Box::new(BodylessFn(vec![], vec![]))) - ); - (body: $tree:expr $(,)*) => ( - FuncCall(Box::new(TreeFn { tree: $tree })) - ); - (args: $pos:expr, $key:expr) => ( - FuncCall(Box::new(BodylessFn($pos, $key))) - ); - } - - /// Parse the basic cases. - #[test] - #[rustfmt::skip] - fn parse_base() { - test("", tree! []); - test("Hello World!", tree! [ T("Hello"), S, T("World!") ]); - } - - /// Test whether newlines generate the correct whitespace. - #[test] - #[rustfmt::skip] - fn parse_newlines_whitespace() { - test("Hello\nWorld", tree! [ T("Hello"), S, T("World") ]); - test("Hello \n World", tree! [ T("Hello"), S, T("World") ]); - test("Hello\n\nWorld", tree! [ T("Hello"), N, T("World") ]); - test("Hello \n\nWorld", tree! [ T("Hello"), S, N, T("World") ]); - test("Hello\n\n World", tree! [ T("Hello"), N, S, T("World") ]); - test("Hello \n \n \n World", tree! [ T("Hello"), S, N, S, T("World") ]); - test("Hello\n \n\n World", tree! [ T("Hello"), N, S, T("World") ]); - test("Hello\n \nWorld", tree! [ T("Hello"), N, T("World") ]); - } - - /// Parse things dealing with functions. - #[test] - #[rustfmt::skip] - fn parse_functions() { - let mut scope = Scope::new(); - scope.add::("test"); - scope.add::("end"); - scope.add::("modifier"); - scope.add::("func"); - - test_scoped(&scope,"[test]", tree! [ F(func! {}) ]); - test_scoped(&scope,"[ test]", tree! [ F(func! {}) ]); - test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [ - T("This"), S, T("is"), S, T("an"), S, - F(func! { body: tree! [ T("example") ] }), S, - T("of"), S, T("a"), S, T("function"), S, T("invocation.") - ]); - test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [ - F(func! { body: tree! [ T("Hello") ] }), - F(func! { body: tree! [ T("Here") ] }), - F(func! {}), - ]); - test_scoped(&scope, "[func][]", tree! [ F(func! { body: tree! [] }) ]); - test_scoped(&scope, "[modifier][[func][call]] outside", tree! [ - F(func! { body: tree! [ F(func! { body: tree! [ T("call") ] }) ] }), S, T("outside") - ]); - - } - - /// Parse functions with arguments. - #[test] - #[rustfmt::skip] - fn parse_function_args() { - use args::*; - - fn func( - pos: Vec, - key: Vec<(&str, Expression)>, - ) -> SyntaxTree { - let key = key.into_iter() - .map(|s| (Ident::new(s.0.to_string()).unwrap(), s.1)) - .collect(); - - tree! [ F(func!(args: pos, key)) ] - } - - let mut scope = Scope::new(); - scope.add::("align"); - - test_scoped(&scope, "[align: left]", func(vec![I("left")], vec![])); - test_scoped(&scope, "[align: left,right]", func(vec![I("left"), I("right")], vec![])); - test_scoped(&scope, "[align: left, right]", func(vec![I("left"), I("right")], vec![])); - test_scoped(&scope, "[align: \"hello\"]", func(vec![S("hello")], vec![])); - test_scoped(&scope, r#"[align: "hello\"world"]"#, func(vec![S(r#"hello\"world"#)], vec![])); - test_scoped(&scope, "[align: 12]", func(vec![N(12.0)], vec![])); - test_scoped(&scope, "[align: 17.53pt]", func(vec![Z(Size::pt(17.53))], vec![])); - test_scoped(&scope, "[align: 2.4in]", func(vec![Z(Size::inches(2.4))], vec![])); - test_scoped(&scope, "[align: true, 10mm, left, \"hi, there\"]", - func(vec![B(true), Z(Size::mm(10.0)), I("left"), S("hi, there")], vec![])); - - test_scoped(&scope, "[align: right=true]", func(vec![], vec![("right", B(true))])); - test_scoped(&scope, "[align: flow = horizontal]", - func(vec![], vec![("flow", I("horizontal"))])); - test_scoped(&scope, "[align: x=1cm, y=20mm]", - func(vec![], vec![("x", Z(Size::cm(1.0))), ("y", Z(Size::mm(20.0)))])); - test_scoped(&scope, "[align: x=5.14,a, \"b\", c=me,d=you]", - func(vec![I("a"), S("b")], vec![("x", N(5.14)), ("c", I("me")), ("d", I("you"))])); - } - - /// Parse comments (line and block). - #[test] - #[rustfmt::skip] - fn parse_comments() { - let mut scope = Scope::new(); - scope.add::("test"); - scope.add::("func"); - - test_scoped(&scope, "Text\n// Comment\n More text", - tree! [ T("Text"), S, T("More"), S, T("text") ]); - test_scoped(&scope, "[test/*world*/]", - tree! [ F(func! {}) ]); - test_scoped(&scope, "[test/*]*/]", - tree! [ F(func! {}) ]); - } - - /// Test if escaped, but unbalanced parens are correctly parsed. - #[test] - #[rustfmt::skip] - fn parse_unbalanced_body_parens() { - let mut scope = Scope::new(); - scope.add::("code"); - - test_scoped(&scope, r"My [code][Close \]] end", tree! [ - T("My"), S, F(func! { body: tree! [ T("Close"), S, T("]") ] }), S, T("end") - ]); - test_scoped(&scope, r"My [code][\[ Open] end", tree! [ - T("My"), S, F(func! { body: tree! [ T("["), S, T("Open") ] }), S, T("end") - ]); - test_scoped(&scope, r"My [code][Open \] and \[ close]end", tree! [ - T("My"), S, F(func! { body: - tree! [ T("Open"), S, T("]"), S, T("and"), S, T("["), S, T("close") ] - }), T("end") - ]); - } - - /// Tests if the parser handles non-ASCII stuff correctly. - #[test] - #[rustfmt::skip] - fn parse_unicode() { - let mut scope = Scope::new(); - scope.add::("func"); - scope.add::("bold"); - - test_scoped(&scope, "[func] ⺐.", tree! [ F(func! {}), S, T("⺐.") ]); - test_scoped(&scope, "[bold][Hello 🌍!]", tree! [ - F(func! { body: tree! [ T("Hello"), S, T("🌍!") ] }) - ]); - } - - /// Tests whether spans get calculated correctly. - #[test] - #[rustfmt::skip] - fn parse_spans() { - fn test_span(src: &str, correct: Vec<(usize, usize, usize, usize)>) { - let mut scope = Scope::new(); - scope.add::("hello"); - let tree = parse(src, ParseContext { scope: &scope }).unwrap(); - let spans = tree.nodes.into_iter() - .map(|node| { - let Span { start, end } = node.span; - (start.line, start.column, end.line, end.column) - }) - .collect::>(); - - assert_eq!(spans, correct); - } - - test_span("hello world", vec![(1, 0, 1, 5), (1, 5, 1, 6), (1, 6, 1, 11)]); - test_span("p1\n \np2", vec![(1, 0, 1, 2), (1, 2, 2, 2), (3, 0, 3, 2)]); - - let src = "func\n [hello: pos, other][body\r\n _🌍_\n]"; - test_span(src, vec![ - (1, 0, 1, 4), - (1, 4, 2, 1), - (2, 1, 4, 1) - ]); - } - - /// Tests whether errors get reported correctly. - #[test] - #[rustfmt::skip] - fn parse_errors() { - let mut scope = Scope::new(); - scope.add::("hello"); - - test_err("No functions here]", "unexpected closing bracket"); - test_err_scoped(&scope, "[hello][world", "expected closing bracket"); - test_err("[hello world", "expected arguments or closing bracket"); - test_err("[ no^name][Why?]", "invalid identifier: `no^name`"); - test_err("Hello */", "unexpected end of block comment"); - } - - /// Tests syntax highlighting. - #[test] - #[rustfmt::skip] - fn test_highlighting() { - use ColorToken::{Bracket as B, FuncName as F, *}; - - let mut scope = Scope::new(); - scope.add::("func"); - scope.add::("tree"); - - test_color(&scope, "[func]", vec![(0, 1, B), (1, 5, F), (5, 6, B)]); - test_color(&scope, "[func: 12pt]", vec![ - (0, 1, B), (1, 5, F), (5, 6, Colon), (7, 11, ExprSize), (11, 12, B) - ]); - test_color(&scope, "[func: x=25.3, y=\"hi\"]", vec![ - (0, 1, B), (1, 5, F), (5, 6, Colon), - (7, 8, KeyArg), (8, 9, Equals), (9, 13, ExprNumber), - (13, 14, Comma), - (15, 16, KeyArg), (16, 17, Equals), (17, 21, ExprStr), - (21, 22, B), - ]); - - test_color(&scope, "Hello [tree][With [func: 3]]", vec![ - (6, 7, B), (7, 11, F), (11, 12, B), - (12, 13, B), (18, 19, B) - ]); - } -} diff --git a/src/syntax/span.rs b/src/syntax/span.rs index bc7001a96..bbb6a2061 100644 --- a/src/syntax/span.rs +++ b/src/syntax/span.rs @@ -45,8 +45,6 @@ impl Span { } pub fn merge(a: Span, b: Span) -> Span { - let start = a.start.min(b.start); - Span { start: a.start.min(b.start), end: a.end.max(b.end), diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index cf37fe483..efcd1fc0a 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -1,88 +1,87 @@ -//! Tokenization of source code. - -use std::str::CharIndices; -use smallvec::SmallVec; +use std::iter::Peekable; +use std::str::Chars; use super::*; +use Token::*; +use State::*; -/// Builds an iterator over the tokens of the source code. pub fn tokenize(src: &str) -> Tokens { Tokens::new(src) } -/// An iterator over the tokens of source code. -#[derive(Debug, Clone)] -pub struct Tokens<'s> { - src: &'s str, - chars: PeekableChars<'s>, - state: TokensState, - stack: SmallVec<[TokensState; 1]>, - line: usize, - line_start_index: usize, +/// A minimal semantic entity of source code. +#[derive(Debug, Clone, PartialEq)] +pub enum Token<'s> { + /// One or more whitespace characters. The contained `usize` denotes the + /// number of newlines that were contained in the whitespace. + Whitespace(usize), + + /// A line comment with inner string contents `//<&'s str>\n`. + LineComment(&'s str), + /// A block comment with inner string contents `/*<&'s str>*/`. The comment + /// can contain nested block comments. + BlockComment(&'s str), + /// An erroneous `*/` without an opening block comment. + StarSlash, + + /// A left bracket: `[`. + LeftBracket, + /// A right bracket: `]`. + RightBracket, + + /// A left parenthesis in a function header: `(`. + LeftParen, + /// A right parenthesis in a function header: `)`. + RightParen, + /// A left brace in a function header: `{`. + LeftBrace, + /// A right brace in a function header: `}`. + RightBrace, + + /// A colon in a function header: `:`. + Colon, + /// A comma in a function header: `:`. + Comma, + /// An equals sign in a function header: `=`. + Equals, + + /// An expression in a function header. + Expr(Expression), + + /// A star in body-text. + Star, + /// An underscore in body-text. + Underscore, + /// A backtick in body-text. + Backtick, + + /// Any other consecutive string. + Text(&'s str), +} + +/// An iterator over the tokens of a string of source code. +pub struct Tokens<'s> { + src: &'s str, + chars: Characters<'s>, + state: State, + stack: Vec, } -/// The state the tokenizer is in. #[derive(Debug, Copy, Clone, Eq, PartialEq)] -enum TokensState { - /// The base state if there is nothing special we are in. +enum State { + Header, + StartBody, Body, - /// Inside a function header. Here colons and equal signs get parsed - /// as distinct tokens rather than text. - Function, - /// We expect either the end of the function or the beginning of the body. - MaybeBody, } impl<'s> Tokens<'s> { - /// Create a new token stream from source code. pub fn new(src: &'s str) -> Tokens<'s> { Tokens { src, - chars: PeekableChars::new(src), - state: TokensState::Body, - stack: SmallVec::new(), - line: 1, - line_start_index: 0, - } - } - - /// The index of the first character of the next token in the source string. - pub fn string_index(&self) -> usize { - self.chars.string_index() - } - - /// Go to a new position in the underlying string. - pub fn set_string_index(&mut self, index: usize) { - self.chars.set_string_index(index); - } - - /// The current position in the source. - pub fn get_position(&self) -> Position { - self.line_position(self.string_index()) - } - - /// Advance the iterator by one step. - fn advance(&mut self) { - self.chars.next(); - } - - /// Switch to the given state. - fn switch(&mut self, state: TokensState) { - self.stack.push(self.state); - self.state = state; - } - - /// Go back to the top-of-stack state. - fn unswitch(&mut self) { - self.state = self.stack.pop().unwrap_or(TokensState::Body); - } - - /// The `Position` with line and column for a string index. - fn line_position(&self, index: usize) -> Position { - Position { - line: self.line, - column: index - self.line_start_index, + chars: Characters::new(src), + state: State::Body, + stack: vec![], } } } @@ -90,455 +89,281 @@ impl<'s> Tokens<'s> { impl<'s> Iterator for Tokens<'s> { type Item = Spanned>; - /// Advance the iterator, return the next token or nothing. - fn next(&mut self) -> Option { - use TokensState as TS; + /// Parse the next token in the source code. + fn next(&mut self) -> Option>> { + let start = self.chars.position(); + let first = self.chars.next()?; + let second = self.chars.peek(); - // Go to the body state if the function has a body or return to the top-of-stack - // state. - if self.state == TS::MaybeBody { - if let Some((index, '[')) = self.chars.peek() { - self.advance(); - self.state = TS::Body; - let span = Span::at(self.line_position(index)); - return Some(Spanned::new(Token::LeftBracket, span)); - } else { - self.unswitch(); - } - } + let token = match first { + // Comments. + '/' if second == Some('/') => self.parse_line_comment(), + '/' if second == Some('*') => self.parse_block_comment(), + '*' if second == Some('/') => { self.eat(); StarSlash } - // Take the next char and peek at the one behind. - let (pos, next) = self.chars.next()?; - let afterwards = self.chars.peekc(); + // Whitespace. + c if c.is_whitespace() => self.parse_whitespace(c), - /// The index at which the line ended, if it did. - let mut eol = None; - - let token = match next { - // Functions - '[' => { - self.switch(TS::Function); - Token::LeftBracket - } + // Functions. + '[' => { self.set_state(Header); LeftBracket } ']' => { - if self.state == TS::Function { - self.state = TS::MaybeBody; + if self.state == Header && second == Some('[') { + self.state = StartBody; } else { - self.unswitch(); + self.pop_state(); } - Token::RightBracket + RightBracket } - // Line comment - '/' if afterwards == Some('/') => { - let start = self.string_index() + 1; + // Syntactic elements in function headers. + '(' if self.state == Header => LeftParen, + ')' if self.state == Header => RightParen, + '{' if self.state == Header => LeftBrace, + '}' if self.state == Header => RightBrace, + ':' if self.state == Header => Colon, + ',' if self.state == Header => Comma, + '=' if self.state == Header => Equals, - while let Some(c) = self.chars.peekc() { - if is_newline_char(c) { - break; - } - self.advance(); - } + // String values. + '"' if self.state == Header => self.parse_string(), - let end = self.string_index(); - Token::LineComment(&self.src[start..end]) - } + // Style toggles. + '*' if self.state == Body => Star, + '_' if self.state == Body => Underscore, + '`' if self.state == Body => Backtick, - // Block comment - '/' if afterwards == Some('*') => { - let start = self.string_index() + 1; - let mut nested = 0; + // An escaped thing. + '\\' => self.parse_escaped(), - while let Some((_, c)) = self.chars.next() { - let after = self.chars.peekc(); - match (c, after) { - ('*', Some('/')) if nested == 0 => { - self.advance(); - break; - } - ('/', Some('*')) => { - self.advance(); - nested += 1 - } - ('*', Some('/')) => { - self.advance(); - nested -= 1 - } - _ => {} - } - } - - let end = self.string_index() - 2; - Token::BlockComment(&self.src[start..end]) - } - - // Unexpected end of block comment - '*' if afterwards == Some('/') => { - self.advance(); - Token::StarSlash - } - - // Whitespace - ' ' | '\t' => { - while let Some(c) = self.chars.peekc() { - match c { - ' ' | '\t' => self.advance(), - _ => break, - } - } - - Token::Space - } - - // Newlines - '\r' if afterwards == Some('\n') => { - self.advance(); - eol = Some(pos + "\r\n".len()); - Token::Newline - } - c if is_newline_char(c) => { - eol = Some(pos + c.len_utf8()); - Token::Newline - } - - // Star/Underscore/Backtick in bodies - '*' if self.state == TS::Body => Token::Star, - '_' if self.state == TS::Body => Token::Underscore, - '`' if self.state == TS::Body => Token::Backtick, - - // Context sensitive operators in headers - ':' if self.state == TS::Function => Token::Colon, - '=' if self.state == TS::Function => Token::Equals, - ',' if self.state == TS::Function => Token::Comma, - - // A string value. - '"' if self.state == TS::Function => { - let start = self.string_index(); - let mut end = start; - let mut escaped = false; - - while let Some((index, c)) = self.chars.next() { - end = index; - if c == '"' && !escaped { - break; - } - - escaped = c == '\\'; - } - - Token::Quoted(&self.src[start..end]) - } - - // Escaping - '\\' => { - if let Some((index, c)) = self.chars.peek() { - let escapable = match c { - '[' | ']' | '\\' | '*' | '_' | '`' | ':' | '=' | ',' | '/' => true, + // Expressions or just strings. + c => { + let word = self.read_string_until(|n| { + match n { + c if c.is_whitespace() => true, + '\\' | '[' | ']' | '*' | '_' | '`' | ':' | '=' | + ',' | '"' | '/' => true, _ => false, - }; - - if escapable { - self.advance(); - Token::Text(&self.src[index..index + c.len_utf8()]) - } else { - Token::Text("\\") } + }, false, -(c.len_utf8() as isize), 0); + + if self.state == Header { + self.parse_expr(word) } else { - Token::Text("\\") + Text(word) } } - - // Normal text - _ => { - // Find out when the word ends. - while let Some((_, c)) = self.chars.peek() { - let second = self.chars.peekn(1).map(|p| p.1); - - // Whether the next token is still from the text or not. - let continues = match c { - '[' | ']' | '\\' => false, - '*' | '_' | '`' if self.state == TS::Body => false, - ':' | '=' | ',' | '"' if self.state == TS::Function => false, - - '/' => second != Some('/') && second != Some('*'), - '*' => second != Some('/'), - - ' ' | '\t' => false, - c if is_newline_char(c) => false, - - _ => true, - }; - - if !continues { - break; - } - - self.advance(); - } - - let end = self.string_index(); - Token::Text(&self.src[pos..end]) - } }; - let start = self.line_position(pos); - let end = self.get_position(); - let span = Span::new(start, end); + let end = self.chars.position(); + let span = Span { start, end }; - if let Some(index) = eol { - self.line += 1; - self.line_start_index = index; - } - - Some(Spanned::new(token, span)) + Some(Spanned { v: token, span }) } } -/// Whether this character is a newline (or starts one). -pub(crate) fn is_newline_char(character: char) -> bool { +impl<'s> Tokens<'s> { + fn parse_line_comment(&mut self) -> Token<'s> { + LineComment(self.read_string_until(is_newline_char, false, 1, 0)) + } + + fn parse_block_comment(&mut self) -> Token<'s> { + enum Last { Slash, Star, Other } + use Last::*; + + self.eat(); + + let mut depth = 0; + let mut last = Last::Other; + + // Find the first `*/` that does not correspond to a nested `/*`. + // Remove the last two bytes to obtain the raw inner text without `*/`. + BlockComment(self.read_string_until(|n| { + match n { + '/' => match last { + Star if depth == 0 => return true, + Star => depth -= 1, + _ => last = Slash + } + '*' => match last { + Slash => depth += 1, + _ => last = Star, + } + _ => last = Other, + } + + false + }, true, 0, -2)) + } + + fn parse_whitespace(&mut self, c: char) -> Token<'s> { + let mut newlines = if is_newline_char(c) { 1 } else { 0 }; + let mut last = c; + + self.read_string_until(|n| { + if is_newline_char(n) && !(last == '\r' && n == '\n') { + newlines += 1; + } + + last = n; + !n.is_whitespace() + }, false, 0, 0); + + Whitespace(newlines) + } + + fn parse_string(&mut self) -> Token<'s> { + let mut escaped = false; + Expr(Expression::Str(self.read_string_until(|n| { + if n == '"' && !escaped { + return true; + } else if n == '\\' { + escaped = !escaped; + } else { + escaped = false; + } + + false + }, true, 0, -1).to_string())) + } + + fn parse_escaped(&mut self) -> Token<'s> { + fn is_escapable(c: char) -> bool { + match c { + '\\' | '[' | ']' | '*' | '_' | '`' | '/' => true, + _ => false, + } + } + + let c = self.chars.peek().unwrap_or('n'); + if self.state == Body && is_escapable(c) { + let index = self.chars.index(); + self.eat(); + Text(&self.src[index .. index + c.len_utf8()]) + } else { + Text("\\") + } + } + + fn parse_expr(&mut self, word: &'s str) -> Token<'s> { + if let Ok(b) = word.parse::() { + Expr(Expression::Bool(b)) + } else if let Ok(num) = word.parse::() { + Expr(Expression::Num(num)) + } else if let Ok(num) = parse_percentage(word) { + Expr(Expression::Num(num / 100.0)) + } else if let Ok(size) = word.parse::() { + Expr(Expression::Size(size)) + } else if let Some(ident) = Ident::new(word) { + Expr(Expression::Ident(ident)) + } else { + Text(word) + } + } + + fn read_string_until( + &mut self, + mut f: F, + eat_match: bool, + offset_start: isize, + offset_end: isize, + ) -> &'s str where F: FnMut(char) -> bool { + let start = ((self.chars.index() as isize) + offset_start) as usize; + let mut matched = false; + + while let Some(c) = self.chars.peek() { + if f(c) { + matched = true; + if eat_match { + self.chars.next(); + } + break; + } + + self.chars.next(); + } + + let mut end = self.chars.index(); + if matched { + end = ((end as isize) + offset_end) as usize; + } + + &self.src[start .. end] + } + + fn set_state(&mut self, state: State) { + self.stack.push(self.state); + self.state = state; + } + + fn pop_state(&mut self) { + self.state = self.stack.pop().unwrap_or(Body); + } + + fn eat(&mut self) { + self.chars.next(); + } +} + +fn parse_percentage(word: &str) -> Result { + if word.ends_with('%') { + word[.. word.len() - 1].parse::().map_err(|_| ()) + } else { + Err(()) + } +} + +/// Whether this character denotes a newline. +fn is_newline_char(character: char) -> bool { match character { - '\n' | '\r' | '\u{000c}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\x0A' ..= '\x0D' => true, + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' => true, _ => false, } } -/// A (index, char) iterator with double lookahead. -#[derive(Debug, Clone)] -struct PeekableChars<'s> { - string: &'s str, - chars: CharIndices<'s>, - peeked: SmallVec<[Option<(usize, char)>; 2]>, - base: usize, +struct Characters<'s> { + iter: Peekable>, + position: Position, index: usize, } -impl<'s> PeekableChars<'s> { - /// Create a new iterator from a string. - fn new(string: &'s str) -> PeekableChars<'s> { - PeekableChars { - string, - chars: string.char_indices(), - peeked: SmallVec::new(), - base: 0, +impl<'s> Characters<'s> { + fn new(src: &'s str) -> Characters<'s> { + Characters { + iter: src.chars().peekable(), + position: Position::new(0, 0), index: 0, } } - /// Peek at the next element. - fn peek(&mut self) -> Option<(usize, char)> { - self.peekn(0) - } + fn next(&mut self) -> Option { + let c = self.iter.next()?; + let len = c.len_utf8(); - /// Peek at the char of the next element. - fn peekc(&mut self) -> Option { - self.peekn(0).map(|p| p.1) - } + self.index += len; - /// Peek at the element after the next element. - fn peekn(&mut self, n: usize) -> Option<(usize, char)> { - while self.peeked.len() <= n { - let next = self.next_inner(); - self.peeked.push(next); + if is_newline_char(c) && !(c == '\r' && self.peek() == Some('\n')) { + self.position.line += 1; + self.position.column = 0; + } else { + self.position.column += len; } - self.peeked[n] + Some(c) } - /// Return the next value of the inner iterator mapped with the offset. - fn next_inner(&mut self) -> Option<(usize, char)> { - self.chars.next().map(|(i, c)| (self.base + i, c)) + fn peek(&mut self) -> Option { + self.iter.peek().copied() } - fn string_index(&self) -> usize { + fn index(&self) -> usize { self.index } - fn set_string_index(&mut self, index: usize) { - self.chars = self.string[index..].char_indices(); - self.base = index; - self.index = 0; - self.peeked.clear(); - } -} - -impl Iterator for PeekableChars<'_> { - type Item = (usize, char); - - fn next(&mut self) -> Option<(usize, char)> { - let next = if !self.peeked.is_empty() { - self.peeked.remove(0) - } else { - self.next_inner() - }; - - if let Some((index, c)) = next { - self.index = index + c.len_utf8(); - } - - next - } -} - -#[cfg(test)] -mod tests { - use super::*; - use Token::{ - Backtick as TB, BlockComment as BC, Colon as C, Equals as E, LeftBracket as L, - LineComment as LC, Newline as N, Quoted as Q, RightBracket as R, Space as S, Star as TS, - StarSlash as SS, Text as T, Underscore as TU, - }; - - /// Test if the source code tokenizes to the tokens. - fn test(src: &str, tokens: Vec) { - assert_eq!(Tokens::new(src) - .map(|token| token.v) - .collect::>(), tokens); - } - - /// Test if the tokens of the source code have the correct spans. - fn test_span(src: &str, spans: Vec<(usize, usize, usize, usize)>) { - assert_eq!(Tokens::new(src) - .map(|token| { - let Span { start, end } = token.span; - (start.line, start.column, end.line, end.column) - }) - .collect::>(), spans); - } - - /// Tokenizes the basic building blocks. - #[test] - #[rustfmt::skip] - fn tokenize_base() { - test("", vec![]); - test("Hallo", vec![T("Hallo")]); - test("[", vec![L]); - test("]", vec![R]); - test("*", vec![TS]); - test("_", vec![TU]); - test("`", vec![TB]); - test("\n", vec![N]); - } - - /// This test looks if LF- and CRLF-style newlines get both identified correctly. - #[test] - #[rustfmt::skip] - fn tokenize_whitespace_newlines() { - test(" \t", vec![S]); - test("First line\r\nSecond line\nThird line\n", vec![ - T("First"), S, T("line"), N, T("Second"), S, T("line"), N, - T("Third"), S, T("line"), N - ]); - test("Hello \n ", vec![T("Hello"), S, N, S]); - test("Dense\nTimes", vec![T("Dense"), N, T("Times")]); - } - - /// Tests if escaping with backslash works as it should. - #[test] - #[rustfmt::skip] - fn tokenize_escape() { - test(r"\[", vec![T("[")]); - test(r"\]", vec![T("]")]); - test(r"\**", vec![T("*"), TS]); - test(r"\*", vec![T("*")]); - test(r"\__", vec![T("_"), TU]); - test(r"\_", vec![T("_")]); - test(r"\hello", vec![T("\\"), T("hello")]); - } - - /// Tests if escaped strings work. - #[test] - #[rustfmt::skip] - fn tokenize_quoted() { - test(r#"[align: "hello\"world"]"#, vec![L, T("align"), C, S, Q(r#"hello\"world"#), R]); - } - - /// Tokenizes some more realistic examples. - #[test] - #[rustfmt::skip] - fn tokenize_examples() { - test(r" - [function][ - Test [italic][example]! - ] - ", vec![ - N, S, L, T("function"), R, L, N, S, T("Test"), S, L, T("italic"), R, L, - T("example"), R, T("!"), N, S, R, N, S - ]); - - test(r" - [page: size=A4] - [font: size=12pt] - - Das ist ein Beispielsatz mit *fetter* Schrift. - ", vec![ - N, S, L, T("page"), C, S, T("size"), E, T("A4"), R, N, S, - L, T("font"), C, S, T("size"), E, T("12pt"), R, N, N, S, - T("Das"), S, T("ist"), S, T("ein"), S, T("Beispielsatz"), S, T("mit"), S, - TS, T("fetter"), TS, S, T("Schrift."), N, S - ]); - } - - /// This test checks whether the colon and equals symbols get parsed correctly depending on the - /// context: Either in a function header or in a body. - #[test] - #[rustfmt::skip] - fn tokenize_symbols_context() { - test("[func: key=value][Answer: 7]", vec![ - L, T("func"), C, S, T("key"), E, T("value"), R, L, - T("Answer:"), S, T("7"), R - ]); - test("[[n: k=v]:x][:[=]]:=", vec![ - L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R, - L, T(":"), L, E, R, R, T(":=") - ]); - test("[hi: k=[func][body] v=1][hello]", vec![ - L, T("hi"), C, S, T("k"), E, L, T("func"), R, L, T("body"), R, S, - T("v"), E, T("1"), R, L, T("hello"), R - ]); - test("[func: __key__=value]", vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]); - test("The /*[*/ answer: 7.", vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]); - } - - /// Test if block and line comments get tokenized as expected. - #[test] - #[rustfmt::skip] - fn tokenize_comments() { - test("These // Line comments.", vec![T("These"), S, LC(" Line comments.")]); - test("This /* is */ a comment.", vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]); - test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]); - test("/* Hey */ */", vec![BC(" Hey "), S, SS]); - test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]); - test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")]) - } - - /// This test has a special look at the underscore syntax. - #[test] - #[rustfmt::skip] - fn tokenize_underscores() { - test("he_llo_world_ __ Now this_ is_ special!", - vec![T("he"), TU, T("llo"), TU, T("world"), TU, S, TU, TU, S, T("Now"), S, - T("this"), TU, S, T("is"), TU, S, T("special!")]); - } - - /// This test is for checking if non-ASCII characters get parsed correctly. - #[test] - #[rustfmt::skip] - fn tokenize_unicode() { - test("[document][Hello 🌍!]", vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]); - test("[f]⺐.", vec![L, T("f"), R, T("⺐.")]); - } - - /// This test checks if all tokens have the correct spans. - #[test] - #[rustfmt::skip] - fn tokenize_spans() { - test_span("Hello World", vec![(1, 0, 1, 5), (1, 5, 1, 6), (1, 6, 1, 11)]); - test_span("🌍_🎈", vec![(1, 0, 1, 4), (1, 4, 1, 5), (1, 5, 1, 9)]); - test_span("hello\nworld", vec![(1, 0, 1, 5), (1, 5, 1, 6), (2, 0, 2, 5)]); - test_span("[hello: world]", vec![ - (1, 0, 1, 1), (1, 1, 1, 6), (1, 6, 1, 7), - (1, 7, 1, 8), (1, 8, 1, 13), (1, 13, 1, 14) - ]); + fn position(&self) -> Position { + self.position } } diff --git a/tests/parse.rs b/tests/parse.rs index 953cc959f..e00b05d83 100644 --- a/tests/parse.rs +++ b/tests/parse.rs @@ -1,9 +1,26 @@ +#![allow(unused_imports)] +#![allow(non_snake_case)] + +use typstc::size::Size; use typstc::syntax::*; use Token::{ - Space as S, Newline as N, LeftBracket as LB, - RightBracket as RB, Text as T, * + Whitespace as W, + LineComment as LC, BlockComment as BC, StarSlash as SS, + LeftBracket as LB, RightBracket as RB, + LeftParen as LP, RightParen as RP, + LeftBrace as LBR, RightBrace as RBR, + Colon as CL, Comma as CM, Equals as EQ, Expr as E, + Star as ST, Underscore as U, Backtick as B, Text as T, }; +use Expression as Expr; +fn ID(ident: &str) -> Token { E(Expr::Ident(Ident::new(ident.to_string()).unwrap())) } +fn STR(ident: &str) -> Token { E(Expr::Str(ident.to_string())) } +fn SIZE(size: Size) -> Token<'static> { E(Expr::Size(size)) } +fn NUM(num: f64) -> Token<'static> { E(Expr::Num(num)) } +fn BOOL(b: bool) -> Token<'static> { E(Expr::Bool(b)) } + + /// Parses the test syntax. macro_rules! tokens { ($($src:expr =>($line:expr)=> $tokens:expr)*) => ({ diff --git a/tests/parsing/base.rs b/tests/parsing/base.rs deleted file mode 100644 index ad7d87c02..000000000 --- a/tests/parsing/base.rs +++ /dev/null @@ -1,78 +0,0 @@ -// Spaces, Newlines, Brackets. -"" => [] -" " => [S] -" " => [S] -"\t" => [S] -" \t" => [S] -"\n" => [N] -"\n " => [N, S] -" \n" => [S, N] -" \n " => [S, N, S] -"[" => [LB] -"]" => [RB] - -// Header only tokens. -"[:]" => [LB, Colon, RB] -"[=]" => [LB, Equals, RB] -"[,]" => [LB, Comma, RB] -":" => [T(":")] -"=" => [T("=")] -"," => [T(",")] -r#"["hi"]"# => [LB, Quoted("hi"), RB] -r#""hi""# => [T(r#""hi""#)] - -// Body only tokens. -"_" => [Underscore] -"*" => [Star] -"`" => [Backtick] -"[_]" => [LB, T("_"), RB] -"[*]" => [LB, T("*"), RB] -"[`]" => [LB, T("`"), RB] - -// Comments. -"//line" => [LineComment("line")] -"/*block*/" => [BlockComment("block")] -"*/" => [StarSlash] - -// Plain text. -"A" => [T("A")] -"Hello" => [T("Hello")] -"Hello-World" => [T("Hello-World")] -r#"A"B"# => [T(r#"A"B"#)] -"🌍" => [T("🌍")] - -// Escapes. -r"\[" => [T("[")] -r"\]" => [T("]")] -r"\\" => [T(r"\")] -r"[\[]" => [LB, T("["), RB] -r"[\]]" => [LB, T("]"), RB] -r"[\\]" => [LB, T(r"\"), RB] -r"\:" => [T(":")] -r"\=" => [T("=")] -r"\/" => [T("/")] -r"[\:]" => [LB, T(":"), RB] -r"[\=]" => [LB, T("="), RB] -r"[\,]" => [LB, T(","), RB] -r"\*" => [T("*")] -r"\_" => [T("_")] -r"\`" => [T("`")] -r"[\*]" => [LB, T("*"), RB] -r"[\_]" => [LB, T("_"), RB] -r"[\`]" => [LB, T("`"), RB] - -// Whitespace. -"Hello World" => [T("Hello"), S, T("World")] -"Hello World" => [T("Hello"), S, T("World")] -"Hello \t World" => [T("Hello"), S, T("World")] - -// Newline. -"First\n" => [T("First"), N] -"First \n" => [T("First"), S, N] -"First\n " => [T("First"), N, S] -"First \n " => [T("First"), S, N, S] -"First\nSecond" => [T("First"), N, T("Second")] -"First\r\nSecond" => [T("First"), N, T("Second")] -"First \nSecond" => [T("First"), S, N, T("Second")] -"First\n Second" => [T("First"), N, S, T("Second")] -"First \n Second" => [T("First"), S, N, S, T("Second")] diff --git a/tests/parsing/tokens.rs b/tests/parsing/tokens.rs new file mode 100644 index 000000000..4f5474bb2 --- /dev/null +++ b/tests/parsing/tokens.rs @@ -0,0 +1,62 @@ +// Whitespace. +"" => [] +" " => [W(0)] +" " => [W(0)] +"\t" => [W(0)] +" \t" => [W(0)] +"\n" => [W(1)] +"\n " => [W(1)] +" \n" => [W(1)] +" \n " => [W(1)] +" \n\t \n " => [W(2)] +"\r\n" => [W(1)] +" \r\r\n \x0D" => [W(3)] +"\n\r" => [W(2)] + +// Comments. +"a // bc\n " => [T("a"), W(0), LC(" bc"), W(1)] +"a //a//b\n " => [T("a"), W(0), LC("a//b"), W(1)] +"a //a//b\r\n" => [T("a"), W(0), LC("a//b"), W(1)] +"a //a//b\n\nhello" => [T("a"), W(0), LC("a//b"), W(2), T("hello")] +"/**/" => [BC("")] +"_/*_/*a*/*/" => [U, BC("_/*a*/")] +"/*/*/" => [BC("/*/")] +"abc*/" => [T("abc"), SS] + +// Header only tokens. +"[" => [LB] +"]" => [RB] +"[(){}:=,]" => [LB, LP, RP, LBR, RBR, CL, EQ, CM, RB] +"[a:b]" => [LB, ID("a"), CL, ID("b"), RB] +"[πŸŒ“, 🌍,]" => [LB, T("πŸŒ“"), CM, W(0), T("🌍"), CM, RB] +"[=]" => [LB, EQ, RB] +"[,]" => [LB, CM, RB] +"a: b" => [T("a"), T(":"), W(0), T("b")] +"c=d, " => [T("c"), T("=d"), T(","), W(0)] +r#"["hello\"world"]"# => [LB, STR(r#"hello\"world"#), RB] +r#"["hi", 12pt]"# => [LB, STR("hi"), CM, W(0), SIZE(Size::pt(12.0)), RB] +"\"hi\"" => [T("\"hi"), T("\"")] +"[a: true, x=1]" => [LB, ID("a"), CL, W(0), BOOL(true), CM, W(0), + ID("x"), EQ, NUM(1.0), RB] +"[120%]" => [LB, NUM(1.2), RB] + +// Body only tokens. +"_*`" => [U, ST, B] +"[_*`]" => [LB, T("_"), T("*"), T("`"), RB] +"hi_you_ there" => [T("hi"), U, T("you"), U, W(0), T("there")] + +// Escapes. +r"\[" => [T("[")] +r"\]" => [T("]")] +r"\\" => [T(r"\")] +r"\/" => [T("/")] +r"\*" => [T("*")] +r"\_" => [T("_")] +r"\`" => [T("`")] + +// Unescapable special symbols. +r"\:" => [T(r"\"), T(":")] +r"\=" => [T(r"\"), T("=")] +r"[\:]" => [LB, T(r"\"), CL, RB] +r"[\=]" => [LB, T(r"\"), EQ, RB] +r"[\,]" => [LB, T(r"\"), CM, RB]