From 5c66bac689f4551e30c20e57087d47245853b5fe Mon Sep 17 00:00:00 2001 From: Laurenz Date: Fri, 3 May 2019 12:41:18 +0200 Subject: [PATCH] =?UTF-8?q?Parse=20line=20and=20block=20comments=20?= =?UTF-8?q?=F0=9F=93=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parsing.rs | 283 ++++++++++++++++++++++++++++++++----------------- src/syntax.rs | 24 +++-- 2 files changed, 202 insertions(+), 105 deletions(-) diff --git a/src/parsing.rs b/src/parsing.rs index 924c3ddec..a74a896c7 100644 --- a/src/parsing.rs +++ b/src/parsing.rs @@ -8,7 +8,7 @@ use smallvec::SmallVec; use unicode_xid::UnicodeXID; use crate::syntax::*; -use crate::func::Scope; +use crate::func::{Function, Scope}; /// Builds an iterator over the tokens of the source code. @@ -99,7 +99,7 @@ impl<'s> Iterator for Tokens<'s> { let afterwards = self.chars.peek().map(|p| p.1); Some(match next { - // Special characters + // Functions '[' => { self.switch(TS::Function); Token::LeftBracket @@ -112,8 +112,47 @@ impl<'s> Iterator for Tokens<'s> { } Token::RightBracket }, - '$' => Token::Dollar, - '#' => Token::Hashtag, + + // Line comment + '/' if afterwards == Some('/') => { + let mut end = self.chars.next().unwrap(); + let start = end.0 + end.1.len_utf8(); + + while let Some((index, c)) = self.chars.peek() { + if is_newline_char(c) { + break; + } + self.advance(); + end = (index, c); + } + + let end = end.0 + end.1.len_utf8(); + Token::LineComment(&self.src[start .. end]) + }, + + // Block comment + '/' if afterwards == Some('*') => { + let mut end = self.chars.next().unwrap(); + let start = end.0 + end.1.len_utf8(); + + let mut nested = 0; + while let Some((index, c)) = self.chars.next() { + let after = self.chars.peek().map(|p| p.1); + match (c, after) { + ('*', Some('/')) if nested == 0 => { self.advance(); break }, + ('/', Some('*')) => { self.advance(); nested += 1 }, + ('*', Some('/')) => { self.advance(); nested -= 1 }, + _ => {}, + } + end = (index, c); + } + + let end = end.0 + end.1.len_utf8(); + Token::BlockComment(&self.src[start .. end]) + }, + + // Unexpected end of block comment + '*' if afterwards == Some('/') => self.consumed(Token::StarSlash), // Whitespace ' ' | '\t' => { @@ -126,25 +165,26 @@ impl<'s> Iterator for Tokens<'s> { Token::Space } + // Newlines + '\r' if afterwards == Some('\n') => self.consumed(Token::Newline), + c if is_newline_char(c) => Token::Newline, + // Context sensitive operators in headers ':' if self.state == TS::Function => Token::Colon, '=' if self.state == TS::Function => Token::Equals, - // Double star/underscore in bodies + // Double star/underscore and dollar in bodies '*' if self.state == TS::Body && afterwards == Some('*') => self.consumed(Token::DoubleStar), '_' if self.state == TS::Body && afterwards == Some('_') => self.consumed(Token::DoubleUnderscore), - - // Newlines - '\r' if afterwards == Some('\n') => self.consumed(Token::Newline), - c if is_newline_char(c) => Token::Newline, + '$' if self.state == TS::Body => Token::Dollar, // Escaping '\\' => { if let Some((index, c)) = self.chars.peek() { let escapable = match c { - '[' | ']' | '$' | '#' | '\\' | '*' | '_' => true, + '[' | ']' | '$' | '#' | '\\' | '*' | '_' | '/' => true, _ => false, }; @@ -162,15 +202,18 @@ impl<'s> Iterator for Tokens<'s> { // Find out when the word ends. let mut end = (next_pos, next); while let Some((index, c)) = self.chars.peek() { + let second = self.chars.peek_second().map(|p| p.1); + // Whether the next token is still from the next or not. let continues = match c { '[' | ']' | '$' | '#' | '\\' => false, ':' | '=' if self.state == TS::Function => false, - '*' if self.state == TS::Body - => self.chars.peek_second().map(|p| p.1) != Some('*'), - '_' if self.state == TS::Body - => self.chars.peek_second().map(|p| p.1) != Some('_'), + '*' if self.state == TS::Body => second != Some('*'), + '_' if self.state == TS::Body => second != Some('_'), + + '/' => second != Some('/') && second != Some('*'), + '*' => second != Some('/'), ' ' | '\t' => false, c if is_newline_char(c) => false, @@ -321,94 +364,89 @@ impl<'s> Parser<'s> { /// Parse the source into an abstract syntax tree. fn parse(mut self) -> ParseResult { - use ParserState as PS; - - while let Some(token) = self.tokens.peek() { - // Skip over comments. - if token == Token::Hashtag { - self.skip_while(|t| t != Token::Newline); - self.advance(); - } - - // Handles all the states. - match self.state { - PS::FirstNewline => match token { - Token::Newline => { - self.append_consumed(Node::Newline); - self.switch(PS::WroteNewline); - }, - Token::Space => self.append_space_consumed(), - _ => { - self.append_space(); - self.switch(PS::Body); - }, - } - - PS::WroteNewline => match token { - Token::Newline | Token::Space => self.append_space_consumed(), - _ => self.switch(PS::Body), - } - - PS::Body => match token { - // Whitespace - Token::Space => self.append_space_consumed(), - Token::Newline => { - self.advance(); - self.switch(PS::FirstNewline); - }, - - // Text - Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())), - - // Functions - Token::LeftBracket => self.parse_function()?, - Token::RightBracket => { - return Err(ParseError::new("unexpected closing bracket")); - }, - - // Modifiers - Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics), - Token::DoubleStar => self.append_consumed(Node::ToggleBold), - Token::Dollar => self.append_consumed(Node::ToggleMath), - - // Should not happen - Token::Colon | Token::Equals | Token::Hashtag => unreachable!(), - }, - } + // Loop through all the tokens. + while self.tokens.peek().is_some() { + self.parse_white()?; + self.parse_body_part()?; } Ok(self.tree) } - /// Parse a function from the current position. - fn parse_function(&mut self) -> ParseResult<()> { + /// Parse part of the body. + fn parse_body_part(&mut self) -> ParseResult<()> { + if let Some(token) = self.tokens.peek() { + match token { + // Functions + Token::LeftBracket => self.parse_func()?, + Token::RightBracket => return Err(ParseError::new("unexpected closing bracket")), + + // Modifiers + Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics), + Token::DoubleStar => self.append_consumed(Node::ToggleBold), + Token::Dollar => self.append_consumed(Node::ToggleMath), + + // Normal text + Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())), + + Token::Colon | Token::Equals => panic!("bad token for body: {:?}", token), + + // The rest is handled elsewhere or should not happen, because Tokens does + // not yield colons or equals in the body, but their text equivalents instead. + _ => panic!("unexpected token: {:?}", token), + } + } + Ok(()) + } + + /// Parse a complete function from the current position. + fn parse_func(&mut self) -> ParseResult<()> { // This should only be called if a left bracket was seen. assert!(self.tokens.next() == Some(Token::LeftBracket)); + let header = self.parse_func_header()?; + let body = self.parse_func_body(&header)?; + + // Finally this function is parsed to the end. + self.append(Node::Func(FuncCall { + header, + body, + })); + + Ok(self.switch(ParserState::Body)) + } + + /// Parse a function header. + fn parse_func_header(&mut self) -> ParseResult { // The next token should be the name of the function. + self.parse_white()?; let name = match self.tokens.next() { Some(Token::Text(word)) => { if is_identifier(word) { Ok(word.to_owned()) } else { - Err(ParseError::new("invalid identifier")) + Err(ParseError::new(format!("invalid identifier: '{}'", word))) } }, _ => Err(ParseError::new("expected identifier")), }?; // Now the header should be closed. + self.parse_white()?; if self.tokens.next() != Some(Token::RightBracket) { return Err(ParseError::new("expected closing bracket")); } // Store the header information of the function invocation. - let header = FuncHeader { + Ok(FuncHeader { name, args: vec![], kwargs: HashMap::new(), - }; + }) + } + /// Parse the body of a function. + fn parse_func_body(&mut self, header: &FuncHeader) -> ParseResult> { // Whether the function has a body. let has_body = self.tokens.peek() == Some(Token::LeftBracket); if has_body { @@ -420,7 +458,7 @@ impl<'s> Parser<'s> { .ok_or_else(|| ParseError::new(format!("unknown function: '{}'", &header.name)))?; // Do the parsing dependent on whether the function has a body. - let body = if has_body { + Ok(if has_body { // Find out the string which makes the body of this function. let (start, end) = self.tokens.current_index().and_then(|index| { find_closing_bracket(&self.src[index..]) @@ -448,15 +486,48 @@ impl<'s> Parser<'s> { body: None, scope: &self.scope, })? - }; + }) + } - // Finally this function is parsed to the end. - self.append(Node::Func(FuncCall { - header, - body, - })); + /// Parse whitespace (as long as there is any) and skip over comments. + fn parse_white(&mut self) -> ParseResult<()> { + while let Some(token) = self.tokens.peek() { + match self.state { + ParserState::FirstNewline => match token { + Token::Newline => { + self.append_consumed(Node::Newline); + self.switch(ParserState::WroteNewline); + }, + Token::Space => self.append_space_consumed(), + _ => { + self.append_space(); + self.switch(ParserState::Body); + }, + }, + ParserState::WroteNewline => match token { + Token::Newline | Token::Space => self.append_space_consumed(), + _ => self.switch(ParserState::Body), + }, + ParserState::Body => match token { + // Whitespace + Token::Space => self.append_space_consumed(), + Token::Newline => { + self.advance(); + self.switch(ParserState::FirstNewline); + }, - Ok(self.switch(ParserState::Body)) + // Comments + Token::LineComment(_) | Token::BlockComment(_) => self.advance(), + Token::StarSlash => { + return Err(ParseError::new("unexpected end of block comment")); + }, + + // Anything else skips out of the function. + _ => break, + } + } + } + Ok(()) } /// Advance the iterator by one step. @@ -492,16 +563,6 @@ impl<'s> Parser<'s> { self.advance(); self.append_space(); } - - /// Skip tokens until the condition is met. - fn skip_while(&mut self, f: F) where F: Fn(Token) -> bool { - while let Some(token) = self.tokens.peek() { - if !f(token) { - break; - } - self.advance(); - } - } } /// Find the index of the first unbalanced (unescaped) closing bracket. @@ -623,7 +684,7 @@ mod token_tests { use super::*; use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R, Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS, - Dollar as D, Hashtag as H, Text as T}; + Dollar as D, Text as T, LineComment as LC, BlockComment as BC, StarSlash as SS}; /// Test if the source code tokenizes to the tokens. fn test(src: &str, tokens: Vec) { @@ -638,7 +699,6 @@ mod token_tests { test("[", vec![L]); test("]", vec![R]); test("$", vec![D]); - test("#", vec![H]); test("**", vec![DS]); test("__", vec![DU]); test("\n", vec![N]); @@ -709,11 +769,24 @@ mod token_tests { T("v"), E, T("1"), R, L, T("hello"), R]); test("[func: __key__=value]", vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]); + test("The /*[*/ answer: 7.", + vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]); } - /// This test has a special look at the double underscore syntax, because - /// per Unicode standard they are not separate words and thus harder to parse - /// than the stars. + /// Test if block and line comments get tokenized as expected. + #[test] + fn tokenize_comments() { + test("These // Line comments.", + vec![T("These"), S, LC(" Line comments.")]); + test("This /* is */ a comment.", + vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]); + test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]); + test("/* Hey */ */", vec![BC(" Hey "), S, SS]); + test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]); + test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")]) + } + + /// This test has a special look at the double underscore syntax. #[test] fn tokenize_double_underscore() { test("he__llo__world_ _ __ Now this_ is__ special!", @@ -876,6 +949,21 @@ mod parse_tests { ]); } + /// Parse comments (line and block). + #[test] + fn parse_comments() { + let mut scope = Scope::new(); + scope.add::("test"); + scope.add::("func"); + + test_scoped(&scope, "Text\n// Comment\n More text", + tree! [ T("Text"), S, T("More"), S, T("text") ]); + test_scoped(&scope, "[test/*world*/]", + tree! [ F(func! { name => "test", body => None }) ]); + test_scoped(&scope, "[test/*]*/]", + tree! [ F(func! { name => "test", body => None }) ]); + } + /// Test if escaped, but unbalanced parens are correctly parsed. #[test] fn parse_unbalanced_body_parens() { @@ -933,6 +1021,7 @@ mod parse_tests { test_err("No functions here]", "unexpected closing bracket"); test_err_scoped(&scope, "[hello][world", "expected closing bracket"); test_err("[hello world", "expected closing bracket"); - test_err("[ no-name][Why?]", "expected identifier"); + test_err("[ no-name][Why?]", "invalid identifier: 'no-name'"); + test_err("Hello */", "unexpected end of block comment"); } } diff --git a/src/syntax.rs b/src/syntax.rs index a8ae930d6..87592c430 100644 --- a/src/syntax.rs +++ b/src/syntax.rs @@ -9,7 +9,7 @@ use crate::func::Function; pub enum Token<'s> { /// One or more whitespace (non-newline) codepoints. Space, - /// A line feed (either `\n` or `\r\n`). + /// A line feed (`\n`, `\r\n` and some more as defined by the Unicode standard). Newline, /// A left bracket: `[`. LeftBracket, @@ -17,19 +17,27 @@ pub enum Token<'s> { RightBracket, /// A colon (`:`) indicating the beginning of function arguments. /// - /// If a colon occurs outside of the function header, it will be + /// If a colon occurs outside of a function header, it will be /// tokenized as a [Word](Token::Word). Colon, - /// Same as with [Colon](Token::Colon). + /// An equals (`=`) sign assigning a function argument a value. + /// + /// Outside of functions headers, same as with [Colon](Token::Colon). Equals, - /// Two underscores, indicating text in _italics_. + /// Two underscores, indicating text in italics. DoubleUnderscore, - /// Two stars, indicating **bold** text. + /// Two stars, indicating bold text. DoubleStar, - /// A dollar sign, indicating _mathematical_ content. + /// A dollar sign, indicating mathematical content. Dollar, - /// A hashtag starting a _comment_. - Hashtag, + /// A line comment. + LineComment(&'s str), + /// A block comment. + BlockComment(&'s str), + /// A star followed by a slash unexpectedly ending a block comment + /// (the comment was not started before, otherwise a + /// [BlockComment](Token::BlockComment) would be returned). + StarSlash, /// Everything else is just text. Text(&'s str), }