From 5c66bac689f4551e30c20e57087d47245853b5fe Mon Sep 17 00:00:00 2001
From: Laurenz <laurmaedje@gmail.com>
Date: Fri, 3 May 2019 12:41:18 +0200
Subject: [PATCH] =?UTF-8?q?Parse=20line=20and=20block=20comments=20?=
 =?UTF-8?q?=F0=9F=93=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/parsing.rs | 283 ++++++++++++++++++++++++++++++++-----------------
 src/syntax.rs  |  24 +++--
 2 files changed, 202 insertions(+), 105 deletions(-)

diff --git a/src/parsing.rs b/src/parsing.rs
index 924c3ddec..a74a896c7 100644
--- a/src/parsing.rs
+++ b/src/parsing.rs
@@ -8,7 +8,7 @@ use smallvec::SmallVec;
 use unicode_xid::UnicodeXID;
 
 use crate::syntax::*;
-use crate::func::Scope;
+use crate::func::{Function, Scope};
 
 
 /// Builds an iterator over the tokens of the source code.
@@ -99,7 +99,7 @@ impl<'s> Iterator for Tokens<'s> {
         let afterwards = self.chars.peek().map(|p| p.1);
 
         Some(match next {
-            // Special characters
+            // Functions
             '[' => {
                 self.switch(TS::Function);
                 Token::LeftBracket
@@ -112,8 +112,47 @@ impl<'s> Iterator for Tokens<'s> {
                 }
                 Token::RightBracket
             },
-            '$' => Token::Dollar,
-            '#' => Token::Hashtag,
+
+            // Line comment
+            '/' if afterwards == Some('/') => {
+                let mut end = self.chars.next().unwrap();
+                let start = end.0 + end.1.len_utf8();
+
+                while let Some((index, c)) = self.chars.peek() {
+                    if is_newline_char(c) {
+                        break;
+                    }
+                    self.advance();
+                    end = (index, c);
+                }
+
+                let end = end.0 + end.1.len_utf8();
+                Token::LineComment(&self.src[start .. end])
+            },
+
+            // Block comment
+            '/' if afterwards == Some('*') => {
+                let mut end = self.chars.next().unwrap();
+                let start = end.0 + end.1.len_utf8();
+
+                let mut nested = 0;
+                while let Some((index, c)) = self.chars.next() {
+                    let after = self.chars.peek().map(|p| p.1);
+                    match (c, after) {
+                        ('*', Some('/')) if nested == 0 => { self.advance(); break },
+                        ('/', Some('*')) => { self.advance(); nested += 1 },
+                        ('*', Some('/')) => { self.advance(); nested -= 1 },
+                        _ => {},
+                    }
+                    end = (index, c);
+                }
+
+                let end = end.0 + end.1.len_utf8();
+                Token::BlockComment(&self.src[start .. end])
+            },
+
+            // Unexpected end of block comment
+            '*' if afterwards == Some('/') => self.consumed(Token::StarSlash),
 
             // Whitespace
             ' ' | '\t' => {
@@ -126,25 +165,26 @@ impl<'s> Iterator for Tokens<'s> {
                 Token::Space
             }
 
+            // Newlines
+            '\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
+            c if is_newline_char(c) => Token::Newline,
+
             // Context sensitive operators in headers
             ':' if self.state == TS::Function => Token::Colon,
             '=' if self.state == TS::Function => Token::Equals,
 
-            // Double star/underscore in bodies
+            // Double star/underscore and dollar in bodies
             '*' if self.state == TS::Body && afterwards == Some('*')
                 => self.consumed(Token::DoubleStar),
             '_' if self.state == TS::Body && afterwards == Some('_')
                 => self.consumed(Token::DoubleUnderscore),
-
-            // Newlines
-            '\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
-            c if is_newline_char(c) => Token::Newline,
+            '$' if self.state == TS::Body => Token::Dollar,
 
             // Escaping
             '\\' => {
                 if let Some((index, c)) = self.chars.peek() {
                     let escapable = match c {
-                        '[' | ']' | '$' | '#' | '\\' | '*' | '_' => true,
+                        '[' | ']' | '$' | '#' | '\\' | '*' | '_' | '/' => true,
                         _ => false,
                     };
 
@@ -162,15 +202,18 @@ impl<'s> Iterator for Tokens<'s> {
                 // Find out when the word ends.
                 let mut end = (next_pos, next);
                 while let Some((index, c)) = self.chars.peek() {
+                    let second = self.chars.peek_second().map(|p| p.1);
+
                     // Whether the next token is still from the next or not.
                     let continues = match c {
                         '[' | ']' | '$' | '#' | '\\' => false,
                         ':' | '=' if self.state == TS::Function => false,
 
-                        '*' if self.state == TS::Body
-                             => self.chars.peek_second().map(|p| p.1) != Some('*'),
-                        '_' if self.state == TS::Body
-                             => self.chars.peek_second().map(|p| p.1) != Some('_'),
+                        '*' if self.state == TS::Body => second != Some('*'),
+                        '_' if self.state == TS::Body => second != Some('_'),
+
+                        '/' => second != Some('/') && second != Some('*'),
+                        '*' => second != Some('/'),
 
                         ' ' | '\t' => false,
                         c if is_newline_char(c) => false,
@@ -321,94 +364,89 @@ impl<'s> Parser<'s> {
 
     /// Parse the source into an abstract syntax tree.
     fn parse(mut self) -> ParseResult<SyntaxTree> {
-        use ParserState as PS;
-
-        while let Some(token) = self.tokens.peek() {
-            // Skip over comments.
-            if token == Token::Hashtag {
-                self.skip_while(|t| t != Token::Newline);
-                self.advance();
-            }
-
-            // Handles all the states.
-            match self.state {
-                PS::FirstNewline => match token {
-                    Token::Newline => {
-                        self.append_consumed(Node::Newline);
-                        self.switch(PS::WroteNewline);
-                    },
-                    Token::Space => self.append_space_consumed(),
-                    _ => {
-                        self.append_space();
-                        self.switch(PS::Body);
-                    },
-                }
-
-                PS::WroteNewline => match token {
-                    Token::Newline | Token::Space => self.append_space_consumed(),
-                    _ => self.switch(PS::Body),
-                }
-
-                PS::Body => match token {
-                    // Whitespace
-                    Token::Space => self.append_space_consumed(),
-                    Token::Newline => {
-                        self.advance();
-                        self.switch(PS::FirstNewline);
-                    },
-
-                    // Text
-                    Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
-
-                    // Functions
-                    Token::LeftBracket => self.parse_function()?,
-                    Token::RightBracket => {
-                        return Err(ParseError::new("unexpected closing bracket"));
-                    },
-
-                    // Modifiers
-                    Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics),
-                    Token::DoubleStar => self.append_consumed(Node::ToggleBold),
-                    Token::Dollar => self.append_consumed(Node::ToggleMath),
-
-                    // Should not happen
-                    Token::Colon | Token::Equals | Token::Hashtag => unreachable!(),
-                },
-            }
+        // Loop through all the tokens.
+        while self.tokens.peek().is_some() {
+            self.parse_white()?;
+            self.parse_body_part()?;
         }
 
         Ok(self.tree)
     }
 
-    /// Parse a function from the current position.
-    fn parse_function(&mut self) -> ParseResult<()> {
+    /// Parse part of the body.
+    fn parse_body_part(&mut self) -> ParseResult<()> {
+        if let Some(token) = self.tokens.peek() {
+            match token {
+                // Functions
+                Token::LeftBracket => self.parse_func()?,
+                Token::RightBracket => return Err(ParseError::new("unexpected closing bracket")),
+
+                // Modifiers
+                Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics),
+                Token::DoubleStar => self.append_consumed(Node::ToggleBold),
+                Token::Dollar => self.append_consumed(Node::ToggleMath),
+
+                // Normal text
+                Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
+
+                Token::Colon | Token::Equals => panic!("bad token for body: {:?}", token),
+
+                // The rest is handled elsewhere or should not happen, because Tokens does
+                // not yield colons or equals in the body, but their text equivalents instead.
+                _ => panic!("unexpected token: {:?}", token),
+            }
+        }
+        Ok(())
+    }
+
+    /// Parse a complete function from the current position.
+    fn parse_func(&mut self) -> ParseResult<()> {
         // This should only be called if a left bracket was seen.
         assert!(self.tokens.next() == Some(Token::LeftBracket));
 
+        let header = self.parse_func_header()?;
+        let body = self.parse_func_body(&header)?;
+
+        // Finally this function is parsed to the end.
+        self.append(Node::Func(FuncCall {
+            header,
+            body,
+        }));
+
+        Ok(self.switch(ParserState::Body))
+    }
+
+    /// Parse a function header.
+    fn parse_func_header(&mut self) -> ParseResult<FuncHeader> {
         // The next token should be the name of the function.
+        self.parse_white()?;
         let name = match self.tokens.next() {
             Some(Token::Text(word)) => {
                 if is_identifier(word) {
                     Ok(word.to_owned())
                 } else {
-                    Err(ParseError::new("invalid identifier"))
+                    Err(ParseError::new(format!("invalid identifier: '{}'", word)))
                 }
             },
             _ => Err(ParseError::new("expected identifier")),
         }?;
 
         // Now the header should be closed.
+        self.parse_white()?;
         if self.tokens.next() != Some(Token::RightBracket) {
             return Err(ParseError::new("expected closing bracket"));
         }
 
         // Store the header information of the function invocation.
-        let header = FuncHeader {
+        Ok(FuncHeader {
             name,
             args: vec![],
             kwargs: HashMap::new(),
-        };
+        })
+    }
 
+    /// Parse the body of a function.
+    fn parse_func_body(&mut self, header: &FuncHeader) -> ParseResult<Box<dyn Function>> {
         // Whether the function has a body.
         let has_body = self.tokens.peek() == Some(Token::LeftBracket);
         if has_body {
@@ -420,7 +458,7 @@ impl<'s> Parser<'s> {
             .ok_or_else(|| ParseError::new(format!("unknown function: '{}'", &header.name)))?;
 
         // Do the parsing dependent on whether the function has a body.
-        let body = if has_body {
+        Ok(if has_body {
             // Find out the string which makes the body of this function.
             let (start, end) = self.tokens.current_index().and_then(|index| {
                 find_closing_bracket(&self.src[index..])
@@ -448,15 +486,48 @@ impl<'s> Parser<'s> {
                 body: None,
                 scope: &self.scope,
             })?
-        };
+        })
+    }
 
-        // Finally this function is parsed to the end.
-        self.append(Node::Func(FuncCall {
-            header,
-            body,
-        }));
+    /// Parse whitespace (as long as there is any) and skip over comments.
+    fn parse_white(&mut self) -> ParseResult<()> {
+        while let Some(token) = self.tokens.peek() {
+            match self.state {
+                ParserState::FirstNewline => match token {
+                    Token::Newline => {
+                        self.append_consumed(Node::Newline);
+                        self.switch(ParserState::WroteNewline);
+                    },
+                    Token::Space => self.append_space_consumed(),
+                    _ => {
+                        self.append_space();
+                        self.switch(ParserState::Body);
+                    },
+                },
+                ParserState::WroteNewline => match token {
+                    Token::Newline | Token::Space => self.append_space_consumed(),
+                    _ => self.switch(ParserState::Body),
+                },
+                ParserState::Body => match token {
+                    // Whitespace
+                    Token::Space => self.append_space_consumed(),
+                    Token::Newline => {
+                        self.advance();
+                        self.switch(ParserState::FirstNewline);
+                    },
 
-        Ok(self.switch(ParserState::Body))
+                    // Comments
+                    Token::LineComment(_) | Token::BlockComment(_) => self.advance(),
+                    Token::StarSlash => {
+                        return Err(ParseError::new("unexpected end of block comment"));
+                    },
+
+                    // Anything else skips out of the function.
+                    _ => break,
+                }
+            }
+        }
+        Ok(())
     }
 
     /// Advance the iterator by one step.
@@ -492,16 +563,6 @@ impl<'s> Parser<'s> {
         self.advance();
         self.append_space();
     }
-
-    /// Skip tokens until the condition is met.
-    fn skip_while<F>(&mut self, f: F) where F: Fn(Token) -> bool {
-        while let Some(token) = self.tokens.peek() {
-            if !f(token) {
-                break;
-            }
-            self.advance();
-        }
-    }
 }
 
 /// Find the index of the first unbalanced (unescaped) closing bracket.
@@ -623,7 +684,7 @@ mod token_tests {
     use super::*;
     use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
                 Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS,
-                Dollar as D, Hashtag as H, Text as T};
+                Dollar as D, Text as T, LineComment as LC, BlockComment as BC, StarSlash as SS};
 
     /// Test if the source code tokenizes to the tokens.
     fn test(src: &str, tokens: Vec<Token>) {
@@ -638,7 +699,6 @@ mod token_tests {
         test("[", vec![L]);
         test("]", vec![R]);
         test("$", vec![D]);
-        test("#", vec![H]);
         test("**", vec![DS]);
         test("__", vec![DU]);
         test("\n", vec![N]);
@@ -709,11 +769,24 @@ mod token_tests {
                  T("v"), E, T("1"), R, L, T("hello"), R]);
         test("[func: __key__=value]",
              vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]);
+        test("The /*[*/ answer: 7.",
+            vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]);
     }
 
-    /// This test has a special look at the double underscore syntax, because
-    /// per Unicode standard they are not separate words and thus harder to parse
-    /// than the stars.
+    /// Test if block and line comments get tokenized as expected.
+    #[test]
+    fn tokenize_comments() {
+        test("These // Line comments.",
+            vec![T("These"), S, LC(" Line comments.")]);
+        test("This /* is */ a comment.",
+            vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]);
+        test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]);
+        test("/* Hey */ */", vec![BC(" Hey "), S, SS]);
+        test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]);
+        test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")])
+    }
+
+    /// This test has a special look at the double underscore syntax.
     #[test]
     fn tokenize_double_underscore() {
         test("he__llo__world_ _ __ Now this_ is__ special!",
@@ -876,6 +949,21 @@ mod parse_tests {
         ]);
     }
 
+    /// Parse comments (line and block).
+    #[test]
+    fn parse_comments() {
+        let mut scope = Scope::new();
+        scope.add::<BodylessFn>("test");
+        scope.add::<TreeFn>("func");
+
+        test_scoped(&scope, "Text\n// Comment\n More text",
+            tree! [ T("Text"), S, T("More"), S, T("text") ]);
+        test_scoped(&scope, "[test/*world*/]",
+            tree! [ F(func! { name => "test", body => None }) ]);
+        test_scoped(&scope, "[test/*]*/]",
+            tree! [ F(func! { name => "test", body => None }) ]);
+    }
+
     /// Test if escaped, but unbalanced parens are correctly parsed.
     #[test]
     fn parse_unbalanced_body_parens() {
@@ -933,6 +1021,7 @@ mod parse_tests {
         test_err("No functions here]", "unexpected closing bracket");
         test_err_scoped(&scope, "[hello][world", "expected closing bracket");
         test_err("[hello world", "expected closing bracket");
-        test_err("[ no-name][Why?]", "expected identifier");
+        test_err("[ no-name][Why?]", "invalid identifier: 'no-name'");
+        test_err("Hello */", "unexpected end of block comment");
     }
 }
diff --git a/src/syntax.rs b/src/syntax.rs
index a8ae930d6..87592c430 100644
--- a/src/syntax.rs
+++ b/src/syntax.rs
@@ -9,7 +9,7 @@ use crate::func::Function;
 pub enum Token<'s> {
     /// One or more whitespace (non-newline) codepoints.
     Space,
-    /// A line feed (either `\n` or `\r\n`).
+    /// A line feed (`\n`, `\r\n` and some more as defined by the Unicode standard).
     Newline,
     /// A left bracket: `[`.
     LeftBracket,
@@ -17,19 +17,27 @@ pub enum Token<'s> {
     RightBracket,
     /// A colon (`:`) indicating the beginning of function arguments.
     ///
-    /// If a colon occurs outside of the function header, it will be
+    /// If a colon occurs outside of a function header, it will be
     /// tokenized as a [Word](Token::Word).
     Colon,
-    /// Same as with [Colon](Token::Colon).
+    /// An equals (`=`) sign assigning a function argument a value.
+    ///
+    /// Outside of functions headers, same as with [Colon](Token::Colon).
     Equals,
-    /// Two underscores, indicating text in _italics_.
+    /// Two underscores, indicating text in italics.
     DoubleUnderscore,
-    /// Two stars, indicating **bold** text.
+    /// Two stars, indicating bold text.
     DoubleStar,
-    /// A dollar sign, indicating _mathematical_ content.
+    /// A dollar sign, indicating mathematical content.
     Dollar,
-    /// A hashtag starting a _comment_.
-    Hashtag,
+    /// A line comment.
+    LineComment(&'s str),
+    /// A block comment.
+    BlockComment(&'s str),
+    /// A star followed by a slash unexpectedly ending a block comment
+    /// (the comment was not started before, otherwise a
+    /// [BlockComment](Token::BlockComment) would be returned).
+    StarSlash,
     /// Everything else is just text.
     Text(&'s str),
 }