Parse line and block comments 📔

This commit is contained in:
Laurenz 2019-05-03 12:41:18 +02:00
parent bc78974fd2
commit 5c66bac689
2 changed files with 202 additions and 105 deletions

View File

@ -8,7 +8,7 @@ use smallvec::SmallVec;
use unicode_xid::UnicodeXID; use unicode_xid::UnicodeXID;
use crate::syntax::*; use crate::syntax::*;
use crate::func::Scope; use crate::func::{Function, Scope};
/// Builds an iterator over the tokens of the source code. /// Builds an iterator over the tokens of the source code.
@ -99,7 +99,7 @@ impl<'s> Iterator for Tokens<'s> {
let afterwards = self.chars.peek().map(|p| p.1); let afterwards = self.chars.peek().map(|p| p.1);
Some(match next { Some(match next {
// Special characters // Functions
'[' => { '[' => {
self.switch(TS::Function); self.switch(TS::Function);
Token::LeftBracket Token::LeftBracket
@ -112,8 +112,47 @@ impl<'s> Iterator for Tokens<'s> {
} }
Token::RightBracket Token::RightBracket
}, },
'$' => Token::Dollar,
'#' => Token::Hashtag, // Line comment
'/' if afterwards == Some('/') => {
let mut end = self.chars.next().unwrap();
let start = end.0 + end.1.len_utf8();
while let Some((index, c)) = self.chars.peek() {
if is_newline_char(c) {
break;
}
self.advance();
end = (index, c);
}
let end = end.0 + end.1.len_utf8();
Token::LineComment(&self.src[start .. end])
},
// Block comment
'/' if afterwards == Some('*') => {
let mut end = self.chars.next().unwrap();
let start = end.0 + end.1.len_utf8();
let mut nested = 0;
while let Some((index, c)) = self.chars.next() {
let after = self.chars.peek().map(|p| p.1);
match (c, after) {
('*', Some('/')) if nested == 0 => { self.advance(); break },
('/', Some('*')) => { self.advance(); nested += 1 },
('*', Some('/')) => { self.advance(); nested -= 1 },
_ => {},
}
end = (index, c);
}
let end = end.0 + end.1.len_utf8();
Token::BlockComment(&self.src[start .. end])
},
// Unexpected end of block comment
'*' if afterwards == Some('/') => self.consumed(Token::StarSlash),
// Whitespace // Whitespace
' ' | '\t' => { ' ' | '\t' => {
@ -126,25 +165,26 @@ impl<'s> Iterator for Tokens<'s> {
Token::Space Token::Space
} }
// Newlines
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
c if is_newline_char(c) => Token::Newline,
// Context sensitive operators in headers // Context sensitive operators in headers
':' if self.state == TS::Function => Token::Colon, ':' if self.state == TS::Function => Token::Colon,
'=' if self.state == TS::Function => Token::Equals, '=' if self.state == TS::Function => Token::Equals,
// Double star/underscore in bodies // Double star/underscore and dollar in bodies
'*' if self.state == TS::Body && afterwards == Some('*') '*' if self.state == TS::Body && afterwards == Some('*')
=> self.consumed(Token::DoubleStar), => self.consumed(Token::DoubleStar),
'_' if self.state == TS::Body && afterwards == Some('_') '_' if self.state == TS::Body && afterwards == Some('_')
=> self.consumed(Token::DoubleUnderscore), => self.consumed(Token::DoubleUnderscore),
'$' if self.state == TS::Body => Token::Dollar,
// Newlines
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
c if is_newline_char(c) => Token::Newline,
// Escaping // Escaping
'\\' => { '\\' => {
if let Some((index, c)) = self.chars.peek() { if let Some((index, c)) = self.chars.peek() {
let escapable = match c { let escapable = match c {
'[' | ']' | '$' | '#' | '\\' | '*' | '_' => true, '[' | ']' | '$' | '#' | '\\' | '*' | '_' | '/' => true,
_ => false, _ => false,
}; };
@ -162,15 +202,18 @@ impl<'s> Iterator for Tokens<'s> {
// Find out when the word ends. // Find out when the word ends.
let mut end = (next_pos, next); let mut end = (next_pos, next);
while let Some((index, c)) = self.chars.peek() { while let Some((index, c)) = self.chars.peek() {
let second = self.chars.peek_second().map(|p| p.1);
// Whether the next token is still from the next or not. // Whether the next token is still from the next or not.
let continues = match c { let continues = match c {
'[' | ']' | '$' | '#' | '\\' => false, '[' | ']' | '$' | '#' | '\\' => false,
':' | '=' if self.state == TS::Function => false, ':' | '=' if self.state == TS::Function => false,
'*' if self.state == TS::Body '*' if self.state == TS::Body => second != Some('*'),
=> self.chars.peek_second().map(|p| p.1) != Some('*'), '_' if self.state == TS::Body => second != Some('_'),
'_' if self.state == TS::Body
=> self.chars.peek_second().map(|p| p.1) != Some('_'), '/' => second != Some('/') && second != Some('*'),
'*' => second != Some('/'),
' ' | '\t' => false, ' ' | '\t' => false,
c if is_newline_char(c) => false, c if is_newline_char(c) => false,
@ -321,94 +364,89 @@ impl<'s> Parser<'s> {
/// Parse the source into an abstract syntax tree. /// Parse the source into an abstract syntax tree.
fn parse(mut self) -> ParseResult<SyntaxTree> { fn parse(mut self) -> ParseResult<SyntaxTree> {
use ParserState as PS; // Loop through all the tokens.
while self.tokens.peek().is_some() {
while let Some(token) = self.tokens.peek() { self.parse_white()?;
// Skip over comments. self.parse_body_part()?;
if token == Token::Hashtag {
self.skip_while(|t| t != Token::Newline);
self.advance();
}
// Handles all the states.
match self.state {
PS::FirstNewline => match token {
Token::Newline => {
self.append_consumed(Node::Newline);
self.switch(PS::WroteNewline);
},
Token::Space => self.append_space_consumed(),
_ => {
self.append_space();
self.switch(PS::Body);
},
}
PS::WroteNewline => match token {
Token::Newline | Token::Space => self.append_space_consumed(),
_ => self.switch(PS::Body),
}
PS::Body => match token {
// Whitespace
Token::Space => self.append_space_consumed(),
Token::Newline => {
self.advance();
self.switch(PS::FirstNewline);
},
// Text
Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
// Functions
Token::LeftBracket => self.parse_function()?,
Token::RightBracket => {
return Err(ParseError::new("unexpected closing bracket"));
},
// Modifiers
Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics),
Token::DoubleStar => self.append_consumed(Node::ToggleBold),
Token::Dollar => self.append_consumed(Node::ToggleMath),
// Should not happen
Token::Colon | Token::Equals | Token::Hashtag => unreachable!(),
},
}
} }
Ok(self.tree) Ok(self.tree)
} }
/// Parse a function from the current position. /// Parse part of the body.
fn parse_function(&mut self) -> ParseResult<()> { fn parse_body_part(&mut self) -> ParseResult<()> {
if let Some(token) = self.tokens.peek() {
match token {
// Functions
Token::LeftBracket => self.parse_func()?,
Token::RightBracket => return Err(ParseError::new("unexpected closing bracket")),
// Modifiers
Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics),
Token::DoubleStar => self.append_consumed(Node::ToggleBold),
Token::Dollar => self.append_consumed(Node::ToggleMath),
// Normal text
Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
Token::Colon | Token::Equals => panic!("bad token for body: {:?}", token),
// The rest is handled elsewhere or should not happen, because Tokens does
// not yield colons or equals in the body, but their text equivalents instead.
_ => panic!("unexpected token: {:?}", token),
}
}
Ok(())
}
/// Parse a complete function from the current position.
fn parse_func(&mut self) -> ParseResult<()> {
// This should only be called if a left bracket was seen. // This should only be called if a left bracket was seen.
assert!(self.tokens.next() == Some(Token::LeftBracket)); assert!(self.tokens.next() == Some(Token::LeftBracket));
let header = self.parse_func_header()?;
let body = self.parse_func_body(&header)?;
// Finally this function is parsed to the end.
self.append(Node::Func(FuncCall {
header,
body,
}));
Ok(self.switch(ParserState::Body))
}
/// Parse a function header.
fn parse_func_header(&mut self) -> ParseResult<FuncHeader> {
// The next token should be the name of the function. // The next token should be the name of the function.
self.parse_white()?;
let name = match self.tokens.next() { let name = match self.tokens.next() {
Some(Token::Text(word)) => { Some(Token::Text(word)) => {
if is_identifier(word) { if is_identifier(word) {
Ok(word.to_owned()) Ok(word.to_owned())
} else { } else {
Err(ParseError::new("invalid identifier")) Err(ParseError::new(format!("invalid identifier: '{}'", word)))
} }
}, },
_ => Err(ParseError::new("expected identifier")), _ => Err(ParseError::new("expected identifier")),
}?; }?;
// Now the header should be closed. // Now the header should be closed.
self.parse_white()?;
if self.tokens.next() != Some(Token::RightBracket) { if self.tokens.next() != Some(Token::RightBracket) {
return Err(ParseError::new("expected closing bracket")); return Err(ParseError::new("expected closing bracket"));
} }
// Store the header information of the function invocation. // Store the header information of the function invocation.
let header = FuncHeader { Ok(FuncHeader {
name, name,
args: vec![], args: vec![],
kwargs: HashMap::new(), kwargs: HashMap::new(),
}; })
}
/// Parse the body of a function.
fn parse_func_body(&mut self, header: &FuncHeader) -> ParseResult<Box<dyn Function>> {
// Whether the function has a body. // Whether the function has a body.
let has_body = self.tokens.peek() == Some(Token::LeftBracket); let has_body = self.tokens.peek() == Some(Token::LeftBracket);
if has_body { if has_body {
@ -420,7 +458,7 @@ impl<'s> Parser<'s> {
.ok_or_else(|| ParseError::new(format!("unknown function: '{}'", &header.name)))?; .ok_or_else(|| ParseError::new(format!("unknown function: '{}'", &header.name)))?;
// Do the parsing dependent on whether the function has a body. // Do the parsing dependent on whether the function has a body.
let body = if has_body { Ok(if has_body {
// Find out the string which makes the body of this function. // Find out the string which makes the body of this function.
let (start, end) = self.tokens.current_index().and_then(|index| { let (start, end) = self.tokens.current_index().and_then(|index| {
find_closing_bracket(&self.src[index..]) find_closing_bracket(&self.src[index..])
@ -448,15 +486,48 @@ impl<'s> Parser<'s> {
body: None, body: None,
scope: &self.scope, scope: &self.scope,
})? })?
}; })
}
// Finally this function is parsed to the end. /// Parse whitespace (as long as there is any) and skip over comments.
self.append(Node::Func(FuncCall { fn parse_white(&mut self) -> ParseResult<()> {
header, while let Some(token) = self.tokens.peek() {
body, match self.state {
})); ParserState::FirstNewline => match token {
Token::Newline => {
self.append_consumed(Node::Newline);
self.switch(ParserState::WroteNewline);
},
Token::Space => self.append_space_consumed(),
_ => {
self.append_space();
self.switch(ParserState::Body);
},
},
ParserState::WroteNewline => match token {
Token::Newline | Token::Space => self.append_space_consumed(),
_ => self.switch(ParserState::Body),
},
ParserState::Body => match token {
// Whitespace
Token::Space => self.append_space_consumed(),
Token::Newline => {
self.advance();
self.switch(ParserState::FirstNewline);
},
Ok(self.switch(ParserState::Body)) // Comments
Token::LineComment(_) | Token::BlockComment(_) => self.advance(),
Token::StarSlash => {
return Err(ParseError::new("unexpected end of block comment"));
},
// Anything else skips out of the function.
_ => break,
}
}
}
Ok(())
} }
/// Advance the iterator by one step. /// Advance the iterator by one step.
@ -492,16 +563,6 @@ impl<'s> Parser<'s> {
self.advance(); self.advance();
self.append_space(); self.append_space();
} }
/// Skip tokens until the condition is met.
fn skip_while<F>(&mut self, f: F) where F: Fn(Token) -> bool {
while let Some(token) = self.tokens.peek() {
if !f(token) {
break;
}
self.advance();
}
}
} }
/// Find the index of the first unbalanced (unescaped) closing bracket. /// Find the index of the first unbalanced (unescaped) closing bracket.
@ -623,7 +684,7 @@ mod token_tests {
use super::*; use super::*;
use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R, use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS, Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS,
Dollar as D, Hashtag as H, Text as T}; Dollar as D, Text as T, LineComment as LC, BlockComment as BC, StarSlash as SS};
/// Test if the source code tokenizes to the tokens. /// Test if the source code tokenizes to the tokens.
fn test(src: &str, tokens: Vec<Token>) { fn test(src: &str, tokens: Vec<Token>) {
@ -638,7 +699,6 @@ mod token_tests {
test("[", vec![L]); test("[", vec![L]);
test("]", vec![R]); test("]", vec![R]);
test("$", vec![D]); test("$", vec![D]);
test("#", vec![H]);
test("**", vec![DS]); test("**", vec![DS]);
test("__", vec![DU]); test("__", vec![DU]);
test("\n", vec![N]); test("\n", vec![N]);
@ -709,11 +769,24 @@ mod token_tests {
T("v"), E, T("1"), R, L, T("hello"), R]); T("v"), E, T("1"), R, L, T("hello"), R]);
test("[func: __key__=value]", test("[func: __key__=value]",
vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]); vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]);
test("The /*[*/ answer: 7.",
vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]);
} }
/// This test has a special look at the double underscore syntax, because /// Test if block and line comments get tokenized as expected.
/// per Unicode standard they are not separate words and thus harder to parse #[test]
/// than the stars. fn tokenize_comments() {
test("These // Line comments.",
vec![T("These"), S, LC(" Line comments.")]);
test("This /* is */ a comment.",
vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]);
test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]);
test("/* Hey */ */", vec![BC(" Hey "), S, SS]);
test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]);
test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")])
}
/// This test has a special look at the double underscore syntax.
#[test] #[test]
fn tokenize_double_underscore() { fn tokenize_double_underscore() {
test("he__llo__world_ _ __ Now this_ is__ special!", test("he__llo__world_ _ __ Now this_ is__ special!",
@ -876,6 +949,21 @@ mod parse_tests {
]); ]);
} }
/// Parse comments (line and block).
#[test]
fn parse_comments() {
let mut scope = Scope::new();
scope.add::<BodylessFn>("test");
scope.add::<TreeFn>("func");
test_scoped(&scope, "Text\n// Comment\n More text",
tree! [ T("Text"), S, T("More"), S, T("text") ]);
test_scoped(&scope, "[test/*world*/]",
tree! [ F(func! { name => "test", body => None }) ]);
test_scoped(&scope, "[test/*]*/]",
tree! [ F(func! { name => "test", body => None }) ]);
}
/// Test if escaped, but unbalanced parens are correctly parsed. /// Test if escaped, but unbalanced parens are correctly parsed.
#[test] #[test]
fn parse_unbalanced_body_parens() { fn parse_unbalanced_body_parens() {
@ -933,6 +1021,7 @@ mod parse_tests {
test_err("No functions here]", "unexpected closing bracket"); test_err("No functions here]", "unexpected closing bracket");
test_err_scoped(&scope, "[hello][world", "expected closing bracket"); test_err_scoped(&scope, "[hello][world", "expected closing bracket");
test_err("[hello world", "expected closing bracket"); test_err("[hello world", "expected closing bracket");
test_err("[ no-name][Why?]", "expected identifier"); test_err("[ no-name][Why?]", "invalid identifier: 'no-name'");
test_err("Hello */", "unexpected end of block comment");
} }
} }

View File

@ -9,7 +9,7 @@ use crate::func::Function;
pub enum Token<'s> { pub enum Token<'s> {
/// One or more whitespace (non-newline) codepoints. /// One or more whitespace (non-newline) codepoints.
Space, Space,
/// A line feed (either `\n` or `\r\n`). /// A line feed (`\n`, `\r\n` and some more as defined by the Unicode standard).
Newline, Newline,
/// A left bracket: `[`. /// A left bracket: `[`.
LeftBracket, LeftBracket,
@ -17,19 +17,27 @@ pub enum Token<'s> {
RightBracket, RightBracket,
/// A colon (`:`) indicating the beginning of function arguments. /// A colon (`:`) indicating the beginning of function arguments.
/// ///
/// If a colon occurs outside of the function header, it will be /// If a colon occurs outside of a function header, it will be
/// tokenized as a [Word](Token::Word). /// tokenized as a [Word](Token::Word).
Colon, Colon,
/// Same as with [Colon](Token::Colon). /// An equals (`=`) sign assigning a function argument a value.
///
/// Outside of functions headers, same as with [Colon](Token::Colon).
Equals, Equals,
/// Two underscores, indicating text in _italics_. /// Two underscores, indicating text in italics.
DoubleUnderscore, DoubleUnderscore,
/// Two stars, indicating **bold** text. /// Two stars, indicating bold text.
DoubleStar, DoubleStar,
/// A dollar sign, indicating _mathematical_ content. /// A dollar sign, indicating mathematical content.
Dollar, Dollar,
/// A hashtag starting a _comment_. /// A line comment.
Hashtag, LineComment(&'s str),
/// A block comment.
BlockComment(&'s str),
/// A star followed by a slash unexpectedly ending a block comment
/// (the comment was not started before, otherwise a
/// [BlockComment](Token::BlockComment) would be returned).
StarSlash,
/// Everything else is just text. /// Everything else is just text.
Text(&'s str), Text(&'s str),
} }