Parse line and block comments 📔

This commit is contained in:
Laurenz 2019-05-03 12:41:18 +02:00
parent bc78974fd2
commit 5c66bac689
2 changed files with 202 additions and 105 deletions

View File

@ -8,7 +8,7 @@ use smallvec::SmallVec;
use unicode_xid::UnicodeXID;
use crate::syntax::*;
use crate::func::Scope;
use crate::func::{Function, Scope};
/// Builds an iterator over the tokens of the source code.
@ -99,7 +99,7 @@ impl<'s> Iterator for Tokens<'s> {
let afterwards = self.chars.peek().map(|p| p.1);
Some(match next {
// Special characters
// Functions
'[' => {
self.switch(TS::Function);
Token::LeftBracket
@ -112,8 +112,47 @@ impl<'s> Iterator for Tokens<'s> {
}
Token::RightBracket
},
'$' => Token::Dollar,
'#' => Token::Hashtag,
// Line comment
'/' if afterwards == Some('/') => {
let mut end = self.chars.next().unwrap();
let start = end.0 + end.1.len_utf8();
while let Some((index, c)) = self.chars.peek() {
if is_newline_char(c) {
break;
}
self.advance();
end = (index, c);
}
let end = end.0 + end.1.len_utf8();
Token::LineComment(&self.src[start .. end])
},
// Block comment
'/' if afterwards == Some('*') => {
let mut end = self.chars.next().unwrap();
let start = end.0 + end.1.len_utf8();
let mut nested = 0;
while let Some((index, c)) = self.chars.next() {
let after = self.chars.peek().map(|p| p.1);
match (c, after) {
('*', Some('/')) if nested == 0 => { self.advance(); break },
('/', Some('*')) => { self.advance(); nested += 1 },
('*', Some('/')) => { self.advance(); nested -= 1 },
_ => {},
}
end = (index, c);
}
let end = end.0 + end.1.len_utf8();
Token::BlockComment(&self.src[start .. end])
},
// Unexpected end of block comment
'*' if afterwards == Some('/') => self.consumed(Token::StarSlash),
// Whitespace
' ' | '\t' => {
@ -126,25 +165,26 @@ impl<'s> Iterator for Tokens<'s> {
Token::Space
}
// Newlines
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
c if is_newline_char(c) => Token::Newline,
// Context sensitive operators in headers
':' if self.state == TS::Function => Token::Colon,
'=' if self.state == TS::Function => Token::Equals,
// Double star/underscore in bodies
// Double star/underscore and dollar in bodies
'*' if self.state == TS::Body && afterwards == Some('*')
=> self.consumed(Token::DoubleStar),
'_' if self.state == TS::Body && afterwards == Some('_')
=> self.consumed(Token::DoubleUnderscore),
// Newlines
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
c if is_newline_char(c) => Token::Newline,
'$' if self.state == TS::Body => Token::Dollar,
// Escaping
'\\' => {
if let Some((index, c)) = self.chars.peek() {
let escapable = match c {
'[' | ']' | '$' | '#' | '\\' | '*' | '_' => true,
'[' | ']' | '$' | '#' | '\\' | '*' | '_' | '/' => true,
_ => false,
};
@ -162,15 +202,18 @@ impl<'s> Iterator for Tokens<'s> {
// Find out when the word ends.
let mut end = (next_pos, next);
while let Some((index, c)) = self.chars.peek() {
let second = self.chars.peek_second().map(|p| p.1);
// Whether the next token is still from the next or not.
let continues = match c {
'[' | ']' | '$' | '#' | '\\' => false,
':' | '=' if self.state == TS::Function => false,
'*' if self.state == TS::Body
=> self.chars.peek_second().map(|p| p.1) != Some('*'),
'_' if self.state == TS::Body
=> self.chars.peek_second().map(|p| p.1) != Some('_'),
'*' if self.state == TS::Body => second != Some('*'),
'_' if self.state == TS::Body => second != Some('_'),
'/' => second != Some('/') && second != Some('*'),
'*' => second != Some('/'),
' ' | '\t' => false,
c if is_newline_char(c) => false,
@ -321,94 +364,89 @@ impl<'s> Parser<'s> {
/// Parse the source into an abstract syntax tree.
fn parse(mut self) -> ParseResult<SyntaxTree> {
use ParserState as PS;
while let Some(token) = self.tokens.peek() {
// Skip over comments.
if token == Token::Hashtag {
self.skip_while(|t| t != Token::Newline);
self.advance();
}
// Handles all the states.
match self.state {
PS::FirstNewline => match token {
Token::Newline => {
self.append_consumed(Node::Newline);
self.switch(PS::WroteNewline);
},
Token::Space => self.append_space_consumed(),
_ => {
self.append_space();
self.switch(PS::Body);
},
}
PS::WroteNewline => match token {
Token::Newline | Token::Space => self.append_space_consumed(),
_ => self.switch(PS::Body),
}
PS::Body => match token {
// Whitespace
Token::Space => self.append_space_consumed(),
Token::Newline => {
self.advance();
self.switch(PS::FirstNewline);
},
// Text
Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
// Functions
Token::LeftBracket => self.parse_function()?,
Token::RightBracket => {
return Err(ParseError::new("unexpected closing bracket"));
},
// Modifiers
Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics),
Token::DoubleStar => self.append_consumed(Node::ToggleBold),
Token::Dollar => self.append_consumed(Node::ToggleMath),
// Should not happen
Token::Colon | Token::Equals | Token::Hashtag => unreachable!(),
},
}
// Loop through all the tokens.
while self.tokens.peek().is_some() {
self.parse_white()?;
self.parse_body_part()?;
}
Ok(self.tree)
}
/// Parse a function from the current position.
fn parse_function(&mut self) -> ParseResult<()> {
/// Parse part of the body.
fn parse_body_part(&mut self) -> ParseResult<()> {
if let Some(token) = self.tokens.peek() {
match token {
// Functions
Token::LeftBracket => self.parse_func()?,
Token::RightBracket => return Err(ParseError::new("unexpected closing bracket")),
// Modifiers
Token::DoubleUnderscore => self.append_consumed(Node::ToggleItalics),
Token::DoubleStar => self.append_consumed(Node::ToggleBold),
Token::Dollar => self.append_consumed(Node::ToggleMath),
// Normal text
Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
Token::Colon | Token::Equals => panic!("bad token for body: {:?}", token),
// The rest is handled elsewhere or should not happen, because Tokens does
// not yield colons or equals in the body, but their text equivalents instead.
_ => panic!("unexpected token: {:?}", token),
}
}
Ok(())
}
/// Parse a complete function from the current position.
fn parse_func(&mut self) -> ParseResult<()> {
// This should only be called if a left bracket was seen.
assert!(self.tokens.next() == Some(Token::LeftBracket));
let header = self.parse_func_header()?;
let body = self.parse_func_body(&header)?;
// Finally this function is parsed to the end.
self.append(Node::Func(FuncCall {
header,
body,
}));
Ok(self.switch(ParserState::Body))
}
/// Parse a function header.
fn parse_func_header(&mut self) -> ParseResult<FuncHeader> {
// The next token should be the name of the function.
self.parse_white()?;
let name = match self.tokens.next() {
Some(Token::Text(word)) => {
if is_identifier(word) {
Ok(word.to_owned())
} else {
Err(ParseError::new("invalid identifier"))
Err(ParseError::new(format!("invalid identifier: '{}'", word)))
}
},
_ => Err(ParseError::new("expected identifier")),
}?;
// Now the header should be closed.
self.parse_white()?;
if self.tokens.next() != Some(Token::RightBracket) {
return Err(ParseError::new("expected closing bracket"));
}
// Store the header information of the function invocation.
let header = FuncHeader {
Ok(FuncHeader {
name,
args: vec![],
kwargs: HashMap::new(),
};
})
}
/// Parse the body of a function.
fn parse_func_body(&mut self, header: &FuncHeader) -> ParseResult<Box<dyn Function>> {
// Whether the function has a body.
let has_body = self.tokens.peek() == Some(Token::LeftBracket);
if has_body {
@ -420,7 +458,7 @@ impl<'s> Parser<'s> {
.ok_or_else(|| ParseError::new(format!("unknown function: '{}'", &header.name)))?;
// Do the parsing dependent on whether the function has a body.
let body = if has_body {
Ok(if has_body {
// Find out the string which makes the body of this function.
let (start, end) = self.tokens.current_index().and_then(|index| {
find_closing_bracket(&self.src[index..])
@ -448,15 +486,48 @@ impl<'s> Parser<'s> {
body: None,
scope: &self.scope,
})?
};
})
}
// Finally this function is parsed to the end.
self.append(Node::Func(FuncCall {
header,
body,
}));
/// Parse whitespace (as long as there is any) and skip over comments.
fn parse_white(&mut self) -> ParseResult<()> {
while let Some(token) = self.tokens.peek() {
match self.state {
ParserState::FirstNewline => match token {
Token::Newline => {
self.append_consumed(Node::Newline);
self.switch(ParserState::WroteNewline);
},
Token::Space => self.append_space_consumed(),
_ => {
self.append_space();
self.switch(ParserState::Body);
},
},
ParserState::WroteNewline => match token {
Token::Newline | Token::Space => self.append_space_consumed(),
_ => self.switch(ParserState::Body),
},
ParserState::Body => match token {
// Whitespace
Token::Space => self.append_space_consumed(),
Token::Newline => {
self.advance();
self.switch(ParserState::FirstNewline);
},
Ok(self.switch(ParserState::Body))
// Comments
Token::LineComment(_) | Token::BlockComment(_) => self.advance(),
Token::StarSlash => {
return Err(ParseError::new("unexpected end of block comment"));
},
// Anything else skips out of the function.
_ => break,
}
}
}
Ok(())
}
/// Advance the iterator by one step.
@ -492,16 +563,6 @@ impl<'s> Parser<'s> {
self.advance();
self.append_space();
}
/// Skip tokens until the condition is met.
fn skip_while<F>(&mut self, f: F) where F: Fn(Token) -> bool {
while let Some(token) = self.tokens.peek() {
if !f(token) {
break;
}
self.advance();
}
}
}
/// Find the index of the first unbalanced (unescaped) closing bracket.
@ -623,7 +684,7 @@ mod token_tests {
use super::*;
use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS,
Dollar as D, Hashtag as H, Text as T};
Dollar as D, Text as T, LineComment as LC, BlockComment as BC, StarSlash as SS};
/// Test if the source code tokenizes to the tokens.
fn test(src: &str, tokens: Vec<Token>) {
@ -638,7 +699,6 @@ mod token_tests {
test("[", vec![L]);
test("]", vec![R]);
test("$", vec![D]);
test("#", vec![H]);
test("**", vec![DS]);
test("__", vec![DU]);
test("\n", vec![N]);
@ -709,11 +769,24 @@ mod token_tests {
T("v"), E, T("1"), R, L, T("hello"), R]);
test("[func: __key__=value]",
vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]);
test("The /*[*/ answer: 7.",
vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]);
}
/// This test has a special look at the double underscore syntax, because
/// per Unicode standard they are not separate words and thus harder to parse
/// than the stars.
/// Test if block and line comments get tokenized as expected.
#[test]
fn tokenize_comments() {
test("These // Line comments.",
vec![T("These"), S, LC(" Line comments.")]);
test("This /* is */ a comment.",
vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]);
test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]);
test("/* Hey */ */", vec![BC(" Hey "), S, SS]);
test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]);
test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")])
}
/// This test has a special look at the double underscore syntax.
#[test]
fn tokenize_double_underscore() {
test("he__llo__world_ _ __ Now this_ is__ special!",
@ -876,6 +949,21 @@ mod parse_tests {
]);
}
/// Parse comments (line and block).
#[test]
fn parse_comments() {
let mut scope = Scope::new();
scope.add::<BodylessFn>("test");
scope.add::<TreeFn>("func");
test_scoped(&scope, "Text\n// Comment\n More text",
tree! [ T("Text"), S, T("More"), S, T("text") ]);
test_scoped(&scope, "[test/*world*/]",
tree! [ F(func! { name => "test", body => None }) ]);
test_scoped(&scope, "[test/*]*/]",
tree! [ F(func! { name => "test", body => None }) ]);
}
/// Test if escaped, but unbalanced parens are correctly parsed.
#[test]
fn parse_unbalanced_body_parens() {
@ -933,6 +1021,7 @@ mod parse_tests {
test_err("No functions here]", "unexpected closing bracket");
test_err_scoped(&scope, "[hello][world", "expected closing bracket");
test_err("[hello world", "expected closing bracket");
test_err("[ no-name][Why?]", "expected identifier");
test_err("[ no-name][Why?]", "invalid identifier: 'no-name'");
test_err("Hello */", "unexpected end of block comment");
}
}

View File

@ -9,7 +9,7 @@ use crate::func::Function;
pub enum Token<'s> {
/// One or more whitespace (non-newline) codepoints.
Space,
/// A line feed (either `\n` or `\r\n`).
/// A line feed (`\n`, `\r\n` and some more as defined by the Unicode standard).
Newline,
/// A left bracket: `[`.
LeftBracket,
@ -17,19 +17,27 @@ pub enum Token<'s> {
RightBracket,
/// A colon (`:`) indicating the beginning of function arguments.
///
/// If a colon occurs outside of the function header, it will be
/// If a colon occurs outside of a function header, it will be
/// tokenized as a [Word](Token::Word).
Colon,
/// Same as with [Colon](Token::Colon).
/// An equals (`=`) sign assigning a function argument a value.
///
/// Outside of functions headers, same as with [Colon](Token::Colon).
Equals,
/// Two underscores, indicating text in _italics_.
/// Two underscores, indicating text in italics.
DoubleUnderscore,
/// Two stars, indicating **bold** text.
/// Two stars, indicating bold text.
DoubleStar,
/// A dollar sign, indicating _mathematical_ content.
/// A dollar sign, indicating mathematical content.
Dollar,
/// A hashtag starting a _comment_.
Hashtag,
/// A line comment.
LineComment(&'s str),
/// A block comment.
BlockComment(&'s str),
/// A star followed by a slash unexpectedly ending a block comment
/// (the comment was not started before, otherwise a
/// [BlockComment](Token::BlockComment) would be returned).
StarSlash,
/// Everything else is just text.
Text(&'s str),
}