mirror of
https://github.com/typst/typst
synced 2025-05-14 04:56:26 +08:00
Improve tokenizer 🛢
This commit is contained in:
parent
9d605c3128
commit
632bf2f2ef
152
src/parsing.rs
152
src/parsing.rs
@ -16,7 +16,7 @@ use crate::func::{ParseContext, Scope};
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Tokens<'s> {
|
pub struct Tokens<'s> {
|
||||||
source: &'s str,
|
source: &'s str,
|
||||||
chars: Peekable<CharIndices<'s>>,
|
chars: PeekableChars<'s>,
|
||||||
state: TokensState,
|
state: TokensState,
|
||||||
stack: Vec<TokensState>,
|
stack: Vec<TokensState>,
|
||||||
}
|
}
|
||||||
@ -39,7 +39,7 @@ impl<'s> Tokens<'s> {
|
|||||||
pub fn new(source: &'s str) -> Tokens<'s> {
|
pub fn new(source: &'s str) -> Tokens<'s> {
|
||||||
Tokens {
|
Tokens {
|
||||||
source,
|
source,
|
||||||
chars: source.char_indices().peekable(),
|
chars: PeekableChars::new(source),
|
||||||
state: TokensState::Body,
|
state: TokensState::Body,
|
||||||
stack: vec![],
|
stack: vec![],
|
||||||
}
|
}
|
||||||
@ -80,20 +80,6 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
fn next(&mut self) -> Option<Token<'s>> {
|
fn next(&mut self) -> Option<Token<'s>> {
|
||||||
use TokensState as TS;
|
use TokensState as TS;
|
||||||
|
|
||||||
// Skip whitespace, but if at least one whitespace character existed,
|
|
||||||
// remember that, because then we return a space token.
|
|
||||||
let mut whitespace = false;
|
|
||||||
while let Some(&(_, c)) = self.chars.peek() {
|
|
||||||
if !c.is_whitespace() || c == '\n' || c == '\r' {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
whitespace = true;
|
|
||||||
self.advance();
|
|
||||||
}
|
|
||||||
if whitespace {
|
|
||||||
return Some(Token::Space);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function maybe has a body
|
// Function maybe has a body
|
||||||
if self.state == TS::MaybeBody {
|
if self.state == TS::MaybeBody {
|
||||||
if self.chars.peek()?.1 == '[' {
|
if self.chars.peek()?.1 == '[' {
|
||||||
@ -107,7 +93,7 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
// Now all special cases are handled and we can finally look at the
|
// Now all special cases are handled and we can finally look at the
|
||||||
// next words.
|
// next words.
|
||||||
let (next_pos, next) = self.chars.next()?;
|
let (next_pos, next) = self.chars.next()?;
|
||||||
let afterwards = self.chars.peek().map(|&(_, c)| c);
|
let afterwards = self.chars.peek().map(|p| p.1);
|
||||||
|
|
||||||
Some(match next {
|
Some(match next {
|
||||||
// Special characters
|
// Special characters
|
||||||
@ -124,22 +110,40 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
'$' => Token::Dollar,
|
'$' => Token::Dollar,
|
||||||
'#' => Token::Hashtag,
|
'#' => Token::Hashtag,
|
||||||
|
|
||||||
// Context sensitive operators
|
// Whitespace
|
||||||
|
' ' | '\t' => {
|
||||||
|
while let Some((_, c)) = self.chars.peek() {
|
||||||
|
match c {
|
||||||
|
' ' | '\t' => self.advance(),
|
||||||
|
_ => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Token::Space
|
||||||
|
}
|
||||||
|
|
||||||
|
// Context sensitive operators in headers
|
||||||
':' if self.state == TS::Function => Token::Colon,
|
':' if self.state == TS::Function => Token::Colon,
|
||||||
'=' if self.state == TS::Function => Token::Equals,
|
'=' if self.state == TS::Function => Token::Equals,
|
||||||
|
|
||||||
// Double star/underscore
|
// Double star/underscore in bodies
|
||||||
'*' if afterwards == Some('*') => self.consumed(Token::DoubleStar),
|
'*' if self.state == TS::Body && afterwards == Some('*')
|
||||||
'_' if afterwards == Some('_') => self.consumed(Token::DoubleUnderscore),
|
=> self.consumed(Token::DoubleStar),
|
||||||
|
'_' if self.state == TS::Body && afterwards == Some('_')
|
||||||
|
=> self.consumed(Token::DoubleUnderscore),
|
||||||
|
|
||||||
// Newlines
|
// Newlines
|
||||||
'\n' => Token::Newline,
|
|
||||||
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
|
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
|
||||||
|
c if is_newline_char(c) => Token::Newline,
|
||||||
|
|
||||||
// Escaping
|
// Escaping
|
||||||
'\\' => {
|
'\\' => {
|
||||||
if let Some(&(index, c)) = self.chars.peek() {
|
if let Some((index, c)) = self.chars.peek() {
|
||||||
if is_special_character(c) {
|
let escapable = match c {
|
||||||
|
'[' | ']' | '$' | '#' | '\\' | '*' | '_' => true,
|
||||||
|
_ => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
if escapable {
|
||||||
self.advance();
|
self.advance();
|
||||||
return Some(self.text(index, index + c.len_utf8()));
|
return Some(self.text(index, index + c.len_utf8()));
|
||||||
}
|
}
|
||||||
@ -148,14 +152,31 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
Token::Text("\\")
|
Token::Text("\\")
|
||||||
},
|
},
|
||||||
|
|
||||||
// Now it seems like it's just a normal word.
|
// Normal text
|
||||||
_ => {
|
_ => {
|
||||||
// Find out when the word ends.
|
// Find out when the word ends.
|
||||||
let mut end = (next_pos, next);
|
let mut end = (next_pos, next);
|
||||||
while let Some(&(index, c)) = self.chars.peek() {
|
while let Some((index, c)) = self.chars.peek() {
|
||||||
if is_special_character(c) || c.is_whitespace() {
|
// Whether the next token is still from the next or not.
|
||||||
|
let continues = match c {
|
||||||
|
'[' | ']' | '$' | '#' | '\\' => false,
|
||||||
|
':' | '=' if self.state == TS::Function => false,
|
||||||
|
|
||||||
|
'*' if self.state == TS::Body
|
||||||
|
=> self.chars.peek_second().map(|p| p.1) != Some('*'),
|
||||||
|
'_' if self.state == TS::Body
|
||||||
|
=> self.chars.peek_second().map(|p| p.1) != Some('_'),
|
||||||
|
|
||||||
|
' ' | '\t' => false,
|
||||||
|
c if is_newline_char(c) => false,
|
||||||
|
|
||||||
|
_ => true,
|
||||||
|
};
|
||||||
|
|
||||||
|
if !continues {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
end = (index, c);
|
end = (index, c);
|
||||||
self.advance();
|
self.advance();
|
||||||
}
|
}
|
||||||
@ -167,14 +188,66 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Whether this character has a special meaning in the language.
|
/// Whether this character is a newline (or starts one).
|
||||||
fn is_special_character(character: char) -> bool {
|
fn is_newline_char(character: char) -> bool {
|
||||||
match character {
|
match character {
|
||||||
'[' | ']' | '$' | '#' | '\\' | ':' | '=' | '*' | '_' => true,
|
'\n' | '\r' | '\u{000c}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
|
||||||
_ => false,
|
_ => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A index + char iterator with double lookahead.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct PeekableChars<'s> {
|
||||||
|
chars: CharIndices<'s>,
|
||||||
|
peek1: Option<Option<(usize, char)>>,
|
||||||
|
peek2: Option<Option<(usize, char)>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> PeekableChars<'s> {
|
||||||
|
/// Create a new iterator from a string.
|
||||||
|
fn new(string: &'s str) -> PeekableChars<'s> {
|
||||||
|
PeekableChars {
|
||||||
|
chars: string.char_indices(),
|
||||||
|
peek1: None,
|
||||||
|
peek2: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Peek at the next element.
|
||||||
|
fn peek(&mut self) -> Option<(usize, char)> {
|
||||||
|
let iter = &mut self.chars;
|
||||||
|
*self.peek1.get_or_insert_with(|| iter.next())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Peek at the element after the next element.
|
||||||
|
fn peek_second(&mut self) -> Option<(usize, char)> {
|
||||||
|
match self.peek2 {
|
||||||
|
Some(peeked) => peeked,
|
||||||
|
None => {
|
||||||
|
self.peek();
|
||||||
|
let next = self.chars.next();
|
||||||
|
self.peek2 = Some(next);
|
||||||
|
next
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for PeekableChars<'_> {
|
||||||
|
type Item = (usize, char);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<(usize, char)> {
|
||||||
|
match self.peek1.take() {
|
||||||
|
Some(value) => {
|
||||||
|
self.peek1 = self.peek2.take();
|
||||||
|
value
|
||||||
|
},
|
||||||
|
None => self.chars.next(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Transforms token streams to syntax trees.
|
/// Transforms token streams to syntax trees.
|
||||||
pub struct Parser<'s, 't> {
|
pub struct Parser<'s, 't> {
|
||||||
tokens: &'s mut ParseTokens<'t>,
|
tokens: &'s mut ParseTokens<'t>,
|
||||||
@ -506,18 +579,11 @@ impl<'s> Iterator for ParseTokens<'s> {
|
|||||||
|
|
||||||
/// More useful functions on `str`'s.
|
/// More useful functions on `str`'s.
|
||||||
trait StrExt {
|
trait StrExt {
|
||||||
/// Whether self consists only of whitespace.
|
|
||||||
fn is_whitespace(&self) -> bool;
|
|
||||||
|
|
||||||
/// Whether this word is a valid unicode identifier.
|
/// Whether this word is a valid unicode identifier.
|
||||||
fn is_identifier(&self) -> bool;
|
fn is_identifier(&self) -> bool;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StrExt for str {
|
impl StrExt for str {
|
||||||
fn is_whitespace(&self) -> bool {
|
|
||||||
self.chars().all(|c| c.is_whitespace() && c != '\n')
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_identifier(&self) -> bool {
|
fn is_identifier(&self) -> bool {
|
||||||
let mut chars = self.chars();
|
let mut chars = self.chars();
|
||||||
|
|
||||||
@ -599,8 +665,6 @@ mod token_tests {
|
|||||||
test(r"\]", vec![T("]")]);
|
test(r"\]", vec![T("]")]);
|
||||||
test(r"\#", vec![T("#")]);
|
test(r"\#", vec![T("#")]);
|
||||||
test(r"\$", vec![T("$")]);
|
test(r"\$", vec![T("$")]);
|
||||||
test(r"\:", vec![T(":")]);
|
|
||||||
test(r"\=", vec![T("=")]);
|
|
||||||
test(r"\**", vec![T("*"), T("*")]);
|
test(r"\**", vec![T("*"), T("*")]);
|
||||||
test(r"\*", vec![T("*")]);
|
test(r"\*", vec![T("*")]);
|
||||||
test(r"\__", vec![T("_"), T("_")]);
|
test(r"\__", vec![T("_"), T("_")]);
|
||||||
@ -639,12 +703,12 @@ mod token_tests {
|
|||||||
fn tokenize_symbols_context() {
|
fn tokenize_symbols_context() {
|
||||||
test("[func: key=value][Answer: 7]",
|
test("[func: key=value][Answer: 7]",
|
||||||
vec![L, T("func"), C, S, T("key"), E, T("value"), R, L,
|
vec![L, T("func"), C, S, T("key"), E, T("value"), R, L,
|
||||||
T("Answer"), T(":"), S, T("7"), R]);
|
T("Answer:"), S, T("7"), R]);
|
||||||
test("[[n: k=v]:x][:[=]]:=",
|
test("[[n: k=v]:x][:[=]]:=",
|
||||||
vec![L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
|
vec![L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
|
||||||
L, T(":"), L, E, R, R, T(":"), T("=")]);
|
L, T(":"), L, E, R, R, T(":=")]);
|
||||||
test("[func: __key__=value]",
|
test("[func: __key__=value]",
|
||||||
vec![L, T("func"), C, S, DU, T("key"), DU, E, T("value"), R]);
|
vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This test has a special look at the double underscore syntax, because
|
/// This test has a special look at the double underscore syntax, because
|
||||||
@ -653,8 +717,8 @@ mod token_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn tokenize_double_underscore() {
|
fn tokenize_double_underscore() {
|
||||||
test("he__llo__world_ _ __ Now this_ is__ special!",
|
test("he__llo__world_ _ __ Now this_ is__ special!",
|
||||||
vec![T("he"), DU, T("llo"), DU, T("world"), T("_"), S, T("_"), S, DU, S, T("Now"), S,
|
vec![T("he"), DU, T("llo"), DU, T("world_"), S, T("_"), S, DU, S, T("Now"), S,
|
||||||
T("this"), T("_"), S, T("is"), DU, S, T("special!")]);
|
T("this_"), S, T("is"), DU, S, T("special!")]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This test is for checking if non-ASCII characters get parsed correctly.
|
/// This test is for checking if non-ASCII characters get parsed correctly.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user