mirror of
https://github.com/typst/typst
synced 2025-05-14 17:15:28 +08:00
499 lines
16 KiB
Rust
499 lines
16 KiB
Rust
use std::str::CharIndices;
|
|
use smallvec::SmallVec;
|
|
use super::*;
|
|
|
|
/// Builds an iterator over the tokens of the source code.
|
|
pub fn tokenize(src: &str) -> Tokens {
|
|
Tokens::new(src)
|
|
}
|
|
|
|
/// An iterator over the tokens of source code.
|
|
#[derive(Debug, Clone)]
|
|
pub struct Tokens<'s> {
|
|
src: &'s str,
|
|
chars: PeekableChars<'s>,
|
|
state: TokensState,
|
|
stack: SmallVec<[TokensState; 1]>,
|
|
}
|
|
|
|
/// The state the tokenizer is in.
|
|
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
|
enum TokensState {
|
|
/// The base state if there is nothing special we are in.
|
|
Body,
|
|
/// Inside a function header. Here colons and equal signs get parsed
|
|
/// as distinct tokens rather than text.
|
|
Function,
|
|
/// We expect either the end of the function or the beginning of the body.
|
|
MaybeBody,
|
|
}
|
|
|
|
impl<'s> Tokens<'s> {
|
|
/// Create a new token stream from source code.
|
|
pub fn new(src: &'s str) -> Tokens<'s> {
|
|
Tokens {
|
|
src,
|
|
chars: PeekableChars::new(src),
|
|
state: TokensState::Body,
|
|
stack: SmallVec::new(),
|
|
}
|
|
}
|
|
|
|
/// The index of the first character of the next token in the source string.
|
|
pub fn string_index(&mut self) -> usize {
|
|
self.chars.string_index()
|
|
}
|
|
|
|
/// Go to a new position in the underlying string.
|
|
pub fn set_string_index(&mut self, index: usize) {
|
|
self.chars.set_string_index(index);
|
|
}
|
|
|
|
/// Advance the iterator by one step.
|
|
fn advance(&mut self) {
|
|
self.chars.next();
|
|
}
|
|
|
|
/// Switch to the given state.
|
|
fn switch(&mut self, state: TokensState) {
|
|
self.stack.push(self.state);
|
|
self.state = state;
|
|
}
|
|
|
|
/// Go back to the top-of-stack state.
|
|
fn unswitch(&mut self) {
|
|
self.state = self.stack.pop().unwrap_or(TokensState::Body);
|
|
}
|
|
}
|
|
|
|
impl<'s> Iterator for Tokens<'s> {
|
|
type Item = Spanned<Token<'s>>;
|
|
|
|
/// Advance the iterator, return the next token or nothing.
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
use TokensState as TS;
|
|
|
|
// Go to the body state if the function has a body or return to the top-of-stack
|
|
// state.
|
|
if self.state == TS::MaybeBody {
|
|
if let Some((index, '[')) = self.chars.peek() {
|
|
self.advance();
|
|
self.state = TS::Body;
|
|
return Some(Spanned::new(Token::LeftBracket, Span::at(index)));
|
|
} else {
|
|
self.unswitch();
|
|
}
|
|
}
|
|
|
|
// Take the next char and peek at the one behind.
|
|
let (pos, next) = self.chars.next()?;
|
|
let afterwards = self.chars.peekc();
|
|
|
|
let token = match next {
|
|
// Functions
|
|
'[' => {
|
|
self.switch(TS::Function);
|
|
Token::LeftBracket
|
|
}
|
|
']' => {
|
|
if self.state == TS::Function {
|
|
self.state = TS::MaybeBody;
|
|
} else {
|
|
self.unswitch();
|
|
}
|
|
|
|
Token::RightBracket
|
|
}
|
|
|
|
// Line comment
|
|
'/' if afterwards == Some('/') => {
|
|
let start = self.string_index() + 1;
|
|
|
|
while let Some(c) = self.chars.peekc() {
|
|
if is_newline_char(c) {
|
|
break;
|
|
}
|
|
self.advance();
|
|
}
|
|
|
|
let end = self.string_index();
|
|
Token::LineComment(&self.src[start..end])
|
|
}
|
|
|
|
// Block comment
|
|
'/' if afterwards == Some('*') => {
|
|
let start = self.string_index() + 1;
|
|
let mut nested = 0;
|
|
|
|
while let Some((_, c)) = self.chars.next() {
|
|
let after = self.chars.peekc();
|
|
match (c, after) {
|
|
('*', Some('/')) if nested == 0 => {
|
|
self.advance();
|
|
break;
|
|
}
|
|
('/', Some('*')) => {
|
|
self.advance();
|
|
nested += 1
|
|
}
|
|
('*', Some('/')) => {
|
|
self.advance();
|
|
nested -= 1
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
let end = self.string_index() - 2;
|
|
Token::BlockComment(&self.src[start..end])
|
|
}
|
|
|
|
// Unexpected end of block comment
|
|
'*' if afterwards == Some('/') => {
|
|
self.advance();
|
|
Token::StarSlash
|
|
}
|
|
|
|
// Whitespace
|
|
' ' | '\t' => {
|
|
while let Some(c) = self.chars.peekc() {
|
|
match c {
|
|
' ' | '\t' => self.advance(),
|
|
_ => break,
|
|
}
|
|
}
|
|
|
|
Token::Space
|
|
}
|
|
|
|
// Newlines
|
|
'\r' if afterwards == Some('\n') => {
|
|
self.advance();
|
|
Token::Newline
|
|
},
|
|
c if is_newline_char(c) => Token::Newline,
|
|
|
|
// Star/Underscore/Backtick in bodies
|
|
'*' if self.state == TS::Body => Token::Star,
|
|
'_' if self.state == TS::Body => Token::Underscore,
|
|
'`' if self.state == TS::Body => Token::Backtick,
|
|
|
|
// Context sensitive operators in headers
|
|
':' if self.state == TS::Function => Token::Colon,
|
|
'=' if self.state == TS::Function => Token::Equals,
|
|
',' if self.state == TS::Function => Token::Comma,
|
|
|
|
// A string value.
|
|
'"' if self.state == TS::Function => {
|
|
let start = self.string_index();
|
|
let mut escaped = false;
|
|
|
|
while let Some((_, c)) = self.chars.next() {
|
|
if c == '"' && !escaped {
|
|
break;
|
|
}
|
|
|
|
escaped = c == '\\';
|
|
}
|
|
|
|
let end = self.string_index() - 1;
|
|
Token::Quoted(&self.src[start..end])
|
|
}
|
|
|
|
// Escaping
|
|
'\\' => {
|
|
if let Some((index, c)) = self.chars.peek() {
|
|
let escapable = match c {
|
|
'[' | ']' | '\\' | '*' | '_' | '`' | ':' | '=' | ',' | '/' => true,
|
|
_ => false,
|
|
};
|
|
|
|
if escapable {
|
|
self.advance();
|
|
Token::Text(&self.src[index..index + c.len_utf8()])
|
|
} else {
|
|
Token::Text("\\")
|
|
}
|
|
} else {
|
|
Token::Text("\\")
|
|
}
|
|
}
|
|
|
|
// Normal text
|
|
_ => {
|
|
// Find out when the word ends.
|
|
while let Some((_, c)) = self.chars.peek() {
|
|
let second = self.chars.peekn(1).map(|p| p.1);
|
|
|
|
// Whether the next token is still from the text or not.
|
|
let continues = match c {
|
|
'[' | ']' | '\\' => false,
|
|
'*' | '_' | '`' if self.state == TS::Body => false,
|
|
':' | '=' | ',' | '"' if self.state == TS::Function => false,
|
|
|
|
'/' => second != Some('/') && second != Some('*'),
|
|
'*' => second != Some('/'),
|
|
|
|
' ' | '\t' => false,
|
|
c if is_newline_char(c) => false,
|
|
|
|
_ => true,
|
|
};
|
|
|
|
if !continues {
|
|
break;
|
|
}
|
|
|
|
self.advance();
|
|
}
|
|
|
|
let end = self.string_index();
|
|
Token::Text(&self.src[pos..end])
|
|
}
|
|
};
|
|
|
|
Some(Spanned::new(token, Span::new(pos, self.string_index())))
|
|
}
|
|
}
|
|
|
|
/// Whether this character is a newline (or starts one).
|
|
fn is_newline_char(character: char) -> bool {
|
|
match character {
|
|
'\n' | '\r' | '\u{000c}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
/// A (index, char) iterator with double lookahead.
|
|
#[derive(Debug, Clone)]
|
|
pub struct PeekableChars<'s> {
|
|
string: &'s str,
|
|
chars: CharIndices<'s>,
|
|
peeked: SmallVec<[Option<(usize, char)>; 2]>,
|
|
base: usize,
|
|
index: usize,
|
|
}
|
|
|
|
impl<'s> PeekableChars<'s> {
|
|
/// Create a new iterator from a string.
|
|
pub fn new(string: &'s str) -> PeekableChars<'s> {
|
|
PeekableChars {
|
|
string,
|
|
chars: string.char_indices(),
|
|
peeked: SmallVec::new(),
|
|
base: 0,
|
|
index: 0,
|
|
}
|
|
}
|
|
|
|
/// Peek at the next element.
|
|
pub fn peek(&mut self) -> Option<(usize, char)> {
|
|
self.peekn(0)
|
|
}
|
|
|
|
/// Peek at the char of the next element.
|
|
pub fn peekc(&mut self) -> Option<char> {
|
|
self.peekn(0).map(|p| p.1)
|
|
}
|
|
|
|
/// Peek at the element after the next element.
|
|
pub fn peekn(&mut self, n: usize) -> Option<(usize, char)> {
|
|
while self.peeked.len() <= n {
|
|
let next = self.next_inner();
|
|
self.peeked.push(next);
|
|
}
|
|
|
|
self.peeked[n]
|
|
}
|
|
|
|
/// Return the next value of the inner iterator mapped with the offset.
|
|
pub fn next_inner(&mut self) -> Option<(usize, char)> {
|
|
self.chars.next().map(|(i, c)| (self.base + i, c))
|
|
}
|
|
|
|
pub fn string_index(&mut self) -> usize {
|
|
self.index
|
|
}
|
|
|
|
pub fn set_string_index(&mut self, index: usize) {
|
|
self.chars = self.string[index..].char_indices();
|
|
self.base = index;
|
|
self.index = 0;
|
|
self.peeked.clear();
|
|
}
|
|
}
|
|
|
|
impl Iterator for PeekableChars<'_> {
|
|
type Item = (usize, char);
|
|
|
|
fn next(&mut self) -> Option<(usize, char)> {
|
|
let next = if !self.peeked.is_empty() {
|
|
self.peeked.remove(0)
|
|
} else {
|
|
self.next_inner()
|
|
};
|
|
|
|
if let Some((index, c)) = next {
|
|
self.index = index + c.len_utf8();
|
|
}
|
|
|
|
next
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use Token::{
|
|
Backtick as TB, BlockComment as BC, Colon as C, Equals as E, LeftBracket as L,
|
|
LineComment as LC, Newline as N, Quoted as Q, RightBracket as R, Space as S, Star as TS,
|
|
StarSlash as SS, Text as T, Underscore as TU,
|
|
};
|
|
|
|
/// Test if the source code tokenizes to the tokens.
|
|
fn test(src: &str, tokens: Vec<Token>) {
|
|
assert_eq!(Tokens::new(src)
|
|
.map(|token| token.v)
|
|
.collect::<Vec<_>>(), tokens);
|
|
}
|
|
|
|
/// Test if the tokens of the source code have the correct spans.
|
|
fn test_span(src: &str, spans: Vec<(usize, usize)>) {
|
|
assert_eq!(Tokens::new(src)
|
|
.map(|token| token.span.pair())
|
|
.collect::<Vec<_>>(), spans);
|
|
}
|
|
|
|
/// Tokenizes the basic building blocks.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_base() {
|
|
test("", vec![]);
|
|
test("Hallo", vec![T("Hallo")]);
|
|
test("[", vec![L]);
|
|
test("]", vec![R]);
|
|
test("*", vec![TS]);
|
|
test("_", vec![TU]);
|
|
test("`", vec![TB]);
|
|
test("\n", vec![N]);
|
|
}
|
|
|
|
/// This test looks if LF- and CRLF-style newlines get both identified correctly.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_whitespace_newlines() {
|
|
test(" \t", vec![S]);
|
|
test("First line\r\nSecond line\nThird line\n", vec![
|
|
T("First"), S, T("line"), N, T("Second"), S, T("line"), N,
|
|
T("Third"), S, T("line"), N
|
|
]);
|
|
test("Hello \n ", vec![T("Hello"), S, N, S]);
|
|
test("Dense\nTimes", vec![T("Dense"), N, T("Times")]);
|
|
}
|
|
|
|
/// Tests if escaping with backslash works as it should.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_escape() {
|
|
test(r"\[", vec![T("[")]);
|
|
test(r"\]", vec![T("]")]);
|
|
test(r"\**", vec![T("*"), TS]);
|
|
test(r"\*", vec![T("*")]);
|
|
test(r"\__", vec![T("_"), TU]);
|
|
test(r"\_", vec![T("_")]);
|
|
test(r"\hello", vec![T("\\"), T("hello")]);
|
|
}
|
|
|
|
/// Tests if escaped strings work.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_quoted() {
|
|
test(r#"[align: "hello\"world"]"#, vec![L, T("align"), C, S, Q(r#"hello\"world"#), R]);
|
|
}
|
|
|
|
/// Tokenizes some more realistic examples.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_examples() {
|
|
test(r"
|
|
[function][
|
|
Test [italic][example]!
|
|
]
|
|
", vec![
|
|
N, S, L, T("function"), R, L, N, S, T("Test"), S, L, T("italic"), R, L,
|
|
T("example"), R, T("!"), N, S, R, N, S
|
|
]);
|
|
|
|
test(r"
|
|
[page: size=A4]
|
|
[font: size=12pt]
|
|
|
|
Das ist ein Beispielsatz mit *fetter* Schrift.
|
|
", vec![
|
|
N, S, L, T("page"), C, S, T("size"), E, T("A4"), R, N, S,
|
|
L, T("font"), C, S, T("size"), E, T("12pt"), R, N, N, S,
|
|
T("Das"), S, T("ist"), S, T("ein"), S, T("Beispielsatz"), S, T("mit"), S,
|
|
TS, T("fetter"), TS, S, T("Schrift."), N, S
|
|
]);
|
|
}
|
|
|
|
/// This test checks whether the colon and equals symbols get parsed correctly depending on the
|
|
/// context: Either in a function header or in a body.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_symbols_context() {
|
|
test("[func: key=value][Answer: 7]", vec![
|
|
L, T("func"), C, S, T("key"), E, T("value"), R, L,
|
|
T("Answer:"), S, T("7"), R
|
|
]);
|
|
test("[[n: k=v]:x][:[=]]:=", vec![
|
|
L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
|
|
L, T(":"), L, E, R, R, T(":=")
|
|
]);
|
|
test("[hi: k=[func][body] v=1][hello]", vec![
|
|
L, T("hi"), C, S, T("k"), E, L, T("func"), R, L, T("body"), R, S,
|
|
T("v"), E, T("1"), R, L, T("hello"), R
|
|
]);
|
|
test("[func: __key__=value]", vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]);
|
|
test("The /*[*/ answer: 7.", vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]);
|
|
}
|
|
|
|
/// Test if block and line comments get tokenized as expected.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_comments() {
|
|
test("These // Line comments.", vec![T("These"), S, LC(" Line comments.")]);
|
|
test("This /* is */ a comment.", vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]);
|
|
test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]);
|
|
test("/* Hey */ */", vec![BC(" Hey "), S, SS]);
|
|
test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]);
|
|
test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")])
|
|
}
|
|
|
|
/// This test has a special look at the underscore syntax.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_underscores() {
|
|
test("he_llo_world_ __ Now this_ is_ special!",
|
|
vec![T("he"), TU, T("llo"), TU, T("world"), TU, S, TU, TU, S, T("Now"), S,
|
|
T("this"), TU, S, T("is"), TU, S, T("special!")]);
|
|
}
|
|
|
|
/// This test is for checking if non-ASCII characters get parsed correctly.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_unicode() {
|
|
test("[document][Hello 🌍!]", vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]);
|
|
test("[f]⺐.", vec![L, T("f"), R, T("⺐.")]);
|
|
}
|
|
|
|
/// This test checks if all tokens have the correct spans.
|
|
#[test]
|
|
#[rustfmt::skip]
|
|
fn tokenize_spans() {
|
|
test_span("Hello World", vec![(0, 5), (5, 6), (6, 11)]);
|
|
test_span("🌍_🎈", vec![(0, 4), (4, 5), (5, 9)]);
|
|
test_span("[hello: world]", vec![(0, 1), (1, 6), (6, 7), (7, 8), (8, 13), (13, 14)]);
|
|
}
|
|
}
|