typst/src/syntax/tokens.rs
2022-11-03 13:35:39 +01:00

1177 lines
38 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::sync::Arc;
use unicode_xid::UnicodeXID;
use unscanny::Scanner;
use super::resolve::{resolve_hex, resolve_raw, resolve_string};
use super::{ErrorPos, NodeKind, RawKind, Unit};
use crate::geom::{AbsUnit, AngleUnit};
use crate::util::{format_eco, EcoString};
/// An iterator over the tokens of a string of source code.
#[derive(Clone)]
pub struct Tokens<'s> {
/// The underlying scanner.
s: Scanner<'s>,
/// The mode the scanner is in. This determines what tokens it recognizes.
mode: TokenMode,
/// Whether the last token has been terminated.
terminated: bool,
/// Offsets the indentation on the first line of the source.
column_offset: usize,
}
/// What kind of tokens to emit.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum TokenMode {
/// Text and markup.
Markup,
/// Math atoms, operators, etc.
Math,
/// Keywords, literals and operators.
Code,
}
impl<'s> Tokens<'s> {
/// Create a new token iterator with the given mode.
#[inline]
pub fn new(text: &'s str, mode: TokenMode) -> Self {
Self::with_prefix("", text, mode)
}
/// Create a new token iterator with the given mode and a prefix to offset
/// column calculations.
#[inline]
pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self {
Self {
s: Scanner::new(text),
mode,
terminated: true,
column_offset: column(prefix, prefix.len(), 0),
}
}
/// Get the current token mode.
#[inline]
pub fn mode(&self) -> TokenMode {
self.mode
}
/// Change the token mode.
#[inline]
pub fn set_mode(&mut self, mode: TokenMode) {
self.mode = mode;
}
/// The index in the string at which the last token ends and next token
/// will start.
#[inline]
pub fn cursor(&self) -> usize {
self.s.cursor()
}
/// Jump to the given index in the string.
#[inline]
pub fn jump(&mut self, index: usize) {
self.s.jump(index);
}
/// The underlying scanner.
#[inline]
pub fn scanner(&self) -> Scanner<'s> {
self.s
}
/// Whether the last token was terminated.
#[inline]
pub fn terminated(&self) -> bool {
self.terminated
}
/// The column index of a given index in the source string.
#[inline]
pub fn column(&self, index: usize) -> usize {
column(self.s.string(), index, self.column_offset)
}
}
impl<'s> Iterator for Tokens<'s> {
type Item = NodeKind;
/// Parse the next token in the source code.
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let start = self.s.cursor();
let c = self.s.eat()?;
Some(match c {
// Trivia.
'/' if self.s.eat_if('/') => self.line_comment(),
'/' if self.s.eat_if('*') => self.block_comment(),
'*' if self.s.eat_if('/') => {
NodeKind::Error(ErrorPos::Full, "unexpected end of block comment".into())
}
c if c.is_whitespace() => self.whitespace(c),
// Other things.
_ => match self.mode {
TokenMode::Markup => self.markup(start, c),
TokenMode::Math => self.math(start, c),
TokenMode::Code => self.code(start, c),
},
})
}
}
impl<'s> Tokens<'s> {
fn line_comment(&mut self) -> NodeKind {
self.s.eat_until(is_newline);
if self.s.peek().is_none() {
self.terminated = false;
}
NodeKind::LineComment
}
fn block_comment(&mut self) -> NodeKind {
let mut state = '_';
let mut depth = 1;
self.terminated = false;
// Find the first `*/` that does not correspond to a nested `/*`.
while let Some(c) = self.s.eat() {
state = match (state, c) {
('*', '/') => {
depth -= 1;
if depth == 0 {
self.terminated = true;
break;
}
'_'
}
('/', '*') => {
depth += 1;
'_'
}
('/', '/') => {
self.line_comment();
'_'
}
_ => c,
}
}
NodeKind::BlockComment
}
fn whitespace(&mut self, c: char) -> NodeKind {
if c == ' ' && !self.s.at(char::is_whitespace) {
return NodeKind::Space { newlines: 0 };
}
self.s.uneat();
// Count the number of newlines.
let mut newlines = 0;
while let Some(c) = self.s.eat() {
if !c.is_whitespace() {
self.s.uneat();
break;
}
if is_newline(c) {
if c == '\r' {
self.s.eat_if('\n');
}
newlines += 1;
}
}
NodeKind::Space { newlines }
}
#[inline]
fn markup(&mut self, start: usize, c: char) -> NodeKind {
match c {
// Blocks.
'{' => NodeKind::LeftBrace,
'}' => NodeKind::RightBrace,
'[' => NodeKind::LeftBracket,
']' => NodeKind::RightBracket,
// Multi-char things.
'#' => self.hash(start),
'.' if self.s.eat_if("..") => NodeKind::Shorthand('\u{2026}'),
'-' => self.hyph(),
'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => {
self.link(start)
}
'`' => self.raw(),
c if c.is_ascii_digit() => self.numbering(start),
'<' => self.label(),
'@' => self.reference(start),
// Escape sequences.
'\\' => self.backslash(),
// Single-char things.
'~' => NodeKind::Shorthand('\u{00A0}'),
'\'' => NodeKind::SmartQuote { double: false },
'"' => NodeKind::SmartQuote { double: true },
'*' if !self.in_word() => NodeKind::Star,
'_' if !self.in_word() => NodeKind::Underscore,
'$' => NodeKind::Dollar,
'=' => NodeKind::Eq,
'+' => NodeKind::Plus,
'/' => NodeKind::Slash,
':' => NodeKind::Colon,
// Plain text.
_ => self.text(start),
}
}
#[inline]
fn text(&mut self, start: usize) -> NodeKind {
macro_rules! table {
($(|$c:literal)*) => {{
let mut t = [false; 128];
$(t[$c as usize] = true;)*
t
}}
}
const TABLE: [bool; 128] = table! {
| ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
| '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
| '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
};
loop {
self.s.eat_until(|c: char| {
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
});
// Continue with the same text node if the thing would become text
// anyway.
let mut s = self.s;
match s.eat() {
Some('/') if !s.at(['/', '*']) => {}
Some(' ') if s.at(char::is_alphanumeric) => {}
Some('-') if !s.at(['-', '?']) => {}
Some('.') if !s.at("..") => {}
Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
Some('@' | '#') if !s.at(is_id_start) => {}
_ => break,
}
self.s = s;
}
NodeKind::Text(self.s.from(start).into())
}
fn backslash(&mut self) -> NodeKind {
match self.s.peek() {
Some('u') if self.s.eat_if("u{") => {
let sequence = self.s.eat_while(char::is_ascii_alphanumeric);
if self.s.eat_if('}') {
if let Some(c) = resolve_hex(sequence) {
NodeKind::Escape(c)
} else {
NodeKind::Error(
ErrorPos::Full,
"invalid unicode escape sequence".into(),
)
}
} else {
self.terminated = false;
NodeKind::Error(ErrorPos::End, "expected closing brace".into())
}
}
// Linebreaks.
Some(c) if c.is_whitespace() => NodeKind::Linebreak,
None => NodeKind::Linebreak,
// Escapes.
Some(c) => {
self.s.expect(c);
NodeKind::Escape(c)
}
}
}
fn hash(&mut self, start: usize) -> NodeKind {
if self.s.at(is_id_start) {
let read = self.s.eat_while(is_id_continue);
match keyword(read) {
Some(keyword) => keyword,
None => NodeKind::Ident(read.into()),
}
} else {
self.text(start)
}
}
fn hyph(&mut self) -> NodeKind {
if self.s.eat_if('-') {
if self.s.eat_if('-') {
NodeKind::Shorthand('\u{2014}')
} else {
NodeKind::Shorthand('\u{2013}')
}
} else if self.s.eat_if('?') {
NodeKind::Shorthand('\u{00AD}')
} else {
NodeKind::Minus
}
}
fn link(&mut self, start: usize) -> NodeKind {
#[rustfmt::skip]
self.s.eat_while(|c: char| matches!(c,
| '0' ..= '9'
| 'a' ..= 'z'
| 'A' ..= 'Z'
| '~' | '/' | '%' | '?' | '#' | '&' | '+' | '='
| '\'' | '.' | ',' | ';'
));
if self.s.scout(-1) == Some('.') {
self.s.uneat();
}
NodeKind::Link(self.s.from(start).into())
}
fn raw(&mut self) -> NodeKind {
let column = self.column(self.s.cursor() - 1);
let mut backticks = 1;
while self.s.eat_if('`') {
backticks += 1;
}
// Special case for empty inline block.
if backticks == 2 {
return NodeKind::Raw(Arc::new(RawKind {
text: EcoString::new(),
lang: None,
block: false,
}));
}
let start = self.s.cursor();
let mut found = 0;
while found < backticks {
match self.s.eat() {
Some('`') => found += 1,
Some(_) => found = 0,
None => break,
}
}
if found == backticks {
let end = self.s.cursor() - found as usize;
NodeKind::Raw(Arc::new(resolve_raw(
column,
backticks,
self.s.get(start .. end),
)))
} else {
self.terminated = false;
let remaining = backticks - found;
let noun = if remaining == 1 { "backtick" } else { "backticks" };
NodeKind::Error(
ErrorPos::End,
if found == 0 {
format_eco!("expected {} {}", remaining, noun)
} else {
format_eco!("expected {} more {}", remaining, noun)
},
)
}
}
fn numbering(&mut self, start: usize) -> NodeKind {
self.s.eat_while(char::is_ascii_digit);
let read = self.s.from(start);
if self.s.eat_if('.') {
if let Ok(number) = read.parse() {
return NodeKind::EnumNumbering(number);
}
}
self.text(start)
}
fn label(&mut self) -> NodeKind {
let label = self.s.eat_while(is_id_continue);
if self.s.eat_if('>') {
if !label.is_empty() {
NodeKind::Label(label.into())
} else {
NodeKind::Error(ErrorPos::Full, "label cannot be empty".into())
}
} else {
self.terminated = false;
NodeKind::Error(ErrorPos::End, "expected closing angle bracket".into())
}
}
fn reference(&mut self, start: usize) -> NodeKind {
let label = self.s.eat_while(is_id_continue);
if !label.is_empty() {
NodeKind::Ref(label.into())
} else {
self.text(start)
}
}
fn math(&mut self, start: usize, c: char) -> NodeKind {
match c {
// Escape sequences.
'\\' => self.backslash(),
// Single-char things.
'_' => NodeKind::Underscore,
'^' => NodeKind::Hat,
'/' => NodeKind::Slash,
'&' => NodeKind::Amp,
'$' => NodeKind::Dollar,
// Brackets.
'{' => NodeKind::LeftBrace,
'}' => NodeKind::RightBrace,
'[' => NodeKind::LeftBracket,
']' => NodeKind::RightBracket,
'(' => NodeKind::LeftParen,
')' => NodeKind::RightParen,
// Identifiers.
c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
self.s.eat_while(is_math_id_continue);
NodeKind::Ident(self.s.from(start).into())
}
// Numbers.
c if c.is_numeric() => {
self.s.eat_while(char::is_numeric);
NodeKind::Atom(self.s.from(start).into())
}
// Other math atoms.
c => NodeKind::Atom(c.into()),
}
}
fn code(&mut self, start: usize, c: char) -> NodeKind {
match c {
// Blocks.
'{' => NodeKind::LeftBrace,
'}' => NodeKind::RightBrace,
'[' => NodeKind::LeftBracket,
']' => NodeKind::RightBracket,
// Parentheses.
'(' => NodeKind::LeftParen,
')' => NodeKind::RightParen,
// Two-char operators.
'=' if self.s.eat_if('=') => NodeKind::EqEq,
'!' if self.s.eat_if('=') => NodeKind::ExclEq,
'<' if self.s.eat_if('=') => NodeKind::LtEq,
'>' if self.s.eat_if('=') => NodeKind::GtEq,
'+' if self.s.eat_if('=') => NodeKind::PlusEq,
'-' if self.s.eat_if('=') => NodeKind::HyphEq,
'*' if self.s.eat_if('=') => NodeKind::StarEq,
'/' if self.s.eat_if('=') => NodeKind::SlashEq,
'.' if self.s.eat_if('.') => NodeKind::Dots,
'=' if self.s.eat_if('>') => NodeKind::Arrow,
// Single-char operators.
',' => NodeKind::Comma,
';' => NodeKind::Semicolon,
':' => NodeKind::Colon,
'+' => NodeKind::Plus,
'-' => NodeKind::Minus,
'*' => NodeKind::Star,
'/' => NodeKind::Slash,
'=' => NodeKind::Eq,
'<' => NodeKind::Lt,
'>' => NodeKind::Gt,
'.' if !self.s.at(char::is_ascii_digit) => NodeKind::Dot,
// Identifiers.
c if is_id_start(c) => self.ident(start),
// Numbers.
c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => {
self.number(start, c)
}
// Strings.
'"' => self.string(),
// Invalid token.
_ => NodeKind::Error(ErrorPos::Full, "not valid here".into()),
}
}
fn ident(&mut self, start: usize) -> NodeKind {
self.s.eat_while(is_id_continue);
match self.s.from(start) {
"none" => NodeKind::None,
"auto" => NodeKind::Auto,
"true" => NodeKind::Bool(true),
"false" => NodeKind::Bool(false),
id => keyword(id).unwrap_or_else(|| NodeKind::Ident(id.into())),
}
}
fn number(&mut self, start: usize, c: char) -> NodeKind {
// Read the first part (integer or fractional depending on `first`).
self.s.eat_while(char::is_ascii_digit);
// Read the fractional part if not already done.
// Make sure not to confuse a range for the decimal separator.
if c != '.' && !self.s.at("..") && self.s.eat_if('.') {
self.s.eat_while(char::is_ascii_digit);
}
// Read the exponent.
if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
self.s.eat_if(['+', '-']);
self.s.eat_while(char::is_ascii_digit);
}
// Read the suffix.
let suffix_start = self.s.cursor();
if !self.s.eat_if('%') {
self.s.eat_while(char::is_ascii_alphanumeric);
}
let number = self.s.get(start .. suffix_start);
let suffix = self.s.from(suffix_start);
// Find out whether it is a simple number.
if suffix.is_empty() {
if let Ok(i) = number.parse::<i64>() {
return NodeKind::Int(i);
}
}
let v = match number.parse::<f64>() {
Ok(v) => v,
Err(_) => return NodeKind::Error(ErrorPos::Full, "invalid number".into()),
};
match suffix {
"" => NodeKind::Float(v),
"pt" => NodeKind::Numeric(v, Unit::Length(AbsUnit::Pt)),
"mm" => NodeKind::Numeric(v, Unit::Length(AbsUnit::Mm)),
"cm" => NodeKind::Numeric(v, Unit::Length(AbsUnit::Cm)),
"in" => NodeKind::Numeric(v, Unit::Length(AbsUnit::In)),
"deg" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Deg)),
"rad" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Rad)),
"em" => NodeKind::Numeric(v, Unit::Em),
"fr" => NodeKind::Numeric(v, Unit::Fr),
"%" => NodeKind::Numeric(v, Unit::Percent),
_ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()),
}
}
fn string(&mut self) -> NodeKind {
let mut escaped = false;
let verbatim = self.s.eat_until(|c| {
if c == '"' && !escaped {
true
} else {
escaped = c == '\\' && !escaped;
false
}
});
let string = resolve_string(verbatim);
if self.s.eat_if('"') {
NodeKind::Str(string)
} else {
self.terminated = false;
NodeKind::Error(ErrorPos::End, "expected quote".into())
}
}
fn in_word(&self) -> bool {
let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
let prev = self.s.scout(-2);
let next = self.s.peek();
alphanumeric(prev) && alphanumeric(next)
}
}
fn keyword(ident: &str) -> Option<NodeKind> {
Some(match ident {
"not" => NodeKind::Not,
"and" => NodeKind::And,
"or" => NodeKind::Or,
"let" => NodeKind::Let,
"set" => NodeKind::Set,
"show" => NodeKind::Show,
"wrap" => NodeKind::Wrap,
"if" => NodeKind::If,
"else" => NodeKind::Else,
"for" => NodeKind::For,
"in" => NodeKind::In,
"as" => NodeKind::As,
"while" => NodeKind::While,
"break" => NodeKind::Break,
"continue" => NodeKind::Continue,
"return" => NodeKind::Return,
"import" => NodeKind::Import,
"include" => NodeKind::Include,
"from" => NodeKind::From,
_ => return None,
})
}
/// The column index of a given index in the source string, given a column
/// offset for the first line.
#[inline]
fn column(string: &str, index: usize, offset: usize) -> usize {
let mut apply_offset = false;
let res = string[.. index]
.char_indices()
.rev()
.take_while(|&(_, c)| !is_newline(c))
.inspect(|&(i, _)| {
if i == 0 {
apply_offset = true
}
})
.count();
// The loop is never executed if the slice is empty, but we are of
// course still at the start of the first line.
if index == 0 {
apply_offset = true;
}
if apply_offset { res + offset } else { res }
}
/// Whether this character denotes a newline.
#[inline]
pub fn is_newline(character: char) -> bool {
matches!(
character,
// Line Feed, Vertical Tab, Form Feed, Carriage Return.
'\n' | '\x0B' | '\x0C' | '\r' |
// Next Line, Line Separator, Paragraph Separator.
'\u{0085}' | '\u{2028}' | '\u{2029}'
)
}
/// Whether a string is a valid unicode identifier.
///
/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
/// - `_` as a starting character,
/// - `_` and `-` as continuing characters.
///
/// [uax31]: http://www.unicode.org/reports/tr31/
#[inline]
pub fn is_ident(string: &str) -> bool {
let mut chars = string.chars();
chars
.next()
.map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
}
/// Whether a character can start an identifier.
#[inline]
fn is_id_start(c: char) -> bool {
c.is_xid_start() || c == '_'
}
/// Whether a character can continue an identifier.
#[inline]
fn is_id_continue(c: char) -> bool {
c.is_xid_continue() || c == '_' || c == '-'
}
/// Whether a character can start an identifier in math.
#[inline]
fn is_math_id_start(c: char) -> bool {
c.is_xid_start()
}
/// Whether a character can continue an identifier in math.
#[inline]
fn is_math_id_continue(c: char) -> bool {
c.is_xid_continue() && c != '_'
}
#[cfg(test)]
#[allow(non_snake_case)]
mod tests {
use super::super::tests::check;
use super::*;
use ErrorPos::*;
use NodeKind::*;
use Option::None;
use TokenMode::{Code, Markup};
fn Space(newlines: usize) -> NodeKind {
NodeKind::Space { newlines }
}
fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind {
NodeKind::Raw(Arc::new(RawKind {
text: text.into(),
lang: lang.map(Into::into),
block,
}))
}
fn Str(string: &str) -> NodeKind {
NodeKind::Str(string.into())
}
fn Text(string: &str) -> NodeKind {
NodeKind::Text(string.into())
}
fn Ident(ident: &str) -> NodeKind {
NodeKind::Ident(ident.into())
}
fn Error(pos: ErrorPos, message: &str) -> NodeKind {
NodeKind::Error(pos, message.into())
}
/// Building blocks for suffix testing.
///
/// We extend each test case with a collection of different suffixes to make
/// sure tokens end at the correct position. These suffixes are split into
/// blocks, which can be disabled/enabled per test case. For example, when
/// testing identifiers we disable letter suffixes because these would
/// mingle with the identifiers.
///
/// Suffix blocks:
/// - ' ': spacing
/// - 'a': letters
/// - '1': numbers
/// - '/': symbols
const BLOCKS: &str = " a1/";
// Suffixes described by four-tuples of:
//
// - block the suffix is part of
// - mode in which the suffix is applicable
// - the suffix string
// - the resulting suffix NodeKind
fn suffixes()
-> impl Iterator<Item = (char, Option<TokenMode>, &'static str, NodeKind)> {
[
// Whitespace suffixes.
(' ', None, " ", Space(0)),
(' ', None, "\n", Space(1)),
(' ', None, "\r", Space(1)),
(' ', None, "\r\n", Space(1)),
// Letter suffixes.
('a', Some(Markup), "hello", Text("hello")),
('a', Some(Markup), "💚", Text("💚")),
('a', Some(Code), "val", Ident("val")),
('a', Some(Code), "α", Ident("α")),
('a', Some(Code), "_", Ident("_")),
// Number suffixes.
('1', Some(Code), "2", Int(2)),
('1', Some(Code), ".2", Float(0.2)),
// Symbol suffixes.
('/', None, "[", LeftBracket),
('/', None, "//", LineComment),
('/', None, "/**/", BlockComment),
('/', Some(Markup), "*", Star),
('/', Some(Markup), r"\\", Escape('\\')),
('/', Some(Markup), "#let", Let),
('/', Some(Code), "(", LeftParen),
('/', Some(Code), ":", Colon),
('/', Some(Code), "+=", PlusEq),
]
.into_iter()
}
macro_rules! t {
(Both $($tts:tt)*) => {
t!(Markup $($tts)*);
t!(Code $($tts)*);
};
($mode:ident $([$blocks:literal])?: $text:expr => $($token:expr),*) => {{
// Test without suffix.
t!(@$mode: $text => $($token),*);
// Test with each applicable suffix.
for (block, mode, suffix, ref token) in suffixes() {
let text = $text;
#[allow(unused_variables)]
let blocks = BLOCKS;
$(let blocks = $blocks;)?
assert!(!blocks.contains(|c| !BLOCKS.contains(c)));
if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) {
t!(@$mode: format!("{}{}", text, suffix) => $($token,)* token);
}
}
}};
(@$mode:ident: $text:expr => $($token:expr),*) => {{
let text = $text;
let found = Tokens::new(&text, $mode).collect::<Vec<_>>();
let expected = vec![$($token.clone()),*];
check(&text, found, expected);
}};
}
#[test]
fn test_tokenize_brackets() {
// Test in markup.
t!(Markup: "{" => LeftBrace);
t!(Markup: "}" => RightBrace);
t!(Markup: "[" => LeftBracket);
t!(Markup: "]" => RightBracket);
t!(Markup[" /"]: "(" => Text("("));
t!(Markup[" /"]: ")" => Text(")"));
// Test in code.
t!(Code: "{" => LeftBrace);
t!(Code: "}" => RightBrace);
t!(Code: "[" => LeftBracket);
t!(Code: "]" => RightBracket);
t!(Code: "(" => LeftParen);
t!(Code: ")" => RightParen);
}
#[test]
fn test_tokenize_whitespace() {
// Test basic whitespace.
t!(Both["a1/"]: "" => );
t!(Both["a1/"]: " " => Space(0));
t!(Both["a1/"]: " " => Space(0));
t!(Both["a1/"]: "\t" => Space(0));
t!(Both["a1/"]: " \t" => Space(0));
t!(Both["a1/"]: "\u{202F}" => Space(0));
// Test newline counting.
t!(Both["a1/"]: "\n" => Space(1));
t!(Both["a1/"]: "\n " => Space(1));
t!(Both["a1/"]: " \n" => Space(1));
t!(Both["a1/"]: " \n " => Space(1));
t!(Both["a1/"]: "\r\n" => Space(1));
t!(Both["a1/"]: "\r\n\r" => Space(2));
t!(Both["a1/"]: " \n\t \n " => Space(2));
t!(Both["a1/"]: "\n\r" => Space(2));
t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
}
#[test]
fn test_tokenize_text() {
// Test basic text.
t!(Markup[" /"]: "hello" => Text("hello"));
t!(Markup[" /"]: "reha-world" => Text("reha-world"));
// Test code symbols in text.
t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b"));
t!(Markup[" /"]: ";,|/+" => Text(";,|/+"));
t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a"));
t!(Markup[" "]: "#123" => Text("#123"));
// Test text ends.
t!(Markup[""]: "hello " => Text("hello"), Space(0));
t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}'));
}
#[test]
fn test_tokenize_escape_sequences() {
// Test escapable symbols.
t!(Markup: r"\\" => Escape('\\'));
t!(Markup: r"\/" => Escape('/'));
t!(Markup: r"\[" => Escape('['));
t!(Markup: r"\]" => Escape(']'));
t!(Markup: r"\{" => Escape('{'));
t!(Markup: r"\}" => Escape('}'));
t!(Markup: r"\*" => Escape('*'));
t!(Markup: r"\_" => Escape('_'));
t!(Markup: r"\=" => Escape('='));
t!(Markup: r"\~" => Escape('~'));
t!(Markup: r"\'" => Escape('\''));
t!(Markup: r#"\""# => Escape('"'));
t!(Markup: r"\`" => Escape('`'));
t!(Markup: r"\$" => Escape('$'));
t!(Markup: r"\#" => Escape('#'));
t!(Markup: r"\a" => Escape('a'));
t!(Markup: r"\u" => Escape('u'));
t!(Markup: r"\1" => Escape('1'));
// Test basic unicode escapes.
t!(Markup: r"\u{}" => Error(Full, "invalid unicode escape sequence"));
t!(Markup: r"\u{2603}" => Escape('☃'));
t!(Markup: r"\u{P}" => Error(Full, "invalid unicode escape sequence"));
// Test unclosed unicode escapes.
t!(Markup[" /"]: r"\u{" => Error(End, "expected closing brace"));
t!(Markup[" /"]: r"\u{1" => Error(End, "expected closing brace"));
t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace"));
t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace"));
t!(Markup: r"\u{1🏕}" => Error(End, "expected closing brace"), Text("🏕"), RightBrace);
}
#[test]
fn test_tokenize_markup_symbols() {
// Test markup tokens.
t!(Markup[" a1"]: "*" => Star);
t!(Markup: "_" => Underscore);
t!(Markup[""]: "===" => Eq, Eq, Eq);
t!(Markup["a1/"]: "= " => Eq, Space(0));
t!(Markup[" "]: r"\" => Linebreak);
t!(Markup: "~" => Shorthand('\u{00A0}'));
t!(Markup["a1/"]: "-?" => Shorthand('\u{00AD}'));
t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}'));
t!(Markup["a1/"]: "- " => Minus, Space(0));
t!(Markup[" "]: "+" => Plus);
t!(Markup[" "]: "1." => EnumNumbering(1));
t!(Markup[" "]: "1.a" => EnumNumbering(1), Text("a"));
t!(Markup[" /"]: "a1." => Text("a1."));
}
#[test]
fn test_tokenize_code_symbols() {
// Test all symbols.
t!(Code: "," => Comma);
t!(Code: ";" => Semicolon);
t!(Code: ":" => Colon);
t!(Code: "+" => Plus);
t!(Code: "-" => Minus);
t!(Code[" a1"]: "*" => Star);
t!(Code[" a1"]: "/" => Slash);
t!(Code[" a/"]: "." => Dot);
t!(Code: "=" => Eq);
t!(Code: "==" => EqEq);
t!(Code: "!=" => ExclEq);
t!(Code: "<" => Lt);
t!(Code: "<=" => LtEq);
t!(Code: ">" => Gt);
t!(Code: ">=" => GtEq);
t!(Code: "+=" => PlusEq);
t!(Code: "-=" => HyphEq);
t!(Code: "*=" => StarEq);
t!(Code: "/=" => SlashEq);
t!(Code: ".." => Dots);
t!(Code: "=>" => Arrow);
// Test combinations.
t!(Code: "<=>" => LtEq, Gt);
t!(Code[" a/"]: "..." => Dots, Dot);
// Test hyphen as symbol vs part of identifier.
t!(Code[" /"]: "-1" => Minus, Int(1));
t!(Code[" /"]: "-a" => Minus, Ident("a"));
t!(Code[" /"]: "--1" => Minus, Minus, Int(1));
t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a"));
t!(Code[" /"]: "a-b" => Ident("a-b"));
// Test invalid.
t!(Code: r"\" => Error(Full, "not valid here"));
}
#[test]
fn test_tokenize_keywords() {
// A list of a few (not all) keywords.
let list = [
("not", Not),
("let", Let),
("if", If),
("else", Else),
("for", For),
("in", In),
("import", Import),
];
for (s, t) in list.clone() {
t!(Markup[" "]: format!("#{}", s) => t);
t!(Markup[" "]: format!("#{0}#{0}", s) => t, t);
t!(Markup[" /"]: format!("# {}", s) => Text(&format!("# {s}")));
}
for (s, t) in list {
t!(Code[" "]: s => t);
t!(Markup[" /"]: s => Text(s));
}
// Test simple identifier.
t!(Markup[" "]: "#letter" => Ident("letter"));
t!(Code[" /"]: "falser" => Ident("falser"));
t!(Code[" /"]: "None" => Ident("None"));
t!(Code[" /"]: "True" => Ident("True"));
}
#[test]
fn test_tokenize_raw_blocks() {
// Test basic raw block.
t!(Markup: "``" => Raw("", None, false));
t!(Markup: "`raw`" => Raw("raw", None, false));
t!(Markup[""]: "`]" => Error(End, "expected 1 backtick"));
// Test special symbols in raw block.
t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false));
t!(Markup[""]: r"`\`` " => Raw(r"\", None, false), Error(End, "expected 1 backtick"));
// Test separated closing backticks.
t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false));
// Test more backticks.
t!(Markup: "``nope``" => Raw("", None, false), Text("nope"), Raw("", None, false));
t!(Markup: "````🚀````" => Raw("", None, false));
t!(Markup[""]: "`````👩🚀````noend" => Error(End, "expected 5 backticks"));
t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false));
}
#[test]
fn test_tokenize_idents() {
// Test valid identifiers.
t!(Code[" /"]: "x" => Ident("x"));
t!(Code[" /"]: "value" => Ident("value"));
t!(Code[" /"]: "__main__" => Ident("__main__"));
t!(Code[" /"]: "_snake_case" => Ident("_snake_case"));
// Test non-ascii.
t!(Code[" /"]: "α" => Ident("α"));
t!(Code[" /"]: "" => Ident(""));
// Test hyphen parsed as identifier.
t!(Code[" /"]: "kebab-case" => Ident("kebab-case"));
t!(Code[" /"]: "one-10" => Ident("one-10"));
}
#[test]
fn test_tokenize_numeric() {
let ints = [("7", 7), ("012", 12)];
let floats = [
(".3", 0.3),
("0.3", 0.3),
("3.", 3.0),
("3.0", 3.0),
("14.3", 14.3),
("10e2", 1000.0),
("10e+0", 10.0),
("10e+1", 100.0),
("10e-2", 0.1),
("10.e1", 100.0),
("10.e-1", 1.0),
(".1e1", 1.0),
("10E2", 1000.0),
];
// Test integers.
for &(s, v) in &ints {
t!(Code[" /"]: s => Int(v));
}
// Test floats.
for &(s, v) in &floats {
t!(Code[" /"]: s => Float(v));
}
// Test attached numbers.
t!(Code[" /"]: ".2.3" => Float(0.2), Float(0.3));
t!(Code[" /"]: "1.2.3" => Float(1.2), Float(0.3));
t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3));
// Test float from too large integer.
let large = i64::MAX as f64 + 1.0;
t!(Code[" /"]: large.to_string() => Float(large));
// Combined integers and floats.
let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats);
let suffixes: &[(&str, fn(f64) -> NodeKind)] = &[
("mm", |x| Numeric(x, Unit::Length(AbsUnit::Mm))),
("pt", |x| Numeric(x, Unit::Length(AbsUnit::Pt))),
("cm", |x| Numeric(x, Unit::Length(AbsUnit::Cm))),
("in", |x| Numeric(x, Unit::Length(AbsUnit::In))),
("rad", |x| Numeric(x, Unit::Angle(AngleUnit::Rad))),
("deg", |x| Numeric(x, Unit::Angle(AngleUnit::Deg))),
("em", |x| Numeric(x, Unit::Em)),
("fr", |x| Numeric(x, Unit::Fr)),
("%", |x| Numeric(x, Unit::Percent)),
];
// Numeric types.
for &(suffix, build) in suffixes {
for (s, v) in nums.clone() {
t!(Code[" /"]: format!("{}{}", s, suffix) => build(v));
}
}
// Multiple dots close the number.
t!(Code[" /"]: "1..2" => Int(1), Dots, Int(2));
t!(Code[" /"]: "1..2.3" => Int(1), Dots, Float(2.3));
t!(Code[" /"]: "1.2..3" => Float(1.2), Dots, Int(3));
// Test invalid.
t!(Code[" /"]: "1foo" => Error(Full, "invalid number suffix"));
}
#[test]
fn test_tokenize_strings() {
// Test basic strings.
t!(Code: "\"hi\"" => Str("hi"));
t!(Code: "\"hi\nthere\"" => Str("hi\nthere"));
t!(Code: "\"🌎\"" => Str("🌎"));
// Test unterminated.
t!(Code[""]: "\"hi" => Error(End, "expected quote"));
// Test escaped quote.
t!(Code: r#""a\"bc""# => Str("a\"bc"));
t!(Code[""]: r#""\""# => Error(End, "expected quote"));
}
#[test]
fn test_tokenize_line_comments() {
// Test line comment with no trailing newline.
t!(Both[""]: "//" => LineComment);
// Test line comment ends at newline.
t!(Both["a1/"]: "//bc\n" => LineComment, Space(1));
t!(Both["a1/"]: "// bc \n" => LineComment, Space(1));
t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1));
// Test nested line comments.
t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1));
}
#[test]
fn test_tokenize_block_comments() {
// Test basic block comments.
t!(Both[""]: "/*" => BlockComment);
t!(Both: "/**/" => BlockComment);
t!(Both: "/*🏞*/" => BlockComment);
t!(Both: "/*\n*/" => BlockComment);
// Test depth 1 and 2 nested block comments.
t!(Both: "/* /* */ */" => BlockComment);
t!(Both: "/*/*/**/*/*/" => BlockComment);
// Test two nested, one unclosed block comments.
t!(Both[""]: "/*/*/**/*/" => BlockComment);
// Test all combinations of up to two following slashes and stars.
t!(Both[""]: "/*" => BlockComment);
t!(Both[""]: "/*/" => BlockComment);
t!(Both[""]: "/**" => BlockComment);
t!(Both[""]: "/*//" => BlockComment);
t!(Both[""]: "/*/*" => BlockComment);
t!(Both[""]: "/**/" => BlockComment);
t!(Both[""]: "/***" => BlockComment);
// Test unexpected terminator.
t!(Both: "/*Hi*/*/" => BlockComment,
Error(Full, "unexpected end of block comment"));
}
}