mirror of
https://github.com/typst/typst
synced 2025-05-13 20:46:23 +08:00
1176 lines
38 KiB
Rust
1176 lines
38 KiB
Rust
use std::sync::Arc;
|
||
|
||
use unicode_xid::UnicodeXID;
|
||
use unscanny::Scanner;
|
||
|
||
use super::resolve::{resolve_hex, resolve_raw, resolve_string};
|
||
use crate::geom::{AngleUnit, LengthUnit};
|
||
use crate::syntax::ast::{MathNode, RawNode, Unit};
|
||
use crate::syntax::{NodeKind, SpanPos};
|
||
use crate::util::EcoString;
|
||
|
||
/// An iterator over the tokens of a string of source code.
|
||
#[derive(Clone)]
|
||
pub struct Tokens<'s> {
|
||
/// The underlying scanner.
|
||
s: Scanner<'s>,
|
||
/// The mode the scanner is in. This determines what tokens it recognizes.
|
||
mode: TokenMode,
|
||
/// Whether the last token has been terminated.
|
||
terminated: bool,
|
||
/// Offsets the indentation on the first line of the source.
|
||
column_offset: usize,
|
||
}
|
||
|
||
/// What kind of tokens to emit.
|
||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||
pub enum TokenMode {
|
||
/// Text and markup.
|
||
Markup,
|
||
/// Keywords, literals and operators.
|
||
Code,
|
||
}
|
||
|
||
impl<'s> Tokens<'s> {
|
||
/// Create a new token iterator with the given mode.
|
||
#[inline]
|
||
pub fn new(text: &'s str, mode: TokenMode) -> Self {
|
||
Self::with_prefix("", text, mode)
|
||
}
|
||
|
||
/// Create a new token iterator with the given mode and a prefix to offset
|
||
/// column calculations.
|
||
#[inline]
|
||
pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self {
|
||
Self {
|
||
s: Scanner::new(text),
|
||
mode,
|
||
terminated: true,
|
||
column_offset: column(prefix, prefix.len(), 0),
|
||
}
|
||
}
|
||
|
||
/// Get the current token mode.
|
||
#[inline]
|
||
pub fn mode(&self) -> TokenMode {
|
||
self.mode
|
||
}
|
||
|
||
/// Change the token mode.
|
||
#[inline]
|
||
pub fn set_mode(&mut self, mode: TokenMode) {
|
||
self.mode = mode;
|
||
}
|
||
|
||
/// The index in the string at which the last token ends and next token
|
||
/// will start.
|
||
#[inline]
|
||
pub fn cursor(&self) -> usize {
|
||
self.s.cursor()
|
||
}
|
||
|
||
/// Jump to the given index in the string.
|
||
#[inline]
|
||
pub fn jump(&mut self, index: usize) {
|
||
self.s.jump(index);
|
||
}
|
||
|
||
/// The underlying scanner.
|
||
#[inline]
|
||
pub fn scanner(&self) -> Scanner<'s> {
|
||
self.s
|
||
}
|
||
|
||
/// Whether the last token was terminated.
|
||
#[inline]
|
||
pub fn terminated(&self) -> bool {
|
||
self.terminated
|
||
}
|
||
|
||
/// The column index of a given index in the source string.
|
||
#[inline]
|
||
pub fn column(&self, index: usize) -> usize {
|
||
column(self.s.string(), index, self.column_offset)
|
||
}
|
||
}
|
||
|
||
impl<'s> Iterator for Tokens<'s> {
|
||
type Item = NodeKind;
|
||
|
||
/// Parse the next token in the source code.
|
||
#[inline]
|
||
fn next(&mut self) -> Option<Self::Item> {
|
||
let start = self.s.cursor();
|
||
let c = self.s.eat()?;
|
||
Some(match c {
|
||
// Comments.
|
||
'/' if self.s.eat_if('/') => self.line_comment(),
|
||
'/' if self.s.eat_if('*') => self.block_comment(),
|
||
'*' if self.s.eat_if('/') => NodeKind::Unknown("*/".into()),
|
||
|
||
// Blocks.
|
||
'{' => NodeKind::LeftBrace,
|
||
'}' => NodeKind::RightBrace,
|
||
'[' => NodeKind::LeftBracket,
|
||
']' => NodeKind::RightBracket,
|
||
|
||
// Whitespace.
|
||
c if c.is_whitespace() => self.whitespace(c),
|
||
|
||
// Other things.
|
||
_ => match self.mode {
|
||
TokenMode::Markup => self.markup(start, c),
|
||
TokenMode::Code => self.code(start, c),
|
||
},
|
||
})
|
||
}
|
||
}
|
||
|
||
impl<'s> Tokens<'s> {
|
||
fn line_comment(&mut self) -> NodeKind {
|
||
self.s.eat_until(is_newline);
|
||
if self.s.peek().is_none() {
|
||
self.terminated = false;
|
||
}
|
||
NodeKind::LineComment
|
||
}
|
||
|
||
fn block_comment(&mut self) -> NodeKind {
|
||
let mut state = '_';
|
||
let mut depth = 1;
|
||
self.terminated = false;
|
||
|
||
// Find the first `*/` that does not correspond to a nested `/*`.
|
||
while let Some(c) = self.s.eat() {
|
||
state = match (state, c) {
|
||
('*', '/') => {
|
||
depth -= 1;
|
||
if depth == 0 {
|
||
self.terminated = true;
|
||
break;
|
||
}
|
||
'_'
|
||
}
|
||
('/', '*') => {
|
||
depth += 1;
|
||
'_'
|
||
}
|
||
('/', '/') => {
|
||
self.line_comment();
|
||
'_'
|
||
}
|
||
_ => c,
|
||
}
|
||
}
|
||
|
||
NodeKind::BlockComment
|
||
}
|
||
|
||
fn whitespace(&mut self, c: char) -> NodeKind {
|
||
if c == ' ' && !self.s.at(char::is_whitespace) {
|
||
return NodeKind::Space { newlines: 0 };
|
||
}
|
||
|
||
self.s.uneat();
|
||
|
||
// Count the number of newlines.
|
||
let mut newlines = 0;
|
||
while let Some(c) = self.s.eat() {
|
||
if !c.is_whitespace() {
|
||
self.s.uneat();
|
||
break;
|
||
}
|
||
|
||
if is_newline(c) {
|
||
if c == '\r' {
|
||
self.s.eat_if('\n');
|
||
}
|
||
newlines += 1;
|
||
}
|
||
}
|
||
|
||
NodeKind::Space { newlines }
|
||
}
|
||
|
||
#[inline]
|
||
fn markup(&mut self, start: usize, c: char) -> NodeKind {
|
||
match c {
|
||
// Escape sequences.
|
||
'\\' => self.backslash(),
|
||
|
||
// Single-char things.
|
||
'~' => NodeKind::NonBreakingSpace,
|
||
'.' if self.s.eat_if("..") => NodeKind::Ellipsis,
|
||
'\'' => NodeKind::Quote { double: false },
|
||
'"' => NodeKind::Quote { double: true },
|
||
'*' if !self.in_word() => NodeKind::Star,
|
||
'_' if !self.in_word() => NodeKind::Underscore,
|
||
'=' => NodeKind::Eq,
|
||
'+' => NodeKind::Plus,
|
||
'/' => NodeKind::Slash,
|
||
':' => NodeKind::Colon,
|
||
|
||
// Multi-char things.
|
||
'#' => self.hash(start),
|
||
'-' => self.hyph(),
|
||
'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => {
|
||
self.link(start)
|
||
}
|
||
'`' => self.raw(),
|
||
'$' => self.math(),
|
||
c if c.is_ascii_digit() => self.numbering(start),
|
||
'<' => self.label(),
|
||
'@' => self.reference(start),
|
||
|
||
// Plain text.
|
||
_ => self.text(start),
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
fn text(&mut self, start: usize) -> NodeKind {
|
||
macro_rules! table {
|
||
($(|$c:literal)*) => {{
|
||
let mut t = [false; 128];
|
||
$(t[$c as usize] = true;)*
|
||
t
|
||
}}
|
||
}
|
||
|
||
const TABLE: [bool; 128] = table! {
|
||
| ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
|
||
| '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
|
||
| '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
|
||
};
|
||
|
||
loop {
|
||
self.s.eat_until(|c: char| {
|
||
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
|
||
});
|
||
|
||
// Continue with the same text node if the thing would become text
|
||
// anyway.
|
||
let mut s = self.s;
|
||
match s.eat() {
|
||
Some('/') if !s.at(['/', '*']) => {}
|
||
Some(' ') if s.at(char::is_alphanumeric) => {}
|
||
Some('-') if !s.at(['-', '?']) => {}
|
||
Some('.') if !s.at("..") => {}
|
||
Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
|
||
Some('@' | '#') if !s.at(is_id_start) => {}
|
||
_ => break,
|
||
}
|
||
|
||
self.s = s;
|
||
}
|
||
|
||
NodeKind::Text(self.s.from(start).into())
|
||
}
|
||
|
||
fn backslash(&mut self) -> NodeKind {
|
||
match self.s.peek() {
|
||
Some('u') if self.s.eat_if("u{") => {
|
||
let sequence = self.s.eat_while(char::is_ascii_alphanumeric);
|
||
if self.s.eat_if('}') {
|
||
if let Some(c) = resolve_hex(sequence) {
|
||
NodeKind::Escape(c)
|
||
} else {
|
||
NodeKind::Error(
|
||
SpanPos::Full,
|
||
"invalid unicode escape sequence".into(),
|
||
)
|
||
}
|
||
} else {
|
||
self.terminated = false;
|
||
NodeKind::Error(SpanPos::End, "expected closing brace".into())
|
||
}
|
||
}
|
||
|
||
// Linebreaks.
|
||
Some(c) if c.is_whitespace() => NodeKind::Linebreak,
|
||
None => NodeKind::Linebreak,
|
||
|
||
// Escapes.
|
||
Some(c) => {
|
||
self.s.expect(c);
|
||
NodeKind::Escape(c)
|
||
}
|
||
}
|
||
}
|
||
|
||
fn hash(&mut self, start: usize) -> NodeKind {
|
||
if self.s.at(is_id_start) {
|
||
let read = self.s.eat_while(is_id_continue);
|
||
match keyword(read) {
|
||
Some(keyword) => keyword,
|
||
None => NodeKind::Ident(read.into()),
|
||
}
|
||
} else {
|
||
self.text(start)
|
||
}
|
||
}
|
||
|
||
fn hyph(&mut self) -> NodeKind {
|
||
if self.s.eat_if('-') {
|
||
if self.s.eat_if('-') {
|
||
NodeKind::EmDash
|
||
} else {
|
||
NodeKind::EnDash
|
||
}
|
||
} else if self.s.eat_if('?') {
|
||
NodeKind::Shy
|
||
} else {
|
||
NodeKind::Minus
|
||
}
|
||
}
|
||
|
||
fn in_word(&self) -> bool {
|
||
let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
|
||
let prev = self.s.scout(-2);
|
||
let next = self.s.peek();
|
||
alphanumeric(prev) && alphanumeric(next)
|
||
}
|
||
|
||
fn link(&mut self, start: usize) -> NodeKind {
|
||
#[rustfmt::skip]
|
||
self.s.eat_while(|c: char| matches!(c,
|
||
| '0' ..= '9'
|
||
| 'a' ..= 'z'
|
||
| 'A' ..= 'Z'
|
||
| '~' | '/' | '%' | '?' | '#' | '&' | '+' | '='
|
||
| '\'' | '.' | ',' | ';'
|
||
));
|
||
if self.s.scout(-1) == Some('.') {
|
||
self.s.uneat();
|
||
}
|
||
NodeKind::Link(self.s.from(start).into())
|
||
}
|
||
|
||
fn raw(&mut self) -> NodeKind {
|
||
let column = self.column(self.s.cursor() - 1);
|
||
|
||
let mut backticks = 1;
|
||
while self.s.eat_if('`') {
|
||
backticks += 1;
|
||
}
|
||
|
||
// Special case for empty inline block.
|
||
if backticks == 2 {
|
||
return NodeKind::Raw(Arc::new(RawNode {
|
||
text: EcoString::new(),
|
||
lang: None,
|
||
block: false,
|
||
}));
|
||
}
|
||
|
||
let start = self.s.cursor();
|
||
let mut found = 0;
|
||
while found < backticks {
|
||
match self.s.eat() {
|
||
Some('`') => found += 1,
|
||
Some(_) => found = 0,
|
||
None => break,
|
||
}
|
||
}
|
||
|
||
if found == backticks {
|
||
let end = self.s.cursor() - found as usize;
|
||
NodeKind::Raw(Arc::new(resolve_raw(
|
||
column,
|
||
backticks,
|
||
self.s.get(start .. end),
|
||
)))
|
||
} else {
|
||
self.terminated = false;
|
||
let remaining = backticks - found;
|
||
let noun = if remaining == 1 { "backtick" } else { "backticks" };
|
||
NodeKind::Error(
|
||
SpanPos::End,
|
||
if found == 0 {
|
||
format_eco!("expected {} {}", remaining, noun)
|
||
} else {
|
||
format_eco!("expected {} more {}", remaining, noun)
|
||
},
|
||
)
|
||
}
|
||
}
|
||
|
||
fn math(&mut self) -> NodeKind {
|
||
let mut escaped = false;
|
||
let formula = self.s.eat_until(|c| {
|
||
if c == '$' && !escaped {
|
||
true
|
||
} else {
|
||
escaped = c == '\\' && !escaped;
|
||
false
|
||
}
|
||
});
|
||
|
||
let display = formula.len() >= 2
|
||
&& formula.starts_with(char::is_whitespace)
|
||
&& formula.ends_with(char::is_whitespace);
|
||
|
||
if self.s.eat_if('$') {
|
||
NodeKind::Math(Arc::new(MathNode { formula: formula.into(), display }))
|
||
} else {
|
||
self.terminated = false;
|
||
NodeKind::Error(SpanPos::End, "expected dollar sign".into())
|
||
}
|
||
}
|
||
|
||
fn numbering(&mut self, start: usize) -> NodeKind {
|
||
self.s.eat_while(char::is_ascii_digit);
|
||
let read = self.s.from(start);
|
||
if self.s.eat_if('.') {
|
||
if let Ok(number) = read.parse() {
|
||
return NodeKind::EnumNumbering(number);
|
||
}
|
||
}
|
||
|
||
self.text(start)
|
||
}
|
||
|
||
fn label(&mut self) -> NodeKind {
|
||
let label = self.s.eat_while(is_id_continue);
|
||
if self.s.eat_if('>') {
|
||
if !label.is_empty() {
|
||
NodeKind::Label(label.into())
|
||
} else {
|
||
NodeKind::Error(SpanPos::Full, "label cannot be empty".into())
|
||
}
|
||
} else {
|
||
self.terminated = false;
|
||
NodeKind::Error(SpanPos::End, "expected closing angle bracket".into())
|
||
}
|
||
}
|
||
|
||
fn reference(&mut self, start: usize) -> NodeKind {
|
||
let label = self.s.eat_while(is_id_continue);
|
||
if !label.is_empty() {
|
||
NodeKind::Ref(label.into())
|
||
} else {
|
||
self.text(start)
|
||
}
|
||
}
|
||
|
||
fn code(&mut self, start: usize, c: char) -> NodeKind {
|
||
match c {
|
||
// Parentheses.
|
||
'(' => NodeKind::LeftParen,
|
||
')' => NodeKind::RightParen,
|
||
|
||
// Two-char operators.
|
||
'=' if self.s.eat_if('=') => NodeKind::EqEq,
|
||
'!' if self.s.eat_if('=') => NodeKind::ExclEq,
|
||
'<' if self.s.eat_if('=') => NodeKind::LtEq,
|
||
'>' if self.s.eat_if('=') => NodeKind::GtEq,
|
||
'+' if self.s.eat_if('=') => NodeKind::PlusEq,
|
||
'-' if self.s.eat_if('=') => NodeKind::HyphEq,
|
||
'*' if self.s.eat_if('=') => NodeKind::StarEq,
|
||
'/' if self.s.eat_if('=') => NodeKind::SlashEq,
|
||
'.' if self.s.eat_if('.') => NodeKind::Dots,
|
||
'=' if self.s.eat_if('>') => NodeKind::Arrow,
|
||
|
||
// Single-char operators.
|
||
',' => NodeKind::Comma,
|
||
';' => NodeKind::Semicolon,
|
||
':' => NodeKind::Colon,
|
||
'+' => NodeKind::Plus,
|
||
'-' => NodeKind::Minus,
|
||
'*' => NodeKind::Star,
|
||
'/' => NodeKind::Slash,
|
||
'=' => NodeKind::Eq,
|
||
'<' => NodeKind::Lt,
|
||
'>' => NodeKind::Gt,
|
||
'.' if !self.s.at(char::is_ascii_digit) => NodeKind::Dot,
|
||
|
||
// Identifiers.
|
||
c if is_id_start(c) => self.ident(start),
|
||
|
||
// Numbers.
|
||
c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => {
|
||
self.number(start, c)
|
||
}
|
||
|
||
// Strings.
|
||
'"' => self.string(),
|
||
|
||
// Invalid token.
|
||
_ => NodeKind::Unknown(self.s.from(start).into()),
|
||
}
|
||
}
|
||
|
||
fn ident(&mut self, start: usize) -> NodeKind {
|
||
self.s.eat_while(is_id_continue);
|
||
match self.s.from(start) {
|
||
"none" => NodeKind::None,
|
||
"auto" => NodeKind::Auto,
|
||
"true" => NodeKind::Bool(true),
|
||
"false" => NodeKind::Bool(false),
|
||
id => keyword(id).unwrap_or_else(|| NodeKind::Ident(id.into())),
|
||
}
|
||
}
|
||
|
||
fn number(&mut self, start: usize, c: char) -> NodeKind {
|
||
// Read the first part (integer or fractional depending on `first`).
|
||
self.s.eat_while(char::is_ascii_digit);
|
||
|
||
// Read the fractional part if not already done.
|
||
// Make sure not to confuse a range for the decimal separator.
|
||
if c != '.' && !self.s.at("..") && self.s.eat_if('.') {
|
||
self.s.eat_while(char::is_ascii_digit);
|
||
}
|
||
|
||
// Read the exponent.
|
||
if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
|
||
self.s.eat_if(['+', '-']);
|
||
self.s.eat_while(char::is_ascii_digit);
|
||
}
|
||
|
||
// Read the suffix.
|
||
let suffix_start = self.s.cursor();
|
||
if !self.s.eat_if('%') {
|
||
self.s.eat_while(char::is_ascii_alphanumeric);
|
||
}
|
||
|
||
let number = self.s.get(start .. suffix_start);
|
||
let suffix = self.s.from(suffix_start);
|
||
let all = self.s.from(start);
|
||
|
||
// Find out whether it is a simple number.
|
||
if suffix.is_empty() {
|
||
if let Ok(i) = number.parse::<i64>() {
|
||
return NodeKind::Int(i);
|
||
}
|
||
}
|
||
|
||
if let Ok(f) = number.parse::<f64>() {
|
||
match suffix {
|
||
"" => NodeKind::Float(f),
|
||
"pt" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Pt)),
|
||
"mm" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Mm)),
|
||
"cm" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Cm)),
|
||
"in" => NodeKind::Numeric(f, Unit::Length(LengthUnit::In)),
|
||
"deg" => NodeKind::Numeric(f, Unit::Angle(AngleUnit::Deg)),
|
||
"rad" => NodeKind::Numeric(f, Unit::Angle(AngleUnit::Rad)),
|
||
"em" => NodeKind::Numeric(f, Unit::Em),
|
||
"fr" => NodeKind::Numeric(f, Unit::Fr),
|
||
"%" => NodeKind::Numeric(f, Unit::Percent),
|
||
_ => NodeKind::Unknown(all.into()),
|
||
}
|
||
} else {
|
||
NodeKind::Unknown(all.into())
|
||
}
|
||
}
|
||
|
||
fn string(&mut self) -> NodeKind {
|
||
let mut escaped = false;
|
||
let verbatim = self.s.eat_until(|c| {
|
||
if c == '"' && !escaped {
|
||
true
|
||
} else {
|
||
escaped = c == '\\' && !escaped;
|
||
false
|
||
}
|
||
});
|
||
|
||
let string = resolve_string(verbatim);
|
||
if self.s.eat_if('"') {
|
||
NodeKind::Str(string)
|
||
} else {
|
||
self.terminated = false;
|
||
NodeKind::Error(SpanPos::End, "expected quote".into())
|
||
}
|
||
}
|
||
}
|
||
|
||
fn keyword(ident: &str) -> Option<NodeKind> {
|
||
Some(match ident {
|
||
"not" => NodeKind::Not,
|
||
"and" => NodeKind::And,
|
||
"or" => NodeKind::Or,
|
||
"let" => NodeKind::Let,
|
||
"set" => NodeKind::Set,
|
||
"show" => NodeKind::Show,
|
||
"wrap" => NodeKind::Wrap,
|
||
"if" => NodeKind::If,
|
||
"else" => NodeKind::Else,
|
||
"for" => NodeKind::For,
|
||
"in" => NodeKind::In,
|
||
"as" => NodeKind::As,
|
||
"while" => NodeKind::While,
|
||
"break" => NodeKind::Break,
|
||
"continue" => NodeKind::Continue,
|
||
"return" => NodeKind::Return,
|
||
"import" => NodeKind::Import,
|
||
"include" => NodeKind::Include,
|
||
"from" => NodeKind::From,
|
||
_ => return None,
|
||
})
|
||
}
|
||
|
||
/// The column index of a given index in the source string, given a column
|
||
/// offset for the first line.
|
||
#[inline]
|
||
fn column(string: &str, index: usize, offset: usize) -> usize {
|
||
let mut apply_offset = false;
|
||
let res = string[.. index]
|
||
.char_indices()
|
||
.rev()
|
||
.take_while(|&(_, c)| !is_newline(c))
|
||
.inspect(|&(i, _)| {
|
||
if i == 0 {
|
||
apply_offset = true
|
||
}
|
||
})
|
||
.count();
|
||
|
||
// The loop is never executed if the slice is empty, but we are of
|
||
// course still at the start of the first line.
|
||
if index == 0 {
|
||
apply_offset = true;
|
||
}
|
||
|
||
if apply_offset { res + offset } else { res }
|
||
}
|
||
|
||
/// Whether this character denotes a newline.
|
||
#[inline]
|
||
pub fn is_newline(character: char) -> bool {
|
||
matches!(
|
||
character,
|
||
// Line Feed, Vertical Tab, Form Feed, Carriage Return.
|
||
'\n' | '\x0B' | '\x0C' | '\r' |
|
||
// Next Line, Line Separator, Paragraph Separator.
|
||
'\u{0085}' | '\u{2028}' | '\u{2029}'
|
||
)
|
||
}
|
||
|
||
/// Whether a string is a valid unicode identifier.
|
||
///
|
||
/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
|
||
/// - `_` as a starting character,
|
||
/// - `_` and `-` as continuing characters.
|
||
///
|
||
/// [uax31]: http://www.unicode.org/reports/tr31/
|
||
#[inline]
|
||
pub fn is_ident(string: &str) -> bool {
|
||
let mut chars = string.chars();
|
||
chars
|
||
.next()
|
||
.map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
|
||
}
|
||
|
||
/// Whether a character can start an identifier.
|
||
#[inline]
|
||
fn is_id_start(c: char) -> bool {
|
||
c.is_xid_start() || c == '_'
|
||
}
|
||
|
||
/// Whether a character can continue an identifier.
|
||
#[inline]
|
||
fn is_id_continue(c: char) -> bool {
|
||
c.is_xid_continue() || c == '_' || c == '-'
|
||
}
|
||
|
||
#[cfg(test)]
|
||
#[allow(non_snake_case)]
|
||
mod tests {
|
||
use super::*;
|
||
use crate::parse::tests::check;
|
||
|
||
use NodeKind::*;
|
||
use Option::None;
|
||
use SpanPos::*;
|
||
use TokenMode::{Code, Markup};
|
||
|
||
fn Space(newlines: usize) -> NodeKind {
|
||
NodeKind::Space { newlines }
|
||
}
|
||
|
||
fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind {
|
||
NodeKind::Raw(Arc::new(RawNode {
|
||
text: text.into(),
|
||
lang: lang.map(Into::into),
|
||
block,
|
||
}))
|
||
}
|
||
|
||
fn Math(formula: &str, display: bool) -> NodeKind {
|
||
NodeKind::Math(Arc::new(MathNode { formula: formula.into(), display }))
|
||
}
|
||
|
||
fn Str(string: &str) -> NodeKind {
|
||
NodeKind::Str(string.into())
|
||
}
|
||
|
||
fn Text(string: &str) -> NodeKind {
|
||
NodeKind::Text(string.into())
|
||
}
|
||
|
||
fn Ident(ident: &str) -> NodeKind {
|
||
NodeKind::Ident(ident.into())
|
||
}
|
||
|
||
fn Error(pos: SpanPos, message: &str) -> NodeKind {
|
||
NodeKind::Error(pos, message.into())
|
||
}
|
||
|
||
fn Invalid(invalid: &str) -> NodeKind {
|
||
NodeKind::Unknown(invalid.into())
|
||
}
|
||
|
||
/// Building blocks for suffix testing.
|
||
///
|
||
/// We extend each test case with a collection of different suffixes to make
|
||
/// sure tokens end at the correct position. These suffixes are split into
|
||
/// blocks, which can be disabled/enabled per test case. For example, when
|
||
/// testing identifiers we disable letter suffixes because these would
|
||
/// mingle with the identifiers.
|
||
///
|
||
/// Suffix blocks:
|
||
/// - ' ': spacing
|
||
/// - 'a': letters
|
||
/// - '1': numbers
|
||
/// - '/': symbols
|
||
const BLOCKS: &str = " a1/";
|
||
|
||
macro_rules! t {
|
||
(Both $($tts:tt)*) => {
|
||
t!(Markup $($tts)*);
|
||
t!(Code $($tts)*);
|
||
};
|
||
($mode:ident $([$blocks:literal])?: $text:expr => $($token:expr),*) => {{
|
||
// Test without suffix.
|
||
t!(@$mode: $text => $($token),*);
|
||
|
||
// Suffixes described by four-tuples of:
|
||
//
|
||
// - block the suffix is part of
|
||
// - mode in which the suffix is applicable
|
||
// - the suffix string
|
||
// - the resulting suffix NodeKind
|
||
let suffixes: &[(char, Option<TokenMode>, &str, NodeKind)] = &[
|
||
// Whitespace suffixes.
|
||
(' ', None, " ", Space(0)),
|
||
(' ', None, "\n", Space(1)),
|
||
(' ', None, "\r", Space(1)),
|
||
(' ', None, "\r\n", Space(1)),
|
||
// Letter suffixes.
|
||
('a', Some(Markup), "hello", Text("hello")),
|
||
('a', Some(Markup), "💚", Text("💚")),
|
||
('a', Some(Code), "val", Ident("val")),
|
||
('a', Some(Code), "α", Ident("α")),
|
||
('a', Some(Code), "_", Ident("_")),
|
||
// Number suffixes.
|
||
('1', Some(Code), "2", Int(2)),
|
||
('1', Some(Code), ".2", Float(0.2)),
|
||
// Symbol suffixes.
|
||
('/', None, "[", LeftBracket),
|
||
('/', None, "//", LineComment),
|
||
('/', None, "/**/", BlockComment),
|
||
('/', Some(Markup), "*", Star),
|
||
('/', Some(Markup), "$ $", Math(" ", false)),
|
||
('/', Some(Markup), r"\\", Escape('\\')),
|
||
('/', Some(Markup), "#let", Let),
|
||
('/', Some(Code), "(", LeftParen),
|
||
('/', Some(Code), ":", Colon),
|
||
('/', Some(Code), "+=", PlusEq),
|
||
];
|
||
|
||
// Test with each applicable suffix.
|
||
for &(block, mode, suffix, ref token) in suffixes {
|
||
let text = $text;
|
||
#[allow(unused_variables)]
|
||
let blocks = BLOCKS;
|
||
$(let blocks = $blocks;)?
|
||
assert!(!blocks.contains(|c| !BLOCKS.contains(c)));
|
||
if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) {
|
||
t!(@$mode: format!("{}{}", text, suffix) => $($token,)* token);
|
||
}
|
||
}
|
||
}};
|
||
(@$mode:ident: $text:expr => $($token:expr),*) => {{
|
||
let text = $text;
|
||
let found = Tokens::new(&text, $mode).collect::<Vec<_>>();
|
||
let expected = vec![$($token.clone()),*];
|
||
check(&text, found, expected);
|
||
}};
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_brackets() {
|
||
// Test in markup.
|
||
t!(Markup: "{" => LeftBrace);
|
||
t!(Markup: "}" => RightBrace);
|
||
t!(Markup: "[" => LeftBracket);
|
||
t!(Markup: "]" => RightBracket);
|
||
t!(Markup[" /"]: "(" => Text("("));
|
||
t!(Markup[" /"]: ")" => Text(")"));
|
||
|
||
// Test in code.
|
||
t!(Code: "{" => LeftBrace);
|
||
t!(Code: "}" => RightBrace);
|
||
t!(Code: "[" => LeftBracket);
|
||
t!(Code: "]" => RightBracket);
|
||
t!(Code: "(" => LeftParen);
|
||
t!(Code: ")" => RightParen);
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_whitespace() {
|
||
// Test basic whitespace.
|
||
t!(Both["a1/"]: "" => );
|
||
t!(Both["a1/"]: " " => Space(0));
|
||
t!(Both["a1/"]: " " => Space(0));
|
||
t!(Both["a1/"]: "\t" => Space(0));
|
||
t!(Both["a1/"]: " \t" => Space(0));
|
||
t!(Both["a1/"]: "\u{202F}" => Space(0));
|
||
|
||
// Test newline counting.
|
||
t!(Both["a1/"]: "\n" => Space(1));
|
||
t!(Both["a1/"]: "\n " => Space(1));
|
||
t!(Both["a1/"]: " \n" => Space(1));
|
||
t!(Both["a1/"]: " \n " => Space(1));
|
||
t!(Both["a1/"]: "\r\n" => Space(1));
|
||
t!(Both["a1/"]: "\r\n\r" => Space(2));
|
||
t!(Both["a1/"]: " \n\t \n " => Space(2));
|
||
t!(Both["a1/"]: "\n\r" => Space(2));
|
||
t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_text() {
|
||
// Test basic text.
|
||
t!(Markup[" /"]: "hello" => Text("hello"));
|
||
t!(Markup[" /"]: "reha-world" => Text("reha-world"));
|
||
|
||
// Test code symbols in text.
|
||
t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, Quote { double: true }, Text("b"));
|
||
t!(Markup[" /"]: ";,|/+" => Text(";,|/+"));
|
||
t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a"));
|
||
t!(Markup[" "]: "#123" => Text("#123"));
|
||
|
||
// Test text ends.
|
||
t!(Markup[""]: "hello " => Text("hello"), Space(0));
|
||
t!(Markup[""]: "hello~" => Text("hello"), NonBreakingSpace);
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_escape_sequences() {
|
||
// Test escapable symbols.
|
||
t!(Markup: r"\\" => Escape('\\'));
|
||
t!(Markup: r"\/" => Escape('/'));
|
||
t!(Markup: r"\[" => Escape('['));
|
||
t!(Markup: r"\]" => Escape(']'));
|
||
t!(Markup: r"\{" => Escape('{'));
|
||
t!(Markup: r"\}" => Escape('}'));
|
||
t!(Markup: r"\*" => Escape('*'));
|
||
t!(Markup: r"\_" => Escape('_'));
|
||
t!(Markup: r"\=" => Escape('='));
|
||
t!(Markup: r"\~" => Escape('~'));
|
||
t!(Markup: r"\'" => Escape('\''));
|
||
t!(Markup: r#"\""# => Escape('"'));
|
||
t!(Markup: r"\`" => Escape('`'));
|
||
t!(Markup: r"\$" => Escape('$'));
|
||
t!(Markup: r"\#" => Escape('#'));
|
||
t!(Markup: r"\a" => Escape('a'));
|
||
t!(Markup: r"\u" => Escape('u'));
|
||
t!(Markup: r"\1" => Escape('1'));
|
||
|
||
// Test basic unicode escapes.
|
||
t!(Markup: r"\u{}" => Error(Full, "invalid unicode escape sequence"));
|
||
t!(Markup: r"\u{2603}" => Escape('☃'));
|
||
t!(Markup: r"\u{P}" => Error(Full, "invalid unicode escape sequence"));
|
||
|
||
// Test unclosed unicode escapes.
|
||
t!(Markup[" /"]: r"\u{" => Error(End, "expected closing brace"));
|
||
t!(Markup[" /"]: r"\u{1" => Error(End, "expected closing brace"));
|
||
t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace"));
|
||
t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace"));
|
||
t!(Markup: r"\u{1🏕}" => Error(End, "expected closing brace"), Text("🏕"), RightBrace);
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_markup_symbols() {
|
||
// Test markup tokens.
|
||
t!(Markup[" a1"]: "*" => Star);
|
||
t!(Markup: "_" => Underscore);
|
||
t!(Markup[""]: "===" => Eq, Eq, Eq);
|
||
t!(Markup["a1/"]: "= " => Eq, Space(0));
|
||
t!(Markup[" "]: r"\" => Linebreak);
|
||
t!(Markup: "~" => NonBreakingSpace);
|
||
t!(Markup["a1/"]: "-?" => Shy);
|
||
t!(Markup["a "]: r"a--" => Text("a"), EnDash);
|
||
t!(Markup["a1/"]: "- " => Minus, Space(0));
|
||
t!(Markup[" "]: "+" => Plus);
|
||
t!(Markup[" "]: "1." => EnumNumbering(1));
|
||
t!(Markup[" "]: "1.a" => EnumNumbering(1), Text("a"));
|
||
t!(Markup[" /"]: "a1." => Text("a1."));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_code_symbols() {
|
||
// Test all symbols.
|
||
t!(Code: "," => Comma);
|
||
t!(Code: ";" => Semicolon);
|
||
t!(Code: ":" => Colon);
|
||
t!(Code: "+" => Plus);
|
||
t!(Code: "-" => Minus);
|
||
t!(Code[" a1"]: "*" => Star);
|
||
t!(Code[" a1"]: "/" => Slash);
|
||
t!(Code[" a/"]: "." => Dot);
|
||
t!(Code: "=" => Eq);
|
||
t!(Code: "==" => EqEq);
|
||
t!(Code: "!=" => ExclEq);
|
||
t!(Code: "<" => Lt);
|
||
t!(Code: "<=" => LtEq);
|
||
t!(Code: ">" => Gt);
|
||
t!(Code: ">=" => GtEq);
|
||
t!(Code: "+=" => PlusEq);
|
||
t!(Code: "-=" => HyphEq);
|
||
t!(Code: "*=" => StarEq);
|
||
t!(Code: "/=" => SlashEq);
|
||
t!(Code: ".." => Dots);
|
||
t!(Code: "=>" => Arrow);
|
||
|
||
// Test combinations.
|
||
t!(Code: "<=>" => LtEq, Gt);
|
||
t!(Code[" a/"]: "..." => Dots, Dot);
|
||
|
||
// Test hyphen as symbol vs part of identifier.
|
||
t!(Code[" /"]: "-1" => Minus, Int(1));
|
||
t!(Code[" /"]: "-a" => Minus, Ident("a"));
|
||
t!(Code[" /"]: "--1" => Minus, Minus, Int(1));
|
||
t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a"));
|
||
t!(Code[" /"]: "a-b" => Ident("a-b"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_keywords() {
|
||
// A list of a few (not all) keywords.
|
||
let list = [
|
||
("not", Not),
|
||
("let", Let),
|
||
("if", If),
|
||
("else", Else),
|
||
("for", For),
|
||
("in", In),
|
||
("import", Import),
|
||
];
|
||
|
||
for (s, t) in list.clone() {
|
||
t!(Markup[" "]: format!("#{}", s) => t);
|
||
t!(Markup[" "]: format!("#{0}#{0}", s) => t, t);
|
||
t!(Markup[" /"]: format!("# {}", s) => Text(&format!("# {s}")));
|
||
}
|
||
|
||
for (s, t) in list {
|
||
t!(Code[" "]: s => t);
|
||
t!(Markup[" /"]: s => Text(s));
|
||
}
|
||
|
||
// Test simple identifier.
|
||
t!(Markup[" "]: "#letter" => Ident("letter"));
|
||
t!(Code[" /"]: "falser" => Ident("falser"));
|
||
t!(Code[" /"]: "None" => Ident("None"));
|
||
t!(Code[" /"]: "True" => Ident("True"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_raw_blocks() {
|
||
// Test basic raw block.
|
||
t!(Markup: "``" => Raw("", None, false));
|
||
t!(Markup: "`raw`" => Raw("raw", None, false));
|
||
t!(Markup[""]: "`]" => Error(End, "expected 1 backtick"));
|
||
|
||
// Test special symbols in raw block.
|
||
t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false));
|
||
t!(Markup[""]: r"`\`` " => Raw(r"\", None, false), Error(End, "expected 1 backtick"));
|
||
|
||
// Test separated closing backticks.
|
||
t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false));
|
||
|
||
// Test more backticks.
|
||
t!(Markup: "``nope``" => Raw("", None, false), Text("nope"), Raw("", None, false));
|
||
t!(Markup: "````🚀````" => Raw("", None, false));
|
||
t!(Markup[""]: "`````👩🚀````noend" => Error(End, "expected 5 backticks"));
|
||
t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_math_formulas() {
|
||
// Test basic formula.
|
||
t!(Markup: "$$" => Math("", false));
|
||
t!(Markup: "$x$" => Math("x", false));
|
||
t!(Markup: r"$\\$" => Math(r"\\", false));
|
||
t!(Markup: r"$[\\]$" => Math(r"[\\]", false));
|
||
t!(Markup: "$ x + y $" => Math(" x + y ", true));
|
||
|
||
// Test unterminated.
|
||
t!(Markup[""]: "$x" => Error(End, "expected dollar sign"));
|
||
t!(Markup[""]: "$[x]\n" => Error(End, "expected dollar sign"));
|
||
|
||
// Test escape sequences.
|
||
t!(Markup: r"$\$x$" => Math(r"\$x", false));
|
||
t!(Markup: r"$\ \$ $" => Math(r"\ \$ ", false));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_idents() {
|
||
// Test valid identifiers.
|
||
t!(Code[" /"]: "x" => Ident("x"));
|
||
t!(Code[" /"]: "value" => Ident("value"));
|
||
t!(Code[" /"]: "__main__" => Ident("__main__"));
|
||
t!(Code[" /"]: "_snake_case" => Ident("_snake_case"));
|
||
|
||
// Test non-ascii.
|
||
t!(Code[" /"]: "α" => Ident("α"));
|
||
t!(Code[" /"]: "ម្តាយ" => Ident("ម្តាយ"));
|
||
|
||
// Test hyphen parsed as identifier.
|
||
t!(Code[" /"]: "kebab-case" => Ident("kebab-case"));
|
||
t!(Code[" /"]: "one-10" => Ident("one-10"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_numeric() {
|
||
let ints = [("7", 7), ("012", 12)];
|
||
let floats = [
|
||
(".3", 0.3),
|
||
("0.3", 0.3),
|
||
("3.", 3.0),
|
||
("3.0", 3.0),
|
||
("14.3", 14.3),
|
||
("10e2", 1000.0),
|
||
("10e+0", 10.0),
|
||
("10e+1", 100.0),
|
||
("10e-2", 0.1),
|
||
("10.e1", 100.0),
|
||
("10.e-1", 1.0),
|
||
(".1e1", 1.0),
|
||
("10E2", 1000.0),
|
||
];
|
||
|
||
// Test integers.
|
||
for &(s, v) in &ints {
|
||
t!(Code[" /"]: s => Int(v));
|
||
}
|
||
|
||
// Test floats.
|
||
for &(s, v) in &floats {
|
||
t!(Code[" /"]: s => Float(v));
|
||
}
|
||
|
||
// Test attached numbers.
|
||
t!(Code[" /"]: ".2.3" => Float(0.2), Float(0.3));
|
||
t!(Code[" /"]: "1.2.3" => Float(1.2), Float(0.3));
|
||
t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3));
|
||
|
||
// Test float from too large integer.
|
||
let large = i64::MAX as f64 + 1.0;
|
||
t!(Code[" /"]: large.to_string() => Float(large));
|
||
|
||
// Combined integers and floats.
|
||
let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats);
|
||
|
||
let suffixes: &[(&str, fn(f64) -> NodeKind)] = &[
|
||
("mm", |x| Numeric(x, Unit::Length(LengthUnit::Mm))),
|
||
("pt", |x| Numeric(x, Unit::Length(LengthUnit::Pt))),
|
||
("cm", |x| Numeric(x, Unit::Length(LengthUnit::Cm))),
|
||
("in", |x| Numeric(x, Unit::Length(LengthUnit::In))),
|
||
("rad", |x| Numeric(x, Unit::Angle(AngleUnit::Rad))),
|
||
("deg", |x| Numeric(x, Unit::Angle(AngleUnit::Deg))),
|
||
("em", |x| Numeric(x, Unit::Em)),
|
||
("fr", |x| Numeric(x, Unit::Fr)),
|
||
("%", |x| Numeric(x, Unit::Percent)),
|
||
];
|
||
|
||
// Numeric types.
|
||
for &(suffix, build) in suffixes {
|
||
for (s, v) in nums.clone() {
|
||
t!(Code[" /"]: format!("{}{}", s, suffix) => build(v));
|
||
}
|
||
}
|
||
|
||
// Multiple dots close the number.
|
||
t!(Code[" /"]: "1..2" => Int(1), Dots, Int(2));
|
||
t!(Code[" /"]: "1..2.3" => Int(1), Dots, Float(2.3));
|
||
t!(Code[" /"]: "1.2..3" => Float(1.2), Dots, Int(3));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_strings() {
|
||
// Test basic strings.
|
||
t!(Code: "\"hi\"" => Str("hi"));
|
||
t!(Code: "\"hi\nthere\"" => Str("hi\nthere"));
|
||
t!(Code: "\"🌎\"" => Str("🌎"));
|
||
|
||
// Test unterminated.
|
||
t!(Code[""]: "\"hi" => Error(End, "expected quote"));
|
||
|
||
// Test escaped quote.
|
||
t!(Code: r#""a\"bc""# => Str("a\"bc"));
|
||
t!(Code[""]: r#""\""# => Error(End, "expected quote"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_line_comments() {
|
||
// Test line comment with no trailing newline.
|
||
t!(Both[""]: "//" => LineComment);
|
||
|
||
// Test line comment ends at newline.
|
||
t!(Both["a1/"]: "//bc\n" => LineComment, Space(1));
|
||
t!(Both["a1/"]: "// bc \n" => LineComment, Space(1));
|
||
t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1));
|
||
|
||
// Test nested line comments.
|
||
t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_block_comments() {
|
||
// Test basic block comments.
|
||
t!(Both[""]: "/*" => BlockComment);
|
||
t!(Both: "/**/" => BlockComment);
|
||
t!(Both: "/*🏞*/" => BlockComment);
|
||
t!(Both: "/*\n*/" => BlockComment);
|
||
|
||
// Test depth 1 and 2 nested block comments.
|
||
t!(Both: "/* /* */ */" => BlockComment);
|
||
t!(Both: "/*/*/**/*/*/" => BlockComment);
|
||
|
||
// Test two nested, one unclosed block comments.
|
||
t!(Both[""]: "/*/*/**/*/" => BlockComment);
|
||
|
||
// Test all combinations of up to two following slashes and stars.
|
||
t!(Both[""]: "/*" => BlockComment);
|
||
t!(Both[""]: "/*/" => BlockComment);
|
||
t!(Both[""]: "/**" => BlockComment);
|
||
t!(Both[""]: "/*//" => BlockComment);
|
||
t!(Both[""]: "/*/*" => BlockComment);
|
||
t!(Both[""]: "/**/" => BlockComment);
|
||
t!(Both[""]: "/***" => BlockComment);
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_invalid() {
|
||
// Test invalidly closed block comments.
|
||
t!(Both: "*/" => Invalid("*/"));
|
||
t!(Both: "/**/*/" => BlockComment, Invalid("*/"));
|
||
|
||
// Test invalid expressions.
|
||
t!(Code: r"\" => Invalid(r"\"));
|
||
t!(Code: "🌓" => Invalid("🌓"));
|
||
t!(Code: r"\:" => Invalid(r"\"), Colon);
|
||
t!(Code: "meal⌚" => Ident("meal"), Invalid("⌚"));
|
||
t!(Code[" /"]: r"\a" => Invalid(r"\"), Ident("a"));
|
||
t!(Code[" /"]: "#" => Invalid("#"));
|
||
|
||
// Test invalid number suffixes.
|
||
t!(Code[" /"]: "1foo" => Invalid("1foo"));
|
||
t!(Code: "1p%" => Invalid("1p"), Invalid("%"));
|
||
t!(Code: "1%%" => Numeric(1.0, Unit::Percent), Invalid("%"));
|
||
}
|
||
}
|