2024-07-29 18:24:16 -03:00

1209 lines
39 KiB
Rust

use ecow::{eco_format, EcoString};
use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_script::{Script, UnicodeScript};
use unicode_segmentation::UnicodeSegmentation;
use unscanny::Scanner;
use crate::{SyntaxError, SyntaxKind, SyntaxNode};
/// Splits up a string of source code into tokens.
#[derive(Clone)]
pub(super) struct Lexer<'s> {
/// The underlying scanner.
s: Scanner<'s>,
/// The mode the lexer is in. This determines which kinds of tokens it
/// produces.
mode: LexMode,
/// Whether the last token contained a newline.
newline: bool,
/// The state held by raw line lexing.
raw: Vec<(SyntaxKind, usize)>,
/// An error for the last token.
error: Option<SyntaxError>,
}
/// What kind of tokens to emit.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub(super) enum LexMode {
/// Text and markup.
Markup,
/// Math atoms, operators, etc.
Math,
/// Keywords, literals and operators.
Code,
/// The contents of a raw block.
Raw,
}
impl<'s> Lexer<'s> {
/// Create a new lexer with the given mode and a prefix to offset column
/// calculations.
pub fn new(text: &'s str, mode: LexMode) -> Self {
Self {
s: Scanner::new(text),
mode,
newline: false,
error: None,
raw: Vec::new(),
}
}
/// Get the current lexing mode.
pub fn mode(&self) -> LexMode {
self.mode
}
/// Change the lexing mode.
pub fn set_mode(&mut self, mode: LexMode) {
self.mode = mode;
}
/// The index in the string at which the last token ends and next token
/// will start.
pub fn cursor(&self) -> usize {
self.s.cursor()
}
/// Jump to the given index in the string.
pub fn jump(&mut self, index: usize) {
self.s.jump(index);
}
/// Whether the last token contained a newline.
pub fn newline(&self) -> bool {
self.newline
}
/// Take out the last error, if any.
fn take_error(&mut self) -> Option<SyntaxError> {
self.error.take()
}
}
impl Lexer<'_> {
/// Construct a full-positioned syntax error.
fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
self.error = Some(SyntaxError::new(message));
SyntaxKind::Error
}
/// If the current node is an error, adds a hint.
fn hint(&mut self, message: impl Into<EcoString>) {
if let Some(error) = &mut self.error {
error.hints.push(message.into());
}
}
}
/// Shared methods with all [`LexMode`].
impl Lexer<'_> {
/// Proceed to the next token and return a [`SyntaxNode`] containing it.
///
/// Note the token could be a [trivia](SyntaxKind::is_trivia). Also, the
/// syntax node returned might not always be a leaf, but could actually
/// come with a subtree (could be an inner node). This happens when it is
/// preferred to perform parsing at the character level instead of at the
/// token level, as seen, for example, in [`annotation`](Lexer::annotation).
pub fn next(&mut self) -> SyntaxNode {
if self.mode == LexMode::Raw {
let Some((kind, end)) = self.raw.pop() else {
return SyntaxNode::end();
};
let start = self.s.cursor();
self.s.jump(end);
return self.emit_token(kind, start);
}
self.newline = false;
self.error = None;
let start = self.s.cursor();
let token = match self.s.eat() {
Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
Some('/') if self.s.eat_if('/') => {
return self.line_comment_or_annotation(start);
}
Some('/') if self.s.eat_if('*') => self.block_comment(),
Some('*') if self.s.eat_if('/') => {
let kind = self.error("unexpected end of block comment");
self.hint(
"consider escaping the `*` with a backslash or \
opening the block comment with `/*`",
);
kind
}
Some(c) => match self.mode {
LexMode::Markup => self.markup(start, c),
LexMode::Math => self.math(start, c),
LexMode::Code => self.code(start, c),
LexMode::Raw => unreachable!(),
},
None => SyntaxKind::End,
};
self.emit_token(token, start)
}
/// Constructs an error node with the given message.
///
/// The node's inner text is taken from the given start position up to and
/// including the current cursor position.
fn emit_error(&self, message: impl Into<EcoString>, start: usize) -> SyntaxNode {
let text = self.s.from(start);
SyntaxNode::error(SyntaxError::new(message), text)
}
/// Converts a token into a syntax node based on its kind. Produces an
/// error node if there are errors.
///
/// The node's inner text is taken from the given start position up to and
/// including the current cursor position.
fn emit_token(&mut self, kind: SyntaxKind, start: usize) -> SyntaxNode {
let text = self.s.from(start);
if kind == SyntaxKind::End {
SyntaxNode::end()
} else if let Some(error) = self.take_error() {
SyntaxNode::error(error, text)
} else {
SyntaxNode::leaf(kind, text)
}
}
/// Eat whitespace characters greedily.
fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
let more = self.s.eat_while(|c| is_space(c, self.mode));
let newlines = match c {
' ' if more.is_empty() => 0,
_ => count_newlines(self.s.from(start)),
};
self.newline = newlines > 0;
if self.mode == LexMode::Markup && newlines >= 2 {
SyntaxKind::Parbreak
} else {
SyntaxKind::Space
}
}
/// Parses an annotation if the line comment has the form
/// `// @something`
///
/// Otherwise, parses a regular line comment.
fn line_comment_or_annotation(&mut self, start: usize) -> SyntaxNode {
self.s.eat_while(is_inline_whitespace);
if self.s.eat_if('@') {
return self.annotation(start);
}
self.s.eat_until(is_newline);
self.emit_token(SyntaxKind::LineComment, start)
}
fn block_comment(&mut self) -> SyntaxKind {
let mut state = '_';
let mut depth = 1;
// Find the first `*/` that does not correspond to a nested `/*`.
while let Some(c) = self.s.eat() {
state = match (state, c) {
('*', '/') => {
depth -= 1;
if depth == 0 {
break;
}
'_'
}
('/', '*') => {
depth += 1;
'_'
}
_ => c,
}
}
SyntaxKind::BlockComment
}
}
/// Annotation lexing and auxiliary methods.
impl Lexer<'_> {
/// Lexes and parses an annotation into a complete syntax subtree.
///
/// The lexer is fully responsible for the annotation, as it is simpler to
/// parse them at the character level, given they follow a very simple
/// and rigid structure, in the form
/// `// @annotation-name("string argument1", "string argument2")`
/// with optional whitespaces and comments between arguments.
fn annotation(&mut self, start: usize) -> SyntaxNode {
// Start by lexing the marker.
let marker = self.emit_token(SyntaxKind::AnnotationMarker, start);
let mut subtree = vec![marker];
let current_start = self.s.cursor();
// Ignore initial non-newline whitespaces.
if !self.s.eat_while(is_inline_whitespace).is_empty() {
subtree.push(self.emit_token(SyntaxKind::Space, current_start));
}
// Lex the annotation name.
let current_start = self.s.cursor();
if !self.s.eat_if(is_id_start) {
self.s.eat_until(is_newline);
subtree.push(self.emit_error("expected identifier", current_start));
// Return a single error node until the end of the annotation.
return SyntaxNode::inner(SyntaxKind::Annotation, subtree);
}
let name = self.annotation_name(current_start);
subtree.push(self.emit_token(name, current_start));
// Optional left parenthesis before annotation arguments.
let current_start = self.s.cursor();
let has_opening_paren = self.s.eat_if('(');
if has_opening_paren {
subtree.push(self.emit_token(SyntaxKind::LeftParen, current_start));
}
// Annotation arguments:
// Keep reading until we find a right parenthesis (if we got a left
// parenthesis) or newline. We have to check the newline before eating
// (through '.peek()') to ensure it is not considered part of the
// annotation. Newlines are exceptionally allowed inside an annotation
// if arguments are surrounded by parentheses.
//
// Each argument may be either an identifier or a string, and arguments
// are separated by spaces. Any other characters are invalid.
let mut found_closing_paren = false;
while !self.s.at(is_newline)
|| has_opening_paren
&& !found_closing_paren
&& self.eat_annotation_linebreak(&mut subtree)
{
let current_start = self.s.cursor();
let token = match self.s.eat() {
Some(c) if c.is_whitespace() => {
self.s.eat_while(is_inline_whitespace);
SyntaxKind::Space
}
Some(_) if found_closing_paren => {
// After we finished specifying arguments, there must only
// be whitespaces until the line ends.
self.s.eat_until(char::is_whitespace);
self.error("unexpected characters after end of annotation")
}
Some(c) if is_id_start(c) => {
self.s.eat_while(is_id_continue);
SyntaxKind::Ident
}
Some('"') => self.annotation_string(),
Some(')') if has_opening_paren => {
found_closing_paren = true;
SyntaxKind::RightParen
}
// Explicitly detect comments for more helpful errors
Some('/') if self.s.at(['/', '*']) => {
if self.s.eat() == Some('*') {
// Found a block comment. Advance until the next
// newline or '*/' just for a more accurate error span.
while !self.s.eat_if("*/") && !self.s.at(is_newline) {
self.s.eat();
}
} else {
self.s.eat_until(is_newline);
}
self.error(eco_format!("unexpected comment inside annotation"))
}
Some(_) => {
self.s.eat_until(|c: char| {
c.is_whitespace() || has_opening_paren && c == ')'
});
self.error(eco_format!(
"expected identifier{} in annotation",
if has_opening_paren {
", string or closing paren"
} else {
" or string"
}
))
}
None => break,
};
let node = self.emit_token(token, current_start);
subtree.push(node);
}
// Right parenthesis (covered above)
if has_opening_paren && !found_closing_paren {
subtree.push(
self.emit_error(
"expected closing paren after annotation",
self.s.cursor(),
),
);
}
SyntaxNode::inner(SyntaxKind::Annotation, subtree)
}
/// Lexes an annotation name.
///
/// An annotation name is an identifier within a specific subset of allowed
/// identifiers. Currently, `allow` is the only valid annotation name.
fn annotation_name(&mut self, start: usize) -> SyntaxKind {
self.s.eat_while(is_id_continue);
let ident = self.s.from(start);
if ident == "allow" {
SyntaxKind::AnnotationName
} else {
self.error(eco_format!("invalid annotation name"));
self.hint("must be 'allow'");
SyntaxKind::Error
}
}
/// Lexes a string in an annotation.
///
/// Currently, such strings only allow a very restricted set of characters.
/// These restrictions may be lifted in the future.
fn annotation_string(&mut self) -> SyntaxKind {
// TODO: Allow more characters in annotations' strings, perhaps allowing
// newlines somehow.
// Could perhaps use one // per line so we can break an annotation into
// multiple lines in a sensible way.
let start = self.s.cursor();
self.s.eat_while(|c| !is_newline(c) && c != '"');
let content = self.s.from(start);
if !self.s.eat_if('"') {
return self.error("unclosed string");
}
if let Some(c) = content.chars().find(|c| !is_valid_in_annotation_string(*c)) {
return self
.error(eco_format!("invalid character '{c}' in an annotation's string"));
}
SyntaxKind::Str
}
/// Expects an annotation continuation in the next line, indicated by a
/// leading comment marker ('//'). If the marker is not present, the
/// annotation is considered invalid and interrupted.
fn eat_annotation_linebreak(&mut self, subtree: &mut Vec<SyntaxNode>) -> bool {
let start = self.s.cursor();
self.s.eat_newline();
self.s.eat_while(is_inline_whitespace);
if self.s.at("//") {
subtree.push(self.emit_token(SyntaxKind::Space, start));
let marker_start = self.s.cursor();
self.s.eat();
self.s.eat();
subtree.push(self.emit_token(SyntaxKind::AnnotationMarker, marker_start));
true
} else {
// No annotation continuation marker on the next line, so we
// interrupt the annotation.
self.s.jump(start);
false
}
}
}
/// Markup.
impl Lexer<'_> {
fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
match c {
'\\' => self.backslash(),
'`' => self.raw(),
'h' if self.s.eat_if("ttp://") => self.link(),
'h' if self.s.eat_if("ttps://") => self.link(),
'<' if self.s.at(is_id_continue) => self.label(),
'@' => self.ref_marker(),
'.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
'-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
'-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
'-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
'-' if self.s.at(char::is_numeric) => SyntaxKind::Shorthand,
'*' if !self.in_word() => SyntaxKind::Star,
'_' if !self.in_word() => SyntaxKind::Underscore,
'#' => SyntaxKind::Hash,
'[' => SyntaxKind::LeftBracket,
']' => SyntaxKind::RightBracket,
'\'' => SyntaxKind::SmartQuote,
'"' => SyntaxKind::SmartQuote,
'$' => SyntaxKind::Dollar,
'~' => SyntaxKind::Shorthand,
':' => SyntaxKind::Colon,
'=' => {
self.s.eat_while('=');
if self.space_or_end() {
SyntaxKind::HeadingMarker
} else {
self.text()
}
}
'-' if self.space_or_end() => SyntaxKind::ListMarker,
'+' if self.space_or_end() => SyntaxKind::EnumMarker,
'/' if self.space_or_end() => SyntaxKind::TermMarker,
'0'..='9' => self.numbering(start),
_ => self.text(),
}
}
fn backslash(&mut self) -> SyntaxKind {
if self.s.eat_if("u{") {
let hex = self.s.eat_while(char::is_ascii_alphanumeric);
if !self.s.eat_if('}') {
return self.error("unclosed Unicode escape sequence");
}
if u32::from_str_radix(hex, 16)
.ok()
.and_then(std::char::from_u32)
.is_none()
{
return self.error(eco_format!("invalid Unicode codepoint: {}", hex));
}
return SyntaxKind::Escape;
}
if self.s.done() || self.s.at(char::is_whitespace) {
SyntaxKind::Linebreak
} else {
self.s.eat();
SyntaxKind::Escape
}
}
fn raw(&mut self) -> SyntaxKind {
let start = self.s.cursor() - 1;
self.raw.clear();
// Determine number of opening backticks.
let mut backticks = 1;
while self.s.eat_if('`') {
backticks += 1;
}
// Special case for ``.
if backticks == 2 {
self.push_raw(SyntaxKind::RawDelim);
self.s.jump(start + 1);
return SyntaxKind::RawDelim;
}
// Find end of raw text.
let mut found = 0;
while found < backticks {
match self.s.eat() {
Some('`') => found += 1,
Some(_) => found = 0,
None => break,
}
}
if found != backticks {
return self.error("unclosed raw text");
}
let end = self.s.cursor();
if backticks >= 3 {
self.blocky_raw(start, end, backticks);
} else {
self.inline_raw(start, end, backticks);
}
// Closing delimiter.
self.push_raw(SyntaxKind::RawDelim);
// The saved tokens will be removed in reverse.
self.raw.reverse();
// Opening delimiter.
self.s.jump(start + backticks);
SyntaxKind::RawDelim
}
fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) {
// Language tag.
self.s.jump(start + backticks);
if self.s.eat_if(is_id_start) {
self.s.eat_while(is_id_continue);
self.push_raw(SyntaxKind::RawLang);
}
// Determine inner content between backticks.
self.s.eat_if(' ');
let inner = self.s.to(end - backticks);
// Determine dedent level.
let mut lines = split_newlines(inner);
let dedent = lines
.iter()
.skip(1)
.filter(|line| !line.chars().all(char::is_whitespace))
// The line with the closing ``` is always taken into account
.chain(lines.last())
.map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
.min()
.unwrap_or(0);
// Trim single space in last line if text ends with a backtick. The last
// line is the one directly before the closing backticks and if it is
// just whitespace, it will be completely trimmed below.
if inner.trim_end().ends_with('`') {
if let Some(last) = lines.last_mut() {
*last = last.strip_suffix(' ').unwrap_or(last);
}
}
let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
let starts_whitespace = lines.first().is_some_and(is_whitespace);
let ends_whitespace = lines.last().is_some_and(is_whitespace);
let mut lines = lines.into_iter();
let mut skipped = false;
// Trim whitespace + newline at start.
if starts_whitespace {
self.s.advance(lines.next().unwrap().len());
skipped = true;
}
// Trim whitespace + newline at end.
if ends_whitespace {
lines.next_back();
}
// Add lines.
for (i, line) in lines.enumerate() {
let dedent = if i == 0 && !skipped { 0 } else { dedent };
let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
self.s.eat_newline();
self.s.advance(offset);
self.push_raw(SyntaxKind::RawTrimmed);
self.s.advance(line.len() - offset);
self.push_raw(SyntaxKind::Text);
}
// Add final trimmed.
if self.s.cursor() < end - backticks {
self.s.jump(end - backticks);
self.push_raw(SyntaxKind::RawTrimmed);
}
self.s.jump(end);
}
fn inline_raw(&mut self, start: usize, end: usize, backticks: usize) {
self.s.jump(start + backticks);
while self.s.cursor() < end - backticks {
if self.s.at(is_newline) {
self.push_raw(SyntaxKind::Text);
self.s.eat_newline();
self.push_raw(SyntaxKind::RawTrimmed);
continue;
}
self.s.eat();
}
self.push_raw(SyntaxKind::Text);
self.s.jump(end);
}
/// Push the current cursor that marks the end of a raw segment of
/// the given `kind`.
fn push_raw(&mut self, kind: SyntaxKind) {
let end = self.s.cursor();
self.raw.push((kind, end));
}
fn link(&mut self) -> SyntaxKind {
let (link, balanced) = link_prefix(self.s.after());
self.s.advance(link.len());
if !balanced {
return self.error(
"automatic links cannot contain unbalanced brackets, \
use the `link` function instead",
);
}
SyntaxKind::Link
}
fn numbering(&mut self, start: usize) -> SyntaxKind {
self.s.eat_while(char::is_ascii_digit);
let read = self.s.from(start);
if self.s.eat_if('.') && self.space_or_end() && read.parse::<usize>().is_ok() {
return SyntaxKind::EnumMarker;
}
self.text()
}
fn ref_marker(&mut self) -> SyntaxKind {
self.s.eat_while(is_valid_in_label_literal);
// Don't include the trailing characters likely to be part of text.
while matches!(self.s.scout(-1), Some('.' | ':')) {
self.s.uneat();
}
SyntaxKind::RefMarker
}
fn label(&mut self) -> SyntaxKind {
let label = self.s.eat_while(is_valid_in_label_literal);
if label.is_empty() {
return self.error("label cannot be empty");
}
if !self.s.eat_if('>') {
return self.error("unclosed label");
}
SyntaxKind::Label
}
fn text(&mut self) -> SyntaxKind {
macro_rules! table {
($(|$c:literal)*) => {
static TABLE: [bool; 128] = {
let mut t = [false; 128];
$(t[$c as usize] = true;)*
t
};
};
}
table! {
| ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
| '[' | ']' | '~' | '-' | '.' | '\'' | '"' | '*' | '_'
| ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
};
loop {
self.s.eat_until(|c: char| {
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
});
// Continue with the same text node if the thing would become text
// anyway.
let mut s = self.s;
match s.eat() {
Some(' ') if s.at(char::is_alphanumeric) => {}
Some('/') if !s.at(['/', '*']) => {}
Some('-') if !s.at(['-', '?']) => {}
Some('.') if !s.at("..") => {}
Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
Some('@') if !s.at(is_valid_in_label_literal) => {}
_ => break,
}
self.s = s;
}
SyntaxKind::Text
}
fn in_word(&self) -> bool {
let wordy = |c: Option<char>| {
c.is_some_and(|c| {
c.is_alphanumeric()
&& !matches!(
c.script(),
Script::Han
| Script::Hiragana
| Script::Katakana
| Script::Hangul
)
})
};
let prev = self.s.scout(-2);
let next = self.s.peek();
wordy(prev) && wordy(next)
}
fn space_or_end(&self) -> bool {
self.s.done() || self.s.at(char::is_whitespace)
}
}
/// Math.
impl Lexer<'_> {
fn math(&mut self, start: usize, c: char) -> SyntaxKind {
match c {
'\\' => self.backslash(),
'"' => self.string(),
'-' if self.s.eat_if(">>") => SyntaxKind::Shorthand,
'-' if self.s.eat_if('>') => SyntaxKind::Shorthand,
'-' if self.s.eat_if("->") => SyntaxKind::Shorthand,
':' if self.s.eat_if('=') => SyntaxKind::Shorthand,
':' if self.s.eat_if(":=") => SyntaxKind::Shorthand,
'!' if self.s.eat_if('=') => SyntaxKind::Shorthand,
'.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
'[' if self.s.eat_if('|') => SyntaxKind::Shorthand,
'<' if self.s.eat_if("==>") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("-->") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("--") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("-<") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("->") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("<-") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("<<") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("==") => SyntaxKind::Shorthand,
'<' if self.s.eat_if("~~") => SyntaxKind::Shorthand,
'<' if self.s.eat_if('=') => SyntaxKind::Shorthand,
'<' if self.s.eat_if('<') => SyntaxKind::Shorthand,
'<' if self.s.eat_if('-') => SyntaxKind::Shorthand,
'<' if self.s.eat_if('~') => SyntaxKind::Shorthand,
'>' if self.s.eat_if("->") => SyntaxKind::Shorthand,
'>' if self.s.eat_if(">>") => SyntaxKind::Shorthand,
'=' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
'=' if self.s.eat_if('>') => SyntaxKind::Shorthand,
'=' if self.s.eat_if(':') => SyntaxKind::Shorthand,
'>' if self.s.eat_if('=') => SyntaxKind::Shorthand,
'>' if self.s.eat_if('>') => SyntaxKind::Shorthand,
'|' if self.s.eat_if("->") => SyntaxKind::Shorthand,
'|' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
'|' if self.s.eat_if(']') => SyntaxKind::Shorthand,
'|' if self.s.eat_if('|') => SyntaxKind::Shorthand,
'~' if self.s.eat_if("~>") => SyntaxKind::Shorthand,
'~' if self.s.eat_if('>') => SyntaxKind::Shorthand,
'*' | '-' => SyntaxKind::Shorthand,
'#' => SyntaxKind::Hash,
'_' => SyntaxKind::Underscore,
'$' => SyntaxKind::Dollar,
'/' => SyntaxKind::Slash,
'^' => SyntaxKind::Hat,
'\'' => SyntaxKind::Prime,
'&' => SyntaxKind::MathAlignPoint,
'√' | '∛' | '∜' => SyntaxKind::Root,
// Identifiers.
c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
self.s.eat_while(is_math_id_continue);
SyntaxKind::MathIdent
}
// Other math atoms.
_ => self.math_text(start, c),
}
}
fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
// Keep numbers and grapheme clusters together.
if c.is_numeric() {
self.s.eat_while(char::is_numeric);
let mut s = self.s;
if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
self.s = s;
}
} else {
let len = self
.s
.get(start..self.s.string().len())
.graphemes(true)
.next()
.map_or(0, str::len);
self.s.jump(start + len);
}
SyntaxKind::Text
}
}
/// Code.
impl Lexer<'_> {
fn code(&mut self, start: usize, c: char) -> SyntaxKind {
match c {
'`' => self.raw(),
'<' if self.s.at(is_id_continue) => self.label(),
'0'..='9' => self.number(start, c),
'.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
'"' => self.string(),
'=' if self.s.eat_if('=') => SyntaxKind::EqEq,
'!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
'<' if self.s.eat_if('=') => SyntaxKind::LtEq,
'>' if self.s.eat_if('=') => SyntaxKind::GtEq,
'+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
'-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::HyphEq,
'*' if self.s.eat_if('=') => SyntaxKind::StarEq,
'/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
'.' if self.s.eat_if('.') => SyntaxKind::Dots,
'=' if self.s.eat_if('>') => SyntaxKind::Arrow,
'{' => SyntaxKind::LeftBrace,
'}' => SyntaxKind::RightBrace,
'[' => SyntaxKind::LeftBracket,
']' => SyntaxKind::RightBracket,
'(' => SyntaxKind::LeftParen,
')' => SyntaxKind::RightParen,
'$' => SyntaxKind::Dollar,
',' => SyntaxKind::Comma,
';' => SyntaxKind::Semicolon,
':' => SyntaxKind::Colon,
'.' => SyntaxKind::Dot,
'+' => SyntaxKind::Plus,
'-' | '\u{2212}' => SyntaxKind::Minus,
'*' => SyntaxKind::Star,
'/' => SyntaxKind::Slash,
'=' => SyntaxKind::Eq,
'<' => SyntaxKind::Lt,
'>' => SyntaxKind::Gt,
c if is_id_start(c) => self.ident(start),
c => self.error(eco_format!("the character `{c}` is not valid in code")),
}
}
fn ident(&mut self, start: usize) -> SyntaxKind {
self.s.eat_while(is_id_continue);
let ident = self.s.from(start);
let prev = self.s.get(0..start);
if !prev.ends_with(['.', '@']) || prev.ends_with("..") {
if let Some(keyword) = keyword(ident) {
return keyword;
}
}
if ident == "_" {
SyntaxKind::Underscore
} else {
SyntaxKind::Ident
}
}
fn number(&mut self, mut start: usize, c: char) -> SyntaxKind {
// Handle alternative integer bases.
let mut base = 10;
if c == '0' {
if self.s.eat_if('b') {
base = 2;
} else if self.s.eat_if('o') {
base = 8;
} else if self.s.eat_if('x') {
base = 16;
}
if base != 10 {
start = self.s.cursor();
}
}
// Read the first part (integer or fractional depending on `first`).
self.s.eat_while(if base == 16 {
char::is_ascii_alphanumeric
} else {
char::is_ascii_digit
});
// Read the fractional part if not already done.
// Make sure not to confuse a range for the decimal separator.
if c != '.'
&& !self.s.at("..")
&& !self.s.scout(1).is_some_and(is_id_start)
&& self.s.eat_if('.')
&& base == 10
{
self.s.eat_while(char::is_ascii_digit);
}
// Read the exponent.
if !self.s.at("em") && self.s.eat_if(['e', 'E']) && base == 10 {
self.s.eat_if(['+', '-']);
self.s.eat_while(char::is_ascii_digit);
}
// Read the suffix.
let suffix_start = self.s.cursor();
if !self.s.eat_if('%') {
self.s.eat_while(char::is_ascii_alphanumeric);
}
let number = self.s.get(start..suffix_start);
let suffix = self.s.from(suffix_start);
let kind = if i64::from_str_radix(number, base).is_ok() {
SyntaxKind::Int
} else if base == 10 && number.parse::<f64>().is_ok() {
SyntaxKind::Float
} else {
return self.error(match base {
2 => eco_format!("invalid binary number: 0b{}", number),
8 => eco_format!("invalid octal number: 0o{}", number),
16 => eco_format!("invalid hexadecimal number: 0x{}", number),
_ => eco_format!("invalid number: {}", number),
});
};
if suffix.is_empty() {
return kind;
}
if !matches!(
suffix,
"pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%"
) {
return self.error(eco_format!("invalid number suffix: {}", suffix));
}
SyntaxKind::Numeric
}
fn string(&mut self) -> SyntaxKind {
let mut escaped = false;
self.s.eat_until(|c| {
let stop = c == '"' && !escaped;
escaped = c == '\\' && !escaped;
stop
});
if !self.s.eat_if('"') {
return self.error("unclosed string");
}
SyntaxKind::Str
}
}
/// Try to parse an identifier into a keyword.
fn keyword(ident: &str) -> Option<SyntaxKind> {
Some(match ident {
"none" => SyntaxKind::None,
"auto" => SyntaxKind::Auto,
"true" => SyntaxKind::Bool,
"false" => SyntaxKind::Bool,
"not" => SyntaxKind::Not,
"and" => SyntaxKind::And,
"or" => SyntaxKind::Or,
"let" => SyntaxKind::Let,
"set" => SyntaxKind::Set,
"show" => SyntaxKind::Show,
"context" => SyntaxKind::Context,
"if" => SyntaxKind::If,
"else" => SyntaxKind::Else,
"for" => SyntaxKind::For,
"in" => SyntaxKind::In,
"while" => SyntaxKind::While,
"break" => SyntaxKind::Break,
"continue" => SyntaxKind::Continue,
"return" => SyntaxKind::Return,
"import" => SyntaxKind::Import,
"include" => SyntaxKind::Include,
"as" => SyntaxKind::As,
_ => return None,
})
}
trait ScannerExt {
fn advance(&mut self, by: usize);
fn eat_newline(&mut self) -> bool;
}
impl ScannerExt for Scanner<'_> {
fn advance(&mut self, by: usize) {
self.jump(self.cursor() + by);
}
fn eat_newline(&mut self) -> bool {
let ate = self.eat_if(is_newline);
if ate && self.before().ends_with('\r') {
self.eat_if('\n');
}
ate
}
}
/// Whether a character will become a [`SyntaxKind::Space`] token.
#[inline]
fn is_space(character: char, mode: LexMode) -> bool {
match mode {
LexMode::Markup => matches!(character, ' ' | '\t') || is_newline(character),
_ => character.is_whitespace(),
}
}
/// Whether a character is a whitespace but not interpreted as a newline by
/// Typst.
#[inline]
pub fn is_inline_whitespace(character: char) -> bool {
character.is_whitespace() && !is_newline(character)
}
/// Whether a character is interpreted as a newline by Typst.
#[inline]
pub fn is_newline(character: char) -> bool {
matches!(
character,
// Line Feed, Vertical Tab, Form Feed, Carriage Return.
'\n' | '\x0B' | '\x0C' | '\r' |
// Next Line, Line Separator, Paragraph Separator.
'\u{0085}' | '\u{2028}' | '\u{2029}'
)
}
/// Extracts a prefix of the text that is a link and also returns whether the
/// parentheses and brackets in the link were balanced.
pub fn link_prefix(text: &str) -> (&str, bool) {
let mut s = unscanny::Scanner::new(text);
let mut brackets = Vec::new();
#[rustfmt::skip]
s.eat_while(|c: char| {
match c {
| '0' ..= '9'
| 'a' ..= 'z'
| 'A' ..= 'Z'
| '!' | '#' | '$' | '%' | '&' | '*' | '+'
| ',' | '-' | '.' | '/' | ':' | ';' | '='
| '?' | '@' | '_' | '~' | '\'' => true,
'[' => {
brackets.push(b'[');
true
}
'(' => {
brackets.push(b'(');
true
}
']' => brackets.pop() == Some(b'['),
')' => brackets.pop() == Some(b'('),
_ => false,
}
});
// Don't include the trailing characters likely to be part of text.
while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
s.uneat();
}
(s.before(), brackets.is_empty())
}
/// Split text at newlines. These newline characters are not kept.
pub fn split_newlines(text: &str) -> Vec<&str> {
let mut s = Scanner::new(text);
let mut lines = Vec::new();
let mut start = 0;
let mut end = 0;
while let Some(c) = s.eat() {
if is_newline(c) {
if c == '\r' {
s.eat_if('\n');
}
lines.push(&text[start..end]);
start = s.cursor();
}
end = s.cursor();
}
lines.push(&text[start..]);
lines
}
/// Count the number of newlines in text.
fn count_newlines(text: &str) -> usize {
let mut newlines = 0;
let mut s = Scanner::new(text);
while let Some(c) = s.eat() {
if is_newline(c) {
if c == '\r' {
s.eat_if('\n');
}
newlines += 1;
}
}
newlines
}
/// Count newlines in text. Only counts up to 2 newlines.
pub(crate) fn count_capped_newlines(text: &str) -> u8 {
let mut newlines = 0;
let mut s = Scanner::new(text);
while let Some(c) = s.eat() {
if is_newline(c) {
if c == '\r' {
s.eat_if('\n');
}
newlines += 1;
if newlines == 2 {
break;
}
}
}
newlines
}
/// Whether a string is a valid Typst identifier.
///
/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
/// - `_` as a starting character,
/// - `_` and `-` as continuing characters.
///
/// [uax31]: http://www.unicode.org/reports/tr31/
#[inline]
pub fn is_ident(string: &str) -> bool {
let mut chars = string.chars();
chars
.next()
.is_some_and(|c| is_id_start(c) && chars.all(is_id_continue))
}
/// Whether a character can start an identifier.
#[inline]
pub fn is_id_start(c: char) -> bool {
is_xid_start(c) || c == '_'
}
/// Whether a character can continue an identifier.
#[inline]
pub fn is_id_continue(c: char) -> bool {
is_xid_continue(c) || c == '_' || c == '-'
}
/// Whether a character can start an identifier in math.
#[inline]
fn is_math_id_start(c: char) -> bool {
is_xid_start(c)
}
/// Whether a character can continue an identifier in math.
#[inline]
fn is_math_id_continue(c: char) -> bool {
is_xid_continue(c) && c != '_'
}
/// Whether a character can be part of a label literal's name.
#[inline]
fn is_valid_in_label_literal(c: char) -> bool {
is_id_continue(c) || matches!(c, ':' | '.')
}
/// Whether a character can be part of a string in an annotation.
#[inline]
fn is_valid_in_annotation_string(c: char) -> bool {
is_id_continue(c) || c == '@' || c == '/'
}
/// Returns true if this string is valid in a label literal.
pub fn is_valid_label_literal_id(id: &str) -> bool {
!id.is_empty() && id.chars().all(is_valid_in_label_literal)
}