12. Add the Token type and replace lex/skip/save methods

This commit is contained in:
Ian Wrzesinski 2024-10-10 11:57:27 -04:00
parent c466080fb2
commit 91b384ad7b

View File

@ -325,11 +325,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
_ => p.expected("expression"), _ => p.expected("expression"),
} }
if continuable if continuable && min_prec < 3 && !p.had_trivia() && maybe_delimited(p) {
&& min_prec < 3
&& p.prev_end() == p.current_start()
&& maybe_delimited(p)
{
p.wrap(m, SyntaxKind::Math); p.wrap(m, SyntaxKind::Math);
} }
@ -581,6 +577,8 @@ fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option<Marker>) {
// Convert 0 exprs into a blank math element (so empty arguments are allowed). // Convert 0 exprs into a blank math element (so empty arguments are allowed).
// Convert 2+ exprs into a math element (so they become a joined sequence). // Convert 2+ exprs into a math element (so they become a joined sequence).
p.wrap_within(arg, p.marker(), SyntaxKind::Math); p.wrap_within(arg, p.marker(), SyntaxKind::Math);
// We need to update `n_trivia` since we no longer have any.
p.token.n_trivia = 0; // TODO: Maybe create a `flush_trivia()` method?
} }
if let Some(m) = named { if let Some(m) = named {
@ -625,14 +623,17 @@ fn embedded_code_expr(p: &mut Parser) {
p.with_mode(LexMode::Code, |p| { p.with_mode(LexMode::Code, |p| {
p.with_nl_mode(AtNewline::Stop, |p| { p.with_nl_mode(AtNewline::Stop, |p| {
p.assert(SyntaxKind::Hash); p.assert(SyntaxKind::Hash);
p.unskip(); if p.had_trivia() {
p.expected("expression");
return;
}
let stmt = p.at_set(set::STMT); let stmt = p.at_set(set::STMT);
let at = p.at_set(set::ATOMIC_CODE_EXPR); let at = p.at_set(set::ATOMIC_CODE_EXPR);
code_expr_prec(p, true, 0); code_expr_prec(p, true, 0);
// Consume error for things like `#12p` or `#"abc\"`.# // Consume error for things like `#12p` or `#"abc\"`.#
if !at && !p.current().is_trivia() && !p.end() { if !at && !p.end() {
p.unexpected(); p.unexpected();
} }
@ -1493,14 +1494,15 @@ fn pattern_leaf<'s>(
/// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s. /// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s.
/// ///
/// The implementation presents an interface that investigates a `current` token /// The implementation presents an interface that investigates a current `token`
/// and can take one of the following actions: /// with a [`SyntaxKind`] and can take one of the following actions:
/// ///
/// 1. Eat a token, pushing `current` into the `nodes` vector as a [leaf /// 1. Eat a token: push `token` onto the `nodes` vector as a [leaf
/// node](`SyntaxNode::leaf`) and prepare a new `current` by calling into the /// node](`SyntaxNode::leaf`) and prepare a new `token` by calling into the
/// lexer. /// lexer.
/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `current`) into /// 2. Wrap nodes from a marker to the end of `nodes` (excluding `token` and any
/// an [inner node](`SyntaxNode::inner`) of a specific [`SyntaxKind`]. /// attached trivia) into an [inner node](`SyntaxNode::inner`) of a specific
/// `SyntaxKind`.
/// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when /// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when
/// something expected is missing or something unexpected is found. /// something expected is missing or something unexpected is found.
/// ///
@ -1525,9 +1527,9 @@ fn pattern_leaf<'s>(
/// pushing onto the end of the `nodes` vector until a non-trivia kind is found. /// pushing onto the end of the `nodes` vector until a non-trivia kind is found.
/// ///
/// The newline mode is used in Code to determine whether a newline should end /// The newline mode is used in Code to determine whether a newline should end
/// the current expression. If so, the parser temporarily changes the current /// the current expression. If so, the parser temporarily changes `token`'s kind
/// token's kind to a fake [`SyntaxKind::End`]. When the parser exits the mode /// to a fake [`SyntaxKind::End`]. When the parser exits the mode the original
/// the original `SyntaxKind` is restored. /// `SyntaxKind` is restored.
struct Parser<'s> { struct Parser<'s> {
/// The source text shared with the lexer. /// The source text shared with the lexer.
text: &'s str, text: &'s str,
@ -1537,21 +1539,16 @@ struct Parser<'s> {
lexer: Lexer<'s>, lexer: Lexer<'s>,
/// The newline mode: whether to insert a temporary end at newlines in Code. /// The newline mode: whether to insert a temporary end at newlines in Code.
nl_mode: AtNewline, nl_mode: AtNewline,
/// The index into `text` of the end of the previous token. /// The current token under inspection, not yet present in `nodes`. This
prev_end: usize, /// acts like a single item of lookahead for the parser.
/// The index into `text` of the start of our current token (the end is ///
/// stored as the lexer's cursor). /// When wrapping, this is _not_ included in the wrapped nodes.
current_start: usize, token: Token,
/// The [`SyntaxKind`] of the current token.
current: SyntaxKind,
/// The [`SyntaxNode`] of the current token, ready to be eaten and pushed
/// onto the end of `nodes`.
current_node: SyntaxNode,
/// Whether the parser has the expected set of open/close delimiters. This /// Whether the parser has the expected set of open/close delimiters. This
/// only ever transitions from `true` to `false`. /// only ever transitions from `true` to `false`.
balanced: bool, balanced: bool,
/// Nodes representing the concrete syntax tree of previously parsed text. /// Nodes representing the concrete syntax tree of previously parsed text.
/// In Code and Math, includes previously parsed trivia, but not `current`. /// In Code and Math, includes previously parsed trivia, but not `token`.
nodes: Vec<SyntaxNode>, nodes: Vec<SyntaxNode>,
/// Parser checkpoints for a given text index. Used for efficient parser /// Parser checkpoints for a given text index. Used for efficient parser
/// backtracking similar to packrat parsing. See comments above in /// backtracking similar to packrat parsing. See comments above in
@ -1559,6 +1556,26 @@ struct Parser<'s> {
memo: MemoArena, memo: MemoArena,
} }
/// A single token returned from the lexer with a cached [`SyntaxKind`] and a
/// record of preceding trivia.
#[derive(Debug, Clone)]
struct Token {
/// The [`SyntaxKind`] of the current token.
kind: SyntaxKind,
/// The [`SyntaxNode`] of the current token, ready to be eaten and pushed
/// onto the end of `nodes`.
node: SyntaxNode,
/// The number of preceding trivia before this token.
n_trivia: usize,
/// Whether this token's preceding trivia contained a newline.
had_newline: bool,
/// The index into `text` of the start of our current token (the end is
/// stored as the lexer's cursor).
start: usize,
/// The index into `text` of the end of the previous token.
prev_end: usize,
}
/// How to proceed with parsing when at a newline in Code. /// How to proceed with parsing when at a newline in Code.
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum AtNewline { enum AtNewline {
@ -1572,11 +1589,12 @@ enum AtNewline {
impl AtNewline { impl AtNewline {
/// Whether to stop at a newline or continue based on the current context. /// Whether to stop at a newline or continue based on the current context.
fn stop(self, kind: impl FnOnce() -> SyntaxKind) -> bool { fn stop(self, kind: SyntaxKind) -> bool {
#[allow(clippy::match_like_matches_macro)]
match self { match self {
AtNewline::Continue => false, AtNewline::Continue => false,
AtNewline::Stop => true, AtNewline::Stop => true,
AtNewline::Contextual => match kind() { AtNewline::Contextual => match kind {
SyntaxKind::Else | SyntaxKind::Dot => false, SyntaxKind::Else | SyntaxKind::Dot => false,
_ => true, _ => true,
}, },
@ -1595,17 +1613,16 @@ impl<'s> Parser<'s> {
fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
let mut lexer = Lexer::new(text, mode); let mut lexer = Lexer::new(text, mode);
lexer.jump(offset); lexer.jump(offset);
let (current, current_node) = lexer.next(); let nl_mode = AtNewline::Continue;
let mut nodes = vec![];
let token = Self::lex(&mut nodes, &mut lexer, nl_mode);
Self { Self {
text, text,
lexer, lexer,
nl_mode: AtNewline::Continue, nl_mode,
prev_end: offset, token,
current_start: offset,
current,
current_node,
balanced: true, balanced: true,
nodes: vec![], nodes,
memo: Default::default(), memo: Default::default(),
} }
} }
@ -1623,18 +1640,18 @@ impl<'s> Parser<'s> {
/// The offset into `text` of the previous token's end. /// The offset into `text` of the previous token's end.
fn prev_end(&self) -> usize { fn prev_end(&self) -> usize {
self.prev_end self.token.prev_end
} }
/// Similar to a `peek()` function: returns the `kind` of the next token to /// Similar to a `peek()` function: returns the `kind` of the next token to
/// be eaten. /// be eaten.
fn current(&self) -> SyntaxKind { fn current(&self) -> SyntaxKind {
self.current self.token.kind
} }
/// The offset into `text` of the current token's start. /// The offset into `text` of the current token's start.
fn current_start(&self) -> usize { fn current_start(&self) -> usize {
self.current_start self.token.start
} }
/// The offset into `text` of the current token's end. /// The offset into `text` of the current token's end.
@ -1644,17 +1661,17 @@ impl<'s> Parser<'s> {
/// The current token's text. /// The current token's text.
fn current_text(&self) -> &'s str { fn current_text(&self) -> &'s str {
&self.text[self.current_start..self.current_end()] &self.text[self.token.start..self.current_end()]
} }
/// Whether the current token is a given [`SyntaxKind`]. /// Whether the current token is a given [`SyntaxKind`].
fn at(&self, kind: SyntaxKind) -> bool { fn at(&self, kind: SyntaxKind) -> bool {
self.current == kind self.token.kind == kind
} }
/// Whether the current token is contained in a [`SyntaxSet`]. /// Whether the current token is contained in a [`SyntaxSet`].
fn at_set(&self, set: SyntaxSet) -> bool { fn at_set(&self, set: SyntaxSet) -> bool {
set.contains(self.current) set.contains(self.token.kind)
} }
/// Whether we're at the end of the token stream. /// Whether we're at the end of the token stream.
@ -1666,24 +1683,21 @@ impl<'s> Parser<'s> {
/// If we're at the given `kind` with no preceding trivia tokens. /// If we're at the given `kind` with no preceding trivia tokens.
fn directly_at(&self, kind: SyntaxKind) -> bool { fn directly_at(&self, kind: SyntaxKind) -> bool {
self.current == kind && self.prev_end == self.current_start self.token.kind == kind && !self.had_trivia()
} }
/// Eat the current token by saving it to the `nodes` vector, then move /// Eat the current token by saving it to the `nodes` vector, then move
/// the lexer forward to prepare a new token. /// the lexer forward to prepare a new token.
fn eat(&mut self) { fn eat(&mut self) {
self.save(); self.nodes.push(std::mem::take(&mut self.token.node));
self.lex(); self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
self.skip();
} }
/// Eat the current node and return a reference for in-place mutation. /// Eat the current node and return a reference for in-place mutation.
#[track_caller] #[track_caller]
fn eat_and_get(&mut self) -> &mut SyntaxNode { fn eat_and_get(&mut self) -> &mut SyntaxNode {
let offset = self.nodes.len(); let offset = self.nodes.len();
self.save(); self.eat();
self.lex();
self.skip();
&mut self.nodes[offset] &mut self.nodes[offset]
} }
@ -1714,20 +1728,25 @@ impl<'s> Parser<'s> {
/// specific token. /// specific token.
#[track_caller] #[track_caller]
fn assert(&mut self, kind: SyntaxKind) { fn assert(&mut self, kind: SyntaxKind) {
assert_eq!(self.current, kind); assert_eq!(self.token.kind, kind);
self.eat(); self.eat();
} }
/// Convert the current token's [`SyntaxKind`] and eat it. /// Convert the current token's [`SyntaxKind`] and eat it.
fn convert_and_eat(&mut self, kind: SyntaxKind) { fn convert_and_eat(&mut self, kind: SyntaxKind) {
// Only need to replace the node here. // Only need to replace the node here.
self.current_node.convert_to_kind(kind); self.token.node.convert_to_kind(kind);
self.eat(); self.eat();
} }
/// Whether the current token is a newline, only used in Markup. /// Whether the current token is a newline, only used in Markup.
fn newline(&mut self) -> bool { fn newline(&self) -> bool {
self.lexer.newline() self.token.had_newline
}
/// Whether `token` had any trivia before it in Code/Math.
fn had_trivia(&self) -> bool {
self.token.n_trivia > 0
} }
/// The number of characters until the most recent newline in `text`. /// The number of characters until the most recent newline in `text`.
@ -1744,13 +1763,7 @@ impl<'s> Parser<'s> {
/// A marker that will point to first trivia before this token in the /// A marker that will point to first trivia before this token in the
/// parser (or the token itself if no trivia precede it). /// parser (or the token itself if no trivia precede it).
fn before_trivia(&self) -> Marker { fn before_trivia(&self) -> Marker {
let mut i = self.nodes.len(); Marker(self.nodes.len() - self.token.n_trivia)
if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
while i > 0 && self.nodes[i - 1].kind().is_trivia() {
i -= 1;
}
}
Marker(i)
} }
/// Whether the last non-trivia node is an error. /// Whether the last non-trivia node is an error.
@ -1792,11 +1805,10 @@ impl<'s> Parser<'s> {
self.lexer.set_mode(mode); self.lexer.set_mode(mode);
func(self); func(self);
if mode != previous { if mode != previous {
self.unskip();
self.lexer.set_mode(previous); self.lexer.set_mode(previous);
self.lexer.jump(self.current_start); self.lexer.jump(self.token.prev_end);
self.lex(); self.nodes.truncate(self.nodes.len() - self.token.n_trivia);
self.skip(); self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
} }
} }
@ -1808,69 +1820,46 @@ impl<'s> Parser<'s> {
let previous = self.nl_mode; let previous = self.nl_mode;
self.nl_mode = mode; self.nl_mode = mode;
func(self); func(self);
self.unskip();
self.nl_mode = previous; self.nl_mode = previous;
self.lexer.jump(self.prev_end); if mode != previous && self.token.had_newline {
self.lex(); let actual_kind = self.token.node.kind();
self.skip(); if self.nl_mode.stop(actual_kind) {
} self.token.kind = SyntaxKind::End;
} else {
/// Move past trivia nodes in Code/Math. self.token.kind = actual_kind;
fn skip(&mut self) {
if self.lexer.mode() != LexMode::Markup {
while self.current.is_trivia() {
self.save();
self.lex();
}
}
}
/// Move the parser back to the start of this token or its leading trivia
/// (in Code/Math).
fn unskip(&mut self) {
if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) {
self.nodes.pop();
}
self.lexer.jump(self.prev_end);
self.lex();
}
}
/// Save the current token to the `nodes` vector as an Inner or Error node.
fn save(&mut self) {
self.nodes.push(self.current_node.clone());
if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() {
self.prev_end = self.current_end();
}
}
/// Find the kind of the next non-trivia token in the lexer.
fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind {
loop {
let next = lexer.next().0;
// Loop is terminable, because `SyntaxKind::End` is not a trivia.
if !next.is_trivia() {
break next;
} }
} }
} }
/// Move the lexer forward and prepare the current token. In Code, this /// Move the lexer forward and prepare the current token. In Code, this
/// might insert a temporary [`SyntaxKind::End`] based on our newline mode. /// might insert a temporary [`SyntaxKind::End`] based on our newline mode.
fn lex(&mut self) { ///
self.current_start = self.lexer.cursor(); /// This is not a method on `self` because we need a valid token before we
(self.current, self.current_node) = self.lexer.next(); /// can initialize the parser.
fn lex(nodes: &mut Vec<SyntaxNode>, lexer: &mut Lexer, nl_mode: AtNewline) -> Token {
let prev_end = lexer.cursor();
let mut start = prev_end;
let (mut kind, mut node) = lexer.next();
let mut n_trivia = 0;
let mut had_newline = lexer.newline();
// Special cases to handle newlines in Code. if lexer.mode() != LexMode::Markup {
if self.lexer.mode() == LexMode::Code while kind.is_trivia() {
&& self.lexer.newline() n_trivia += 1;
&& self.nl_mode.stop(|| Self::next_non_trivia(&mut self.lexer.clone())) nodes.push(node);
{ start = lexer.cursor();
self.current = SyntaxKind::End; (kind, node) = lexer.next();
had_newline |= lexer.newline();
} }
if lexer.mode() == LexMode::Code && had_newline {
// Insert a temporary ['SyntaxKind::End'] to halt the parser.
// The actual `SyntaxKind` will be restored from `node` later.
if nl_mode.stop(kind) {
kind = SyntaxKind::End;
}
}
}
Token { kind, node, n_trivia, had_newline, start, prev_end }
} }
} }
@ -1906,10 +1895,7 @@ struct Checkpoint {
struct PartialState { struct PartialState {
cursor: usize, cursor: usize,
lex_mode: LexMode, lex_mode: LexMode,
prev_end: usize, token: Token,
current_start: usize,
current: SyntaxKind,
current_node: SyntaxNode,
} }
impl<'s> Parser<'s> { impl<'s> Parser<'s> {
@ -1951,10 +1937,7 @@ impl<'s> Parser<'s> {
fn restore_partial(&mut self, state: PartialState) { fn restore_partial(&mut self, state: PartialState) {
self.lexer.jump(state.cursor); self.lexer.jump(state.cursor);
self.lexer.set_mode(state.lex_mode); self.lexer.set_mode(state.lex_mode);
self.prev_end = state.prev_end; self.token = state.token;
self.current_start = state.current_start;
self.current = state.current;
self.current_node = state.current_node;
} }
/// Save a checkpoint of the parser state. /// Save a checkpoint of the parser state.
@ -1963,10 +1946,7 @@ impl<'s> Parser<'s> {
let state = PartialState { let state = PartialState {
cursor: self.lexer.cursor(), cursor: self.lexer.cursor(),
lex_mode: self.lexer.mode(), lex_mode: self.lexer.mode(),
prev_end: self.prev_end, token: self.token.clone(),
current_start: self.current_start,
current: self.current,
current_node: self.current_node.clone(),
}; };
Checkpoint { node_len, state } Checkpoint { node_len, state }
} }
@ -1978,7 +1958,7 @@ impl<'s> Parser<'s> {
let at = self.at(kind); let at = self.at(kind);
if at { if at {
self.eat(); self.eat();
} else if kind == SyntaxKind::Ident && self.current.is_keyword() { } else if kind == SyntaxKind::Ident && self.token.kind.is_keyword() {
self.trim_errors(); self.trim_errors();
self.eat_and_get().expected(kind.name()); self.eat_and_get().expected(kind.name());
} else { } else {
@ -2024,7 +2004,7 @@ impl<'s> Parser<'s> {
/// unexpected. /// unexpected.
fn unexpected(&mut self) { fn unexpected(&mut self) {
self.trim_errors(); self.trim_errors();
self.balanced &= !self.current.is_grouping(); self.balanced &= !self.token.kind.is_grouping();
self.eat_and_get().unexpected(); self.eat_and_get().unexpected();
} }