5. Refactor parser memoization to localize functionality

This commit is contained in:
Ian Wrzesinski 2024-10-10 21:54:43 -04:00
parent 54eadb65a9
commit 16cc7eb472

View File

@ -1057,29 +1057,25 @@ fn return_stmt(p: &mut Parser) {
/// An expression that starts with a parenthesis. /// An expression that starts with a parenthesis.
fn expr_with_paren(p: &mut Parser, atomic: bool) { fn expr_with_paren(p: &mut Parser, atomic: bool) {
// If we've seen this position before and have a memoized result, just use if atomic {
// it. See below for more explanation about this memoization. // Atomic expressions aren't modified by operators that follow them, so
let start = p.current_start(); // our first guess of array/dict will be correct.
if let Some((range, end_point)) = p.memo.get(&start).cloned() { parenthesized_or_array_or_dict(p);
// Restore the end point first, so that it doesn't truncate our freshly
// pushed nodes. If the current length of `p.nodes` doesn't match what
// we had in the memoized run, this might otherwise happen.
p.restore(end_point);
p.nodes.extend(p.memo_arena[range].iter().cloned());
return; return;
} }
let m = p.marker(); // If we've seen this position before and have a memoized result, restore it
let checkpoint = p.checkpoint(); // and return. Otherwise, get a key to this position and a checkpoint to
// restart from in case we make a wrong prediction.
let Some((memo_key, checkpoint)) = p.restore_memo_or_checkpoint() else { return };
// The node length from when we restored.
let prev_len = checkpoint.node_len;
// When we reach a '(', we can't be sure what it is. First, we attempt to // When we reach a '(', we can't be sure what it is. First, we attempt to
// parse as a simple parenthesized expression, array, or dictionary as // parse as a simple parenthesized expression, array, or dictionary as
// these are the most likely things. We can handle all of those in a single // these are the most likely things. We can handle all of those in a single
// pass. // pass.
let kind = parenthesized_or_array_or_dict(p); let kind = parenthesized_or_array_or_dict(p);
if atomic {
return;
}
// If, however, '=>' or '=' follows, we must backtrack and reparse as either // If, however, '=>' or '=' follows, we must backtrack and reparse as either
// a parameter list or a destructuring. To be able to do that, we created a // a parameter list or a destructuring. To be able to do that, we created a
@ -1100,6 +1096,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
// case running time of O(2n). // case running time of O(2n).
if p.at(SyntaxKind::Arrow) { if p.at(SyntaxKind::Arrow) {
p.restore(checkpoint); p.restore(checkpoint);
let m = p.marker();
params(p); params(p);
if !p.expect(SyntaxKind::Arrow) { if !p.expect(SyntaxKind::Arrow) {
return; return;
@ -1108,6 +1105,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
p.wrap(m, SyntaxKind::Closure); p.wrap(m, SyntaxKind::Closure);
} else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized { } else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized {
p.restore(checkpoint); p.restore(checkpoint);
let m = p.marker();
destructuring_or_parenthesized(p, true, &mut HashSet::new()); destructuring_or_parenthesized(p, true, &mut HashSet::new());
if !p.expect(SyntaxKind::Eq) { if !p.expect(SyntaxKind::Eq) {
return; return;
@ -1119,9 +1117,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
} }
// Memoize result if we backtracked. // Memoize result if we backtracked.
let offset = p.memo_arena.len(); p.memoize_parsed_nodes(memo_key, prev_len);
p.memo_arena.extend(p.nodes[m.0..].iter().cloned());
p.memo.insert(start, (offset..p.memo_arena.len(), p.checkpoint()));
} }
/// Parses either /// Parses either
@ -1456,6 +1452,9 @@ fn destructuring_item<'s>(
// Parse a normal positional pattern or a destructuring key. // Parse a normal positional pattern or a destructuring key.
let was_at_pat = p.at_set(set::PATTERN); let was_at_pat = p.at_set(set::PATTERN);
// We must use a full checkpoint here (can't just clone the lexer) because
// there may be trivia between the identifier and the colon we need to skip.
let checkpoint = p.checkpoint(); let checkpoint = p.checkpoint();
if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) { if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) {
p.restore(checkpoint); p.restore(checkpoint);
@ -1579,9 +1578,7 @@ struct Parser<'s> {
/// Parser checkpoints for a given text index. Used for efficient parser /// Parser checkpoints for a given text index. Used for efficient parser
/// backtracking similar to packrat parsing. See comments above in /// backtracking similar to packrat parsing. See comments above in
/// [`expr_with_paren`]. /// [`expr_with_paren`].
memo: HashMap<usize, (Range<usize>, Checkpoint<'s>)>, memo: MemoArena<'s>,
/// The stored parse results at each checkpoint.
memo_arena: Vec<SyntaxNode>,
} }
/// How to proceed with parsing when at a newline in Code. /// How to proceed with parsing when at a newline in Code.
@ -1601,15 +1598,6 @@ enum NewlineMode {
#[derive(Debug, Copy, Clone, Eq, PartialEq)] #[derive(Debug, Copy, Clone, Eq, PartialEq)]
struct Marker(usize); struct Marker(usize);
#[derive(Clone)]
struct Checkpoint<'s> {
lexer: Lexer<'s>,
prev_end: usize,
current_start: usize,
current: SyntaxKind,
nodes: usize,
}
impl<'s> Parser<'s> { impl<'s> Parser<'s> {
/// Create a new parser starting from the given text offset and lexer mode. /// Create a new parser starting from the given text offset and lexer mode.
fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
@ -1626,8 +1614,7 @@ impl<'s> Parser<'s> {
nodes: vec![], nodes: vec![],
modes: vec![], modes: vec![],
newline_modes: vec![], newline_modes: vec![],
memo: HashMap::new(), memo: Default::default(),
memo_arena: vec![],
} }
} }
@ -1836,26 +1823,6 @@ impl<'s> Parser<'s> {
self.skip(); self.skip();
} }
/// Save a checkpoint of the parser state.
fn checkpoint(&self) -> Checkpoint<'s> {
Checkpoint {
lexer: self.lexer.clone(),
prev_end: self.prev_end,
current_start: self.current_start,
current: self.current,
nodes: self.nodes.len(),
}
}
/// Reset the parser from a checkpoint.
fn restore(&mut self, checkpoint: Checkpoint<'s>) {
self.lexer = checkpoint.lexer;
self.prev_end = checkpoint.prev_end;
self.current_start = checkpoint.current_start;
self.current = checkpoint.current;
self.nodes.truncate(checkpoint.nodes);
}
/// Move past trivia nodes in Code/Math. /// Move past trivia nodes in Code/Math.
fn skip(&mut self) { fn skip(&mut self) {
if self.lexer.mode() != LexMode::Markup { if self.lexer.mode() != LexMode::Markup {
@ -1929,6 +1896,90 @@ impl<'s> Parser<'s> {
} }
} }
/// Extra parser state for efficiently recovering from mispredicted parses.
///
/// This is the same idea as packrat parsing, but we use it only in the limited
/// case of parenthesized structures. See [`expr_with_paren`] for more.
#[derive(Default)]
struct MemoArena<'s> {
/// A single arena of previously parsed nodes (to reduce allocations).
/// Memoized ranges refer to unique sections of the arena.
arena: Vec<SyntaxNode>,
/// A map from the parser's current position to a range of previously parsed
/// nodes in the arena and a checkpoint of the parser's state. These allow
/// us to reset the parser to avoid parsing the same location again.
memo_map: HashMap<MemoKey, (Range<usize>, Checkpoint<'s>)>,
}
/// A type alias for the memo key so it doesn't get confused with other usizes.
///
/// The memo is keyed by the index into `text` of the current token's start.
type MemoKey = usize;
/// A checkpoint of the parser which can fully restore it to a previous state.
#[derive(Clone)]
struct Checkpoint<'s> {
lexer: Lexer<'s>,
prev_end: usize,
current_start: usize,
current: SyntaxKind,
node_len: usize,
}
impl<'s> Parser<'s> {
/// Store the already parsed nodes and the parser state into the memo map by
/// extending the arena and storing the extended range and a checkpoint.
fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) {
let memo_start = self.memo.arena.len();
self.memo.arena.extend_from_slice(&self.nodes[prev_len..]);
let arena_range = memo_start..self.memo.arena.len();
self.memo.memo_map.insert(key, (arena_range, self.checkpoint()));
}
/// Try to load a memoized result, return `None` if we did or `Some` (with a
/// checkpoint and a key for the memo map) if we didn't.
fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint<'s>)> {
// We use the starting index of the current token as our key.
let key: MemoKey = self.current_start();
match self.memo.memo_map.get(&key).cloned() {
Some((range, checkpoint)) => {
self.nodes.extend_from_slice(&self.memo.arena[range]);
// It's important that we don't truncate the nodes vector since
// it may have grown or shrunk (due to other memoization or
// error reporting) since we made this checkpoint.
self.restore_partial(checkpoint);
None
}
None => Some((key, self.checkpoint())),
}
}
/// Restore the parser to the state at a checkpoint.
fn restore(&mut self, checkpoint: Checkpoint<'s>) {
self.nodes.truncate(checkpoint.node_len);
self.restore_partial(checkpoint);
}
/// Restore parts of the checkpoint excluding the nodes vector.
fn restore_partial(&mut self, checkpoint: Checkpoint<'s>) {
self.lexer = checkpoint.lexer;
self.prev_end = checkpoint.prev_end;
self.current_start = checkpoint.current_start;
self.current = checkpoint.current;
}
/// Save a checkpoint of the parser state.
fn checkpoint(&self) -> Checkpoint<'s> {
Checkpoint {
lexer: self.lexer.clone(),
prev_end: self.prev_end,
current_start: self.current_start,
current: self.current,
node_len: self.nodes.len(),
}
}
}
impl<'s> Parser<'s> { impl<'s> Parser<'s> {
/// Consume the given `kind` or produce an error. /// Consume the given `kind` or produce an error.
fn expect(&mut self, kind: SyntaxKind) -> bool { fn expect(&mut self, kind: SyntaxKind) -> bool {