mirror of
https://github.com/typst/typst
synced 2025-06-28 16:22:53 +08:00
5. Refactor parser memoization to localize functionality
This commit is contained in:
parent
54eadb65a9
commit
16cc7eb472
@ -1057,29 +1057,25 @@ fn return_stmt(p: &mut Parser) {
|
|||||||
|
|
||||||
/// An expression that starts with a parenthesis.
|
/// An expression that starts with a parenthesis.
|
||||||
fn expr_with_paren(p: &mut Parser, atomic: bool) {
|
fn expr_with_paren(p: &mut Parser, atomic: bool) {
|
||||||
// If we've seen this position before and have a memoized result, just use
|
if atomic {
|
||||||
// it. See below for more explanation about this memoization.
|
// Atomic expressions aren't modified by operators that follow them, so
|
||||||
let start = p.current_start();
|
// our first guess of array/dict will be correct.
|
||||||
if let Some((range, end_point)) = p.memo.get(&start).cloned() {
|
parenthesized_or_array_or_dict(p);
|
||||||
// Restore the end point first, so that it doesn't truncate our freshly
|
|
||||||
// pushed nodes. If the current length of `p.nodes` doesn't match what
|
|
||||||
// we had in the memoized run, this might otherwise happen.
|
|
||||||
p.restore(end_point);
|
|
||||||
p.nodes.extend(p.memo_arena[range].iter().cloned());
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let m = p.marker();
|
// If we've seen this position before and have a memoized result, restore it
|
||||||
let checkpoint = p.checkpoint();
|
// and return. Otherwise, get a key to this position and a checkpoint to
|
||||||
|
// restart from in case we make a wrong prediction.
|
||||||
|
let Some((memo_key, checkpoint)) = p.restore_memo_or_checkpoint() else { return };
|
||||||
|
// The node length from when we restored.
|
||||||
|
let prev_len = checkpoint.node_len;
|
||||||
|
|
||||||
// When we reach a '(', we can't be sure what it is. First, we attempt to
|
// When we reach a '(', we can't be sure what it is. First, we attempt to
|
||||||
// parse as a simple parenthesized expression, array, or dictionary as
|
// parse as a simple parenthesized expression, array, or dictionary as
|
||||||
// these are the most likely things. We can handle all of those in a single
|
// these are the most likely things. We can handle all of those in a single
|
||||||
// pass.
|
// pass.
|
||||||
let kind = parenthesized_or_array_or_dict(p);
|
let kind = parenthesized_or_array_or_dict(p);
|
||||||
if atomic {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If, however, '=>' or '=' follows, we must backtrack and reparse as either
|
// If, however, '=>' or '=' follows, we must backtrack and reparse as either
|
||||||
// a parameter list or a destructuring. To be able to do that, we created a
|
// a parameter list or a destructuring. To be able to do that, we created a
|
||||||
@ -1100,6 +1096,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
|
|||||||
// case running time of O(2n).
|
// case running time of O(2n).
|
||||||
if p.at(SyntaxKind::Arrow) {
|
if p.at(SyntaxKind::Arrow) {
|
||||||
p.restore(checkpoint);
|
p.restore(checkpoint);
|
||||||
|
let m = p.marker();
|
||||||
params(p);
|
params(p);
|
||||||
if !p.expect(SyntaxKind::Arrow) {
|
if !p.expect(SyntaxKind::Arrow) {
|
||||||
return;
|
return;
|
||||||
@ -1108,6 +1105,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
|
|||||||
p.wrap(m, SyntaxKind::Closure);
|
p.wrap(m, SyntaxKind::Closure);
|
||||||
} else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized {
|
} else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized {
|
||||||
p.restore(checkpoint);
|
p.restore(checkpoint);
|
||||||
|
let m = p.marker();
|
||||||
destructuring_or_parenthesized(p, true, &mut HashSet::new());
|
destructuring_or_parenthesized(p, true, &mut HashSet::new());
|
||||||
if !p.expect(SyntaxKind::Eq) {
|
if !p.expect(SyntaxKind::Eq) {
|
||||||
return;
|
return;
|
||||||
@ -1119,9 +1117,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Memoize result if we backtracked.
|
// Memoize result if we backtracked.
|
||||||
let offset = p.memo_arena.len();
|
p.memoize_parsed_nodes(memo_key, prev_len);
|
||||||
p.memo_arena.extend(p.nodes[m.0..].iter().cloned());
|
|
||||||
p.memo.insert(start, (offset..p.memo_arena.len(), p.checkpoint()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parses either
|
/// Parses either
|
||||||
@ -1456,6 +1452,9 @@ fn destructuring_item<'s>(
|
|||||||
|
|
||||||
// Parse a normal positional pattern or a destructuring key.
|
// Parse a normal positional pattern or a destructuring key.
|
||||||
let was_at_pat = p.at_set(set::PATTERN);
|
let was_at_pat = p.at_set(set::PATTERN);
|
||||||
|
|
||||||
|
// We must use a full checkpoint here (can't just clone the lexer) because
|
||||||
|
// there may be trivia between the identifier and the colon we need to skip.
|
||||||
let checkpoint = p.checkpoint();
|
let checkpoint = p.checkpoint();
|
||||||
if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) {
|
if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) {
|
||||||
p.restore(checkpoint);
|
p.restore(checkpoint);
|
||||||
@ -1579,9 +1578,7 @@ struct Parser<'s> {
|
|||||||
/// Parser checkpoints for a given text index. Used for efficient parser
|
/// Parser checkpoints for a given text index. Used for efficient parser
|
||||||
/// backtracking similar to packrat parsing. See comments above in
|
/// backtracking similar to packrat parsing. See comments above in
|
||||||
/// [`expr_with_paren`].
|
/// [`expr_with_paren`].
|
||||||
memo: HashMap<usize, (Range<usize>, Checkpoint<'s>)>,
|
memo: MemoArena<'s>,
|
||||||
/// The stored parse results at each checkpoint.
|
|
||||||
memo_arena: Vec<SyntaxNode>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// How to proceed with parsing when at a newline in Code.
|
/// How to proceed with parsing when at a newline in Code.
|
||||||
@ -1601,15 +1598,6 @@ enum NewlineMode {
|
|||||||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||||||
struct Marker(usize);
|
struct Marker(usize);
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct Checkpoint<'s> {
|
|
||||||
lexer: Lexer<'s>,
|
|
||||||
prev_end: usize,
|
|
||||||
current_start: usize,
|
|
||||||
current: SyntaxKind,
|
|
||||||
nodes: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'s> Parser<'s> {
|
impl<'s> Parser<'s> {
|
||||||
/// Create a new parser starting from the given text offset and lexer mode.
|
/// Create a new parser starting from the given text offset and lexer mode.
|
||||||
fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
|
fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
|
||||||
@ -1626,8 +1614,7 @@ impl<'s> Parser<'s> {
|
|||||||
nodes: vec![],
|
nodes: vec![],
|
||||||
modes: vec![],
|
modes: vec![],
|
||||||
newline_modes: vec![],
|
newline_modes: vec![],
|
||||||
memo: HashMap::new(),
|
memo: Default::default(),
|
||||||
memo_arena: vec![],
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1836,26 +1823,6 @@ impl<'s> Parser<'s> {
|
|||||||
self.skip();
|
self.skip();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save a checkpoint of the parser state.
|
|
||||||
fn checkpoint(&self) -> Checkpoint<'s> {
|
|
||||||
Checkpoint {
|
|
||||||
lexer: self.lexer.clone(),
|
|
||||||
prev_end: self.prev_end,
|
|
||||||
current_start: self.current_start,
|
|
||||||
current: self.current,
|
|
||||||
nodes: self.nodes.len(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Reset the parser from a checkpoint.
|
|
||||||
fn restore(&mut self, checkpoint: Checkpoint<'s>) {
|
|
||||||
self.lexer = checkpoint.lexer;
|
|
||||||
self.prev_end = checkpoint.prev_end;
|
|
||||||
self.current_start = checkpoint.current_start;
|
|
||||||
self.current = checkpoint.current;
|
|
||||||
self.nodes.truncate(checkpoint.nodes);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Move past trivia nodes in Code/Math.
|
/// Move past trivia nodes in Code/Math.
|
||||||
fn skip(&mut self) {
|
fn skip(&mut self) {
|
||||||
if self.lexer.mode() != LexMode::Markup {
|
if self.lexer.mode() != LexMode::Markup {
|
||||||
@ -1929,6 +1896,90 @@ impl<'s> Parser<'s> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extra parser state for efficiently recovering from mispredicted parses.
|
||||||
|
///
|
||||||
|
/// This is the same idea as packrat parsing, but we use it only in the limited
|
||||||
|
/// case of parenthesized structures. See [`expr_with_paren`] for more.
|
||||||
|
#[derive(Default)]
|
||||||
|
struct MemoArena<'s> {
|
||||||
|
/// A single arena of previously parsed nodes (to reduce allocations).
|
||||||
|
/// Memoized ranges refer to unique sections of the arena.
|
||||||
|
arena: Vec<SyntaxNode>,
|
||||||
|
/// A map from the parser's current position to a range of previously parsed
|
||||||
|
/// nodes in the arena and a checkpoint of the parser's state. These allow
|
||||||
|
/// us to reset the parser to avoid parsing the same location again.
|
||||||
|
memo_map: HashMap<MemoKey, (Range<usize>, Checkpoint<'s>)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type alias for the memo key so it doesn't get confused with other usizes.
|
||||||
|
///
|
||||||
|
/// The memo is keyed by the index into `text` of the current token's start.
|
||||||
|
type MemoKey = usize;
|
||||||
|
|
||||||
|
/// A checkpoint of the parser which can fully restore it to a previous state.
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct Checkpoint<'s> {
|
||||||
|
lexer: Lexer<'s>,
|
||||||
|
prev_end: usize,
|
||||||
|
current_start: usize,
|
||||||
|
current: SyntaxKind,
|
||||||
|
node_len: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> Parser<'s> {
|
||||||
|
/// Store the already parsed nodes and the parser state into the memo map by
|
||||||
|
/// extending the arena and storing the extended range and a checkpoint.
|
||||||
|
fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) {
|
||||||
|
let memo_start = self.memo.arena.len();
|
||||||
|
self.memo.arena.extend_from_slice(&self.nodes[prev_len..]);
|
||||||
|
let arena_range = memo_start..self.memo.arena.len();
|
||||||
|
self.memo.memo_map.insert(key, (arena_range, self.checkpoint()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try to load a memoized result, return `None` if we did or `Some` (with a
|
||||||
|
/// checkpoint and a key for the memo map) if we didn't.
|
||||||
|
fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint<'s>)> {
|
||||||
|
// We use the starting index of the current token as our key.
|
||||||
|
let key: MemoKey = self.current_start();
|
||||||
|
match self.memo.memo_map.get(&key).cloned() {
|
||||||
|
Some((range, checkpoint)) => {
|
||||||
|
self.nodes.extend_from_slice(&self.memo.arena[range]);
|
||||||
|
// It's important that we don't truncate the nodes vector since
|
||||||
|
// it may have grown or shrunk (due to other memoization or
|
||||||
|
// error reporting) since we made this checkpoint.
|
||||||
|
self.restore_partial(checkpoint);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
None => Some((key, self.checkpoint())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Restore the parser to the state at a checkpoint.
|
||||||
|
fn restore(&mut self, checkpoint: Checkpoint<'s>) {
|
||||||
|
self.nodes.truncate(checkpoint.node_len);
|
||||||
|
self.restore_partial(checkpoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Restore parts of the checkpoint excluding the nodes vector.
|
||||||
|
fn restore_partial(&mut self, checkpoint: Checkpoint<'s>) {
|
||||||
|
self.lexer = checkpoint.lexer;
|
||||||
|
self.prev_end = checkpoint.prev_end;
|
||||||
|
self.current_start = checkpoint.current_start;
|
||||||
|
self.current = checkpoint.current;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save a checkpoint of the parser state.
|
||||||
|
fn checkpoint(&self) -> Checkpoint<'s> {
|
||||||
|
Checkpoint {
|
||||||
|
lexer: self.lexer.clone(),
|
||||||
|
prev_end: self.prev_end,
|
||||||
|
current_start: self.current_start,
|
||||||
|
current: self.current,
|
||||||
|
node_len: self.nodes.len(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'s> Parser<'s> {
|
impl<'s> Parser<'s> {
|
||||||
/// Consume the given `kind` or produce an error.
|
/// Consume the given `kind` or produce an error.
|
||||||
fn expect(&mut self, kind: SyntaxKind) -> bool {
|
fn expect(&mut self, kind: SyntaxKind) -> bool {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user