Merge pull request #48 from typst/parser-incr

Incremental parsing
This commit is contained in:
Laurenz 2022-01-04 00:27:05 +01:00 committed by GitHub
commit 4c81a5d43e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 1244 additions and 86 deletions

View File

@ -49,6 +49,11 @@ fn bench_parse(iai: &mut Iai) {
iai.run(|| parse(SRC)); iai.run(|| parse(SRC));
} }
fn bench_edit(iai: &mut Iai) {
let (mut ctx, id) = context();
iai.run(|| black_box(ctx.sources.edit(id, 1168 .. 1171, "_Uhr_")));
}
fn bench_eval(iai: &mut Iai) { fn bench_eval(iai: &mut Iai) {
let (mut ctx, id) = context(); let (mut ctx, id) = context();
iai.run(|| ctx.evaluate(id).unwrap()); iai.run(|| ctx.evaluate(id).unwrap());
@ -66,6 +71,7 @@ main!(
bench_scan, bench_scan,
bench_tokenize, bench_tokenize,
bench_parse, bench_parse,
bench_edit,
bench_eval, bench_eval,
bench_layout bench_layout
); );

672
src/parse/incremental.rs Normal file
View File

@ -0,0 +1,672 @@
use std::ops::Range;
use std::rc::Rc;
use crate::syntax::{Green, GreenNode, NodeKind};
use super::{
is_newline, parse, parse_atomic, parse_atomic_markup, parse_block, parse_comment,
parse_markup, parse_markup_elements, parse_template, Scanner, TokenMode,
};
/// The conditions that a node has to fulfill in order to be replaced.
///
/// This can dictate if a node can be replaced at all and if yes, what can take
/// its place.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum SuccessionRule {
/// Changing this node can never have an influence on the other nodes.
Safe,
/// This node has to be replaced with a single token of the same kind.
SameKind(Option<TokenMode>),
/// In code mode, this node can only be changed into a single atomic
/// expression, otherwise it is safe.
AtomicPrimary,
/// Changing an unsafe layer node in code mode changes what the parents or
/// the surrounding nodes would be and is therefore disallowed. Change the
/// parents or children instead. If it appears in Markup, however, it is
/// safe to change.
UnsafeLayer,
/// Changing an unsafe node or any of its children is not allowed. Change
/// the parents instead.
Unsafe,
}
/// The conditions under which a node can be inserted or remain in a tree.
///
/// These conditions all search the neighbors of the node and see if its
/// existence is plausible with them present. This can be used to encode some
/// context-free language components for incremental parsing.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum NeighbourRule {
/// These nodes depend on being at the start of a line. Reparsing of safe
/// left neighbors has to check this invariant. Additionally, when
/// exchanging the right sibling or inserting such a node the indentation of
/// the first right non-trivia, non-whitespace sibling must not be greater
/// than the current indentation.
AtStart,
/// These nodes depend on not being at the start of a line. Reparsing of
/// safe left neighbors has to check this invariant. Otherwise, this node is
/// safe.
NotAtStart,
/// These nodes could end up somewhere else up the tree if the parse was
/// happening from scratch. The parse result has to be checked for such
/// nodes. They are safe to add if followed up by other nodes.
NotAtEnd,
/// No additional requirements.
None,
}
/// Allows partial refreshs of the [`Green`] node tree.
///
/// This struct holds a description of a change. Its methods can be used to try
/// and apply the change to a green tree.
pub struct Reparser<'a> {
/// The new source code, with the change applied.
src: &'a str,
/// Which range in the old source file was changed.
replace_range: Range<usize>,
/// How many characters replaced the text in `replace_range`.
replace_len: usize,
}
impl<'a> Reparser<'a> {
/// Create a new reparser.
pub fn new(src: &'a str, replace_range: Range<usize>, replace_len: usize) -> Self {
Self { src, replace_range, replace_len }
}
}
impl Reparser<'_> {
/// Find the innermost child that is incremental safe.
pub fn reparse(&self, green: &mut Rc<GreenNode>) -> Range<usize> {
self.reparse_step(Rc::make_mut(green), 0, TokenMode::Markup, true)
.unwrap_or_else(|| {
*green = parse(self.src);
0 .. self.src.len()
})
}
fn reparse_step(
&self,
green: &mut GreenNode,
mut offset: usize,
parent_mode: TokenMode,
mut outermost: bool,
) -> Option<Range<usize>> {
let mode = green.kind().mode().unwrap_or(parent_mode);
let child_mode = green.kind().mode().unwrap_or(TokenMode::Code);
let original_count = green.children().len();
// Save the current indent if this is a markup node.
let indent = match green.kind() {
NodeKind::Markup(n) => *n,
_ => 0,
};
let mut first = None;
let mut at_start = true;
// Find the the first child in the range of children to reparse.
for (i, child) in green.children_mut().iter_mut().enumerate() {
let child_span = offset .. offset + child.len();
// We look for the start in the element but we only take a position
// at the right border if this is markup or the last element.
//
// This is because in Markup mode, we want to examine all nodes
// touching a replacement but in code we want to atomically replace.
if child_span.contains(&self.replace_range.start)
|| (mode == TokenMode::Markup
&& self.replace_range.start == child_span.end)
{
first = Some((i, offset));
break;
}
offset += child.len();
at_start = child.kind().is_at_start(at_start);
}
let (first_idx, first_start) = first?;
let mut last = None;
// Find the the last child in the range of children to reparse.
for (i, child) in green.children_mut().iter_mut().enumerate().skip(first_idx) {
let child_span = offset .. offset + child.len();
// Similarly to above, the end of the edit must be in the node but
// if it is at the edge and we are in markup node, we also want its
// neighbor!
if child_span.contains(&self.replace_range.end)
|| self.replace_range.end == child_span.end
&& (mode != TokenMode::Markup || i + 1 == original_count)
{
outermost &= i + 1 == original_count;
last = Some((i, offset + child.len()));
break;
} else if mode != TokenMode::Markup
|| !child.kind().succession_rule().safe_in_markup()
{
break;
}
offset += child.len();
}
let (last_idx, last_end) = last?;
let superseded_range = first_idx .. last_idx + 1;
let superseded_span = first_start .. last_end;
let last_kind = green.children()[last_idx].kind().clone();
// First, we try if the child itself has another, more specific
// applicable child.
if superseded_range.len() == 1 {
let child = &mut green.children_mut()[superseded_range.start];
let prev_len = child.len();
if last_kind.succession_rule() != SuccessionRule::Unsafe {
if let Some(range) = match child {
Green::Node(node) => self.reparse_step(
Rc::make_mut(node),
first_start,
child_mode,
outermost,
),
Green::Token(_) => None,
} {
let new_len = child.len();
green.update_parent(new_len, prev_len);
return Some(range);
}
}
}
// We only replace multiple children in markup mode.
if superseded_range.len() > 1 && mode == TokenMode::Code {
return None;
}
// We now have a child that we can replace and a function to do so.
let func = last_kind.reparsing_func(child_mode, indent)?;
let succession = last_kind.succession_rule();
let mut markup_min_column = 0;
// If this is a markup node, we want to save its indent instead to pass
// the right indent argument.
if superseded_range.len() == 1 {
let child = &mut green.children_mut()[superseded_range.start];
if let NodeKind::Markup(n) = child.kind() {
markup_min_column = *n;
}
}
// The span of the to-be-reparsed children in the new source.
let newborn_span = superseded_span.start
..
superseded_span.end + self.replace_len - self.replace_range.len();
// For atomic primaries we need to pass in the whole remaining string to
// check whether the parser would eat more stuff illicitly.
let reparse_span = if succession == SuccessionRule::AtomicPrimary {
newborn_span.start .. self.src.len()
} else {
newborn_span.clone()
};
let mut prefix = "";
for (i, c) in self.src[.. reparse_span.start].char_indices().rev() {
if is_newline(c) {
break;
}
prefix = &self.src[i .. reparse_span.start];
}
// Do the reparsing!
let (mut newborns, terminated) = func(
&prefix,
&self.src[reparse_span.clone()],
at_start,
markup_min_column,
)?;
// Make sure that atomic primaries ate only what they were supposed to.
if succession == SuccessionRule::AtomicPrimary {
let len = newborn_span.len();
if newborns.len() > 1 && newborns[0].len() == len {
newborns.truncate(1);
} else if newborns.iter().map(Green::len).sum::<usize>() != len {
return None;
}
}
// Do not accept unclosed nodes if the old node wasn't at the right edge
// of the tree.
if !outermost && !terminated {
return None;
}
// If all post- and preconditions match, we are good to go!
if validate(
green.children(),
superseded_range.clone(),
at_start,
&newborns,
mode,
succession,
newborn_span.clone(),
self.src,
) {
green.replace_children(superseded_range, newborns);
Some(newborn_span)
} else {
None
}
}
}
/// Validate that a node replacement is allowed by post- and preconditions.
fn validate(
superseded: &[Green],
superseded_range: Range<usize>,
mut at_start: bool,
newborns: &[Green],
mode: TokenMode,
post: SuccessionRule,
newborn_span: Range<usize>,
src: &str,
) -> bool {
// Atomic primaries must only generate one new child.
if post == SuccessionRule::AtomicPrimary && newborns.len() != 1 {
return false;
}
// Same kind in mode `inside` must generate only one child and that child
// must be of the same kind as previously.
if let SuccessionRule::SameKind(inside) = post {
let superseded_kind = superseded[superseded_range.start].kind();
let superseded_mode = superseded_kind.mode().unwrap_or(mode);
if inside.map_or(true, |m| m == superseded_mode)
&& (newborns.len() != 1 || superseded_kind != newborns[0].kind())
{
return false;
}
}
// Neighbor invariants are only relevant in markup mode.
if mode == TokenMode::Code {
return true;
}
// Check if there are any `AtStart` predecessors which require a certain
// indentation.
let s = Scanner::new(src);
let mut prev_pos = newborn_span.start;
for child in (&superseded[.. superseded_range.start]).iter().rev() {
prev_pos -= child.len();
if !child.kind().is_trivia() {
if child.kind().neighbour_rule() == NeighbourRule::AtStart {
let left_col = s.column(prev_pos);
// Search for the first non-trivia newborn.
let mut new_pos = newborn_span.start;
let mut child_col = None;
for child in newborns {
if !child.kind().is_trivia() {
child_col = Some(s.column(new_pos));
break;
}
new_pos += child.len();
}
if let Some(child_col) = child_col {
if child_col > left_col {
return false;
}
}
}
break;
}
}
// Compute the at_start state behind the new children.
for child in newborns {
at_start = child.kind().is_at_start(at_start);
}
// Ensure that a possible at-start or not-at-start precondition of
// a node after the replacement range is satisfied.
for child in &superseded[superseded_range.end ..] {
let neighbour_rule = child.kind().neighbour_rule();
if (neighbour_rule == NeighbourRule::AtStart && !at_start)
|| (neighbour_rule == NeighbourRule::NotAtStart && at_start)
{
return false;
}
if !child.kind().is_trivia() {
break;
}
at_start = child.kind().is_at_start(at_start);
}
// Verify that the last of the newborns is not `NotAtEnd`.
if newborns.last().map_or(false, |child| {
child.kind().neighbour_rule() == NeighbourRule::NotAtEnd
}) {
return false;
}
// We have to check whether the last non-trivia newborn is `AtStart` and
// verify the indent of its right neighbors in order to make sure its
// indentation requirements are fulfilled.
let mut child_pos = newborn_span.end;
for child in newborns.iter().rev() {
child_pos -= child.len();
if child.kind().is_trivia() {
continue;
}
if child.kind().neighbour_rule() == NeighbourRule::AtStart {
let child_col = s.column(child_pos);
let mut right_pos = newborn_span.end;
for child in &superseded[superseded_range.end ..] {
if child.kind().is_trivia() {
right_pos += child.len();
continue;
}
if s.column(right_pos) > child_col {
return false;
}
break;
}
}
break;
}
true
}
impl NodeKind {
/// Return the correct reparsing function given the postconditions for the
/// type.
fn reparsing_func(
&self,
parent_mode: TokenMode,
indent: usize,
) -> Option<fn(&str, &str, bool, usize) -> Option<(Vec<Green>, bool)>> {
let mode = self.mode().unwrap_or(parent_mode);
match self.succession_rule() {
SuccessionRule::Unsafe | SuccessionRule::UnsafeLayer => None,
SuccessionRule::AtomicPrimary if mode == TokenMode::Code => {
Some(parse_atomic)
}
SuccessionRule::AtomicPrimary => Some(parse_atomic_markup),
SuccessionRule::SameKind(x) if x == None || x == Some(mode) => match self {
NodeKind::Markup(_) => Some(parse_markup),
NodeKind::Template => Some(parse_template),
NodeKind::Block => Some(parse_block),
NodeKind::LineComment | NodeKind::BlockComment => Some(parse_comment),
_ => None,
},
_ => match mode {
TokenMode::Markup if indent == 0 => Some(parse_markup_elements),
_ => return None,
},
}
}
/// Whether it is safe to do incremental parsing on this node. Never allow
/// non-termination errors if this is not already the last leaf node.
pub fn succession_rule(&self) -> SuccessionRule {
match self {
// Replacing parenthesis changes if the expression is balanced and
// is therefore not safe.
Self::LeftBracket
| Self::RightBracket
| Self::LeftBrace
| Self::RightBrace
| Self::LeftParen
| Self::RightParen => SuccessionRule::Unsafe,
// Replacing an operator can change whether the parent is an
// operation which makes it unsafe. The star can appear in markup.
Self::Star
| Self::Comma
| Self::Semicolon
| Self::Colon
| Self::Plus
| Self::Minus
| Self::Slash
| Self::Eq
| Self::EqEq
| Self::ExclEq
| Self::Lt
| Self::LtEq
| Self::Gt
| Self::GtEq
| Self::PlusEq
| Self::HyphEq
| Self::StarEq
| Self::SlashEq
| Self::Not
| Self::And
| Self::Or
| Self::With
| Self::Dots
| Self::Arrow => SuccessionRule::Unsafe,
// These keywords change what kind of expression the parent is and
// how far the expression would go.
Self::Let
| Self::Set
| Self::If
| Self::Else
| Self::For
| Self::In
| Self::While
| Self::Break
| Self::Continue
| Self::Return
| Self::Import
| Self::Include
| Self::From => SuccessionRule::Unsafe,
// Changing the heading level, enum numbering, or list bullet
// changes the next layer.
Self::EnumNumbering(_) => SuccessionRule::Unsafe,
// This can be anything, so we don't make any promises.
Self::Error(_, _) | Self::Unknown(_) => SuccessionRule::Unsafe,
// These are complex expressions which may screw with their
// environments.
Self::Call
| Self::Unary
| Self::Binary
| Self::CallArgs
| Self::Named
| Self::Spread => SuccessionRule::UnsafeLayer,
// The closure is a bit magic with the let expression, and also it
// is not atomic.
Self::Closure | Self::ClosureParams => SuccessionRule::UnsafeLayer,
// Missing these creates errors for the parents.
Self::WithExpr | Self::ForPattern | Self::ImportItems => {
SuccessionRule::UnsafeLayer
}
// Only markup is expected at the points where it does occur. The
// indentation must be preserved as well, also for the children.
Self::Markup(_) => SuccessionRule::SameKind(None),
// These can appear everywhere and must not change to other stuff
// because that could change the outer expression.
Self::LineComment | Self::BlockComment => SuccessionRule::SameKind(None),
// These can appear as bodies and would trigger an error if they
// became something else.
Self::Template => SuccessionRule::SameKind(None),
Self::Block => SuccessionRule::SameKind(Some(TokenMode::Code)),
// Whitespace in code mode has to remain whitespace or else the type
// of things would change.
Self::Space(_) => SuccessionRule::SameKind(Some(TokenMode::Code)),
// These are expressions that can be replaced by other expressions.
Self::Ident(_)
| Self::Bool(_)
| Self::Int(_)
| Self::Float(_)
| Self::Length(_, _)
| Self::Angle(_, _)
| Self::Percentage(_)
| Self::Str(_)
| Self::Fraction(_)
| Self::Array
| Self::Dict
| Self::Group
| Self::None
| Self::Auto => SuccessionRule::AtomicPrimary,
// More complex, but still an expression.
Self::ForExpr
| Self::WhileExpr
| Self::IfExpr
| Self::LetExpr
| Self::SetExpr
| Self::ImportExpr
| Self::IncludeExpr => SuccessionRule::AtomicPrimary,
// This element always has to remain in the same column so better
// reparse the whole parent.
Self::Raw(_) => SuccessionRule::Unsafe,
// These are all replaceable by other tokens.
Self::Parbreak
| Self::Linebreak
| Self::Text(_)
| Self::TextInLine(_)
| Self::NonBreakingSpace
| Self::EnDash
| Self::EmDash
| Self::Escape(_)
| Self::Strong
| Self::Emph
| Self::Heading
| Self::Enum
| Self::List
| Self::Math(_) => SuccessionRule::Safe,
}
}
/// The appropriate precondition for the type.
pub fn neighbour_rule(&self) -> NeighbourRule {
match self {
Self::Heading | Self::Enum | Self::List => NeighbourRule::AtStart,
Self::TextInLine(_) => NeighbourRule::NotAtStart,
Self::Error(_, _) => NeighbourRule::NotAtEnd,
_ => NeighbourRule::None,
}
}
}
impl SuccessionRule {
/// Whether a node with this condition can be reparsed in markup mode.
pub fn safe_in_markup(&self) -> bool {
match self {
Self::Safe | Self::UnsafeLayer => true,
Self::SameKind(mode) => mode.map_or(false, |m| m != TokenMode::Markup),
_ => false,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parse::parse;
use crate::source::SourceFile;
#[test]
#[rustfmt::skip]
fn test_incremental_parse() {
#[track_caller]
fn test(prev: &str, range: Range<usize>, with: &str, goal: Range<usize>) {
let mut source = SourceFile::detached(prev);
let range = source.edit(range, with);
assert_eq!(range, goal);
assert_eq!(parse(source.src()), *source.root());
}
// Test simple replacements.
test("hello world", 6 .. 11, "walkers", 5 .. 13);
test("some content", 0..12, "", 0..0);
test("", 0..0, "do it", 0..5);
test("a d e", 1 .. 3, " b c d", 0 .. 8);
test("a #f() e", 1 .. 6, " b c d", 0 .. 8);
test("{(0, 1, 2)}", 5 .. 6, "11pt", 5 .. 9);
test("= A heading", 3 .. 3, "n evocative", 2 .. 22);
test("your thing", 5 .. 5, "a", 4 .. 11);
test("a your thing a", 6 .. 7, "a", 2 .. 12);
test("{call(); abc}", 7 .. 7, "[]", 0 .. 15);
test("#call() abc", 7 .. 7, "[]", 0 .. 10);
test("hi[\n- item\n- item 2\n - item 3]", 11 .. 11, " ", 3 .. 34);
test("hi\n- item\nno item\n - item 3", 10 .. 10, "- ", 0 .. 32);
test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 16 .. 20, "none", 16 .. 20);
test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 33 .. 42, "[_gronk_]", 33 .. 42);
test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 34 .. 41, "_bar_", 34 .. 39);
test("{let i=1; for x in range(5) {i}}", 6 .. 6, " ", 1 .. 9);
test("{let i=1; for x in range(5) {i}}", 13 .. 14, " ", 10 .. 32);
test("hello {x}", 6 .. 9, "#f()", 5 .. 10);
test("this is -- in my opinion -- spectacular", 8 .. 10, "---", 7 .. 12);
test("understanding `code` is complicated", 15 .. 15, "C ", 0 .. 37);
test("{ let x = g() }", 10 .. 12, "f(54", 2 .. 15);
test("a #let rect with (fill: eastern)\nb", 16 .. 31, " (stroke: conifer", 2 .. 34);
// Test the whitespace invariants.
test("hello \\ world", 7 .. 8, "a ", 6 .. 14);
test("hello \\ world", 7 .. 8, " a", 6 .. 14);
test("x = y", 1 .. 1, " + y", 0 .. 6);
test("x = y", 1 .. 1, " + y\n", 0 .. 10);
test("abc\n= a heading\njoke", 3 .. 4, "\nmore\n\n", 0 .. 21);
test("abc\n= a heading\njoke", 3 .. 4, "\nnot ", 0 .. 19);
test("#let x = (1, 2 + ; Five\r\n\r", 19..22, "2.", 18..22);
test("hey #myfriend", 4 .. 4, "\\", 0 .. 14);
test("hey #myfriend", 4 .. 4, "\\", 3 .. 6);
// Test type invariants.
test("a #for x in array {x}", 18 .. 21, "[#x]", 2 .. 22);
test("a #let x = 1 {5}", 3 .. 6, "if", 0 .. 15);
test("a {let x = 1 {5}} b", 3 .. 6, "if", 2 .. 16);
test("#let x = 1 {5}", 4 .. 4, " if", 0 .. 17);
test("{let x = 1 {5}}", 4 .. 4, " if", 0 .. 18);
test("a // b c #f()", 3 .. 4, "", 0 .. 12);
test("{\nf()\n//g(a)\n}", 6 .. 8, "", 0 .. 12);
test("a{\nf()\n//g(a)\n}b", 7 .. 9, "", 1 .. 13);
test("a #while x {\n g(x) \n} b", 11 .. 11, "//", 0 .. 26);
test("{(1, 2)}", 1 .. 1, "while ", 0 .. 14);
test("a b c", 1 .. 1, "{[}", 0 .. 8);
// Test unclosed things.
test(r#"{"hi"}"#, 4 .. 5, "c", 0 .. 6);
test(r"this \u{abcd}", 8 .. 9, "", 5 .. 12);
test(r"this \u{abcd} that", 12 .. 13, "", 0 .. 17);
test(r"{{let x = z}; a = 1} b", 6 .. 6, "//", 0 .. 24);
test("a b c", 1 .. 1, " /* letters */", 0 .. 16);
test("a b c", 1 .. 1, " /* letters", 0 .. 16);
test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters */", 1 .. 35);
test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters", 0 .. 38);
// Test raw tokens.
test(r#"a ```typst hello``` b"#, 16 .. 17, "", 0 .. 20);
test(r#"a ```typst hello```"#, 16 .. 17, "", 0 .. 18);
}
}

View File

@ -1,10 +1,12 @@
//! Parsing and tokenization. //! Parsing and tokenization.
mod incremental;
mod parser; mod parser;
mod resolve; mod resolve;
mod scanner; mod scanner;
mod tokens; mod tokens;
pub use incremental::*;
pub use parser::*; pub use parser::*;
pub use resolve::*; pub use resolve::*;
pub use scanner::*; pub use scanner::*;
@ -14,10 +16,11 @@ use std::rc::Rc;
use crate::syntax::ast::{Associativity, BinOp, UnOp}; use crate::syntax::ast::{Associativity, BinOp, UnOp};
use crate::syntax::{ErrorPos, Green, GreenNode, NodeKind}; use crate::syntax::{ErrorPos, Green, GreenNode, NodeKind};
use crate::util::EcoString;
/// Parse a source file. /// Parse a source file.
pub fn parse(src: &str) -> Rc<GreenNode> { pub fn parse(src: &str) -> Rc<GreenNode> {
let mut p = Parser::new(src); let mut p = Parser::new(src, TokenMode::Markup);
markup(&mut p); markup(&mut p);
match p.finish().into_iter().next() { match p.finish().into_iter().next() {
Some(Green::Node(node)) => node, Some(Green::Node(node)) => node,
@ -25,9 +28,108 @@ pub fn parse(src: &str) -> Rc<GreenNode> {
} }
} }
/// Parse an atomic primary. Returns `Some` if all of the input was consumed.
pub fn parse_atomic(
prefix: &str,
src: &str,
_: bool,
_: usize,
) -> Option<(Vec<Green>, bool)> {
let mut p = Parser::with_prefix(prefix, src, TokenMode::Code);
primary(&mut p, true).ok()?;
p.consume_unterminated()
}
/// Parse an atomic primary. Returns `Some` if all of the input was consumed.
pub fn parse_atomic_markup(
prefix: &str,
src: &str,
_: bool,
_: usize,
) -> Option<(Vec<Green>, bool)> {
let mut p = Parser::with_prefix(prefix, src, TokenMode::Markup);
markup_expr(&mut p);
p.consume_unterminated()
}
/// Parse some markup. Returns `Some` if all of the input was consumed.
pub fn parse_markup(
prefix: &str,
src: &str,
_: bool,
min_column: usize,
) -> Option<(Vec<Green>, bool)> {
let mut p = Parser::with_prefix(prefix, src, TokenMode::Markup);
if min_column == 0 {
markup(&mut p);
} else {
markup_indented(&mut p, min_column);
}
p.consume()
}
/// Parse some markup without the topmost node. Returns `Some` if all of the
/// input was consumed.
pub fn parse_markup_elements(
prefix: &str,
src: &str,
mut at_start: bool,
_: usize,
) -> Option<(Vec<Green>, bool)> {
let mut p = Parser::with_prefix(prefix, src, TokenMode::Markup);
while !p.eof() {
markup_node(&mut p, &mut at_start);
}
p.consume()
}
/// Parse a template literal. Returns `Some` if all of the input was consumed.
pub fn parse_template(
prefix: &str,
src: &str,
_: bool,
_: usize,
) -> Option<(Vec<Green>, bool)> {
let mut p = Parser::with_prefix(prefix, src, TokenMode::Code);
if !p.at(&NodeKind::LeftBracket) {
return None;
}
template(&mut p);
p.consume()
}
/// Parse a code block. Returns `Some` if all of the input was consumed.
pub fn parse_block(
prefix: &str,
src: &str,
_: bool,
_: usize,
) -> Option<(Vec<Green>, bool)> {
let mut p = Parser::with_prefix(prefix, src, TokenMode::Code);
if !p.at(&NodeKind::LeftBrace) {
return None;
}
block(&mut p);
p.consume()
}
/// Parse a comment. Returns `Some` if all of the input was consumed.
pub fn parse_comment(
prefix: &str,
src: &str,
_: bool,
_: usize,
) -> Option<(Vec<Green>, bool)> {
let mut p = Parser::with_prefix(prefix, src, TokenMode::Code);
comment(&mut p).ok()?;
p.consume()
}
/// Parse markup. /// Parse markup.
fn markup(p: &mut Parser) { fn markup(p: &mut Parser) {
markup_while(p, true, &mut |_| true) markup_while(p, true, 0, &mut |_| true)
} }
/// Parse markup that stays right of the given column. /// Parse markup that stays right of the given column.
@ -38,7 +140,7 @@ fn markup_indented(p: &mut Parser, column: usize) {
_ => false, _ => false,
}); });
markup_while(p, false, &mut |p| match p.peek() { markup_while(p, false, column, &mut |p| match p.peek() {
Some(NodeKind::Space(n)) if *n >= 1 => p.column(p.current_end()) >= column, Some(NodeKind::Space(n)) if *n >= 1 => p.column(p.current_end()) >= column,
_ => true, _ => true,
}) })
@ -48,11 +150,11 @@ fn markup_indented(p: &mut Parser, column: usize) {
/// ///
/// If `at_start` is true, things like headings that may only appear at the /// If `at_start` is true, things like headings that may only appear at the
/// beginning of a line or template are allowed. /// beginning of a line or template are allowed.
fn markup_while<F>(p: &mut Parser, mut at_start: bool, f: &mut F) fn markup_while<F>(p: &mut Parser, mut at_start: bool, column: usize, f: &mut F)
where where
F: FnMut(&mut Parser) -> bool, F: FnMut(&mut Parser) -> bool,
{ {
p.perform(NodeKind::Markup, |p| { p.perform(NodeKind::Markup(column), |p| {
while !p.eof() && f(p) { while !p.eof() && f(p) {
markup_node(p, &mut at_start); markup_node(p, &mut at_start);
} }
@ -98,14 +200,9 @@ fn markup_node(p: &mut Parser, at_start: &mut bool) {
p.eat(); p.eat();
} }
NodeKind::Eq if *at_start => heading(p), NodeKind::Eq => heading(p, *at_start),
NodeKind::Minus if *at_start => list_node(p), NodeKind::Minus => list_node(p, *at_start),
NodeKind::EnumNumbering(_) if *at_start => enum_node(p), NodeKind::EnumNumbering(_) => enum_node(p, *at_start),
// Line-based markup that is not currently at the start of the line.
NodeKind::Eq | NodeKind::Minus | NodeKind::EnumNumbering(_) => {
p.convert(NodeKind::Text(p.peek_src().into()));
}
// Hashtag + keyword / identifier. // Hashtag + keyword / identifier.
NodeKind::Ident(_) NodeKind::Ident(_)
@ -115,17 +212,7 @@ fn markup_node(p: &mut Parser, at_start: &mut bool) {
| NodeKind::While | NodeKind::While
| NodeKind::For | NodeKind::For
| NodeKind::Import | NodeKind::Import
| NodeKind::Include => { | NodeKind::Include => markup_expr(p),
let stmt = matches!(token, NodeKind::Let | NodeKind::Set | NodeKind::Import);
let group = if stmt { Group::Stmt } else { Group::Expr };
p.start_group(group);
let res = expr_prec(p, true, 0);
if stmt && res.is_ok() && !p.eof() {
p.expected_at("semicolon or line break");
}
p.end_group();
}
// Block and template. // Block and template.
NodeKind::LeftBrace => block(p), NodeKind::LeftBrace => block(p),
@ -139,31 +226,65 @@ fn markup_node(p: &mut Parser, at_start: &mut bool) {
} }
/// Parse a heading. /// Parse a heading.
fn heading(p: &mut Parser) { fn heading(p: &mut Parser, at_start: bool) {
p.perform(NodeKind::Heading, |p| { let marker = p.marker();
p.eat_assert(&NodeKind::Eq); let current_start = p.current_start();
while p.eat_if(&NodeKind::Eq) {} p.eat_assert(&NodeKind::Eq);
while p.eat_if(&NodeKind::Eq) {}
if at_start && p.peek().map_or(true, |kind| kind.is_whitespace()) {
let column = p.column(p.prev_end()); let column = p.column(p.prev_end());
markup_indented(p, column); markup_indented(p, column);
}); marker.end(p, NodeKind::Heading);
} else {
let text = p.get(current_start .. p.prev_end()).into();
marker.convert(p, NodeKind::TextInLine(text));
}
} }
/// Parse a single list item. /// Parse a single list item.
fn list_node(p: &mut Parser) { fn list_node(p: &mut Parser, at_start: bool) {
p.perform(NodeKind::List, |p| { let marker = p.marker();
p.eat_assert(&NodeKind::Minus); let text: EcoString = p.peek_src().into();
p.eat_assert(&NodeKind::Minus);
if at_start && p.peek().map_or(true, |kind| kind.is_whitespace()) {
let column = p.column(p.prev_end()); let column = p.column(p.prev_end());
markup_indented(p, column); markup_indented(p, column);
}); marker.end(p, NodeKind::List);
} else {
marker.convert(p, NodeKind::TextInLine(text));
}
} }
/// Parse a single enum item. /// Parse a single enum item.
fn enum_node(p: &mut Parser) { fn enum_node(p: &mut Parser, at_start: bool) {
p.perform(NodeKind::Enum, |p| { let marker = p.marker();
p.eat(); let text: EcoString = p.peek_src().into();
p.eat();
if at_start && p.peek().map_or(true, |kind| kind.is_whitespace()) {
let column = p.column(p.prev_end()); let column = p.column(p.prev_end());
markup_indented(p, column); markup_indented(p, column);
}); marker.end(p, NodeKind::Enum);
} else {
marker.convert(p, NodeKind::TextInLine(text));
}
}
/// Parse an expression within markup mode.
fn markup_expr(p: &mut Parser) {
if let Some(token) = p.peek() {
let stmt = matches!(token, NodeKind::Let | NodeKind::Set | NodeKind::Import);
let group = if stmt { Group::Stmt } else { Group::Expr };
p.start_group(group);
let res = expr_prec(p, true, 0);
if stmt && res.is_ok() && !p.eof() {
p.expected_at("semicolon or line break");
}
p.end_group();
}
} }
/// Parse an expression. /// Parse an expression.
@ -183,13 +304,13 @@ fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult {
// Start the unary expression. // Start the unary expression.
match p.peek().and_then(UnOp::from_token) { match p.peek().and_then(UnOp::from_token) {
Some(op) => { Some(op) if !atomic => {
p.eat(); p.eat();
let prec = op.precedence(); let prec = op.precedence();
expr_prec(p, atomic, prec)?; expr_prec(p, atomic, prec)?;
marker.end(p, NodeKind::Unary); marker.end(p, NodeKind::Unary);
} }
None => primary(p, atomic)?, _ => primary(p, atomic)?,
}; };
loop { loop {
@ -254,7 +375,7 @@ fn primary(p: &mut Parser, atomic: bool) -> ParseResult {
} }
// Structures. // Structures.
Some(NodeKind::LeftParen) => parenthesized(p), Some(NodeKind::LeftParen) => parenthesized(p, atomic),
Some(NodeKind::LeftBracket) => { Some(NodeKind::LeftBracket) => {
template(p); template(p);
Ok(()) Ok(())
@ -315,7 +436,7 @@ fn literal(p: &mut Parser) -> bool {
/// - Dictionary literal /// - Dictionary literal
/// - Parenthesized expression /// - Parenthesized expression
/// - Parameter list of closure expression /// - Parameter list of closure expression
fn parenthesized(p: &mut Parser) -> ParseResult { fn parenthesized(p: &mut Parser, atomic: bool) -> ParseResult {
let marker = p.marker(); let marker = p.marker();
p.start_group(Group::Paren); p.start_group(Group::Paren);
@ -330,7 +451,7 @@ fn parenthesized(p: &mut Parser) -> ParseResult {
} }
// Arrow means this is a closure's parameter list. // Arrow means this is a closure's parameter list.
if p.at(&NodeKind::Arrow) { if !atomic && p.at(&NodeKind::Arrow) {
params(p, marker); params(p, marker);
p.eat_assert(&NodeKind::Arrow); p.eat_assert(&NodeKind::Arrow);
return marker.perform(p, NodeKind::Closure, expr); return marker.perform(p, NodeKind::Closure, expr);
@ -706,3 +827,14 @@ fn body(p: &mut Parser) -> ParseResult {
} }
Ok(()) Ok(())
} }
/// Parse a comment.
fn comment(p: &mut Parser) -> ParseResult {
match p.peek() {
Some(NodeKind::LineComment | NodeKind::BlockComment) => {
p.eat();
Ok(())
}
_ => Err(ParseError),
}
}

View File

@ -1,7 +1,8 @@
use core::slice::SliceIndex;
use std::fmt::{self, Display, Formatter}; use std::fmt::{self, Display, Formatter};
use std::mem; use std::mem;
use super::{TokenMode, Tokens}; use super::{Scanner, TokenMode, Tokens};
use crate::syntax::{ErrorPos, Green, GreenData, GreenNode, NodeKind}; use crate::syntax::{ErrorPos, Green, GreenData, GreenNode, NodeKind};
use crate::util::EcoString; use crate::util::EcoString;
@ -21,12 +22,17 @@ pub struct Parser<'s> {
groups: Vec<GroupEntry>, groups: Vec<GroupEntry>,
/// The children of the currently built node. /// The children of the currently built node.
children: Vec<Green>, children: Vec<Green>,
/// Is `Some` if there is an unterminated group at the last position where
/// groups were terminated.
last_unterminated: Option<usize>,
/// Offsets the indentation on the first line of the source.
column_offset: usize,
} }
impl<'s> Parser<'s> { impl<'s> Parser<'s> {
/// Create a new parser for the source string. /// Create a new parser for the source string.
pub fn new(src: &'s str) -> Self { pub fn new(src: &'s str, mode: TokenMode) -> Self {
let mut tokens = Tokens::new(src, TokenMode::Markup); let mut tokens = Tokens::new(src, mode);
let current = tokens.next(); let current = tokens.next();
Self { Self {
tokens, tokens,
@ -36,14 +42,38 @@ impl<'s> Parser<'s> {
current_start: 0, current_start: 0,
groups: vec![], groups: vec![],
children: vec![], children: vec![],
last_unterminated: None,
column_offset: 0,
} }
} }
/// Create a new parser for the source string that is prefixed by some text
/// that does not need to be parsed but taken into account for column
/// calculation.
pub fn with_prefix(prefix: &str, src: &'s str, mode: TokenMode) -> Self {
let mut p = Self::new(src, mode);
p.column_offset = Scanner::new(prefix).column(prefix.len());
p
}
/// End the parsing process and return the last child. /// End the parsing process and return the last child.
pub fn finish(self) -> Vec<Green> { pub fn finish(self) -> Vec<Green> {
self.children self.children
} }
/// End the parsing process and return multiple children and whether the
/// last token was terminated.
pub fn consume(self) -> Option<(Vec<Green>, bool)> {
(self.eof() && self.terminated())
.then(|| (self.children, self.tokens.terminated()))
}
/// End the parsing process and return multiple children and whether the
/// last token was terminated, even if there remains stuff in the string.
pub fn consume_unterminated(self) -> Option<(Vec<Green>, bool)> {
self.terminated().then(|| (self.children, self.tokens.terminated()))
}
/// Create a new marker. /// Create a new marker.
pub fn marker(&mut self) -> Marker { pub fn marker(&mut self) -> Marker {
Marker(self.children.len()) Marker(self.children.len())
@ -170,6 +200,14 @@ impl<'s> Parser<'s> {
self.tokens.scanner().get(self.current_start() .. self.current_end()) self.tokens.scanner().get(self.current_start() .. self.current_end())
} }
/// Obtain a range of the source code.
pub fn get<I>(&self, index: I) -> &'s str
where
I: SliceIndex<str, Output = str>,
{
self.tokens.scanner().get(index)
}
/// The byte index at which the last non-trivia token ended. /// The byte index at which the last non-trivia token ended.
pub fn prev_end(&self) -> usize { pub fn prev_end(&self) -> usize {
self.prev_end self.prev_end
@ -187,7 +225,7 @@ impl<'s> Parser<'s> {
/// Determine the column index for the given byte index. /// Determine the column index for the given byte index.
pub fn column(&self, index: usize) -> usize { pub fn column(&self, index: usize) -> usize {
self.tokens.scanner().column(index) self.tokens.scanner().column_offset(index, self.column_offset)
} }
/// Continue parsing in a group. /// Continue parsing in a group.
@ -225,6 +263,9 @@ impl<'s> Parser<'s> {
let group = self.groups.pop().expect("no started group"); let group = self.groups.pop().expect("no started group");
self.tokens.set_mode(group.prev_mode); self.tokens.set_mode(group.prev_mode);
self.repeek(); self.repeek();
if self.last_unterminated != Some(self.prev_end()) {
self.last_unterminated = None;
}
let mut rescan = self.tokens.mode() != group_mode; let mut rescan = self.tokens.mode() != group_mode;
@ -243,6 +284,7 @@ impl<'s> Parser<'s> {
rescan = false; rescan = false;
} else if required { } else if required {
self.push_error(format_eco!("expected {}", end)); self.push_error(format_eco!("expected {}", end));
self.last_unterminated = Some(self.prev_end());
} }
} }
@ -260,6 +302,11 @@ impl<'s> Parser<'s> {
} }
} }
/// Checks if all groups were correctly terminated.
pub fn terminated(&self) -> bool {
self.groups.is_empty() && self.last_unterminated.is_none()
}
/// Low-level bump that consumes exactly one token without special trivia /// Low-level bump that consumes exactly one token without special trivia
/// handling. /// handling.
fn bump(&mut self) { fn bump(&mut self) {
@ -320,7 +367,8 @@ impl Parser<'_> {
/// Push an error into the children list. /// Push an error into the children list.
pub fn push_error(&mut self, msg: impl Into<EcoString>) { pub fn push_error(&mut self, msg: impl Into<EcoString>) {
let error = NodeKind::Error(ErrorPos::Full, msg.into()); let error = NodeKind::Error(ErrorPos::Full, msg.into());
self.children.push(GreenData::new(error, 0).into()); let idx = self.trivia_start();
self.children.insert(idx.0, GreenData::new(error, 0).into());
} }
/// Eat the current token and add an error that it is unexpected. /// Eat the current token and add an error that it is unexpected.
@ -419,6 +467,7 @@ impl Marker {
} }
/// A logical group of tokens, e.g. `[...]`. /// A logical group of tokens, e.g. `[...]`.
#[derive(Debug)]
struct GroupEntry { struct GroupEntry {
/// The kind of group this is. This decides which tokens will end the group. /// The kind of group this is. This decides which tokens will end the group.
/// For example, a [`Group::Paren`] will be ended by /// For example, a [`Group::Paren`] will be ended by

View File

@ -162,11 +162,26 @@ impl<'s> Scanner<'s> {
/// The column index of a given index in the source string. /// The column index of a given index in the source string.
#[inline] #[inline]
pub fn column(&self, index: usize) -> usize { pub fn column(&self, index: usize) -> usize {
self.src[.. index] self.column_offset(index, 0)
.chars() }
/// The column index of a given index in the source string when an offset is
/// applied to the first line of the string.
#[inline]
pub fn column_offset(&self, index: usize, offset: usize) -> usize {
let mut apply_offset = false;
let res = self.src[.. index]
.char_indices()
.rev() .rev()
.take_while(|&c| !is_newline(c)) .take_while(|&(_, c)| !is_newline(c))
.count() .inspect(|&(i, _)| {
if i == 0 {
apply_offset = true
}
})
.count();
if apply_offset { res + offset } else { res }
} }
} }

View File

@ -13,6 +13,7 @@ use crate::util::EcoString;
pub struct Tokens<'s> { pub struct Tokens<'s> {
s: Scanner<'s>, s: Scanner<'s>,
mode: TokenMode, mode: TokenMode,
terminated: bool,
} }
/// What kind of tokens to emit. /// What kind of tokens to emit.
@ -28,7 +29,11 @@ impl<'s> Tokens<'s> {
/// Create a new token iterator with the given mode. /// Create a new token iterator with the given mode.
#[inline] #[inline]
pub fn new(src: &'s str, mode: TokenMode) -> Self { pub fn new(src: &'s str, mode: TokenMode) -> Self {
Self { s: Scanner::new(src), mode } Self {
s: Scanner::new(src),
mode,
terminated: true,
}
} }
/// Get the current token mode. /// Get the current token mode.
@ -63,6 +68,12 @@ impl<'s> Tokens<'s> {
pub fn scanner(&self) -> Scanner<'s> { pub fn scanner(&self) -> Scanner<'s> {
self.s self.s
} }
/// Whether the last token was terminated.
#[inline]
pub fn terminated(&self) -> bool {
self.terminated
}
} }
impl<'s> Iterator for Tokens<'s> { impl<'s> Iterator for Tokens<'s> {
@ -117,9 +128,7 @@ impl<'s> Tokens<'s> {
'`' => self.raw(), '`' => self.raw(),
'$' => self.math(), '$' => self.math(),
'-' => self.hyph(), '-' => self.hyph(),
'=' if self.s.check_or(true, |c| c == '=' || c.is_whitespace()) => { '=' => NodeKind::Eq,
NodeKind::Eq
}
c if c == '.' || c.is_ascii_digit() => self.numbering(start, c), c if c == '.' || c.is_ascii_digit() => self.numbering(start, c),
// Plain text. // Plain text.
@ -248,6 +257,7 @@ impl<'s> Tokens<'s> {
) )
} }
} else { } else {
self.terminated = false;
NodeKind::Error( NodeKind::Error(
ErrorPos::End, ErrorPos::End,
"expected closing brace".into(), "expected closing brace".into(),
@ -281,10 +291,8 @@ impl<'s> Tokens<'s> {
} else { } else {
NodeKind::EnDash NodeKind::EnDash
} }
} else if self.s.check_or(true, char::is_whitespace) {
NodeKind::Minus
} else { } else {
NodeKind::Text('-'.into()) NodeKind::Minus
} }
} }
@ -300,11 +308,7 @@ impl<'s> Tokens<'s> {
None None
}; };
if self.s.check_or(true, char::is_whitespace) { NodeKind::EnumNumbering(number)
NodeKind::EnumNumbering(number)
} else {
NodeKind::Text(self.s.eaten_from(start).into())
}
} }
fn raw(&mut self) -> NodeKind { fn raw(&mut self) -> NodeKind {
@ -346,6 +350,7 @@ impl<'s> Tokens<'s> {
let remaining = backticks - found; let remaining = backticks - found;
let noun = if remaining == 1 { "backtick" } else { "backticks" }; let noun = if remaining == 1 { "backtick" } else { "backticks" };
self.terminated = false;
NodeKind::Error( NodeKind::Error(
ErrorPos::End, ErrorPos::End,
if found == 0 { if found == 0 {
@ -393,6 +398,7 @@ impl<'s> Tokens<'s> {
display, display,
})) }))
} else { } else {
self.terminated = false;
NodeKind::Error( NodeKind::Error(
ErrorPos::End, ErrorPos::End,
if !display || (!escaped && dollar) { if !display || (!escaped && dollar) {
@ -481,18 +487,23 @@ impl<'s> Tokens<'s> {
if self.s.eat_if('"') { if self.s.eat_if('"') {
NodeKind::Str(string) NodeKind::Str(string)
} else { } else {
self.terminated = false;
NodeKind::Error(ErrorPos::End, "expected quote".into()) NodeKind::Error(ErrorPos::End, "expected quote".into())
} }
} }
fn line_comment(&mut self) -> NodeKind { fn line_comment(&mut self) -> NodeKind {
self.s.eat_until(is_newline); self.s.eat_until(is_newline);
if self.s.peek().is_none() {
self.terminated = false;
}
NodeKind::LineComment NodeKind::LineComment
} }
fn block_comment(&mut self) -> NodeKind { fn block_comment(&mut self) -> NodeKind {
let mut state = '_'; let mut state = '_';
let mut depth = 1; let mut depth = 1;
self.terminated = false;
// Find the first `*/` that does not correspond to a nested `/*`. // Find the first `*/` that does not correspond to a nested `/*`.
while let Some(c) = self.s.eat() { while let Some(c) = self.s.eat() {
@ -500,6 +511,7 @@ impl<'s> Tokens<'s> {
('*', '/') => { ('*', '/') => {
depth -= 1; depth -= 1;
if depth == 0 { if depth == 0 {
self.terminated = true;
break; break;
} }
'_' '_'
@ -713,6 +725,7 @@ mod tests {
t!(Both["a1/"]: " \n" => Space(1)); t!(Both["a1/"]: " \n" => Space(1));
t!(Both["a1/"]: " \n " => Space(1)); t!(Both["a1/"]: " \n " => Space(1));
t!(Both["a1/"]: "\r\n" => Space(1)); t!(Both["a1/"]: "\r\n" => Space(1));
t!(Both["a1/"]: "\r\n\r" => Space(2));
t!(Both["a1/"]: " \n\t \n " => Space(2)); t!(Both["a1/"]: " \n\t \n " => Space(2));
t!(Both["a1/"]: "\n\r" => Space(2)); t!(Both["a1/"]: "\n\r" => Space(2));
t!(Both["a1/"]: " \r\r\n \x0D" => Space(3)); t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
@ -722,12 +735,12 @@ mod tests {
fn test_tokenize_text() { fn test_tokenize_text() {
// Test basic text. // Test basic text.
t!(Markup[" /"]: "hello" => Text("hello")); t!(Markup[" /"]: "hello" => Text("hello"));
t!(Markup[" /"]: "hello-world" => Text("hello"), Text("-"), Text("world")); t!(Markup[" /"]: "hello-world" => Text("hello"), Minus, Text("world"));
// Test code symbols in text. // Test code symbols in text.
t!(Markup[" /"]: "a():\"b" => Text("a():\"b")); t!(Markup[" /"]: "a():\"b" => Text("a():\"b"));
t!(Markup[" /"]: ";:,|/+" => Text(";:,|"), Text("/+")); t!(Markup[" /"]: ";:,|/+" => Text(";:,|"), Text("/+"));
t!(Markup[" /"]: "=-a" => Text("="), Text("-"), Text("a")); t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a"));
t!(Markup[" "]: "#123" => Text("#"), Text("123")); t!(Markup[" "]: "#123" => Text("#"), Text("123"));
// Test text ends. // Test text ends.
@ -784,7 +797,7 @@ mod tests {
t!(Markup["a1/"]: "- " => Minus, Space(0)); t!(Markup["a1/"]: "- " => Minus, Space(0));
t!(Markup[" "]: "." => EnumNumbering(None)); t!(Markup[" "]: "." => EnumNumbering(None));
t!(Markup[" "]: "1." => EnumNumbering(Some(1))); t!(Markup[" "]: "1." => EnumNumbering(Some(1)));
t!(Markup[" "]: "1.a" => Text("1."), Text("a")); t!(Markup[" "]: "1.a" => EnumNumbering(Some(1)), Text("a"));
t!(Markup[" /"]: "a1." => Text("a1.")); t!(Markup[" /"]: "a1." => Text("a1."));
} }

View File

@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
use crate::diag::TypResult; use crate::diag::TypResult;
use crate::loading::{FileHash, Loader}; use crate::loading::{FileHash, Loader};
use crate::parse::{is_newline, parse, Scanner}; use crate::parse::{is_newline, parse, Reparser, Scanner};
use crate::syntax::ast::Markup; use crate::syntax::ast::Markup;
use crate::syntax::{self, Category, GreenNode, RedNode}; use crate::syntax::{self, Category, GreenNode, RedNode};
use crate::util::PathExt; use crate::util::PathExt;
@ -154,9 +154,14 @@ impl SourceFile {
&self.root &self.root
} }
/// The root red node of the file's untyped red tree.
pub fn red(&self) -> RedNode {
RedNode::from_root(self.root.clone(), self.id)
}
/// The root node of the file's typed abstract syntax tree. /// The root node of the file's typed abstract syntax tree.
pub fn ast(&self) -> TypResult<Markup> { pub fn ast(&self) -> TypResult<Markup> {
let red = RedNode::from_root(self.root.clone(), self.id); let red = self.red();
let errors = red.errors(); let errors = red.errors();
if errors.is_empty() { if errors.is_empty() {
Ok(red.cast().unwrap()) Ok(red.cast().unwrap())
@ -265,10 +270,11 @@ impl SourceFile {
/// Edit the source file by replacing the given range. /// Edit the source file by replacing the given range.
/// ///
/// This panics if the `replace` range is out of bounds. /// Returns the range of the section in the new source that was ultimately
pub fn edit(&mut self, replace: Range<usize>, with: &str) { /// reparsed. The method panics if the `replace` range is out of bounds.
pub fn edit(&mut self, replace: Range<usize>, with: &str) -> Range<usize> {
let start = replace.start; let start = replace.start;
self.src.replace_range(replace, with); self.src.replace_range(replace.clone(), with);
// Remove invalidated line starts. // Remove invalidated line starts.
let line = self.byte_to_line(start).unwrap(); let line = self.byte_to_line(start).unwrap();
@ -283,8 +289,8 @@ impl SourceFile {
self.line_starts self.line_starts
.extend(newlines(&self.src[start ..]).map(|idx| start + idx)); .extend(newlines(&self.src[start ..]).map(|idx| start + idx));
// Reparse. // Incrementally reparse the replaced range.
self.root = parse(&self.src); Reparser::new(&self.src, replace, with.len()).reparse(&mut self.root)
} }
/// Provide highlighting categories for the given range of the source file. /// Provide highlighting categories for the given range of the source file.

View File

@ -53,7 +53,7 @@ macro_rules! node {
node! { node! {
/// The syntactical root capable of representing a full parsed document. /// The syntactical root capable of representing a full parsed document.
Markup Markup: NodeKind::Markup(_)
} }
impl Markup { impl Markup {
@ -65,7 +65,9 @@ impl Markup {
NodeKind::Parbreak => Some(MarkupNode::Parbreak), NodeKind::Parbreak => Some(MarkupNode::Parbreak),
NodeKind::Strong => Some(MarkupNode::Strong), NodeKind::Strong => Some(MarkupNode::Strong),
NodeKind::Emph => Some(MarkupNode::Emph), NodeKind::Emph => Some(MarkupNode::Emph),
NodeKind::Text(s) => Some(MarkupNode::Text(s.clone())), NodeKind::Text(s) | NodeKind::TextInLine(s) => {
Some(MarkupNode::Text(s.clone()))
}
NodeKind::Escape(c) => Some(MarkupNode::Text((*c).into())), NodeKind::Escape(c) => Some(MarkupNode::Text((*c).into())),
NodeKind::EnDash => Some(MarkupNode::Text('\u{2013}'.into())), NodeKind::EnDash => Some(MarkupNode::Text('\u{2013}'.into())),
NodeKind::EmDash => Some(MarkupNode::Text('\u{2014}'.into())), NodeKind::EmDash => Some(MarkupNode::Text('\u{2014}'.into())),

View File

@ -154,10 +154,11 @@ impl Category {
NodeKind::Str(_) => Some(Category::String), NodeKind::Str(_) => Some(Category::String),
NodeKind::Error(_, _) => Some(Category::Invalid), NodeKind::Error(_, _) => Some(Category::Invalid),
NodeKind::Unknown(_) => Some(Category::Invalid), NodeKind::Unknown(_) => Some(Category::Invalid),
NodeKind::Markup => None, NodeKind::Markup(_) => None,
NodeKind::Space(_) => None, NodeKind::Space(_) => None,
NodeKind::Parbreak => None, NodeKind::Parbreak => None,
NodeKind::Text(_) => None, NodeKind::Text(_) => None,
NodeKind::TextInLine(_) => None,
NodeKind::List => None, NodeKind::List => None,
NodeKind::Enum => None, NodeKind::Enum => None,
NodeKind::Array => None, NodeKind::Array => None,

View File

@ -6,6 +6,7 @@ mod pretty;
mod span; mod span;
use std::fmt::{self, Debug, Display, Formatter}; use std::fmt::{self, Debug, Display, Formatter};
use std::ops::Range;
use std::rc::Rc; use std::rc::Rc;
pub use highlight::*; pub use highlight::*;
@ -15,6 +16,7 @@ pub use span::*;
use self::ast::{MathNode, RawNode, TypedNode}; use self::ast::{MathNode, RawNode, TypedNode};
use crate::diag::Error; use crate::diag::Error;
use crate::geom::{AngularUnit, LengthUnit}; use crate::geom::{AngularUnit, LengthUnit};
use crate::parse::TokenMode;
use crate::source::SourceId; use crate::source::SourceId;
use crate::util::EcoString; use crate::util::EcoString;
@ -62,6 +64,14 @@ impl Green {
} }
} }
/// Whether the node is a leaf node in the green tree.
pub fn is_leaf(&self) -> bool {
match self {
Green::Node(n) => n.children().is_empty(),
Green::Token(_) => true,
}
}
/// Change the type of the node. /// Change the type of the node.
pub fn convert(&mut self, kind: NodeKind) { pub fn convert(&mut self, kind: NodeKind) {
match self { match self {
@ -127,6 +137,52 @@ impl GreenNode {
pub fn children(&self) -> &[Green] { pub fn children(&self) -> &[Green] {
&self.children &self.children
} }
/// The node's metadata.
fn data(&self) -> &GreenData {
&self.data
}
/// The node's type.
pub fn kind(&self) -> &NodeKind {
self.data().kind()
}
/// The node's length.
pub fn len(&self) -> usize {
self.data().len()
}
/// The node's children, mutably.
pub(crate) fn children_mut(&mut self) -> &mut [Green] {
&mut self.children
}
/// Replaces a range of children with some replacement.
pub(crate) fn replace_children(
&mut self,
range: Range<usize>,
replacement: Vec<Green>,
) {
let superseded = &self.children[range.clone()];
let superseded_len: usize = superseded.iter().map(Green::len).sum();
let replacement_len: usize = replacement.iter().map(Green::len).sum();
// If we're erroneous, but not due to the superseded range, then we will
// still be erroneous after the replacement.
let still_erroneous = self.erroneous && !superseded.iter().any(Green::erroneous);
self.children.splice(range, replacement);
self.data.len = self.data.len + replacement_len - superseded_len;
self.erroneous = still_erroneous || self.children.iter().any(Green::erroneous);
}
/// Update the length of this node given the old and new length of
/// replaced children.
pub(crate) fn update_parent(&mut self, new_len: usize, old_len: usize) {
self.data.len = self.data.len() + new_len - old_len;
self.erroneous = self.children.iter().any(Green::erroneous);
}
} }
impl From<GreenNode> for Green { impl From<GreenNode> for Green {
@ -266,7 +322,7 @@ impl Debug for RedNode {
} }
} }
/// A borrowed wrapper for a green node with span information. /// A borrowed wrapper for a [`GreenNode`] with span information.
/// ///
/// Borrowed variant of [`RedNode`]. Can be [cast](Self::cast) to an AST node. /// Borrowed variant of [`RedNode`]. Can be [cast](Self::cast) to an AST node.
#[derive(Copy, Clone, PartialEq)] #[derive(Copy, Clone, PartialEq)]
@ -301,6 +357,11 @@ impl<'a> RedRef<'a> {
Span::new(self.id, self.offset, self.offset + self.green.len()) Span::new(self.id, self.offset, self.offset + self.green.len())
} }
/// Whether the node is a leaf node.
pub fn is_leaf(self) -> bool {
self.green.is_leaf()
}
/// The error messages for this node and its descendants. /// The error messages for this node and its descendants.
pub fn errors(self) -> Vec<Error> { pub fn errors(self) -> Vec<Error> {
if !self.green.erroneous() { if !self.green.erroneous() {
@ -325,6 +386,15 @@ impl<'a> RedRef<'a> {
} }
} }
/// Returns all leaf descendants of this node (may include itself).
pub fn leafs(self) -> Vec<Self> {
if self.is_leaf() {
vec![self]
} else {
self.children().flat_map(Self::leafs).collect()
}
}
/// Convert the node to a typed AST node. /// Convert the node to a typed AST node.
pub fn cast<T>(self) -> Option<T> pub fn cast<T>(self) -> Option<T>
where where
@ -502,8 +572,8 @@ pub enum NodeKind {
Include, Include,
/// The `from` keyword. /// The `from` keyword.
From, From,
/// Template markup. /// Template markup of which all lines must start in some column.
Markup, Markup(usize),
/// One or more whitespace characters. /// One or more whitespace characters.
Space(usize), Space(usize),
/// A forced line break: `\`. /// A forced line break: `\`.
@ -512,6 +582,8 @@ pub enum NodeKind {
Parbreak, Parbreak,
/// A consecutive non-markup string. /// A consecutive non-markup string.
Text(EcoString), Text(EcoString),
/// A text node that cannot appear at the beginning of a source line.
TextInLine(EcoString),
/// A non-breaking space: `~`. /// A non-breaking space: `~`.
NonBreakingSpace, NonBreakingSpace,
/// An en-dash: `--`. /// An en-dash: `--`.
@ -648,11 +720,71 @@ impl NodeKind {
matches!(self, Self::LeftParen | Self::RightParen) matches!(self, Self::LeftParen | Self::RightParen)
} }
/// Whether this is whitespace.
pub fn is_whitespace(&self) -> bool {
matches!(self, Self::Space(_) | Self::Parbreak)
}
/// Whether this is trivia.
pub fn is_trivia(&self) -> bool {
self.is_whitespace() || matches!(self, Self::LineComment | Self::BlockComment)
}
/// Whether this is some kind of error. /// Whether this is some kind of error.
pub fn is_error(&self) -> bool { pub fn is_error(&self) -> bool {
matches!(self, NodeKind::Error(_, _) | NodeKind::Unknown(_)) matches!(self, NodeKind::Error(_, _) | NodeKind::Unknown(_))
} }
/// Whether this node is `at_start` given the previous value of the property.
pub fn is_at_start(&self, prev: bool) -> bool {
match self {
Self::Space(n) if *n > 0 => true,
Self::Parbreak => true,
Self::LineComment | Self::BlockComment => prev,
_ => false,
}
}
/// Whether this token appears in Markup.
pub fn mode(&self) -> Option<TokenMode> {
match self {
Self::Markup(_)
| Self::Linebreak
| Self::Parbreak
| Self::Text(_)
| Self::TextInLine(_)
| Self::NonBreakingSpace
| Self::EnDash
| Self::EmDash
| Self::Escape(_)
| Self::Strong
| Self::Emph
| Self::Heading
| Self::Enum
| Self::EnumNumbering(_)
| Self::List
| Self::Raw(_)
| Self::Math(_) => Some(TokenMode::Markup),
Self::Template
| Self::Space(_)
| Self::Block
| Self::Ident(_)
| Self::LetExpr
| Self::IfExpr
| Self::WhileExpr
| Self::ForExpr
| Self::ImportExpr
| Self::Call
| Self::IncludeExpr
| Self::LineComment
| Self::BlockComment
| Self::Error(_, _)
| Self::Minus
| Self::Eq => None,
_ => Some(TokenMode::Code),
}
}
/// A human-readable name for the kind. /// A human-readable name for the kind.
pub fn as_str(&self) -> &'static str { pub fn as_str(&self) -> &'static str {
match self { match self {
@ -701,11 +833,11 @@ impl NodeKind {
Self::Import => "keyword `import`", Self::Import => "keyword `import`",
Self::Include => "keyword `include`", Self::Include => "keyword `include`",
Self::From => "keyword `from`", Self::From => "keyword `from`",
Self::Markup => "markup", Self::Markup(_) => "markup",
Self::Space(_) => "space", Self::Space(_) => "space",
Self::Linebreak => "forced linebreak", Self::Linebreak => "forced linebreak",
Self::Parbreak => "paragraph break", Self::Parbreak => "paragraph break",
Self::Text(_) => "text", Self::Text(_) | Self::TextInLine(_) => "text",
Self::NonBreakingSpace => "non-breaking space", Self::NonBreakingSpace => "non-breaking space",
Self::EnDash => "en dash", Self::EnDash => "en dash",
Self::EmDash => "em dash", Self::EmDash => "em dash",

View File

@ -129,7 +129,7 @@
} }
--- ---
// Error: 2:1 expected closing brace // Error: 2 expected closing brace
{ {
--- ---

View File

@ -57,7 +57,7 @@ Three
// Terminated by semicolon even though we are in a paren group. // Terminated by semicolon even though we are in a paren group.
// Error: 18 expected expression // Error: 18 expected expression
// Error: 19 expected closing paren // Error: 18 expected closing paren
#let v5 = (1, 2 + ; Five #let v5 = (1, 2 + ; Five
--- ---

View File

@ -1,6 +1,7 @@
use std::env; use std::env;
use std::ffi::OsStr; use std::ffi::OsStr;
use std::fs; use std::fs;
use std::ops::Range;
use std::path::Path; use std::path::Path;
use std::rc::Rc; use std::rc::Rc;
@ -186,6 +187,7 @@ fn test(
let mut line = 0; let mut line = 0;
let mut compare_ref = true; let mut compare_ref = true;
let mut compare_ever = false; let mut compare_ever = false;
let mut rng = LinearShift::new();
let parts: Vec<_> = src.split("\n---").collect(); let parts: Vec<_> = src.split("\n---").collect();
for (i, &part) in parts.iter().enumerate() { for (i, &part) in parts.iter().enumerate() {
@ -202,8 +204,16 @@ fn test(
} }
} }
} else { } else {
let (part_ok, compare_here, part_frames) = let (part_ok, compare_here, part_frames) = test_part(
test_part(ctx, src_path, part.into(), i, compare_ref, line, debug); ctx,
src_path,
part.into(),
i,
compare_ref,
line,
debug,
&mut rng,
);
ok &= part_ok; ok &= part_ok;
compare_ever |= compare_here; compare_ever |= compare_here;
frames.extend(part_frames); frames.extend(part_frames);
@ -252,14 +262,15 @@ fn test_part(
compare_ref: bool, compare_ref: bool,
line: usize, line: usize,
debug: bool, debug: bool,
rng: &mut LinearShift,
) -> (bool, bool, Vec<Rc<Frame>>) { ) -> (bool, bool, Vec<Rc<Frame>>) {
let id = ctx.sources.provide(src_path, src); let id = ctx.sources.provide(src_path, src);
let source = ctx.sources.get(id); let source = ctx.sources.get(id);
let (local_compare_ref, mut ref_errors) = parse_metadata(&source); let (local_compare_ref, mut ref_errors) = parse_metadata(&source);
let compare_ref = local_compare_ref.unwrap_or(compare_ref); let compare_ref = local_compare_ref.unwrap_or(compare_ref);
let mut ok = test_reparse(ctx.sources.get(id).src(), i, rng);
let mut ok = true;
let (frames, mut errors) = match ctx.evaluate(id) { let (frames, mut errors) = match ctx.evaluate(id) {
Ok(module) => { Ok(module) => {
let tree = module.into_root(); let tree = module.into_root();
@ -366,6 +377,104 @@ fn test_incremental(
ok ok
} }
/// Pseudorandomly edit the source file and test whether a reparse produces the
/// same result as a clean parse.
///
/// The method will first inject 10 strings once every 400 source characters
/// and then select 5 leaf node boundries to inject an additional, randomly
/// chosen string from the injection list.
fn test_reparse(src: &str, i: usize, rng: &mut LinearShift) -> bool {
let supplements = [
"[",
")",
"#rect()",
"a word",
", a: 1",
"10.0",
":",
"if i == 0 {true}",
"for",
"* hello *",
"//",
"/*",
"\\u{12e4}",
"```typst",
" ",
"trees",
"\\",
"$ a $",
"2.",
"-",
"5",
];
let mut ok = true;
let apply = |replace: std::ops::Range<usize>, with| {
let mut incr_source = SourceFile::detached(src);
if incr_source.root().len() != src.len() {
println!(
" Subtest {} tree length {} does not match string length {} ❌",
i,
incr_source.root().len(),
src.len(),
);
return false;
}
incr_source.edit(replace.clone(), with);
let edited_src = incr_source.src();
let ref_source = SourceFile::detached(edited_src);
let incr_root = incr_source.root();
let ref_root = ref_source.root();
if incr_root != ref_root {
println!(
" Subtest {} reparse differs from clean parse when inserting '{}' at {}-{} ❌",
i, with, replace.start, replace.end,
);
println!(
"\n Expected reference tree:\n{:#?}\n\n Found incremental tree:\n{:#?}",
ref_root, incr_root
);
println!("Full source ({}):\n\"{:?}\"", edited_src.len(), edited_src);
false
} else {
true
}
};
let mut pick = |range: Range<usize>| {
let ratio = rng.next();
(range.start as f64 + ratio * (range.end - range.start) as f64).floor() as usize
};
let insertions = (src.len() as f64 / 400.0).ceil() as usize;
for _ in 0 .. insertions {
let supplement = supplements[pick(0 .. supplements.len())];
let start = pick(0 .. src.len());
let end = pick(start .. src.len());
if !src.is_char_boundary(start) || !src.is_char_boundary(end) {
continue;
}
ok &= apply(start .. end, supplement);
}
let red = SourceFile::detached(src).red();
let leafs = red.as_ref().leafs();
let leaf_start = leafs[pick(0 .. leafs.len())].span().start;
let supplement = supplements[pick(0 .. supplements.len())];
ok &= apply(leaf_start .. leaf_start, supplement);
ok
}
fn parse_metadata(source: &SourceFile) -> (Option<bool>, Vec<Error>) { fn parse_metadata(source: &SourceFile) -> (Option<bool>, Vec<Error>) {
let mut compare_ref = None; let mut compare_ref = None;
let mut errors = vec![]; let mut errors = vec![];
@ -823,3 +932,24 @@ where
FileDescriptor::redirect_stdio(&stdout, Stdout).unwrap(); FileDescriptor::redirect_stdio(&stdout, Stdout).unwrap();
result result
} }
/// This is an Linear-feedback shift register using XOR as its shifting
/// function. It can be used as PRNG.
struct LinearShift(u64);
impl LinearShift {
/// Initialize the shift register with a pre-set seed.
pub fn new() -> Self {
Self(0xACE5)
}
/// Return a pseudo-random number between `0.0` and `1.0`.
pub fn next(&mut self) -> f64 {
self.0 ^= self.0 >> 3;
self.0 ^= self.0 << 14;
self.0 ^= self.0 >> 28;
self.0 ^= self.0 << 36;
self.0 ^= self.0 >> 52;
self.0 as f64 / u64::MAX as f64
}
}