diff --git a/benches/oneshot.rs b/benches/oneshot.rs index 9a57825d1..a42a710d9 100644 --- a/benches/oneshot.rs +++ b/benches/oneshot.rs @@ -44,17 +44,13 @@ fn bench_scan(iai: &mut Iai) { } fn bench_tokenize(iai: &mut Iai) { - iai.run(|| { - Tokens::new( - black_box(&SourceFile::detached(SRC)), - black_box(TokenMode::Markup), - ) - .count() - }); + let src = SourceFile::detached(SRC); + iai.run(|| Tokens::new(black_box(&src), black_box(TokenMode::Markup)).count()); } fn bench_parse(iai: &mut Iai) { - iai.run(|| parse(&SourceFile::detached(SRC))); + let src = SourceFile::detached(SRC); + iai.run(|| parse(&src)); } fn bench_eval(iai: &mut Iai) { diff --git a/src/parse/mod.rs b/src/parse/mod.rs index ce992834c..8775e8a17 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -54,7 +54,10 @@ where while !p.eof() && f(p) { markup_node(p, &mut at_start); if let Some(node) = p.last_child() { - at_start &= matches!(node.kind(), &NodeKind::Space(_) | &NodeKind::Parbreak | &NodeKind::LineComment | &NodeKind::BlockComment); + at_start &= matches!(node.kind(), + &NodeKind::Space(_) | &NodeKind::Parbreak | + &NodeKind::LineComment | &NodeKind::BlockComment + ); } } @@ -88,22 +91,8 @@ fn markup_node(p: &mut Parser, at_start: &mut bool) { | NodeKind::Emph | NodeKind::Strong | NodeKind::Linebreak - | NodeKind::Raw(_) => p.eat(), - - NodeKind::UnicodeEscape(u) => { - if u.character.is_none() { - let src = p.peek_src(); - p.convert(NodeKind::Error( - ErrorPosition::Full, - "invalid unicode escape sequence".into(), - )); - p.start(); - p.end(NodeKind::Text(src.into())); - return; - } - - p.eat(); - } + | NodeKind::Raw(_) + | NodeKind::UnicodeEscape(_) => p.eat(), NodeKind::Eq if *at_start => heading(p), NodeKind::ListBullet if *at_start => list_node(p), @@ -503,9 +492,8 @@ fn item(p: &mut Parser) -> NodeKind { /// Convert a collection into an array, producing errors for anything other than /// expressions. fn array(p: &mut Parser, items: usize) { - p.start_with(items); p.filter_children( - 0, + p.child_count() - items, |x| match x.kind() { NodeKind::Named | NodeKind::ParameterSink => false, _ => true, @@ -522,15 +510,14 @@ fn array(p: &mut Parser, items: usize) { }, ); - p.end(NodeKind::Array) + p.convert_with(items, NodeKind::Array); } /// Convert a collection into a dictionary, producing errors for anything other /// than named pairs. fn dict(p: &mut Parser, items: usize) { - p.start_with(items); p.filter_children( - 0, + p.child_count() - items, |x| { x.kind() == &NodeKind::Named || x.kind().is_parenthesis() @@ -547,7 +534,7 @@ fn dict(p: &mut Parser, items: usize) { ), }, ); - p.end(NodeKind::Dict); + p.convert_with(items, NodeKind::Dict); } /// Convert a collection into a list of parameters, producing errors for @@ -684,8 +671,7 @@ fn let_expr(p: &mut Parser) { return; } - p.start_with(p.child_count() - offset); - p.end(NodeKind::Closure) + p.convert_with(p.child_count() - offset, NodeKind::Closure); } } diff --git a/src/parse/parser.rs b/src/parse/parser.rs index e6fcc1aed..240de43d7 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -186,9 +186,27 @@ impl<'s> Parser<'s> { } pub fn convert(&mut self, kind: NodeKind) { - self.start(); - self.eat(); - self.end(kind); + let len = self.tokens.index() - self.next_start; + + self.children.push( + GreenNode::with_child( + kind, + len, + GreenData::new(self.next.clone().unwrap(), len), + ) + .into(), + ); + self.fast_forward(); + self.success = true; + } + + pub fn convert_with(&mut self, preserve: usize, kind: NodeKind) { + let preserved: Vec<_> = + self.children.drain(self.children.len() - preserve ..).collect(); + let len = preserved.iter().map(|c| c.len()).sum(); + self.children + .push(GreenNode::with_children(kind, len, preserved).into()); + self.success = true; } /// End the current node and undo its existence, inling all accumulated diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 7c500ce79..1d2e32ec5 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -200,7 +200,7 @@ impl<'s> Tokens<'s> { TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) }); - NodeKind::Text(resolve_string(self.s.eaten_from(start))) + NodeKind::Text(self.s.eaten_from(start).into()) } fn whitespace(&mut self) -> NodeKind { @@ -243,10 +243,16 @@ impl<'s> Tokens<'s> { let sequence: EcoString = self.s.eat_while(|c| c.is_ascii_alphanumeric()).into(); if self.s.eat_if('}') { - NodeKind::UnicodeEscape(Rc::new(UnicodeEscapeToken { - character: resolve_hex(&sequence), - sequence, - })) + if let Some(character) = resolve_hex(&sequence) { + NodeKind::UnicodeEscape(UnicodeEscapeToken { + character, + }) + } else { + NodeKind::Error( + ErrorPosition::Full, + "invalid unicode escape sequence".into(), + ) + } } else { NodeKind::Error( ErrorPosition::End, @@ -560,35 +566,21 @@ mod tests { use Option::None; use TokenMode::{Code, Markup}; - fn UnicodeEscape(sequence: &str, terminated: bool) -> NodeKind { - if terminated { - NodeKind::UnicodeEscape(Rc::new(UnicodeEscapeToken { - character: resolve_hex(sequence), - sequence: sequence.into(), - })) - } else { - NodeKind::Error(ErrorPosition::End, "expected closing brace".into()) - } + fn UnicodeEscape(character: char) -> NodeKind { + NodeKind::UnicodeEscape(UnicodeEscapeToken { character }) } - fn Raw( - text: &str, - lang: Option<&str>, - backticks_left: u8, - err_msg: Option<&str>, - block: bool, - ) -> NodeKind { - match err_msg { - None => NodeKind::Raw(Rc::new(RawToken { - text: text.into(), - lang: lang.map(Into::into), - backticks: backticks_left, - block, - })), - Some(msg) => { - NodeKind::Error(ErrorPosition::End, format!("expected {}", msg).into()) - } - } + fn Error(pos: ErrorPosition, message: &str) -> NodeKind { + NodeKind::Error(pos, message.into()) + } + + fn Raw(text: &str, lang: Option<&str>, backticks_left: u8, block: bool) -> NodeKind { + NodeKind::Raw(Rc::new(RawToken { + text: text.into(), + lang: lang.map(Into::into), + backticks: backticks_left, + block, + })) } fn Math(formula: &str, display: bool, err_msg: Option<&str>) -> NodeKind { @@ -795,16 +787,16 @@ mod tests { t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\"")); // Test basic unicode escapes. - t!(Markup: r"\u{}" => UnicodeEscape("", true)); - t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true)); - t!(Markup: r"\u{P}" => UnicodeEscape("P", true)); + t!(Markup: r"\u{}" => Error(ErrorPosition::Full, "invalid unicode escape sequence")); + t!(Markup: r"\u{2603}" => UnicodeEscape('☃')); + t!(Markup: r"\u{P}" => Error(ErrorPosition::Full, "invalid unicode escape sequence")); // Test unclosed unicode escapes. - t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false)); - t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false)); - t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false)); - t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false)); - t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace); + t!(Markup[" /"]: r"\u{" => Error(ErrorPosition::End, "expected closing brace")); + t!(Markup[" /"]: r"\u{1" => Error(ErrorPosition::End, "expected closing brace")); + t!(Markup[" /"]: r"\u{26A4" => Error(ErrorPosition::End, "expected closing brace")); + t!(Markup[" /"]: r"\u{1Q3P" => Error(ErrorPosition::End, "expected closing brace")); + t!(Markup: r"\u{1🏕}" => Error(ErrorPosition::End, "expected closing brace"), Text("🏕"), RightBrace); } #[test] @@ -894,22 +886,22 @@ mod tests { #[test] fn test_tokenize_raw_blocks() { // Test basic raw block. - t!(Markup: "``" => Raw("", None, 1, None, false)); - t!(Markup: "`raw`" => Raw("raw", None, 1, None, false)); - t!(Markup[""]: "`]" => Raw("]", None, 1, Some("1 backtick"), false)); + t!(Markup: "``" => Raw("", None, 1, false)); + t!(Markup: "`raw`" => Raw("raw", None, 1, false)); + t!(Markup[""]: "`]" => Error(ErrorPosition::End, "expected 1 backtick")); // Test special symbols in raw block. - t!(Markup: "`[brackets]`" => Raw("[brackets]", None, 1, None, false)); - t!(Markup[""]: r"`\`` " => Raw(r"\", None, 1, None, false), Raw(" ", None, 1, Some("1 backtick"), false)); + t!(Markup: "`[brackets]`" => Raw("[brackets]", None, 1, false)); + t!(Markup[""]: r"`\`` " => Raw(r"\", None, 1, false), Error(ErrorPosition::End, "expected 1 backtick")); // Test separated closing backticks. - t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), 3, None, false)); + t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), 3, false)); // Test more backticks. - t!(Markup: "``nope``" => Raw("", None, 1, None, false), Text("nope"), Raw("", None, 1, None, false)); - t!(Markup: "````🚀````" => Raw("", Some("🚀"), 4, None, false)); - t!(Markup[""]: "`````👩‍🚀````noend" => Raw("````noend", Some("👩‍🚀"), 5, Some("5 backticks"), false)); - t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), 4, None, false), Raw("", None, 1, None, false)); + t!(Markup: "``nope``" => Raw("", None, 1, false), Text("nope"), Raw("", None, 1, false)); + t!(Markup: "````🚀````" => Raw("", Some("🚀"), 4, false)); + t!(Markup[""]: "`````👩‍🚀````noend" => Error(ErrorPosition::End, "expected 5 backticks")); + t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), 4, false), Raw("", None, 1, false)); } #[test] diff --git a/src/syntax/markup.rs b/src/syntax/markup.rs index 49b2a519e..f43a618a0 100644 --- a/src/syntax/markup.rs +++ b/src/syntax/markup.rs @@ -1,7 +1,6 @@ use super::{Expr, Ident, NodeKind, RedNode, RedRef, Span, TypedNode}; use crate::node; use crate::util::EcoString; -use std::fmt::Write; node! { /// The syntactical root capable of representing a full parsed document. @@ -50,14 +49,7 @@ impl TypedNode for MarkupNode { NodeKind::Strong => Some(MarkupNode::Strong), NodeKind::Emph => Some(MarkupNode::Emph), NodeKind::Text(s) => Some(MarkupNode::Text(s.clone())), - NodeKind::UnicodeEscape(u) => Some(MarkupNode::Text(match u.character { - Some(c) => c.into(), - None => { - let mut eco = EcoString::with_capacity(u.sequence.len() + 4); - write!(&mut eco, "\\u{{{}}}", u.sequence).unwrap(); - eco - } - })), + NodeKind::UnicodeEscape(u) => Some(MarkupNode::Text(u.character.into())), NodeKind::EnDash => Some(MarkupNode::Text(EcoString::from("\u{2013}"))), NodeKind::EmDash => Some(MarkupNode::Text(EcoString::from("\u{2014}"))), NodeKind::NonBreakingSpace => { diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index afa0ab86a..9d4beb6cb 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -121,7 +121,7 @@ pub enum NodeKind { Text(EcoString), /// A slash and the letter "u" followed by a hexadecimal unicode entity /// enclosed in curly braces: `\u{1F5FA}`. - UnicodeEscape(Rc), + UnicodeEscape(UnicodeEscapeToken), /// An arbitrary number of backticks followed by inner contents, terminated /// with the same number of backticks: `` `...` ``. Raw(Rc), diff --git a/src/syntax/token.rs b/src/syntax/token.rs index 5a6214958..4f43bb4f5 100644 --- a/src/syntax/token.rs +++ b/src/syntax/token.rs @@ -33,9 +33,8 @@ pub struct MathToken { /// A unicode escape sequence token: `\u{1F5FA}`. #[derive(Debug, Clone, PartialEq)] +#[repr(transparent)] pub struct UnicodeEscapeToken { - /// The escape sequence between the braces. - pub sequence: EcoString, /// The resulting unicode character. - pub character: Option, + pub character: char, }