Unicode escape error moved to tokenizer

This commit is contained in:
Martin Haug 2021-10-31 18:52:48 +01:00
parent c569e14c07
commit 2e7d359e59
7 changed files with 82 additions and 99 deletions

View File

@ -44,17 +44,13 @@ fn bench_scan(iai: &mut Iai) {
} }
fn bench_tokenize(iai: &mut Iai) { fn bench_tokenize(iai: &mut Iai) {
iai.run(|| { let src = SourceFile::detached(SRC);
Tokens::new( iai.run(|| Tokens::new(black_box(&src), black_box(TokenMode::Markup)).count());
black_box(&SourceFile::detached(SRC)),
black_box(TokenMode::Markup),
)
.count()
});
} }
fn bench_parse(iai: &mut Iai) { fn bench_parse(iai: &mut Iai) {
iai.run(|| parse(&SourceFile::detached(SRC))); let src = SourceFile::detached(SRC);
iai.run(|| parse(&src));
} }
fn bench_eval(iai: &mut Iai) { fn bench_eval(iai: &mut Iai) {

View File

@ -54,7 +54,10 @@ where
while !p.eof() && f(p) { while !p.eof() && f(p) {
markup_node(p, &mut at_start); markup_node(p, &mut at_start);
if let Some(node) = p.last_child() { if let Some(node) = p.last_child() {
at_start &= matches!(node.kind(), &NodeKind::Space(_) | &NodeKind::Parbreak | &NodeKind::LineComment | &NodeKind::BlockComment); at_start &= matches!(node.kind(),
&NodeKind::Space(_) | &NodeKind::Parbreak |
&NodeKind::LineComment | &NodeKind::BlockComment
);
} }
} }
@ -88,22 +91,8 @@ fn markup_node(p: &mut Parser, at_start: &mut bool) {
| NodeKind::Emph | NodeKind::Emph
| NodeKind::Strong | NodeKind::Strong
| NodeKind::Linebreak | NodeKind::Linebreak
| NodeKind::Raw(_) => p.eat(), | NodeKind::Raw(_)
| NodeKind::UnicodeEscape(_) => p.eat(),
NodeKind::UnicodeEscape(u) => {
if u.character.is_none() {
let src = p.peek_src();
p.convert(NodeKind::Error(
ErrorPosition::Full,
"invalid unicode escape sequence".into(),
));
p.start();
p.end(NodeKind::Text(src.into()));
return;
}
p.eat();
}
NodeKind::Eq if *at_start => heading(p), NodeKind::Eq if *at_start => heading(p),
NodeKind::ListBullet if *at_start => list_node(p), NodeKind::ListBullet if *at_start => list_node(p),
@ -503,9 +492,8 @@ fn item(p: &mut Parser) -> NodeKind {
/// Convert a collection into an array, producing errors for anything other than /// Convert a collection into an array, producing errors for anything other than
/// expressions. /// expressions.
fn array(p: &mut Parser, items: usize) { fn array(p: &mut Parser, items: usize) {
p.start_with(items);
p.filter_children( p.filter_children(
0, p.child_count() - items,
|x| match x.kind() { |x| match x.kind() {
NodeKind::Named | NodeKind::ParameterSink => false, NodeKind::Named | NodeKind::ParameterSink => false,
_ => true, _ => true,
@ -522,15 +510,14 @@ fn array(p: &mut Parser, items: usize) {
}, },
); );
p.end(NodeKind::Array) p.convert_with(items, NodeKind::Array);
} }
/// Convert a collection into a dictionary, producing errors for anything other /// Convert a collection into a dictionary, producing errors for anything other
/// than named pairs. /// than named pairs.
fn dict(p: &mut Parser, items: usize) { fn dict(p: &mut Parser, items: usize) {
p.start_with(items);
p.filter_children( p.filter_children(
0, p.child_count() - items,
|x| { |x| {
x.kind() == &NodeKind::Named x.kind() == &NodeKind::Named
|| x.kind().is_parenthesis() || x.kind().is_parenthesis()
@ -547,7 +534,7 @@ fn dict(p: &mut Parser, items: usize) {
), ),
}, },
); );
p.end(NodeKind::Dict); p.convert_with(items, NodeKind::Dict);
} }
/// Convert a collection into a list of parameters, producing errors for /// Convert a collection into a list of parameters, producing errors for
@ -684,8 +671,7 @@ fn let_expr(p: &mut Parser) {
return; return;
} }
p.start_with(p.child_count() - offset); p.convert_with(p.child_count() - offset, NodeKind::Closure);
p.end(NodeKind::Closure)
} }
} }

View File

@ -186,9 +186,27 @@ impl<'s> Parser<'s> {
} }
pub fn convert(&mut self, kind: NodeKind) { pub fn convert(&mut self, kind: NodeKind) {
self.start(); let len = self.tokens.index() - self.next_start;
self.eat();
self.end(kind); self.children.push(
GreenNode::with_child(
kind,
len,
GreenData::new(self.next.clone().unwrap(), len),
)
.into(),
);
self.fast_forward();
self.success = true;
}
pub fn convert_with(&mut self, preserve: usize, kind: NodeKind) {
let preserved: Vec<_> =
self.children.drain(self.children.len() - preserve ..).collect();
let len = preserved.iter().map(|c| c.len()).sum();
self.children
.push(GreenNode::with_children(kind, len, preserved).into());
self.success = true;
} }
/// End the current node and undo its existence, inling all accumulated /// End the current node and undo its existence, inling all accumulated

View File

@ -200,7 +200,7 @@ impl<'s> Tokens<'s> {
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
}); });
NodeKind::Text(resolve_string(self.s.eaten_from(start))) NodeKind::Text(self.s.eaten_from(start).into())
} }
fn whitespace(&mut self) -> NodeKind { fn whitespace(&mut self) -> NodeKind {
@ -243,10 +243,16 @@ impl<'s> Tokens<'s> {
let sequence: EcoString = self.s.eat_while(|c| c.is_ascii_alphanumeric()).into(); let sequence: EcoString = self.s.eat_while(|c| c.is_ascii_alphanumeric()).into();
if self.s.eat_if('}') { if self.s.eat_if('}') {
NodeKind::UnicodeEscape(Rc::new(UnicodeEscapeToken { if let Some(character) = resolve_hex(&sequence) {
character: resolve_hex(&sequence), NodeKind::UnicodeEscape(UnicodeEscapeToken {
sequence, character,
})) })
} else {
NodeKind::Error(
ErrorPosition::Full,
"invalid unicode escape sequence".into(),
)
}
} else { } else {
NodeKind::Error( NodeKind::Error(
ErrorPosition::End, ErrorPosition::End,
@ -560,35 +566,21 @@ mod tests {
use Option::None; use Option::None;
use TokenMode::{Code, Markup}; use TokenMode::{Code, Markup};
fn UnicodeEscape(sequence: &str, terminated: bool) -> NodeKind { fn UnicodeEscape(character: char) -> NodeKind {
if terminated { NodeKind::UnicodeEscape(UnicodeEscapeToken { character })
NodeKind::UnicodeEscape(Rc::new(UnicodeEscapeToken {
character: resolve_hex(sequence),
sequence: sequence.into(),
}))
} else {
NodeKind::Error(ErrorPosition::End, "expected closing brace".into())
}
} }
fn Raw( fn Error(pos: ErrorPosition, message: &str) -> NodeKind {
text: &str, NodeKind::Error(pos, message.into())
lang: Option<&str>, }
backticks_left: u8,
err_msg: Option<&str>, fn Raw(text: &str, lang: Option<&str>, backticks_left: u8, block: bool) -> NodeKind {
block: bool, NodeKind::Raw(Rc::new(RawToken {
) -> NodeKind {
match err_msg {
None => NodeKind::Raw(Rc::new(RawToken {
text: text.into(), text: text.into(),
lang: lang.map(Into::into), lang: lang.map(Into::into),
backticks: backticks_left, backticks: backticks_left,
block, block,
})), }))
Some(msg) => {
NodeKind::Error(ErrorPosition::End, format!("expected {}", msg).into())
}
}
} }
fn Math(formula: &str, display: bool, err_msg: Option<&str>) -> NodeKind { fn Math(formula: &str, display: bool, err_msg: Option<&str>) -> NodeKind {
@ -795,16 +787,16 @@ mod tests {
t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\"")); t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\""));
// Test basic unicode escapes. // Test basic unicode escapes.
t!(Markup: r"\u{}" => UnicodeEscape("", true)); t!(Markup: r"\u{}" => Error(ErrorPosition::Full, "invalid unicode escape sequence"));
t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true)); t!(Markup: r"\u{2603}" => UnicodeEscape('☃'));
t!(Markup: r"\u{P}" => UnicodeEscape("P", true)); t!(Markup: r"\u{P}" => Error(ErrorPosition::Full, "invalid unicode escape sequence"));
// Test unclosed unicode escapes. // Test unclosed unicode escapes.
t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false)); t!(Markup[" /"]: r"\u{" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false)); t!(Markup[" /"]: r"\u{1" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false)); t!(Markup[" /"]: r"\u{26A4" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false)); t!(Markup[" /"]: r"\u{1Q3P" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace); t!(Markup: r"\u{1🏕}" => Error(ErrorPosition::End, "expected closing brace"), Text("🏕"), RightBrace);
} }
#[test] #[test]
@ -894,22 +886,22 @@ mod tests {
#[test] #[test]
fn test_tokenize_raw_blocks() { fn test_tokenize_raw_blocks() {
// Test basic raw block. // Test basic raw block.
t!(Markup: "``" => Raw("", None, 1, None, false)); t!(Markup: "``" => Raw("", None, 1, false));
t!(Markup: "`raw`" => Raw("raw", None, 1, None, false)); t!(Markup: "`raw`" => Raw("raw", None, 1, false));
t!(Markup[""]: "`]" => Raw("]", None, 1, Some("1 backtick"), false)); t!(Markup[""]: "`]" => Error(ErrorPosition::End, "expected 1 backtick"));
// Test special symbols in raw block. // Test special symbols in raw block.
t!(Markup: "`[brackets]`" => Raw("[brackets]", None, 1, None, false)); t!(Markup: "`[brackets]`" => Raw("[brackets]", None, 1, false));
t!(Markup[""]: r"`\`` " => Raw(r"\", None, 1, None, false), Raw(" ", None, 1, Some("1 backtick"), false)); t!(Markup[""]: r"`\`` " => Raw(r"\", None, 1, false), Error(ErrorPosition::End, "expected 1 backtick"));
// Test separated closing backticks. // Test separated closing backticks.
t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), 3, None, false)); t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), 3, false));
// Test more backticks. // Test more backticks.
t!(Markup: "``nope``" => Raw("", None, 1, None, false), Text("nope"), Raw("", None, 1, None, false)); t!(Markup: "``nope``" => Raw("", None, 1, false), Text("nope"), Raw("", None, 1, false));
t!(Markup: "````🚀````" => Raw("", Some("🚀"), 4, None, false)); t!(Markup: "````🚀````" => Raw("", Some("🚀"), 4, false));
t!(Markup[""]: "`````👩‍🚀````noend" => Raw("````noend", Some("👩‍🚀"), 5, Some("5 backticks"), false)); t!(Markup[""]: "`````👩‍🚀````noend" => Error(ErrorPosition::End, "expected 5 backticks"));
t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), 4, None, false), Raw("", None, 1, None, false)); t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), 4, false), Raw("", None, 1, false));
} }
#[test] #[test]

View File

@ -1,7 +1,6 @@
use super::{Expr, Ident, NodeKind, RedNode, RedRef, Span, TypedNode}; use super::{Expr, Ident, NodeKind, RedNode, RedRef, Span, TypedNode};
use crate::node; use crate::node;
use crate::util::EcoString; use crate::util::EcoString;
use std::fmt::Write;
node! { node! {
/// The syntactical root capable of representing a full parsed document. /// The syntactical root capable of representing a full parsed document.
@ -50,14 +49,7 @@ impl TypedNode for MarkupNode {
NodeKind::Strong => Some(MarkupNode::Strong), NodeKind::Strong => Some(MarkupNode::Strong),
NodeKind::Emph => Some(MarkupNode::Emph), NodeKind::Emph => Some(MarkupNode::Emph),
NodeKind::Text(s) => Some(MarkupNode::Text(s.clone())), NodeKind::Text(s) => Some(MarkupNode::Text(s.clone())),
NodeKind::UnicodeEscape(u) => Some(MarkupNode::Text(match u.character { NodeKind::UnicodeEscape(u) => Some(MarkupNode::Text(u.character.into())),
Some(c) => c.into(),
None => {
let mut eco = EcoString::with_capacity(u.sequence.len() + 4);
write!(&mut eco, "\\u{{{}}}", u.sequence).unwrap();
eco
}
})),
NodeKind::EnDash => Some(MarkupNode::Text(EcoString::from("\u{2013}"))), NodeKind::EnDash => Some(MarkupNode::Text(EcoString::from("\u{2013}"))),
NodeKind::EmDash => Some(MarkupNode::Text(EcoString::from("\u{2014}"))), NodeKind::EmDash => Some(MarkupNode::Text(EcoString::from("\u{2014}"))),
NodeKind::NonBreakingSpace => { NodeKind::NonBreakingSpace => {

View File

@ -121,7 +121,7 @@ pub enum NodeKind {
Text(EcoString), Text(EcoString),
/// A slash and the letter "u" followed by a hexadecimal unicode entity /// A slash and the letter "u" followed by a hexadecimal unicode entity
/// enclosed in curly braces: `\u{1F5FA}`. /// enclosed in curly braces: `\u{1F5FA}`.
UnicodeEscape(Rc<UnicodeEscapeToken>), UnicodeEscape(UnicodeEscapeToken),
/// An arbitrary number of backticks followed by inner contents, terminated /// An arbitrary number of backticks followed by inner contents, terminated
/// with the same number of backticks: `` `...` ``. /// with the same number of backticks: `` `...` ``.
Raw(Rc<RawToken>), Raw(Rc<RawToken>),

View File

@ -33,9 +33,8 @@ pub struct MathToken {
/// A unicode escape sequence token: `\u{1F5FA}`. /// A unicode escape sequence token: `\u{1F5FA}`.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
#[repr(transparent)]
pub struct UnicodeEscapeToken { pub struct UnicodeEscapeToken {
/// The escape sequence between the braces.
pub sequence: EcoString,
/// The resulting unicode character. /// The resulting unicode character.
pub character: Option<char>, pub character: char,
} }