Unicode escape error moved to tokenizer

This commit is contained in:
Martin Haug 2021-10-31 18:52:48 +01:00
parent c569e14c07
commit 2e7d359e59
7 changed files with 82 additions and 99 deletions

View File

@ -44,17 +44,13 @@ fn bench_scan(iai: &mut Iai) {
}
fn bench_tokenize(iai: &mut Iai) {
iai.run(|| {
Tokens::new(
black_box(&SourceFile::detached(SRC)),
black_box(TokenMode::Markup),
)
.count()
});
let src = SourceFile::detached(SRC);
iai.run(|| Tokens::new(black_box(&src), black_box(TokenMode::Markup)).count());
}
fn bench_parse(iai: &mut Iai) {
iai.run(|| parse(&SourceFile::detached(SRC)));
let src = SourceFile::detached(SRC);
iai.run(|| parse(&src));
}
fn bench_eval(iai: &mut Iai) {

View File

@ -54,7 +54,10 @@ where
while !p.eof() && f(p) {
markup_node(p, &mut at_start);
if let Some(node) = p.last_child() {
at_start &= matches!(node.kind(), &NodeKind::Space(_) | &NodeKind::Parbreak | &NodeKind::LineComment | &NodeKind::BlockComment);
at_start &= matches!(node.kind(),
&NodeKind::Space(_) | &NodeKind::Parbreak |
&NodeKind::LineComment | &NodeKind::BlockComment
);
}
}
@ -88,22 +91,8 @@ fn markup_node(p: &mut Parser, at_start: &mut bool) {
| NodeKind::Emph
| NodeKind::Strong
| NodeKind::Linebreak
| NodeKind::Raw(_) => p.eat(),
NodeKind::UnicodeEscape(u) => {
if u.character.is_none() {
let src = p.peek_src();
p.convert(NodeKind::Error(
ErrorPosition::Full,
"invalid unicode escape sequence".into(),
));
p.start();
p.end(NodeKind::Text(src.into()));
return;
}
p.eat();
}
| NodeKind::Raw(_)
| NodeKind::UnicodeEscape(_) => p.eat(),
NodeKind::Eq if *at_start => heading(p),
NodeKind::ListBullet if *at_start => list_node(p),
@ -503,9 +492,8 @@ fn item(p: &mut Parser) -> NodeKind {
/// Convert a collection into an array, producing errors for anything other than
/// expressions.
fn array(p: &mut Parser, items: usize) {
p.start_with(items);
p.filter_children(
0,
p.child_count() - items,
|x| match x.kind() {
NodeKind::Named | NodeKind::ParameterSink => false,
_ => true,
@ -522,15 +510,14 @@ fn array(p: &mut Parser, items: usize) {
},
);
p.end(NodeKind::Array)
p.convert_with(items, NodeKind::Array);
}
/// Convert a collection into a dictionary, producing errors for anything other
/// than named pairs.
fn dict(p: &mut Parser, items: usize) {
p.start_with(items);
p.filter_children(
0,
p.child_count() - items,
|x| {
x.kind() == &NodeKind::Named
|| x.kind().is_parenthesis()
@ -547,7 +534,7 @@ fn dict(p: &mut Parser, items: usize) {
),
},
);
p.end(NodeKind::Dict);
p.convert_with(items, NodeKind::Dict);
}
/// Convert a collection into a list of parameters, producing errors for
@ -684,8 +671,7 @@ fn let_expr(p: &mut Parser) {
return;
}
p.start_with(p.child_count() - offset);
p.end(NodeKind::Closure)
p.convert_with(p.child_count() - offset, NodeKind::Closure);
}
}

View File

@ -186,9 +186,27 @@ impl<'s> Parser<'s> {
}
pub fn convert(&mut self, kind: NodeKind) {
self.start();
self.eat();
self.end(kind);
let len = self.tokens.index() - self.next_start;
self.children.push(
GreenNode::with_child(
kind,
len,
GreenData::new(self.next.clone().unwrap(), len),
)
.into(),
);
self.fast_forward();
self.success = true;
}
pub fn convert_with(&mut self, preserve: usize, kind: NodeKind) {
let preserved: Vec<_> =
self.children.drain(self.children.len() - preserve ..).collect();
let len = preserved.iter().map(|c| c.len()).sum();
self.children
.push(GreenNode::with_children(kind, len, preserved).into());
self.success = true;
}
/// End the current node and undo its existence, inling all accumulated

View File

@ -200,7 +200,7 @@ impl<'s> Tokens<'s> {
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
});
NodeKind::Text(resolve_string(self.s.eaten_from(start)))
NodeKind::Text(self.s.eaten_from(start).into())
}
fn whitespace(&mut self) -> NodeKind {
@ -243,10 +243,16 @@ impl<'s> Tokens<'s> {
let sequence: EcoString = self.s.eat_while(|c| c.is_ascii_alphanumeric()).into();
if self.s.eat_if('}') {
NodeKind::UnicodeEscape(Rc::new(UnicodeEscapeToken {
character: resolve_hex(&sequence),
sequence,
}))
if let Some(character) = resolve_hex(&sequence) {
NodeKind::UnicodeEscape(UnicodeEscapeToken {
character,
})
} else {
NodeKind::Error(
ErrorPosition::Full,
"invalid unicode escape sequence".into(),
)
}
} else {
NodeKind::Error(
ErrorPosition::End,
@ -560,35 +566,21 @@ mod tests {
use Option::None;
use TokenMode::{Code, Markup};
fn UnicodeEscape(sequence: &str, terminated: bool) -> NodeKind {
if terminated {
NodeKind::UnicodeEscape(Rc::new(UnicodeEscapeToken {
character: resolve_hex(sequence),
sequence: sequence.into(),
}))
} else {
NodeKind::Error(ErrorPosition::End, "expected closing brace".into())
}
fn UnicodeEscape(character: char) -> NodeKind {
NodeKind::UnicodeEscape(UnicodeEscapeToken { character })
}
fn Raw(
text: &str,
lang: Option<&str>,
backticks_left: u8,
err_msg: Option<&str>,
block: bool,
) -> NodeKind {
match err_msg {
None => NodeKind::Raw(Rc::new(RawToken {
text: text.into(),
lang: lang.map(Into::into),
backticks: backticks_left,
block,
})),
Some(msg) => {
NodeKind::Error(ErrorPosition::End, format!("expected {}", msg).into())
}
}
fn Error(pos: ErrorPosition, message: &str) -> NodeKind {
NodeKind::Error(pos, message.into())
}
fn Raw(text: &str, lang: Option<&str>, backticks_left: u8, block: bool) -> NodeKind {
NodeKind::Raw(Rc::new(RawToken {
text: text.into(),
lang: lang.map(Into::into),
backticks: backticks_left,
block,
}))
}
fn Math(formula: &str, display: bool, err_msg: Option<&str>) -> NodeKind {
@ -795,16 +787,16 @@ mod tests {
t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\""));
// Test basic unicode escapes.
t!(Markup: r"\u{}" => UnicodeEscape("", true));
t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true));
t!(Markup: r"\u{P}" => UnicodeEscape("P", true));
t!(Markup: r"\u{}" => Error(ErrorPosition::Full, "invalid unicode escape sequence"));
t!(Markup: r"\u{2603}" => UnicodeEscape('☃'));
t!(Markup: r"\u{P}" => Error(ErrorPosition::Full, "invalid unicode escape sequence"));
// Test unclosed unicode escapes.
t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false));
t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false));
t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false));
t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false));
t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace);
t!(Markup[" /"]: r"\u{" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup[" /"]: r"\u{1" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup[" /"]: r"\u{26A4" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup[" /"]: r"\u{1Q3P" => Error(ErrorPosition::End, "expected closing brace"));
t!(Markup: r"\u{1🏕}" => Error(ErrorPosition::End, "expected closing brace"), Text("🏕"), RightBrace);
}
#[test]
@ -894,22 +886,22 @@ mod tests {
#[test]
fn test_tokenize_raw_blocks() {
// Test basic raw block.
t!(Markup: "``" => Raw("", None, 1, None, false));
t!(Markup: "`raw`" => Raw("raw", None, 1, None, false));
t!(Markup[""]: "`]" => Raw("]", None, 1, Some("1 backtick"), false));
t!(Markup: "``" => Raw("", None, 1, false));
t!(Markup: "`raw`" => Raw("raw", None, 1, false));
t!(Markup[""]: "`]" => Error(ErrorPosition::End, "expected 1 backtick"));
// Test special symbols in raw block.
t!(Markup: "`[brackets]`" => Raw("[brackets]", None, 1, None, false));
t!(Markup[""]: r"`\`` " => Raw(r"\", None, 1, None, false), Raw(" ", None, 1, Some("1 backtick"), false));
t!(Markup: "`[brackets]`" => Raw("[brackets]", None, 1, false));
t!(Markup[""]: r"`\`` " => Raw(r"\", None, 1, false), Error(ErrorPosition::End, "expected 1 backtick"));
// Test separated closing backticks.
t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), 3, None, false));
t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), 3, false));
// Test more backticks.
t!(Markup: "``nope``" => Raw("", None, 1, None, false), Text("nope"), Raw("", None, 1, None, false));
t!(Markup: "````🚀````" => Raw("", Some("🚀"), 4, None, false));
t!(Markup[""]: "`````👩‍🚀````noend" => Raw("````noend", Some("👩‍🚀"), 5, Some("5 backticks"), false));
t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), 4, None, false), Raw("", None, 1, None, false));
t!(Markup: "``nope``" => Raw("", None, 1, false), Text("nope"), Raw("", None, 1, false));
t!(Markup: "````🚀````" => Raw("", Some("🚀"), 4, false));
t!(Markup[""]: "`````👩‍🚀````noend" => Error(ErrorPosition::End, "expected 5 backticks"));
t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), 4, false), Raw("", None, 1, false));
}
#[test]

View File

@ -1,7 +1,6 @@
use super::{Expr, Ident, NodeKind, RedNode, RedRef, Span, TypedNode};
use crate::node;
use crate::util::EcoString;
use std::fmt::Write;
node! {
/// The syntactical root capable of representing a full parsed document.
@ -50,14 +49,7 @@ impl TypedNode for MarkupNode {
NodeKind::Strong => Some(MarkupNode::Strong),
NodeKind::Emph => Some(MarkupNode::Emph),
NodeKind::Text(s) => Some(MarkupNode::Text(s.clone())),
NodeKind::UnicodeEscape(u) => Some(MarkupNode::Text(match u.character {
Some(c) => c.into(),
None => {
let mut eco = EcoString::with_capacity(u.sequence.len() + 4);
write!(&mut eco, "\\u{{{}}}", u.sequence).unwrap();
eco
}
})),
NodeKind::UnicodeEscape(u) => Some(MarkupNode::Text(u.character.into())),
NodeKind::EnDash => Some(MarkupNode::Text(EcoString::from("\u{2013}"))),
NodeKind::EmDash => Some(MarkupNode::Text(EcoString::from("\u{2014}"))),
NodeKind::NonBreakingSpace => {

View File

@ -121,7 +121,7 @@ pub enum NodeKind {
Text(EcoString),
/// A slash and the letter "u" followed by a hexadecimal unicode entity
/// enclosed in curly braces: `\u{1F5FA}`.
UnicodeEscape(Rc<UnicodeEscapeToken>),
UnicodeEscape(UnicodeEscapeToken),
/// An arbitrary number of backticks followed by inner contents, terminated
/// with the same number of backticks: `` `...` ``.
Raw(Rc<RawToken>),

View File

@ -33,9 +33,8 @@ pub struct MathToken {
/// A unicode escape sequence token: `\u{1F5FA}`.
#[derive(Debug, Clone, PartialEq)]
#[repr(transparent)]
pub struct UnicodeEscapeToken {
/// The escape sequence between the braces.
pub sequence: EcoString,
/// The resulting unicode character.
pub character: Option<char>,
pub character: char,
}