mirror of
https://github.com/typst/typst
synced 2025-05-14 04:56:26 +08:00
Remove the concept of words from tokenization 🎈
This commit is contained in:
parent
90848df5de
commit
9d605c3128
@ -7,7 +7,6 @@ edition = "2018"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
pdf = { path = "../pdf" }
|
pdf = { path = "../pdf" }
|
||||||
opentype = { path = "../opentype" }
|
opentype = { path = "../opentype" }
|
||||||
unicode-segmentation = "1.2"
|
|
||||||
unicode-xid = "0.1.0"
|
unicode-xid = "0.1.0"
|
||||||
byteorder = "1"
|
byteorder = "1"
|
||||||
smallvec = "0.6.9"
|
smallvec = "0.6.9"
|
||||||
|
@ -62,7 +62,7 @@ impl<'a> Engine<'a> {
|
|||||||
// Iterate through the documents nodes.
|
// Iterate through the documents nodes.
|
||||||
for node in &self.tree.nodes {
|
for node in &self.tree.nodes {
|
||||||
match node {
|
match node {
|
||||||
Node::Word(word) => self.write_word(word)?,
|
Node::Text(text) => self.write_word(text)?,
|
||||||
Node::Space => self.write_space()?,
|
Node::Space => self.write_space()?,
|
||||||
Node::Newline => {
|
Node::Newline => {
|
||||||
self.write_buffered_text();
|
self.write_buffered_text();
|
||||||
|
@ -53,7 +53,6 @@ use crate::syntax::SyntaxTree;
|
|||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
mod error;
|
mod error;
|
||||||
mod utility;
|
|
||||||
pub mod doc;
|
pub mod doc;
|
||||||
pub mod engine;
|
pub mod engine;
|
||||||
pub mod export;
|
pub mod export;
|
||||||
|
294
src/parsing.rs
294
src/parsing.rs
@ -1,30 +1,29 @@
|
|||||||
//! Tokenization and parsing of source code into syntax trees.
|
//! Tokenization and parsing of source code into syntax trees.
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
|
||||||
use std::iter::Peekable;
|
use std::iter::Peekable;
|
||||||
use std::mem::swap;
|
use std::mem::swap;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
use std::str::CharIndices;
|
||||||
|
|
||||||
use unicode_segmentation::{UnicodeSegmentation, UWordBounds};
|
use unicode_xid::UnicodeXID;
|
||||||
|
|
||||||
use crate::syntax::*;
|
use crate::syntax::*;
|
||||||
use crate::func::{ParseContext, Scope};
|
use crate::func::{ParseContext, Scope};
|
||||||
use crate::utility::{Splinor, Spline, Splined, StrExt};
|
|
||||||
|
|
||||||
|
|
||||||
/// An iterator over the tokens of source code.
|
/// An iterator over the tokens of source code.
|
||||||
#[derive(Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Tokens<'s> {
|
pub struct Tokens<'s> {
|
||||||
source: &'s str,
|
source: &'s str,
|
||||||
words: Peekable<UWordBounds<'s>>,
|
chars: Peekable<CharIndices<'s>>,
|
||||||
state: TokensState<'s>,
|
state: TokensState,
|
||||||
stack: Vec<TokensState<'s>>,
|
stack: Vec<TokensState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The state the tokenizer is in.
|
/// The state the tokenizer is in.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
enum TokensState<'s> {
|
enum TokensState {
|
||||||
/// The base state if there is nothing special we are in.
|
/// The base state if there is nothing special we are in.
|
||||||
Body,
|
Body,
|
||||||
/// Inside a function header. Here colons and equal signs get parsed
|
/// Inside a function header. Here colons and equal signs get parsed
|
||||||
@ -32,9 +31,6 @@ enum TokensState<'s> {
|
|||||||
Function,
|
Function,
|
||||||
/// We expect either the end of the function or the beginning of the body.
|
/// We expect either the end of the function or the beginning of the body.
|
||||||
MaybeBody,
|
MaybeBody,
|
||||||
/// We are inside one unicode word that consists of multiple tokens,
|
|
||||||
/// because it contains double underscores.
|
|
||||||
DoubleUnderscore(Spline<'s, Token<'s>>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> Tokens<'s> {
|
impl<'s> Tokens<'s> {
|
||||||
@ -43,7 +39,7 @@ impl<'s> Tokens<'s> {
|
|||||||
pub fn new(source: &'s str) -> Tokens<'s> {
|
pub fn new(source: &'s str) -> Tokens<'s> {
|
||||||
Tokens {
|
Tokens {
|
||||||
source,
|
source,
|
||||||
words: source.split_word_bounds().peekable(),
|
chars: source.char_indices().peekable(),
|
||||||
state: TokensState::Body,
|
state: TokensState::Body,
|
||||||
stack: vec![],
|
stack: vec![],
|
||||||
}
|
}
|
||||||
@ -51,11 +47,11 @@ impl<'s> Tokens<'s> {
|
|||||||
|
|
||||||
/// Advance the iterator by one step.
|
/// Advance the iterator by one step.
|
||||||
fn advance(&mut self) {
|
fn advance(&mut self) {
|
||||||
self.words.next();
|
self.chars.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Switch to the given state.
|
/// Switch to the given state.
|
||||||
fn switch(&mut self, mut state: TokensState<'s>) {
|
fn switch(&mut self, mut state: TokensState) {
|
||||||
swap(&mut state, &mut self.state);
|
swap(&mut state, &mut self.state);
|
||||||
self.stack.push(state);
|
self.stack.push(state);
|
||||||
}
|
}
|
||||||
@ -70,6 +66,11 @@ impl<'s> Tokens<'s> {
|
|||||||
self.advance();
|
self.advance();
|
||||||
token
|
token
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a word containing the string bounded by the given indices.
|
||||||
|
fn text(&self, start: usize, end: usize) -> Token<'s> {
|
||||||
|
Token::Text(&self.source[start .. end])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> Iterator for Tokens<'s> {
|
impl<'s> Iterator for Tokens<'s> {
|
||||||
@ -79,27 +80,11 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
fn next(&mut self) -> Option<Token<'s>> {
|
fn next(&mut self) -> Option<Token<'s>> {
|
||||||
use TokensState as TS;
|
use TokensState as TS;
|
||||||
|
|
||||||
// Return the remaining words and double underscores.
|
// Skip whitespace, but if at least one whitespace character existed,
|
||||||
if let TS::DoubleUnderscore(splinor) = &mut self.state {
|
// remember that, because then we return a space token.
|
||||||
loop {
|
|
||||||
if let Some(splined) = splinor.next() {
|
|
||||||
return Some(match splined {
|
|
||||||
Splined::Value(word) if word != "" => Token::Word(word),
|
|
||||||
Splined::Splinor(s) => s,
|
|
||||||
_ => continue,
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
self.unswitch();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip whitespace, but if at least one whitespace word existed,
|
|
||||||
// remember that, because we return a space token.
|
|
||||||
let mut whitespace = false;
|
let mut whitespace = false;
|
||||||
while let Some(word) = self.words.peek() {
|
while let Some(&(_, c)) = self.chars.peek() {
|
||||||
if !word.is_whitespace() {
|
if !c.is_whitespace() || c == '\n' || c == '\r' {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
whitespace = true;
|
whitespace = true;
|
||||||
@ -111,100 +96,82 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
|
|
||||||
// Function maybe has a body
|
// Function maybe has a body
|
||||||
if self.state == TS::MaybeBody {
|
if self.state == TS::MaybeBody {
|
||||||
match *self.words.peek()? {
|
if self.chars.peek()?.1 == '[' {
|
||||||
"[" => {
|
self.state = TS::Body;
|
||||||
self.state = TS::Body;
|
return Some(self.consumed(Token::LeftBracket));
|
||||||
return Some(self.consumed(Token::LeftBracket));
|
} else {
|
||||||
},
|
self.unswitch();
|
||||||
_ => self.unswitch(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now all special cases are handled and we can finally look at the
|
// Now all special cases are handled and we can finally look at the
|
||||||
// next words.
|
// next words.
|
||||||
let next = self.words.next()?;
|
let (next_pos, next) = self.chars.next()?;
|
||||||
let afterwards = self.words.peek();
|
let afterwards = self.chars.peek().map(|&(_, c)| c);
|
||||||
|
|
||||||
Some(match next {
|
Some(match next {
|
||||||
// Special characters
|
// Special characters
|
||||||
"[" => {
|
'[' => {
|
||||||
self.switch(TS::Function);
|
self.switch(TS::Function);
|
||||||
Token::LeftBracket
|
Token::LeftBracket
|
||||||
},
|
},
|
||||||
"]" => {
|
']' => {
|
||||||
if self.state == TS::Function {
|
if self.state == TS::Function {
|
||||||
self.state = TS::MaybeBody;
|
self.state = TS::MaybeBody;
|
||||||
}
|
}
|
||||||
Token::RightBracket
|
Token::RightBracket
|
||||||
},
|
},
|
||||||
"$" => Token::Dollar,
|
'$' => Token::Dollar,
|
||||||
"#" => Token::Hashtag,
|
'#' => Token::Hashtag,
|
||||||
|
|
||||||
// Context sensitive operators
|
// Context sensitive operators
|
||||||
":" if self.state == TS::Function => Token::Colon,
|
':' if self.state == TS::Function => Token::Colon,
|
||||||
"=" if self.state == TS::Function => Token::Equals,
|
'=' if self.state == TS::Function => Token::Equals,
|
||||||
|
|
||||||
// Double star/underscore
|
// Double star/underscore
|
||||||
"*" if afterwards == Some(&"*") => self.consumed(Token::DoubleStar),
|
'*' if afterwards == Some('*') => self.consumed(Token::DoubleStar),
|
||||||
"__" => Token::DoubleUnderscore,
|
'_' if afterwards == Some('_') => self.consumed(Token::DoubleUnderscore),
|
||||||
|
|
||||||
// Newlines
|
// Newlines
|
||||||
"\n" | "\r\n" => Token::Newline,
|
'\n' => Token::Newline,
|
||||||
|
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
|
||||||
|
|
||||||
// Escaping
|
// Escaping
|
||||||
r"\" => {
|
'\\' => {
|
||||||
if let Some(next) = afterwards {
|
if let Some(&(index, c)) = self.chars.peek() {
|
||||||
let escapable = match *next {
|
if is_special_character(c) {
|
||||||
"[" | "]" | "$" | "#" | r"\" | ":" | "=" | "*" | "_" => true,
|
|
||||||
w if w.starts_with("__") => true,
|
|
||||||
_ => false,
|
|
||||||
};
|
|
||||||
|
|
||||||
if escapable {
|
|
||||||
let next = *next;
|
|
||||||
self.advance();
|
self.advance();
|
||||||
return Some(Token::Word(next));
|
return Some(self.text(index, index + c.len_utf8()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Token::Word(r"\")
|
Token::Text("\\")
|
||||||
},
|
|
||||||
|
|
||||||
// Double underscores hidden in words.
|
|
||||||
word if word.contains("__") => {
|
|
||||||
let spline = word.spline("__", Token::DoubleUnderscore);
|
|
||||||
self.switch(TS::DoubleUnderscore(spline));
|
|
||||||
return self.next();
|
|
||||||
},
|
},
|
||||||
|
|
||||||
// Now it seems like it's just a normal word.
|
// Now it seems like it's just a normal word.
|
||||||
word => Token::Word(word),
|
_ => {
|
||||||
|
// Find out when the word ends.
|
||||||
|
let mut end = (next_pos, next);
|
||||||
|
while let Some(&(index, c)) = self.chars.peek() {
|
||||||
|
if is_special_character(c) || c.is_whitespace() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
end = (index, c);
|
||||||
|
self.advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
let end_pos = end.0 + end.1.len_utf8();
|
||||||
|
self.text(next_pos, end_pos)
|
||||||
|
},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for Tokens<'_> {
|
/// Whether this character has a special meaning in the language.
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn is_special_character(character: char) -> bool {
|
||||||
f.debug_struct("Tokens")
|
match character {
|
||||||
.field("source", &self.source)
|
'[' | ']' | '$' | '#' | '\\' | ':' | '=' | '*' | '_' => true,
|
||||||
.field("words", &"Peekable<UWordBounds>")
|
_ => false,
|
||||||
.field("state", &self.state)
|
|
||||||
.field("stack", &self.stack)
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialEq for TokensState<'_> {
|
|
||||||
fn eq(&self, other: &TokensState) -> bool {
|
|
||||||
use TokensState as TS;
|
|
||||||
|
|
||||||
match (self, other) {
|
|
||||||
(TS::Body, TS::Body) => true,
|
|
||||||
(TS::Function, TS::Function) => true,
|
|
||||||
(TS::MaybeBody, TS::MaybeBody) => true,
|
|
||||||
// They are not necessarily different, but we don't care
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -285,8 +252,8 @@ impl<'s, 't> Parser<'s, 't> {
|
|||||||
Token::Space => self.append_space_consumed(),
|
Token::Space => self.append_space_consumed(),
|
||||||
Token::Newline => self.switch_consumed(PS::FirstNewline),
|
Token::Newline => self.switch_consumed(PS::FirstNewline),
|
||||||
|
|
||||||
// Words
|
// Text
|
||||||
Token::Word(word) => self.append_consumed(Node::Word(word.to_owned())),
|
Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
|
||||||
|
|
||||||
// Functions
|
// Functions
|
||||||
Token::LeftBracket => self.parse_function()?,
|
Token::LeftBracket => self.parse_function()?,
|
||||||
@ -315,7 +282,7 @@ impl<'s, 't> Parser<'s, 't> {
|
|||||||
|
|
||||||
// The next token should be the name of the function.
|
// The next token should be the name of the function.
|
||||||
let name = match self.tokens.next() {
|
let name = match self.tokens.next() {
|
||||||
Some(Token::Word(word)) => {
|
Some(Token::Text(word)) => {
|
||||||
if word.is_identifier() {
|
if word.is_identifier() {
|
||||||
Ok(word.to_owned())
|
Ok(word.to_owned())
|
||||||
} else {
|
} else {
|
||||||
@ -537,6 +504,39 @@ impl<'s> Iterator for ParseTokens<'s> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// More useful functions on `str`'s.
|
||||||
|
trait StrExt {
|
||||||
|
/// Whether self consists only of whitespace.
|
||||||
|
fn is_whitespace(&self) -> bool;
|
||||||
|
|
||||||
|
/// Whether this word is a valid unicode identifier.
|
||||||
|
fn is_identifier(&self) -> bool;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StrExt for str {
|
||||||
|
fn is_whitespace(&self) -> bool {
|
||||||
|
self.chars().all(|c| c.is_whitespace() && c != '\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_identifier(&self) -> bool {
|
||||||
|
let mut chars = self.chars();
|
||||||
|
|
||||||
|
match chars.next() {
|
||||||
|
Some(c) if !UnicodeXID::is_xid_start(c) => return false,
|
||||||
|
None => return false,
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(c) = chars.next() {
|
||||||
|
if !UnicodeXID::is_xid_continue(c) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// The error type for parsing.
|
/// The error type for parsing.
|
||||||
pub struct ParseError(String);
|
pub struct ParseError(String);
|
||||||
|
|
||||||
@ -560,7 +560,7 @@ mod token_tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
|
use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
|
||||||
Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS,
|
Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS,
|
||||||
Dollar as D, Hashtag as H, Word as W};
|
Dollar as D, Hashtag as H, Text as T};
|
||||||
|
|
||||||
/// Test if the source code tokenizes to the tokens.
|
/// Test if the source code tokenizes to the tokens.
|
||||||
fn test(src: &str, tokens: Vec<Token>) {
|
fn test(src: &str, tokens: Vec<Token>) {
|
||||||
@ -571,7 +571,7 @@ mod token_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn tokenize_base() {
|
fn tokenize_base() {
|
||||||
test("", vec![]);
|
test("", vec![]);
|
||||||
test("Hallo", vec![W("Hallo")]);
|
test("Hallo", vec![T("Hallo")]);
|
||||||
test("[", vec![L]);
|
test("[", vec![L]);
|
||||||
test("]", vec![R]);
|
test("]", vec![R]);
|
||||||
test("$", vec![D]);
|
test("$", vec![D]);
|
||||||
@ -586,26 +586,26 @@ mod token_tests {
|
|||||||
fn tokenize_whitespace_newlines() {
|
fn tokenize_whitespace_newlines() {
|
||||||
test(" \t", vec![S]);
|
test(" \t", vec![S]);
|
||||||
test("First line\r\nSecond line\nThird line\n",
|
test("First line\r\nSecond line\nThird line\n",
|
||||||
vec![W("First"), S, W("line"), N, W("Second"), S, W("line"), N,
|
vec![T("First"), S, T("line"), N, T("Second"), S, T("line"), N,
|
||||||
W("Third"), S, W("line"), N]);
|
T("Third"), S, T("line"), N]);
|
||||||
test("Hello \n ", vec![W("Hello"), S, N, S]);
|
test("Hello \n ", vec![T("Hello"), S, N, S]);
|
||||||
test("Dense\nTimes", vec![W("Dense"), N, W("Times")]);
|
test("Dense\nTimes", vec![T("Dense"), N, T("Times")]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Tests if escaping with backslash works as it should.
|
/// Tests if escaping with backslash works as it should.
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_escape() {
|
fn tokenize_escape() {
|
||||||
test(r"\[", vec![W("[")]);
|
test(r"\[", vec![T("[")]);
|
||||||
test(r"\]", vec![W("]")]);
|
test(r"\]", vec![T("]")]);
|
||||||
test(r"\#", vec![W("#")]);
|
test(r"\#", vec![T("#")]);
|
||||||
test(r"\$", vec![W("$")]);
|
test(r"\$", vec![T("$")]);
|
||||||
test(r"\:", vec![W(":")]);
|
test(r"\:", vec![T(":")]);
|
||||||
test(r"\=", vec![W("=")]);
|
test(r"\=", vec![T("=")]);
|
||||||
test(r"\**", vec![W("*"), W("*")]);
|
test(r"\**", vec![T("*"), T("*")]);
|
||||||
test(r"\*", vec![W("*")]);
|
test(r"\*", vec![T("*")]);
|
||||||
test(r"\__", vec![W("__")]);
|
test(r"\__", vec![T("_"), T("_")]);
|
||||||
test(r"\_", vec![W("_")]);
|
test(r"\_", vec![T("_")]);
|
||||||
test(r"\hello", vec![W(r"\"), W("hello")]);
|
test(r"\hello", vec![T("\\"), T("hello")]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Tokenizes some more realistic examples.
|
/// Tokenizes some more realistic examples.
|
||||||
@ -616,8 +616,8 @@ mod token_tests {
|
|||||||
Test [italic][example]!
|
Test [italic][example]!
|
||||||
]
|
]
|
||||||
", vec![
|
", vec![
|
||||||
N, S, L, W("function"), R, L, N, S, W("Test"), S, L, W("italic"), R, L,
|
N, S, L, T("function"), R, L, N, S, T("Test"), S, L, T("italic"), R, L,
|
||||||
W("example"), R, W("!"), N, S, R, N, S
|
T("example"), R, T("!"), N, S, R, N, S
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test(r"
|
test(r"
|
||||||
@ -626,10 +626,10 @@ mod token_tests {
|
|||||||
|
|
||||||
Das ist ein Beispielsatz mit **fetter** Schrift.
|
Das ist ein Beispielsatz mit **fetter** Schrift.
|
||||||
", vec![
|
", vec![
|
||||||
N, S, L, W("page"), C, S, W("size"), E, W("A4"), R, N, S,
|
N, S, L, T("page"), C, S, T("size"), E, T("A4"), R, N, S,
|
||||||
L, W("font"), C, S, W("size"), E, W("12pt"), R, N, N, S,
|
L, T("font"), C, S, T("size"), E, T("12pt"), R, N, N, S,
|
||||||
W("Das"), S, W("ist"), S, W("ein"), S, W("Beispielsatz"), S, W("mit"), S,
|
T("Das"), S, T("ist"), S, T("ein"), S, T("Beispielsatz"), S, T("mit"), S,
|
||||||
DS, W("fetter"), DS, S, W("Schrift"), W("."), N, S
|
DS, T("fetter"), DS, S, T("Schrift."), N, S
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -638,13 +638,13 @@ mod token_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn tokenize_symbols_context() {
|
fn tokenize_symbols_context() {
|
||||||
test("[func: key=value][Answer: 7]",
|
test("[func: key=value][Answer: 7]",
|
||||||
vec![L, W("func"), C, S, W("key"), E, W("value"), R, L,
|
vec![L, T("func"), C, S, T("key"), E, T("value"), R, L,
|
||||||
W("Answer"), W(":"), S, W("7"), R]);
|
T("Answer"), T(":"), S, T("7"), R]);
|
||||||
test("[[n: k=v]:x][:[=]]:=",
|
test("[[n: k=v]:x][:[=]]:=",
|
||||||
vec![L, L, W("n"), C, S, W("k"), E, W("v"), R, C, W("x"), R,
|
vec![L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
|
||||||
L, W(":"), L, E, R, R, W(":"), W("=")]);
|
L, T(":"), L, E, R, R, T(":"), T("=")]);
|
||||||
test("[func: __key__=value]",
|
test("[func: __key__=value]",
|
||||||
vec![L, W("func"), C, S, DU, W("key"), DU, E, W("value"), R]);
|
vec![L, T("func"), C, S, DU, T("key"), DU, E, T("value"), R]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This test has a special look at the double underscore syntax, because
|
/// This test has a special look at the double underscore syntax, because
|
||||||
@ -653,16 +653,16 @@ mod token_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn tokenize_double_underscore() {
|
fn tokenize_double_underscore() {
|
||||||
test("he__llo__world_ _ __ Now this_ is__ special!",
|
test("he__llo__world_ _ __ Now this_ is__ special!",
|
||||||
vec![W("he"), DU, W("llo"), DU, W("world_"), S, W("_"), S, DU, S, W("Now"), S,
|
vec![T("he"), DU, T("llo"), DU, T("world"), T("_"), S, T("_"), S, DU, S, T("Now"), S,
|
||||||
W("this_"), S, W("is"), DU, S, W("special"), W("!")]);
|
T("this"), T("_"), S, T("is"), DU, S, T("special!")]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This test is for checking if non-ASCII characters get parsed correctly.
|
/// This test is for checking if non-ASCII characters get parsed correctly.
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_unicode() {
|
fn tokenize_unicode() {
|
||||||
test("[document][Hello 🌍!]",
|
test("[document][Hello 🌍!]",
|
||||||
vec![L, W("document"), R, L, W("Hello"), S, W("🌍"), W("!"), R]);
|
vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]);
|
||||||
test("[f]⺐.", vec![L, W("f"), R, W("⺐"), W(".")]);
|
test("[f]⺐.", vec![L, T("f"), R, T("⺐.")]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -674,7 +674,7 @@ mod parse_tests {
|
|||||||
use Node::{Space as S, Newline as N, Func as F};
|
use Node::{Space as S, Newline as N, Func as F};
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
#[allow(non_snake_case)]
|
||||||
fn W(s: &str) -> Node { Node::Word(s.to_owned()) }
|
fn T(s: &str) -> Node { Node::Text(s.to_owned()) }
|
||||||
|
|
||||||
/// A testing function which just parses it's body into a syntax tree.
|
/// A testing function which just parses it's body into a syntax tree.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
@ -764,19 +764,19 @@ mod parse_tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn parse_base() {
|
fn parse_base() {
|
||||||
test("", tree! []);
|
test("", tree! []);
|
||||||
test("Hello World!", tree! [ W("Hello"), S, W("World"), W("!") ]);
|
test("Hello World!", tree! [ T("Hello"), S, T("World!") ]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Test whether newlines generate the correct whitespace.
|
/// Test whether newlines generate the correct whitespace.
|
||||||
#[test]
|
#[test]
|
||||||
fn parse_newlines_whitespace() {
|
fn parse_newlines_whitespace() {
|
||||||
test("Hello\nWorld", tree! [ W("Hello"), S, W("World") ]);
|
test("Hello\nWorld", tree! [ T("Hello"), S, T("World") ]);
|
||||||
test("Hello \n World", tree! [ W("Hello"), S, W("World") ]);
|
test("Hello \n World", tree! [ T("Hello"), S, T("World") ]);
|
||||||
test("Hello\n\nWorld", tree! [ W("Hello"), N, W("World") ]);
|
test("Hello\n\nWorld", tree! [ T("Hello"), N, T("World") ]);
|
||||||
test("Hello \n\nWorld", tree! [ W("Hello"), S, N, W("World") ]);
|
test("Hello \n\nWorld", tree! [ T("Hello"), S, N, T("World") ]);
|
||||||
test("Hello\n\n World", tree! [ W("Hello"), N, S, W("World") ]);
|
test("Hello\n\n World", tree! [ T("Hello"), N, S, T("World") ]);
|
||||||
test("Hello \n \n \n World", tree! [ W("Hello"), S, N, S, W("World") ]);
|
test("Hello \n \n \n World", tree! [ T("Hello"), S, N, S, T("World") ]);
|
||||||
test("Hello\n \n\n World", tree! [ W("Hello"), S, N, S, W("World") ]);
|
test("Hello\n \n\n World", tree! [ T("Hello"), S, N, S, T("World") ]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse things dealing with functions.
|
/// Parse things dealing with functions.
|
||||||
@ -790,18 +790,18 @@ mod parse_tests {
|
|||||||
|
|
||||||
test_scoped(&scope,"[test]", tree! [ F(func! { name => "test", body => None }) ]);
|
test_scoped(&scope,"[test]", tree! [ F(func! { name => "test", body => None }) ]);
|
||||||
test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [
|
test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [
|
||||||
W("This"), S, W("is"), S, W("an"), S,
|
T("This"), S, T("is"), S, T("an"), S,
|
||||||
F(func! { name => "modifier", body => tree! [ W("example") ] }), S,
|
F(func! { name => "modifier", body => tree! [ T("example") ] }), S,
|
||||||
W("of"), S, W("a"), S, W("function"), S, W("invocation"), W(".")
|
T("of"), S, T("a"), S, T("function"), S, T("invocation.")
|
||||||
]);
|
]);
|
||||||
test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [
|
test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [
|
||||||
F(func! {
|
F(func! {
|
||||||
name => "func",
|
name => "func",
|
||||||
body => tree! [ W("Hello") ],
|
body => tree! [ T("Hello") ],
|
||||||
}),
|
}),
|
||||||
F(func! {
|
F(func! {
|
||||||
name => "modifier",
|
name => "modifier",
|
||||||
body => tree! [ W("Here") ],
|
body => tree! [ T("Here") ],
|
||||||
}),
|
}),
|
||||||
F(func! {
|
F(func! {
|
||||||
name => "end",
|
name => "end",
|
||||||
@ -820,11 +820,11 @@ mod parse_tests {
|
|||||||
body => tree! [
|
body => tree! [
|
||||||
F(func! {
|
F(func! {
|
||||||
name => "func",
|
name => "func",
|
||||||
body => tree! [ W("call") ],
|
body => tree! [ T("call") ],
|
||||||
}),
|
}),
|
||||||
],
|
],
|
||||||
}),
|
}),
|
||||||
S, W("outside")
|
S, T("outside")
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -839,12 +839,12 @@ mod parse_tests {
|
|||||||
name => "func",
|
name => "func",
|
||||||
body => None,
|
body => None,
|
||||||
}),
|
}),
|
||||||
S, W("⺐"), W(".")
|
S, T("⺐.")
|
||||||
]);
|
]);
|
||||||
test_scoped(&scope, "[bold][Hello 🌍!]", tree! [
|
test_scoped(&scope, "[bold][Hello 🌍!]", tree! [
|
||||||
F(func! {
|
F(func! {
|
||||||
name => "bold",
|
name => "bold",
|
||||||
body => tree! [ W("Hello"), S, W("🌍"), W("!") ],
|
body => tree! [ T("Hello"), S, T("🌍!") ],
|
||||||
})
|
})
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
@ -30,8 +30,8 @@ pub enum Token<'s> {
|
|||||||
Dollar,
|
Dollar,
|
||||||
/// A hashtag starting a _comment_.
|
/// A hashtag starting a _comment_.
|
||||||
Hashtag,
|
Hashtag,
|
||||||
/// Everything else just is a literal word.
|
/// Everything else is just text.
|
||||||
Word(&'s str),
|
Text(&'s str),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A tree representation of the source.
|
/// A tree representation of the source.
|
||||||
@ -62,8 +62,8 @@ pub enum Node {
|
|||||||
ToggleBold,
|
ToggleBold,
|
||||||
/// Indicates that math mode was enabled/disabled.
|
/// Indicates that math mode was enabled/disabled.
|
||||||
ToggleMath,
|
ToggleMath,
|
||||||
/// A literal word.
|
/// Literal text.
|
||||||
Word(String),
|
Text(String),
|
||||||
/// A function invocation.
|
/// A function invocation.
|
||||||
Func(FuncCall),
|
Func(FuncCall),
|
||||||
}
|
}
|
||||||
|
135
src/utility.rs
135
src/utility.rs
@ -1,135 +0,0 @@
|
|||||||
//! Utility functionality.
|
|
||||||
|
|
||||||
use std::iter::Peekable;
|
|
||||||
use std::str::Split;
|
|
||||||
use unicode_xid::UnicodeXID;
|
|
||||||
|
|
||||||
|
|
||||||
/// Types that can be splined.
|
|
||||||
pub trait Splinor {
|
|
||||||
/// Returns an iterator over the substrings splitted by the pattern,
|
|
||||||
/// intertwined with the splinor.
|
|
||||||
///
|
|
||||||
/// # Example
|
|
||||||
///
|
|
||||||
/// ```ignore
|
|
||||||
/// #[derive(Debug, Copy, Clone, PartialEq)]
|
|
||||||
/// struct Space;
|
|
||||||
///
|
|
||||||
/// let v: Vec<Splined<Space>> = "My airplane flies!".spline(" ", Space).collect();
|
|
||||||
/// assert_eq!(v, [
|
|
||||||
/// Splined::Value("My"),
|
|
||||||
/// Splined::Splinor(Space),
|
|
||||||
/// Splined::Value("airplane"),
|
|
||||||
/// Splined::Splinor(Space),
|
|
||||||
/// Splined::Value("flies!"),
|
|
||||||
/// ]);
|
|
||||||
/// ```
|
|
||||||
fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Splinor for str {
|
|
||||||
fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T> {
|
|
||||||
Spline {
|
|
||||||
splinor: Splined::Splinor(splinor),
|
|
||||||
split: self.split(pat).peekable(),
|
|
||||||
next_splinor: false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterator over splitted values and splinors.
|
|
||||||
///
|
|
||||||
/// Created by the [`spline`](Splinor::spline) function.
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct Spline<'s, T> {
|
|
||||||
splinor: Splined<'s, T>,
|
|
||||||
split: Peekable<Split<'s, &'s str>>,
|
|
||||||
next_splinor: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents either a splitted substring or a splinor.
|
|
||||||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
|
||||||
pub enum Splined<'s, T> {
|
|
||||||
/// A substring.
|
|
||||||
Value(&'s str),
|
|
||||||
/// An intertwined splinor.
|
|
||||||
Splinor(T),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'s, T: Clone> Iterator for Spline<'s, T> {
|
|
||||||
type Item = Splined<'s, T>;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Splined<'s, T>> {
|
|
||||||
if self.next_splinor && self.split.peek().is_some() {
|
|
||||||
self.next_splinor = false;
|
|
||||||
return Some(self.splinor.clone());
|
|
||||||
} else {
|
|
||||||
self.next_splinor = true;
|
|
||||||
return Some(Splined::Value(self.split.next()?))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// More useful functions on `str`'s.
|
|
||||||
pub trait StrExt {
|
|
||||||
/// Whether self consists only of whitespace.
|
|
||||||
fn is_whitespace(&self) -> bool;
|
|
||||||
|
|
||||||
/// Whether this word is a valid unicode identifier.
|
|
||||||
fn is_identifier(&self) -> bool;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StrExt for str {
|
|
||||||
fn is_whitespace(&self) -> bool {
|
|
||||||
self.chars().all(|c| c.is_whitespace() && c != '\n')
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_identifier(&self) -> bool {
|
|
||||||
let mut chars = self.chars();
|
|
||||||
|
|
||||||
match chars.next() {
|
|
||||||
Some(c) if !UnicodeXID::is_xid_start(c) => return false,
|
|
||||||
None => return false,
|
|
||||||
_ => (),
|
|
||||||
}
|
|
||||||
|
|
||||||
while let Some(c) = chars.next() {
|
|
||||||
if !UnicodeXID::is_xid_continue(c) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod splinor_tests {
|
|
||||||
use super::*;
|
|
||||||
use Splined::{Value as V, Splinor as S};
|
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, PartialEq)]
|
|
||||||
enum Token { DoubleUnderscore }
|
|
||||||
|
|
||||||
fn test<T>(string: &str, pat: &str, splinor: T, vec: Vec<Splined<T>>)
|
|
||||||
where T: std::fmt::Debug + Clone + PartialEq {
|
|
||||||
assert_eq!(string.spline(pat, splinor).collect::<Vec<_>>(), vec);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn splinor() {
|
|
||||||
let s = S(Token::DoubleUnderscore);
|
|
||||||
test("__he__llo__world__", "__", Token::DoubleUnderscore,
|
|
||||||
vec![V(""), s, V("he"), s, V("llo"), s, V("world"), s, V("")]);
|
|
||||||
test("__Italic__", "__", Token::DoubleUnderscore,
|
|
||||||
vec![V(""), s, V("Italic"), s, V("")]);
|
|
||||||
test("Key__Value", "__", Token::DoubleUnderscore,
|
|
||||||
vec![V("Key"), s, V("Value")]);
|
|
||||||
test("__Start__NoEnd", "__", Token::DoubleUnderscore,
|
|
||||||
vec![V(""), s, V("Start"), s, V("NoEnd")]);
|
|
||||||
test("NoStart__End__", "__", Token::DoubleUnderscore,
|
|
||||||
vec![V("NoStart"), s, V("End"), s, V("")]);
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user