Remove the concept of words from tokenization 🎈

This commit is contained in:
Laurenz 2019-04-30 09:15:31 +02:00
parent 90848df5de
commit 9d605c3128
6 changed files with 152 additions and 289 deletions

View File

@ -7,7 +7,6 @@ edition = "2018"
[dependencies] [dependencies]
pdf = { path = "../pdf" } pdf = { path = "../pdf" }
opentype = { path = "../opentype" } opentype = { path = "../opentype" }
unicode-segmentation = "1.2"
unicode-xid = "0.1.0" unicode-xid = "0.1.0"
byteorder = "1" byteorder = "1"
smallvec = "0.6.9" smallvec = "0.6.9"

View File

@ -62,7 +62,7 @@ impl<'a> Engine<'a> {
// Iterate through the documents nodes. // Iterate through the documents nodes.
for node in &self.tree.nodes { for node in &self.tree.nodes {
match node { match node {
Node::Word(word) => self.write_word(word)?, Node::Text(text) => self.write_word(text)?,
Node::Space => self.write_space()?, Node::Space => self.write_space()?,
Node::Newline => { Node::Newline => {
self.write_buffered_text(); self.write_buffered_text();

View File

@ -53,7 +53,6 @@ use crate::syntax::SyntaxTree;
#[macro_use] #[macro_use]
mod error; mod error;
mod utility;
pub mod doc; pub mod doc;
pub mod engine; pub mod engine;
pub mod export; pub mod export;

View File

@ -1,30 +1,29 @@
//! Tokenization and parsing of source code into syntax trees. //! Tokenization and parsing of source code into syntax trees.
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt;
use std::iter::Peekable; use std::iter::Peekable;
use std::mem::swap; use std::mem::swap;
use std::ops::Deref; use std::ops::Deref;
use std::str::CharIndices;
use unicode_segmentation::{UnicodeSegmentation, UWordBounds}; use unicode_xid::UnicodeXID;
use crate::syntax::*; use crate::syntax::*;
use crate::func::{ParseContext, Scope}; use crate::func::{ParseContext, Scope};
use crate::utility::{Splinor, Spline, Splined, StrExt};
/// An iterator over the tokens of source code. /// An iterator over the tokens of source code.
#[derive(Clone)] #[derive(Debug, Clone)]
pub struct Tokens<'s> { pub struct Tokens<'s> {
source: &'s str, source: &'s str,
words: Peekable<UWordBounds<'s>>, chars: Peekable<CharIndices<'s>>,
state: TokensState<'s>, state: TokensState,
stack: Vec<TokensState<'s>>, stack: Vec<TokensState>,
} }
/// The state the tokenizer is in. /// The state the tokenizer is in.
#[derive(Debug, Clone)] #[derive(Debug, Clone, PartialEq)]
enum TokensState<'s> { enum TokensState {
/// The base state if there is nothing special we are in. /// The base state if there is nothing special we are in.
Body, Body,
/// Inside a function header. Here colons and equal signs get parsed /// Inside a function header. Here colons and equal signs get parsed
@ -32,9 +31,6 @@ enum TokensState<'s> {
Function, Function,
/// We expect either the end of the function or the beginning of the body. /// We expect either the end of the function or the beginning of the body.
MaybeBody, MaybeBody,
/// We are inside one unicode word that consists of multiple tokens,
/// because it contains double underscores.
DoubleUnderscore(Spline<'s, Token<'s>>),
} }
impl<'s> Tokens<'s> { impl<'s> Tokens<'s> {
@ -43,7 +39,7 @@ impl<'s> Tokens<'s> {
pub fn new(source: &'s str) -> Tokens<'s> { pub fn new(source: &'s str) -> Tokens<'s> {
Tokens { Tokens {
source, source,
words: source.split_word_bounds().peekable(), chars: source.char_indices().peekable(),
state: TokensState::Body, state: TokensState::Body,
stack: vec![], stack: vec![],
} }
@ -51,11 +47,11 @@ impl<'s> Tokens<'s> {
/// Advance the iterator by one step. /// Advance the iterator by one step.
fn advance(&mut self) { fn advance(&mut self) {
self.words.next(); self.chars.next();
} }
/// Switch to the given state. /// Switch to the given state.
fn switch(&mut self, mut state: TokensState<'s>) { fn switch(&mut self, mut state: TokensState) {
swap(&mut state, &mut self.state); swap(&mut state, &mut self.state);
self.stack.push(state); self.stack.push(state);
} }
@ -70,6 +66,11 @@ impl<'s> Tokens<'s> {
self.advance(); self.advance();
token token
} }
/// Returns a word containing the string bounded by the given indices.
fn text(&self, start: usize, end: usize) -> Token<'s> {
Token::Text(&self.source[start .. end])
}
} }
impl<'s> Iterator for Tokens<'s> { impl<'s> Iterator for Tokens<'s> {
@ -79,27 +80,11 @@ impl<'s> Iterator for Tokens<'s> {
fn next(&mut self) -> Option<Token<'s>> { fn next(&mut self) -> Option<Token<'s>> {
use TokensState as TS; use TokensState as TS;
// Return the remaining words and double underscores. // Skip whitespace, but if at least one whitespace character existed,
if let TS::DoubleUnderscore(splinor) = &mut self.state { // remember that, because then we return a space token.
loop {
if let Some(splined) = splinor.next() {
return Some(match splined {
Splined::Value(word) if word != "" => Token::Word(word),
Splined::Splinor(s) => s,
_ => continue,
});
} else {
self.unswitch();
break;
}
}
}
// Skip whitespace, but if at least one whitespace word existed,
// remember that, because we return a space token.
let mut whitespace = false; let mut whitespace = false;
while let Some(word) = self.words.peek() { while let Some(&(_, c)) = self.chars.peek() {
if !word.is_whitespace() { if !c.is_whitespace() || c == '\n' || c == '\r' {
break; break;
} }
whitespace = true; whitespace = true;
@ -111,100 +96,82 @@ impl<'s> Iterator for Tokens<'s> {
// Function maybe has a body // Function maybe has a body
if self.state == TS::MaybeBody { if self.state == TS::MaybeBody {
match *self.words.peek()? { if self.chars.peek()?.1 == '[' {
"[" => { self.state = TS::Body;
self.state = TS::Body; return Some(self.consumed(Token::LeftBracket));
return Some(self.consumed(Token::LeftBracket)); } else {
}, self.unswitch();
_ => self.unswitch(),
} }
} }
// Now all special cases are handled and we can finally look at the // Now all special cases are handled and we can finally look at the
// next words. // next words.
let next = self.words.next()?; let (next_pos, next) = self.chars.next()?;
let afterwards = self.words.peek(); let afterwards = self.chars.peek().map(|&(_, c)| c);
Some(match next { Some(match next {
// Special characters // Special characters
"[" => { '[' => {
self.switch(TS::Function); self.switch(TS::Function);
Token::LeftBracket Token::LeftBracket
}, },
"]" => { ']' => {
if self.state == TS::Function { if self.state == TS::Function {
self.state = TS::MaybeBody; self.state = TS::MaybeBody;
} }
Token::RightBracket Token::RightBracket
}, },
"$" => Token::Dollar, '$' => Token::Dollar,
"#" => Token::Hashtag, '#' => Token::Hashtag,
// Context sensitive operators // Context sensitive operators
":" if self.state == TS::Function => Token::Colon, ':' if self.state == TS::Function => Token::Colon,
"=" if self.state == TS::Function => Token::Equals, '=' if self.state == TS::Function => Token::Equals,
// Double star/underscore // Double star/underscore
"*" if afterwards == Some(&"*") => self.consumed(Token::DoubleStar), '*' if afterwards == Some('*') => self.consumed(Token::DoubleStar),
"__" => Token::DoubleUnderscore, '_' if afterwards == Some('_') => self.consumed(Token::DoubleUnderscore),
// Newlines // Newlines
"\n" | "\r\n" => Token::Newline, '\n' => Token::Newline,
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
// Escaping // Escaping
r"\" => { '\\' => {
if let Some(next) = afterwards { if let Some(&(index, c)) = self.chars.peek() {
let escapable = match *next { if is_special_character(c) {
"[" | "]" | "$" | "#" | r"\" | ":" | "=" | "*" | "_" => true,
w if w.starts_with("__") => true,
_ => false,
};
if escapable {
let next = *next;
self.advance(); self.advance();
return Some(Token::Word(next)); return Some(self.text(index, index + c.len_utf8()));
} }
} }
Token::Word(r"\") Token::Text("\\")
},
// Double underscores hidden in words.
word if word.contains("__") => {
let spline = word.spline("__", Token::DoubleUnderscore);
self.switch(TS::DoubleUnderscore(spline));
return self.next();
}, },
// Now it seems like it's just a normal word. // Now it seems like it's just a normal word.
word => Token::Word(word), _ => {
// Find out when the word ends.
let mut end = (next_pos, next);
while let Some(&(index, c)) = self.chars.peek() {
if is_special_character(c) || c.is_whitespace() {
break;
}
end = (index, c);
self.advance();
}
let end_pos = end.0 + end.1.len_utf8();
self.text(next_pos, end_pos)
},
}) })
} }
} }
impl fmt::Debug for Tokens<'_> { /// Whether this character has a special meaning in the language.
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn is_special_character(character: char) -> bool {
f.debug_struct("Tokens") match character {
.field("source", &self.source) '[' | ']' | '$' | '#' | '\\' | ':' | '=' | '*' | '_' => true,
.field("words", &"Peekable<UWordBounds>") _ => false,
.field("state", &self.state)
.field("stack", &self.stack)
.finish()
}
}
impl PartialEq for TokensState<'_> {
fn eq(&self, other: &TokensState) -> bool {
use TokensState as TS;
match (self, other) {
(TS::Body, TS::Body) => true,
(TS::Function, TS::Function) => true,
(TS::MaybeBody, TS::MaybeBody) => true,
// They are not necessarily different, but we don't care
_ => false,
}
} }
} }
@ -285,8 +252,8 @@ impl<'s, 't> Parser<'s, 't> {
Token::Space => self.append_space_consumed(), Token::Space => self.append_space_consumed(),
Token::Newline => self.switch_consumed(PS::FirstNewline), Token::Newline => self.switch_consumed(PS::FirstNewline),
// Words // Text
Token::Word(word) => self.append_consumed(Node::Word(word.to_owned())), Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
// Functions // Functions
Token::LeftBracket => self.parse_function()?, Token::LeftBracket => self.parse_function()?,
@ -315,7 +282,7 @@ impl<'s, 't> Parser<'s, 't> {
// The next token should be the name of the function. // The next token should be the name of the function.
let name = match self.tokens.next() { let name = match self.tokens.next() {
Some(Token::Word(word)) => { Some(Token::Text(word)) => {
if word.is_identifier() { if word.is_identifier() {
Ok(word.to_owned()) Ok(word.to_owned())
} else { } else {
@ -537,6 +504,39 @@ impl<'s> Iterator for ParseTokens<'s> {
} }
} }
/// More useful functions on `str`'s.
trait StrExt {
/// Whether self consists only of whitespace.
fn is_whitespace(&self) -> bool;
/// Whether this word is a valid unicode identifier.
fn is_identifier(&self) -> bool;
}
impl StrExt for str {
fn is_whitespace(&self) -> bool {
self.chars().all(|c| c.is_whitespace() && c != '\n')
}
fn is_identifier(&self) -> bool {
let mut chars = self.chars();
match chars.next() {
Some(c) if !UnicodeXID::is_xid_start(c) => return false,
None => return false,
_ => (),
}
while let Some(c) = chars.next() {
if !UnicodeXID::is_xid_continue(c) {
return false;
}
}
true
}
}
/// The error type for parsing. /// The error type for parsing.
pub struct ParseError(String); pub struct ParseError(String);
@ -560,7 +560,7 @@ mod token_tests {
use super::*; use super::*;
use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R, use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS, Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS,
Dollar as D, Hashtag as H, Word as W}; Dollar as D, Hashtag as H, Text as T};
/// Test if the source code tokenizes to the tokens. /// Test if the source code tokenizes to the tokens.
fn test(src: &str, tokens: Vec<Token>) { fn test(src: &str, tokens: Vec<Token>) {
@ -571,7 +571,7 @@ mod token_tests {
#[test] #[test]
fn tokenize_base() { fn tokenize_base() {
test("", vec![]); test("", vec![]);
test("Hallo", vec![W("Hallo")]); test("Hallo", vec![T("Hallo")]);
test("[", vec![L]); test("[", vec![L]);
test("]", vec![R]); test("]", vec![R]);
test("$", vec![D]); test("$", vec![D]);
@ -586,26 +586,26 @@ mod token_tests {
fn tokenize_whitespace_newlines() { fn tokenize_whitespace_newlines() {
test(" \t", vec![S]); test(" \t", vec![S]);
test("First line\r\nSecond line\nThird line\n", test("First line\r\nSecond line\nThird line\n",
vec![W("First"), S, W("line"), N, W("Second"), S, W("line"), N, vec![T("First"), S, T("line"), N, T("Second"), S, T("line"), N,
W("Third"), S, W("line"), N]); T("Third"), S, T("line"), N]);
test("Hello \n ", vec![W("Hello"), S, N, S]); test("Hello \n ", vec![T("Hello"), S, N, S]);
test("Dense\nTimes", vec![W("Dense"), N, W("Times")]); test("Dense\nTimes", vec![T("Dense"), N, T("Times")]);
} }
/// Tests if escaping with backslash works as it should. /// Tests if escaping with backslash works as it should.
#[test] #[test]
fn tokenize_escape() { fn tokenize_escape() {
test(r"\[", vec![W("[")]); test(r"\[", vec![T("[")]);
test(r"\]", vec![W("]")]); test(r"\]", vec![T("]")]);
test(r"\#", vec![W("#")]); test(r"\#", vec![T("#")]);
test(r"\$", vec![W("$")]); test(r"\$", vec![T("$")]);
test(r"\:", vec![W(":")]); test(r"\:", vec![T(":")]);
test(r"\=", vec![W("=")]); test(r"\=", vec![T("=")]);
test(r"\**", vec![W("*"), W("*")]); test(r"\**", vec![T("*"), T("*")]);
test(r"\*", vec![W("*")]); test(r"\*", vec![T("*")]);
test(r"\__", vec![W("__")]); test(r"\__", vec![T("_"), T("_")]);
test(r"\_", vec![W("_")]); test(r"\_", vec![T("_")]);
test(r"\hello", vec![W(r"\"), W("hello")]); test(r"\hello", vec![T("\\"), T("hello")]);
} }
/// Tokenizes some more realistic examples. /// Tokenizes some more realistic examples.
@ -616,8 +616,8 @@ mod token_tests {
Test [italic][example]! Test [italic][example]!
] ]
", vec![ ", vec![
N, S, L, W("function"), R, L, N, S, W("Test"), S, L, W("italic"), R, L, N, S, L, T("function"), R, L, N, S, T("Test"), S, L, T("italic"), R, L,
W("example"), R, W("!"), N, S, R, N, S T("example"), R, T("!"), N, S, R, N, S
]); ]);
test(r" test(r"
@ -626,10 +626,10 @@ mod token_tests {
Das ist ein Beispielsatz mit **fetter** Schrift. Das ist ein Beispielsatz mit **fetter** Schrift.
", vec![ ", vec![
N, S, L, W("page"), C, S, W("size"), E, W("A4"), R, N, S, N, S, L, T("page"), C, S, T("size"), E, T("A4"), R, N, S,
L, W("font"), C, S, W("size"), E, W("12pt"), R, N, N, S, L, T("font"), C, S, T("size"), E, T("12pt"), R, N, N, S,
W("Das"), S, W("ist"), S, W("ein"), S, W("Beispielsatz"), S, W("mit"), S, T("Das"), S, T("ist"), S, T("ein"), S, T("Beispielsatz"), S, T("mit"), S,
DS, W("fetter"), DS, S, W("Schrift"), W("."), N, S DS, T("fetter"), DS, S, T("Schrift."), N, S
]); ]);
} }
@ -638,13 +638,13 @@ mod token_tests {
#[test] #[test]
fn tokenize_symbols_context() { fn tokenize_symbols_context() {
test("[func: key=value][Answer: 7]", test("[func: key=value][Answer: 7]",
vec![L, W("func"), C, S, W("key"), E, W("value"), R, L, vec![L, T("func"), C, S, T("key"), E, T("value"), R, L,
W("Answer"), W(":"), S, W("7"), R]); T("Answer"), T(":"), S, T("7"), R]);
test("[[n: k=v]:x][:[=]]:=", test("[[n: k=v]:x][:[=]]:=",
vec![L, L, W("n"), C, S, W("k"), E, W("v"), R, C, W("x"), R, vec![L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
L, W(":"), L, E, R, R, W(":"), W("=")]); L, T(":"), L, E, R, R, T(":"), T("=")]);
test("[func: __key__=value]", test("[func: __key__=value]",
vec![L, W("func"), C, S, DU, W("key"), DU, E, W("value"), R]); vec![L, T("func"), C, S, DU, T("key"), DU, E, T("value"), R]);
} }
/// This test has a special look at the double underscore syntax, because /// This test has a special look at the double underscore syntax, because
@ -653,16 +653,16 @@ mod token_tests {
#[test] #[test]
fn tokenize_double_underscore() { fn tokenize_double_underscore() {
test("he__llo__world_ _ __ Now this_ is__ special!", test("he__llo__world_ _ __ Now this_ is__ special!",
vec![W("he"), DU, W("llo"), DU, W("world_"), S, W("_"), S, DU, S, W("Now"), S, vec![T("he"), DU, T("llo"), DU, T("world"), T("_"), S, T("_"), S, DU, S, T("Now"), S,
W("this_"), S, W("is"), DU, S, W("special"), W("!")]); T("this"), T("_"), S, T("is"), DU, S, T("special!")]);
} }
/// This test is for checking if non-ASCII characters get parsed correctly. /// This test is for checking if non-ASCII characters get parsed correctly.
#[test] #[test]
fn tokenize_unicode() { fn tokenize_unicode() {
test("[document][Hello 🌍!]", test("[document][Hello 🌍!]",
vec![L, W("document"), R, L, W("Hello"), S, W("🌍"), W("!"), R]); vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]);
test("[f]⺐.", vec![L, W("f"), R, W(""), W(".")]); test("[f]⺐.", vec![L, T("f"), R, T(".")]);
} }
} }
@ -674,7 +674,7 @@ mod parse_tests {
use Node::{Space as S, Newline as N, Func as F}; use Node::{Space as S, Newline as N, Func as F};
#[allow(non_snake_case)] #[allow(non_snake_case)]
fn W(s: &str) -> Node { Node::Word(s.to_owned()) } fn T(s: &str) -> Node { Node::Text(s.to_owned()) }
/// A testing function which just parses it's body into a syntax tree. /// A testing function which just parses it's body into a syntax tree.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
@ -764,19 +764,19 @@ mod parse_tests {
#[test] #[test]
fn parse_base() { fn parse_base() {
test("", tree! []); test("", tree! []);
test("Hello World!", tree! [ W("Hello"), S, W("World"), W("!") ]); test("Hello World!", tree! [ T("Hello"), S, T("World!") ]);
} }
/// Test whether newlines generate the correct whitespace. /// Test whether newlines generate the correct whitespace.
#[test] #[test]
fn parse_newlines_whitespace() { fn parse_newlines_whitespace() {
test("Hello\nWorld", tree! [ W("Hello"), S, W("World") ]); test("Hello\nWorld", tree! [ T("Hello"), S, T("World") ]);
test("Hello \n World", tree! [ W("Hello"), S, W("World") ]); test("Hello \n World", tree! [ T("Hello"), S, T("World") ]);
test("Hello\n\nWorld", tree! [ W("Hello"), N, W("World") ]); test("Hello\n\nWorld", tree! [ T("Hello"), N, T("World") ]);
test("Hello \n\nWorld", tree! [ W("Hello"), S, N, W("World") ]); test("Hello \n\nWorld", tree! [ T("Hello"), S, N, T("World") ]);
test("Hello\n\n World", tree! [ W("Hello"), N, S, W("World") ]); test("Hello\n\n World", tree! [ T("Hello"), N, S, T("World") ]);
test("Hello \n \n \n World", tree! [ W("Hello"), S, N, S, W("World") ]); test("Hello \n \n \n World", tree! [ T("Hello"), S, N, S, T("World") ]);
test("Hello\n \n\n World", tree! [ W("Hello"), S, N, S, W("World") ]); test("Hello\n \n\n World", tree! [ T("Hello"), S, N, S, T("World") ]);
} }
/// Parse things dealing with functions. /// Parse things dealing with functions.
@ -790,18 +790,18 @@ mod parse_tests {
test_scoped(&scope,"[test]", tree! [ F(func! { name => "test", body => None }) ]); test_scoped(&scope,"[test]", tree! [ F(func! { name => "test", body => None }) ]);
test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [ test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [
W("This"), S, W("is"), S, W("an"), S, T("This"), S, T("is"), S, T("an"), S,
F(func! { name => "modifier", body => tree! [ W("example") ] }), S, F(func! { name => "modifier", body => tree! [ T("example") ] }), S,
W("of"), S, W("a"), S, W("function"), S, W("invocation"), W(".") T("of"), S, T("a"), S, T("function"), S, T("invocation.")
]); ]);
test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [ test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [
F(func! { F(func! {
name => "func", name => "func",
body => tree! [ W("Hello") ], body => tree! [ T("Hello") ],
}), }),
F(func! { F(func! {
name => "modifier", name => "modifier",
body => tree! [ W("Here") ], body => tree! [ T("Here") ],
}), }),
F(func! { F(func! {
name => "end", name => "end",
@ -820,11 +820,11 @@ mod parse_tests {
body => tree! [ body => tree! [
F(func! { F(func! {
name => "func", name => "func",
body => tree! [ W("call") ], body => tree! [ T("call") ],
}), }),
], ],
}), }),
S, W("outside") S, T("outside")
]); ]);
} }
@ -839,12 +839,12 @@ mod parse_tests {
name => "func", name => "func",
body => None, body => None,
}), }),
S, W(""), W(".") S, T(".")
]); ]);
test_scoped(&scope, "[bold][Hello 🌍!]", tree! [ test_scoped(&scope, "[bold][Hello 🌍!]", tree! [
F(func! { F(func! {
name => "bold", name => "bold",
body => tree! [ W("Hello"), S, W("🌍"), W("!") ], body => tree! [ T("Hello"), S, T("🌍!") ],
}) })
]); ]);
} }

View File

@ -30,8 +30,8 @@ pub enum Token<'s> {
Dollar, Dollar,
/// A hashtag starting a _comment_. /// A hashtag starting a _comment_.
Hashtag, Hashtag,
/// Everything else just is a literal word. /// Everything else is just text.
Word(&'s str), Text(&'s str),
} }
/// A tree representation of the source. /// A tree representation of the source.
@ -62,8 +62,8 @@ pub enum Node {
ToggleBold, ToggleBold,
/// Indicates that math mode was enabled/disabled. /// Indicates that math mode was enabled/disabled.
ToggleMath, ToggleMath,
/// A literal word. /// Literal text.
Word(String), Text(String),
/// A function invocation. /// A function invocation.
Func(FuncCall), Func(FuncCall),
} }

View File

@ -1,135 +0,0 @@
//! Utility functionality.
use std::iter::Peekable;
use std::str::Split;
use unicode_xid::UnicodeXID;
/// Types that can be splined.
pub trait Splinor {
/// Returns an iterator over the substrings splitted by the pattern,
/// intertwined with the splinor.
///
/// # Example
///
/// ```ignore
/// #[derive(Debug, Copy, Clone, PartialEq)]
/// struct Space;
///
/// let v: Vec<Splined<Space>> = "My airplane flies!".spline(" ", Space).collect();
/// assert_eq!(v, [
/// Splined::Value("My"),
/// Splined::Splinor(Space),
/// Splined::Value("airplane"),
/// Splined::Splinor(Space),
/// Splined::Value("flies!"),
/// ]);
/// ```
fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T>;
}
impl Splinor for str {
fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T> {
Spline {
splinor: Splined::Splinor(splinor),
split: self.split(pat).peekable(),
next_splinor: false,
}
}
}
/// Iterator over splitted values and splinors.
///
/// Created by the [`spline`](Splinor::spline) function.
#[derive(Debug, Clone)]
pub struct Spline<'s, T> {
splinor: Splined<'s, T>,
split: Peekable<Split<'s, &'s str>>,
next_splinor: bool,
}
/// Represents either a splitted substring or a splinor.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Splined<'s, T> {
/// A substring.
Value(&'s str),
/// An intertwined splinor.
Splinor(T),
}
impl<'s, T: Clone> Iterator for Spline<'s, T> {
type Item = Splined<'s, T>;
fn next(&mut self) -> Option<Splined<'s, T>> {
if self.next_splinor && self.split.peek().is_some() {
self.next_splinor = false;
return Some(self.splinor.clone());
} else {
self.next_splinor = true;
return Some(Splined::Value(self.split.next()?))
}
}
}
/// More useful functions on `str`'s.
pub trait StrExt {
/// Whether self consists only of whitespace.
fn is_whitespace(&self) -> bool;
/// Whether this word is a valid unicode identifier.
fn is_identifier(&self) -> bool;
}
impl StrExt for str {
fn is_whitespace(&self) -> bool {
self.chars().all(|c| c.is_whitespace() && c != '\n')
}
fn is_identifier(&self) -> bool {
let mut chars = self.chars();
match chars.next() {
Some(c) if !UnicodeXID::is_xid_start(c) => return false,
None => return false,
_ => (),
}
while let Some(c) = chars.next() {
if !UnicodeXID::is_xid_continue(c) {
return false;
}
}
true
}
}
#[cfg(test)]
mod splinor_tests {
use super::*;
use Splined::{Value as V, Splinor as S};
#[derive(Debug, Copy, Clone, PartialEq)]
enum Token { DoubleUnderscore }
fn test<T>(string: &str, pat: &str, splinor: T, vec: Vec<Splined<T>>)
where T: std::fmt::Debug + Clone + PartialEq {
assert_eq!(string.spline(pat, splinor).collect::<Vec<_>>(), vec);
}
#[test]
fn splinor() {
let s = S(Token::DoubleUnderscore);
test("__he__llo__world__", "__", Token::DoubleUnderscore,
vec![V(""), s, V("he"), s, V("llo"), s, V("world"), s, V("")]);
test("__Italic__", "__", Token::DoubleUnderscore,
vec![V(""), s, V("Italic"), s, V("")]);
test("Key__Value", "__", Token::DoubleUnderscore,
vec![V("Key"), s, V("Value")]);
test("__Start__NoEnd", "__", Token::DoubleUnderscore,
vec![V(""), s, V("Start"), s, V("NoEnd")]);
test("NoStart__End__", "__", Token::DoubleUnderscore,
vec![V("NoStart"), s, V("End"), s, V("")]);
}
}