diff --git a/benches/oneshot.rs b/benches/oneshot.rs index a42a710d9..63f201ac5 100644 --- a/benches/oneshot.rs +++ b/benches/oneshot.rs @@ -6,7 +6,7 @@ use typst::eval::eval; use typst::layout::layout; use typst::loading::MemLoader; use typst::parse::{parse, Scanner, TokenMode, Tokens}; -use typst::source::{SourceFile, SourceId}; +use typst::source::SourceId; use typst::Context; const SRC: &str = include_str!("bench.typ"); @@ -44,13 +44,11 @@ fn bench_scan(iai: &mut Iai) { } fn bench_tokenize(iai: &mut Iai) { - let src = SourceFile::detached(SRC); - iai.run(|| Tokens::new(black_box(&src), black_box(TokenMode::Markup)).count()); + iai.run(|| Tokens::new(black_box(&SRC), black_box(TokenMode::Markup)).count()); } fn bench_parse(iai: &mut Iai) { - let src = SourceFile::detached(SRC); - iai.run(|| parse(&src)); + iai.run(|| parse(&SRC)); } fn bench_eval(iai: &mut Iai) { diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 22288d01a..c6def4dcc 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -12,12 +12,11 @@ pub use tokens::*; use std::rc::Rc; -use crate::source::SourceFile; use crate::syntax::*; use crate::util::EcoString; /// Parse a source file. -pub fn parse(source: &SourceFile) -> Rc { +pub fn parse(source: &str) -> Rc { let mut p = Parser::new(source); markup(&mut p); p.finish() diff --git a/src/parse/parser.rs b/src/parse/parser.rs index 240de43d7..374e7c09f 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -1,15 +1,14 @@ use std::ops::Range; use std::rc::Rc; -use super::{TokenMode, Tokens}; -use crate::source::{SourceFile, SourceId}; +use super::{is_newline, TokenMode, Tokens}; use crate::syntax::{ErrorPosition, Green, GreenData, GreenNode, NodeKind}; use crate::util::EcoString; /// A convenient token-based parser. pub struct Parser<'s> { /// The parsed file. - source: &'s SourceFile, + src: &'s str, /// An iterator over the source tokens. tokens: Tokens<'s>, /// The stack of open groups. @@ -61,11 +60,11 @@ pub enum Group { impl<'s> Parser<'s> { /// Create a new parser for the source string. - pub fn new(source: &'s SourceFile) -> Self { - let mut tokens = Tokens::new(source, TokenMode::Markup); + pub fn new(src: &'s str) -> Self { + let mut tokens = Tokens::new(src, TokenMode::Markup); let next = tokens.next(); Self { - source, + src, tokens, groups: vec![], next: next.clone(), @@ -78,11 +77,6 @@ impl<'s> Parser<'s> { } } - /// The id of the parsed source file. - pub fn id(&self) -> SourceId { - self.source.id() - } - /// Start a nested node. /// /// Each start call has to be matched with a call to `end`, @@ -366,12 +360,16 @@ impl<'s> Parser<'s> { /// Determine the column index for the given byte index. pub fn column(&self, index: usize) -> usize { - self.source.byte_to_column(index).unwrap() + self.src[.. index] + .chars() + .rev() + .take_while(|&c| !is_newline(c)) + .count() } /// Slice out part of the source string. pub fn get(&self, range: Range) -> &'s str { - self.source.get(range).unwrap() + self.src.get(range).unwrap() } /// Continue parsing in a group. diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index 8d4c04d49..3fab98a4b 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -1,5 +1,5 @@ use super::{is_newline, Scanner}; -use crate::syntax::RawToken; +use crate::syntax::RawData; use crate::util::EcoString; /// Resolve all escape sequences in a string. @@ -46,18 +46,18 @@ pub fn resolve_hex(sequence: &str) -> Option { } /// Resolve the language tag and trims the raw text. -pub fn resolve_raw(column: usize, backticks: u8, text: &str) -> RawToken { +pub fn resolve_raw(column: usize, backticks: u8, text: &str) -> RawData { if backticks > 1 { let (tag, inner) = split_at_lang_tag(text); let (text, block) = trim_and_split_raw(column, inner); - RawToken { + RawData { lang: Some(tag.into()), text: text.into(), backticks, block, } } else { - RawToken { + RawData { lang: None, text: split_lines(text).join("\n").into(), backticks, diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs index 8e3e42782..edf28e179 100644 --- a/src/parse/scanner.rs +++ b/src/parse/scanner.rs @@ -106,6 +106,16 @@ impl<'s> Scanner<'s> { self.index } + /// The column index of a given index in the source string. + #[inline] + pub fn column(&self, index: usize) -> usize { + self.src[.. index] + .chars() + .rev() + .take_while(|&c| !is_newline(c)) + .count() + } + /// Jump to an index in the source string. #[inline] pub fn jump(&mut self, index: usize) { diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 1d2e32ec5..ef2678d4c 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,7 +1,6 @@ use super::{is_newline, resolve_raw, Scanner}; use crate::geom::{AngularUnit, LengthUnit}; use crate::parse::resolve::{resolve_hex, resolve_string}; -use crate::source::SourceFile; use crate::syntax::*; use crate::util::EcoString; @@ -9,7 +8,6 @@ use std::rc::Rc; /// An iterator over the tokens of a string of source code. pub struct Tokens<'s> { - source: &'s SourceFile, s: Scanner<'s>, mode: TokenMode, } @@ -26,12 +24,8 @@ pub enum TokenMode { impl<'s> Tokens<'s> { /// Create a new token iterator with the given mode. #[inline] - pub fn new(source: &'s SourceFile, mode: TokenMode) -> Self { - Self { - s: Scanner::new(source.src()), - source, - mode, - } + pub fn new(source: &'s str, mode: TokenMode) -> Self { + Self { s: Scanner::new(source), mode } } /// Get the current token mode. @@ -244,7 +238,7 @@ impl<'s> Tokens<'s> { if self.s.eat_if('}') { if let Some(character) = resolve_hex(&sequence) { - NodeKind::UnicodeEscape(UnicodeEscapeToken { + NodeKind::UnicodeEscape(UnicodeEscapeData { character, }) } else { @@ -314,7 +308,7 @@ impl<'s> Tokens<'s> { } fn raw(&mut self) -> NodeKind { - let column = self.source.byte_to_column(self.s.index() - 1).unwrap(); + let column = self.s.column(self.s.index() - 1); let mut backticks = 1; while self.s.eat_if('`') && backticks < u8::MAX { backticks += 1; @@ -322,7 +316,7 @@ impl<'s> Tokens<'s> { // Special case for empty inline block. if backticks == 2 { - return NodeKind::Raw(Rc::new(RawToken { + return NodeKind::Raw(Rc::new(RawData { text: EcoString::new(), lang: None, backticks: 1, @@ -397,7 +391,7 @@ impl<'s> Tokens<'s> { }; if terminated { - NodeKind::Math(Rc::new(MathToken { + NodeKind::Math(Rc::new(MathData { formula: self.s.get(start .. end).into(), display, })) @@ -492,7 +486,7 @@ impl<'s> Tokens<'s> { } })); if self.s.eat_if('"') { - NodeKind::Str(StrToken { string }) + NodeKind::Str(StrData { string }) } else { NodeKind::Error(ErrorPosition::End, "expected quote".into()) } @@ -567,7 +561,7 @@ mod tests { use TokenMode::{Code, Markup}; fn UnicodeEscape(character: char) -> NodeKind { - NodeKind::UnicodeEscape(UnicodeEscapeToken { character }) + NodeKind::UnicodeEscape(UnicodeEscapeData { character }) } fn Error(pos: ErrorPosition, message: &str) -> NodeKind { @@ -575,7 +569,7 @@ mod tests { } fn Raw(text: &str, lang: Option<&str>, backticks_left: u8, block: bool) -> NodeKind { - NodeKind::Raw(Rc::new(RawToken { + NodeKind::Raw(Rc::new(RawData { text: text.into(), lang: lang.map(Into::into), backticks: backticks_left, @@ -586,7 +580,7 @@ mod tests { fn Math(formula: &str, display: bool, err_msg: Option<&str>) -> NodeKind { match err_msg { None => { - NodeKind::Math(Rc::new(MathToken { formula: formula.into(), display })) + NodeKind::Math(Rc::new(MathData { formula: formula.into(), display })) } Some(msg) => NodeKind::Error( ErrorPosition::End, @@ -597,7 +591,7 @@ mod tests { fn Str(string: &str, terminated: bool) -> NodeKind { if terminated { - NodeKind::Str(StrToken { string: string.into() }) + NodeKind::Str(StrData { string: string.into() }) } else { NodeKind::Error(ErrorPosition::End, "expected quote".into()) } @@ -687,7 +681,7 @@ mod tests { }}; (@$mode:ident: $src:expr => $($token:expr),*) => {{ let src = $src; - let found = Tokens::new(&SourceFile::detached(src.clone()), $mode).collect::>(); + let found = Tokens::new(&src, $mode).collect::>(); let expected = vec![$($token.clone()),*]; check(&src, found, expected); }}; diff --git a/src/source.rs b/src/source.rs index e3803f575..3b7212514 100644 --- a/src/source.rs +++ b/src/source.rs @@ -8,10 +8,10 @@ use std::rc::Rc; use serde::{Deserialize, Serialize}; -use crate::diag::{Error, TypResult}; +use crate::diag::TypResult; use crate::loading::{FileHash, Loader}; use crate::parse::{is_newline, parse, Scanner}; -use crate::syntax::{GreenNode, Markup, NodeKind, RedNode}; +use crate::syntax::{GreenNode, Markup, RedNode}; use crate::util::PathExt; #[cfg(feature = "codespan-reporting")] @@ -134,28 +134,22 @@ impl SourceFile { pub fn new(id: SourceId, path: &Path, src: String) -> Self { let mut line_starts = vec![0]; line_starts.extend(newlines(&src)); - let mut init = Self { + Self { id, path: path.normalize(), + root: parse(&src), src, line_starts, - root: Rc::new(GreenNode::new(NodeKind::Markup, 0)), - }; - - let root = parse(&init); - init.root = root; - init + } } pub fn ast(&self) -> TypResult { let red = RedNode::new_root(self.root.clone(), self.id); let errors = red.errors(); if errors.is_empty() { - Ok(red.as_ref().cast().unwrap()) + Ok(red.cast().unwrap()) } else { - Err(Box::new( - errors.into_iter().map(|(span, msg)| Error::new(span, msg)).collect(), - )) + Err(Box::new(errors)) } } diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs index bdd0767de..6ca271a96 100644 --- a/src/syntax/ast.rs +++ b/src/syntax/ast.rs @@ -1,8 +1,39 @@ use super::{Ident, NodeKind, RedNode, RedRef, Span, TypedNode}; use crate::geom::{AngularUnit, LengthUnit}; -use crate::node; use crate::util::EcoString; +macro_rules! node { + ($(#[$attr:meta])* $name:ident) => { + node!{$(#[$attr])* $name => $name} + }; + ($(#[$attr:meta])* $variant:ident => $name:ident) => { + #[derive(Debug, Clone, PartialEq)] + #[repr(transparent)] + $(#[$attr])* + pub struct $name(RedNode); + + impl TypedNode for $name { + fn cast_from(node: RedRef) -> Option { + if node.kind() != &NodeKind::$variant { + return None; + } + + Some(Self(node.own())) + } + } + + impl $name { + pub fn span(&self) -> Span { + self.0.span() + } + + pub fn underlying(&self) -> RedRef { + self.0.as_ref() + } + } + }; +} + node! { /// The syntactical root capable of representing a full parsed document. Markup diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 9fd2b21d2..ca41d33f7 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -15,6 +15,7 @@ pub use ident::*; pub use pretty::*; pub use span::*; +use crate::diag::Error; use crate::geom::{AngularUnit, LengthUnit}; use crate::source::SourceId; use crate::util::EcoString; @@ -94,9 +95,9 @@ impl GreenNode { } pub fn with_children(kind: NodeKind, len: usize, children: Vec) -> Self { - let mut meta = GreenData::new(kind, len); - meta.erroneous |= children.iter().any(|c| c.erroneous()); - Self { data: meta, children } + let mut data = GreenData::new(kind, len); + data.erroneous |= children.iter().any(|c| c.erroneous()); + Self { data, children } } pub fn with_child(kind: NodeKind, len: usize, child: impl Into) -> Self { @@ -180,6 +181,10 @@ impl<'a> RedRef<'a> { Span::new(self.id, self.offset, self.offset + self.green.len()) } + pub fn len(&self) -> usize { + self.green.len() + } + pub fn cast(self) -> Option where T: TypedNode, @@ -205,6 +210,29 @@ impl<'a> RedRef<'a> { }) } + pub fn errors(&self) -> Vec { + if !self.green.erroneous() { + return vec![]; + } + + match self.kind() { + NodeKind::Error(pos, msg) => { + let span = match pos { + ErrorPosition::Start => self.span().at_start(), + ErrorPosition::Full => self.span(), + ErrorPosition::End => self.span().at_end(), + }; + + vec![Error::new(span, msg.to_string())] + } + _ => self + .children() + .filter(|red| red.green.erroneous()) + .flat_map(|red| red.errors()) + .collect(), + } + } + pub(crate) fn typed_child(&self, kind: &NodeKind) -> Option { self.children() .find(|x| mem::discriminant(x.kind()) == mem::discriminant(kind)) @@ -219,6 +247,18 @@ impl<'a> RedRef<'a> { } } +impl Debug for RedRef<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{:?}: {:?}", self.kind(), self.span())?; + let mut children = self.children().peekable(); + if children.peek().is_some() { + f.write_str(" ")?; + f.debug_list().entries(children.map(RedRef::own)).finish()?; + } + Ok(()) + } +} + #[derive(Clone, PartialEq)] pub struct RedNode { id: SourceId, @@ -231,12 +271,27 @@ impl RedNode { Self { id, offset: 0, green: root.into() } } + pub fn as_ref<'a>(&'a self) -> RedRef<'a> { + RedRef { + id: self.id, + offset: self.offset, + green: &self.green, + } + } + pub fn span(&self) -> Span { self.as_ref().span() } pub fn len(&self) -> usize { - self.green.len() + self.as_ref().len() + } + + pub fn cast(self) -> Option + where + T: TypedNode, + { + T::cast_from(self.as_ref()) } pub fn kind(&self) -> &NodeKind { @@ -247,36 +302,8 @@ impl RedNode { self.as_ref().children() } - pub fn errors(&self) -> Vec<(Span, EcoString)> { - if !self.green.erroneous() { - return vec![]; - } - - match self.kind() { - NodeKind::Error(pos, msg) => { - let span = match pos { - ErrorPosition::Start => self.span().at_start(), - ErrorPosition::Full => self.span(), - ErrorPosition::End => self.span().at_end(), - }; - - vec![(span, msg.clone())] - } - _ => self - .as_ref() - .children() - .filter(|red| red.green.erroneous()) - .flat_map(|red| red.own().errors()) - .collect(), - } - } - - pub fn as_ref<'a>(&'a self) -> RedRef<'a> { - RedRef { - id: self.id, - offset: self.offset, - green: &self.green, - } + pub fn errors<'a>(&'a self) -> Vec { + self.as_ref().errors() } pub(crate) fn typed_child(&self, kind: &NodeKind) -> Option { @@ -294,15 +321,7 @@ impl RedNode { impl Debug for RedNode { fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, "{:?}: {:?}", self.kind(), self.span())?; - let children = self.as_ref().children().collect::>(); - if !children.is_empty() { - f.write_str(" ")?; - f.debug_list() - .entries(children.into_iter().map(RedRef::own)) - .finish()?; - } - Ok(()) + self.as_ref().fmt(f) } } @@ -419,7 +438,7 @@ pub enum NodeKind { EmDash, /// A slash and the letter "u" followed by a hexadecimal unicode entity /// enclosed in curly braces: `\u{1F5FA}`. - UnicodeEscape(UnicodeEscapeToken), + UnicodeEscape(UnicodeEscapeData), /// Strong text was enabled / disabled: `*`. Strong, /// Emphasized text was enabled / disabled: `_`. @@ -440,9 +459,9 @@ pub enum NodeKind { ListBullet, /// An arbitrary number of backticks followed by inner contents, terminated /// with the same number of backticks: `` `...` ``. - Raw(Rc), + Raw(Rc), /// Dollar signs surrounding inner contents. - Math(Rc), + Math(Rc), /// An identifier: `center`. Ident(EcoString), /// A boolean: `true`, `false`. @@ -463,7 +482,7 @@ pub enum NodeKind { /// A fraction unit: `3fr`. Fraction(f64), /// A quoted string: `"..."`. - Str(StrToken), + Str(StrData), /// An array expression: `(1, "hi", 12cm)`. Array, /// A dictionary expression: `(thickness: 3pt, pattern: dashed)`. @@ -534,15 +553,14 @@ pub enum ErrorPosition { /// A quoted string token: `"..."`. #[derive(Debug, Clone, PartialEq)] -#[repr(transparent)] -pub struct StrToken { +pub struct StrData { /// The string inside the quotes. pub string: EcoString, } /// A raw block token: `` `...` ``. #[derive(Debug, Clone, PartialEq)] -pub struct RawToken { +pub struct RawData { /// The raw text in the block. pub text: EcoString, /// The programming language of the raw text. @@ -555,7 +573,7 @@ pub struct RawToken { /// A math formula token: `$2pi + x$` or `$[f'(x) = x^2]$`. #[derive(Debug, Clone, PartialEq)] -pub struct MathToken { +pub struct MathData { /// The formula between the dollars. pub formula: EcoString, /// Whether the formula is display-level, that is, it is surrounded by @@ -565,8 +583,7 @@ pub struct MathToken { /// A unicode escape sequence token: `\u{1F5FA}`. #[derive(Debug, Clone, PartialEq)] -#[repr(transparent)] -pub struct UnicodeEscapeToken { +pub struct UnicodeEscapeData { /// The resulting unicode character. pub character: char, } @@ -712,36 +729,3 @@ impl NodeKind { } } } - -#[macro_export] -macro_rules! node { - ($(#[$attr:meta])* $name:ident) => { - node!{$(#[$attr])* $name => $name} - }; - ($(#[$attr:meta])* $variant:ident => $name:ident) => { - #[derive(Debug, Clone, PartialEq)] - #[repr(transparent)] - $(#[$attr])* - pub struct $name(RedNode); - - impl TypedNode for $name { - fn cast_from(node: RedRef) -> Option { - if node.kind() != &NodeKind::$variant { - return None; - } - - Some(Self(node.own())) - } - } - - impl $name { - pub fn span(&self) -> Span { - self.0.span() - } - - pub fn underlying(&self) -> RedRef { - self.0.as_ref() - } - } - }; -}