From c2b6f2dc359d3b5c5b09996b8902c09e27271b4c Mon Sep 17 00:00:00 2001 From: Martin Haug Date: Sat, 29 Aug 2020 13:53:59 +0200 Subject: [PATCH] =?UTF-8?q?Added=20code=20blocks=20=F0=9F=9A=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/layout/tree.rs | 18 ++++- src/syntax/parsing.rs | 162 ++++++++++++++++++++++++++++++++++++++++-- src/syntax/tokens.rs | 80 ++++++++++++++++++++- src/syntax/tree.rs | 8 +++ 4 files changed, 261 insertions(+), 7 deletions(-) diff --git a/src/layout/tree.rs b/src/layout/tree.rs index e500c4ba2..f039d9b09 100644 --- a/src/layout/tree.rs +++ b/src/layout/tree.rs @@ -3,7 +3,7 @@ use crate::style::LayoutStyle; use crate::syntax::decoration::Decoration; use crate::syntax::span::{Span, Spanned}; -use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree}; +use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree, CodeBlockExpr}; use crate::{DynFuture, Feedback, Pass}; use super::line::{LineContext, LineLayouter}; use super::text::{layout_text, TextContext}; @@ -80,6 +80,7 @@ impl<'a> TreeLayouter<'a> { } SyntaxNode::Raw(lines) => self.layout_raw(lines).await, + SyntaxNode::CodeBlock(block) => self.layout_code(block).await, SyntaxNode::Par(par) => self.layout_par(par).await, SyntaxNode::Call(call) => { self.layout_call(Spanned::new(call, node.span)).await; @@ -128,6 +129,21 @@ impl<'a> TreeLayouter<'a> { self.style.text.fallback = fallback; } + async fn layout_code(&mut self, block: &CodeBlockExpr) { + let fallback = self.style.text.fallback.clone(); + self.style.text.fallback + .list_mut() + .insert(0, "monospace".to_string()); + self.style.text.fallback.flatten(); + + for line in &block.raw { + self.layout_text(line).await; + self.layouter.finish_line(); + } + + self.style.text.fallback = fallback; + } + async fn layout_par(&mut self, par: &SyntaxTree) { self.layout_tree(par).await; self.layouter.add_secondary_spacing( diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 29a9d788f..e9bbf2e58 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -7,8 +7,15 @@ use crate::color::RgbaColor; use crate::compute::table::SpannedEntry; use super::decoration::Decoration; use super::span::{Pos, Span, Spanned}; -use super::tokens::{is_newline_char, Token, TokenMode, Tokens}; -use super::tree::{CallExpr, Expr, SyntaxNode, SyntaxTree, TableExpr}; +use super::tokens::{is_newline_char, Token, TokenMode, Tokens, is_identifier}; +use super::tree::{ + CallExpr, + Expr, + SyntaxNode, + SyntaxTree, + TableExpr, + CodeBlockExpr, +}; use super::Ident; /// Parse a string of source code. @@ -84,6 +91,34 @@ impl Parser<'_> { self.with_span(SyntaxNode::Raw(unescape_raw(raw))) } + Token::Code { lang, raw, terminated } => { + if !terminated { + error!( + @self.feedback, Span::at(token.span.end), + "expected code block to close", + ); + } + let mut valid_ident = false; + let mut lang = lang.map(|s| s.map(|v| { + if is_identifier(v) { + valid_ident = true; + } + Ident(v.to_string()) + })); + + if !valid_ident { + if let Some(l) = lang { + error!( + @self.feedback, l.span, + "expected language to be a valid identifier", + ); + } + lang = None; + } + + self.with_span(SyntaxNode::CodeBlock(CodeBlockExpr { raw: unescape_code(raw), lang })) + } + Token::Text(text) => { self.with_span(SyntaxNode::Text(text.to_string())) } @@ -627,6 +662,84 @@ fn unescape_raw(raw: &str) -> Vec { lines } +/// Unescape raw markup and split it into into lines. +fn unescape_code(raw: &str) -> Vec { + let mut iter = raw.chars().peekable(); + let mut line = String::new(); + let mut lines = Vec::new(); + let mut backticks: usize = 0; + + // This assignment is used in line 731, 733; + // the compiler does not want to acknowledge that, however. + #[allow(unused_assignments)] + let mut update_backtick_count = true; + + while let Some(c) = iter.next() { + update_backtick_count = true; + if is_newline_char(c) { + if c == '\r' && iter.peek() == Some(&'\n') { + iter.next(); + } + + lines.push(std::mem::take(&mut line)); + } else { + if c == '\\' && backticks > 0 { + let mut tail = String::new(); + let mut escape_success = false; + + let mut backticks_after_slash: u8 = 0; + + while let Some(&s) = iter.peek() { + match s { + '\\' => { + if backticks_after_slash == 0 { + tail.push(s); + } else { + // Pattern like `\`\` should fail + // escape and just be printed verbantim. + break; + } + } + '`' => { + tail.push(s); + backticks_after_slash += 1; + if backticks_after_slash == 2 { + escape_success = true; + iter.next(); + break; + } + } + _ => { break } + } + + iter.next(); + } + + if !escape_success { + line.push(c); + backticks = backticks_after_slash as usize; + update_backtick_count = false; + } else { + backticks = 0; + } + + line.push_str(&tail); + } else { + line.push(c); + } + } + + if update_backtick_count && c == '`' { + backticks += 1; + } else if update_backtick_count { + backticks = 0; + } + } + + lines.push(line); + lines +} + #[cfg(test)] #[allow(non_snake_case)] mod tests { @@ -652,6 +765,14 @@ mod tests { }; } + fn Lang(text: &str) -> Option> { Some(Spanned::zero(Ident(text.to_string()))) } + + macro_rules! C { + ($lang:expr, $($line:expr),* $(,)?) => { + SyntaxNode::CodeBlock(CodeBlockExpr { raw: vec![$($line.to_string()) ,*], lang: $lang }) + }; + } + macro_rules! P { ($($tts:tt)*) => { SyntaxNode::Par(Tree![@$($tts)*]) }; } @@ -799,6 +920,28 @@ mod tests { test("raw\\", vec!["raw\\"]); } + #[test] + fn test_unescape_code() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(unescape_code(raw), expected); + } + + test("code\\`", vec!["code\\`"]); + test("code`\\``", vec!["code```"]); + test("code`\\`a", vec!["code`\\`a"]); + test("code``hi`\\``", vec!["code``hi```"]); + test("code`\\\\``", vec!["code`\\``"]); + test("code`\\`\\`go", vec!["code`\\`\\`go"]); + test("code`\\`\\``", vec!["code`\\```"]); + test("code\ntext", vec!["code", "text"]); + test("a\r\nb", vec!["a", "b"]); + test("a\n\nb", vec!["a", "", "b"]); + test("a\r\x0Bb", vec!["a", "", "b"]); + test("a\r\n\r\nb", vec!["a", "", "b"]); + test("code\\a", vec!["code\\a"]); + test("code\\", vec!["code\\"]); + } + #[test] fn test_parse_simple_nodes() { t!("" => ); @@ -811,8 +954,19 @@ mod tests { t!("`py`" => P![R!["py"]]); t!("`hi\nyou" => P![R!["hi", "you"]]); e!("`hi\nyou" => s(1,3, 1,3, "expected backtick")); - t!("`hi\\`du`" => P![R!["hi`du"]]); - t!("💜\n\n 🌍" => P![T("💜")], P![T("🌍")]); + t!("`hi\\`du`" => P![R!["hi`du"]]); + t!("```java System.out.print```" => P![ + C![Lang("java"), "System.out.print"] + ]); + t!("``` console.log(\n\"alert\"\n)" => P![ + C![None, "console.log(", "\"alert\"", ")"] + ]); + t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => P![ + C![Lang("typst"), " Typst uses ``` to indicate code blocks"] + ]); + e!("``` hi\nyou" => s(1,3, 1,3, "expected code block to close")); + e!("```🌍 hi\nyou```" => s(0,3, 0,4, "expected language to be a valid identifier")); + t!("💜\n\n 🌍" => P![T("💜")], P![T("🌍")]); ts!("hi" => s(0,0, 0,2, P![s(0,0, 0,2, T("hi"))])); ts!("*Hi*" => s(0,0, 0,4, P![ diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index 1dcf9022e..dbba175dc 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -90,6 +90,16 @@ pub enum Token<'s> { terminated: bool, }, + /// Multi-line code block. + Code { + /// The language of the code block, if specified. + lang: Option>, + /// The raw text (not yet unescaped as for strings). + raw: &'s str, + /// Whether the closing backticks were present. + terminated: bool, + }, + /// Any other consecutive string. Text(&'s str), @@ -127,6 +137,7 @@ impl<'s> Token<'s> { Underscore => "underscore", Backslash => "backslash", Raw { .. } => "raw text", + Code { .. } => "code block", Text(_) => "text", Invalid("*/") => "end of block comment", Invalid(_) => "invalid token", @@ -241,7 +252,7 @@ impl<'s> Iterator for Tokens<'s> { // Style toggles. '_' if self.mode == Body => Underscore, - '`' if self.mode == Body => self.read_raw(), + '`' if self.mode == Body => self.read_raw_and_code(), // An escaped thing. '\\' if self.mode == Body => self.read_escaped(), @@ -330,8 +341,65 @@ impl<'s> Tokens<'s> { Str { string, terminated } } - fn read_raw(&mut self) -> Token<'s> { + fn read_raw_and_code(&mut self) -> Token<'s> { let (raw, terminated) = self.read_until_unescaped('`'); + if raw.len() == 0 && terminated && self.peek() == Some('`') { + // Third tick found; this is a code block + self.eat(); + let mut backticks = 0; + let mut terminated = true; + // Reads the lang tag (until newline or whitespace) + let lang_start = self.pos(); + let (lang_opt, _) = self.read_string_until( + |c| c == '`' || c.is_whitespace() || is_newline_char(c), + false, 0, 0); + let lang_end = self.pos(); + + #[derive(Debug, PartialEq)] + enum WhitespaceIngestion { All, ExceptNewline, Never } + let mut ingest_whitespace = WhitespaceIngestion::Never; + let mut start = self.index(); + + while backticks < 3 { + match self.eat() { + Some('`') => backticks += 1, + Some('\\') if backticks == 1 && self.peek() == Some('`') => { + backticks = 0; + } + Some(c) => { + // Remove whitespace between language and content or + // first line break, deal with CRLF and CR line endings. + if ingest_whitespace != WhitespaceIngestion::All + && c == '\n' { + start += 1; + ingest_whitespace = WhitespaceIngestion::All; + } else if ingest_whitespace != WhitespaceIngestion::All + && c == '\r' { + start += 1; + ingest_whitespace = WhitespaceIngestion::ExceptNewline; + } else if ingest_whitespace == WhitespaceIngestion::Never + && c.is_whitespace() { + start += 1; + } else { + ingest_whitespace = WhitespaceIngestion::All; + } + } + None => { + terminated = false; + break; + } + } + } + let end = self.index() - (if terminated { 3 } else { 0 }); + + return Code { + lang: if lang_opt.len() == 0 { None } else { + Some(Spanned::new(lang_opt, Span::new(lang_start, lang_end))) + }, + raw: &self.src[start..end], + terminated + } + } Raw { raw, terminated } } @@ -494,6 +562,7 @@ mod tests { use crate::length::Length; use crate::syntax::tests::*; use super::*; + use super::super::span::Spanned; use Token::{ Space as S, LineComment as LC, BlockComment as BC, @@ -515,6 +584,9 @@ mod tests { fn Str(string: &str, terminated: bool) -> Token { Token::Str { string, terminated } } fn Raw(raw: &str, terminated: bool) -> Token { Token::Raw { raw, terminated } } + fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> { + Token::Code { lang: lang.map(Spanned::zero), raw, terminated } + } macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} } @@ -568,6 +640,10 @@ mod tests { t!(Body, "`[func]`" => Raw("[func]", true)); t!(Body, "`]" => Raw("]", false)); t!(Body, "`\\``" => Raw("\\`", true)); + t!(Body, "``not code`" => Raw("", true), T("not"), S(0), T("code"), Raw("", false)); + t!(Body, "```rust hi```" => Code(Some("rust"), "hi", true)); + t!(Body, "``` hi`\\``" => Code(None, "hi`\\``", false)); + t!(Body, "```js \r\n document.write(\"go\")" => Code(Some("js"), " document.write(\"go\")", false)); t!(Body, "\\ " => Backslash, S(0)); t!(Header, "_`" => Invalid("_`")); } diff --git a/src/syntax/tree.rs b/src/syntax/tree.rs index ae2e98920..ace5ad8ec 100644 --- a/src/syntax/tree.rs +++ b/src/syntax/tree.rs @@ -31,6 +31,8 @@ pub enum SyntaxNode { Text(String), /// Lines of raw text. Raw(Vec), + /// An optionally highlighted multi-line code block. + CodeBlock(CodeBlockExpr), /// A paragraph of child nodes. Par(SyntaxTree), /// A function call. @@ -199,3 +201,9 @@ impl CallExpr { } } } +/// An code block. +#[derive(Debug, Clone, PartialEq)] +pub struct CodeBlockExpr { + pub lang: Option>, + pub raw: Vec, +}