From ede65dcf3ed99974e7989a672f61e9727062c3a7 Mon Sep 17 00:00:00 2001 From: Jesse Braham Date: Wed, 11 Dec 2024 17:39:11 +0100 Subject: [PATCH] Fix separator-related bug in lexer, rewrite lexer unit tests --- onihime/src/lexer/mod.rs | 201 ++++++++++++++++++++------------------- 1 file changed, 103 insertions(+), 98 deletions(-) diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs index 90d11b0..b996410 100644 --- a/onihime/src/lexer/mod.rs +++ b/onihime/src/lexer/mod.rs @@ -14,21 +14,17 @@ mod error; mod symbol; mod token; -/// A trait for checking if a character is a separator. -pub trait Separator { - /// Check if the character is a separator. - fn is_separator(&self) -> bool; -} - -impl Separator for char { - fn is_separator(&self) -> bool { - self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}') - } +/// Determine if the current character is a separator, performing 1-character +/// lookahead as needed to handle multi-character separators. +fn is_separator(current: char, next: Option) -> bool { + current.is_ascii_whitespace() + || matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';') + || (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{'))) } /// A lexer, used by the parser. #[derive(Debug)] -pub struct Lexer<'lexer> { +pub(crate) struct Lexer<'lexer> { input: Chars<'lexer>, byte: usize, source: Arc, @@ -37,7 +33,7 @@ pub struct Lexer<'lexer> { impl<'lexer> Lexer<'lexer> { /// Create a new lexer instance from a string. #[must_use] - pub fn new(input: &'lexer str) -> Self { + pub(crate) fn new(input: &'lexer str) -> Self { let source = Arc::new(Source::new(None, input.to_string())); Self { @@ -48,25 +44,19 @@ impl<'lexer> Lexer<'lexer> { } /// Set the name of the [Source] being lexically analyzed. - pub fn set_name(&mut self, name: String) { - // FIXME: Avoid unwrapping here (if possible?) + pub(crate) fn set_name(&mut self, name: String) { + // TODO: Avoid unwrapping here (if possible?) Arc::get_mut(&mut self.source).unwrap().set_name(name); } /// The source being lexically analyzed. + #[cfg(test)] #[must_use] - pub fn source(&self) -> Arc { + pub(crate) fn source(&self) -> Arc { self.source.clone() } - /// Get the unparsed input. - #[must_use] - pub fn get_unparsed(&self) -> &str { - self.input.as_str() - } - /// Get the current position of the lexer. - #[inline] #[must_use] pub(crate) fn span(&self) -> Span { Span::new(self.byte..self.byte, self.source.clone()) @@ -79,7 +69,7 @@ impl<'lexer> Lexer<'lexer> { } /// Get the nth character ahead of the current character without advancing. - #[inline] + #[must_use] fn peek(&self, n: usize) -> Option { self.input.as_str().chars().nth(n) } @@ -105,7 +95,7 @@ impl<'lexer> Lexer<'lexer> { fn read_word(&mut self) -> String { let mut word = String::new(); while let Some(c) = self.current() { - if c.is_separator() || (c == '#' && self.peek(1) == Some('{')) { + if is_separator(c, self.peek(1)) { break; } @@ -117,6 +107,7 @@ impl<'lexer> Lexer<'lexer> { } /// Parse a value from the input or return an error. + #[must_use] fn parse_or(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result where T: FromStr, @@ -129,7 +120,7 @@ impl<'lexer> Lexer<'lexer> { } /// Read the next token from the input. - pub fn read(&mut self) -> Result, LexerError> { + pub(crate) fn read(&mut self) -> Result, LexerError> { // Eat whitespace until we encounter a meaningful character, or simply return if // we have reached the end of input and no additional characters can be read: let c = loop { @@ -143,7 +134,9 @@ impl<'lexer> Lexer<'lexer> { }; let mut span = self.span(); + let kind = match c { + // TODO: This allows for unclosed block comments; do we care? '#' if self.peek(1) == Some('|') => { self.advance(); // '#' self.advance(); // '|' @@ -167,6 +160,7 @@ impl<'lexer> Lexer<'lexer> { self.advance(); } + // Line comments continue until a newline character is encountered: let mut comment = String::new(); while let Some(c) = self.advance() { if c == '\n' { @@ -199,6 +193,8 @@ impl<'lexer> Lexer<'lexer> { Some('\\') => match self.advance() { Some(c @ ('"' | '\\')) => c, Some('n') => '\n', + Some('r') => '\r', + Some('t') => '\t', Some('e') => '\x1b', Some(c) => { self.read_word(); // Recover from the error @@ -250,12 +246,15 @@ impl<'lexer> Lexer<'lexer> { Some('\\') => match self.advance() { Some(c @ ('"' | '\\')) => c, Some('n') => '\n', + Some('r') => '\r', + Some('t') => '\t', Some('e') => '\x1b', Some(c) => { + self.read_word(); // Recover from the error return Err(LexerError::new( LexerErrorKind::InvalidEscape(c), ch_span.join(&self.span()), - )) + )); } None => { return Err(LexerError::new( @@ -271,7 +270,10 @@ impl<'lexer> Lexer<'lexer> { }); } - if self.current().is_some_and(|c| !c.is_separator()) { + if self + .current() + .is_some_and(|c| !is_separator(c, self.peek(1))) + { self.read_word(); // Recover from the error return Err(LexerError::new( LexerErrorKind::InvalidString, @@ -282,7 +284,7 @@ impl<'lexer> Lexer<'lexer> { TokenKind::String(string) } ':' => { - self.advance(); + self.advance(); // ':' TokenKind::Keyword(Symbol(self.read_word())) } @@ -320,35 +322,40 @@ mod tests { #[test] fn $name() { let mut lexer = Lexer::new($input); - for token in $tokens { - let x = lexer.next().map(|r| match r { + let kind = lexer.next().map(|r| match r { Ok(t) => Ok(t.kind), Err(e) => Err(e.kind), }); - - assert_eq!(x, Some(token)); + assert_eq!(kind, Some(token)); } - assert_eq!(lexer.next(), None); } }; } - test!(block_comment: "#| foo\nbar |#(- 1)", [ - Ok(TokenKind::BlockComment("foo\nbar".into())), + test!(block_comment: "foo#| bar\nbaz |#qux", [ + Ok(TokenKind::Symbol(Symbol::from("foo"))), + Ok(TokenKind::BlockComment("bar\nbaz".into())), + Ok(TokenKind::Symbol(Symbol::from("qux"))), + ]); + + test!(line_comment: "nil ;; foo; bar\nnil", [ + Ok(TokenKind::Nil), + Ok(TokenKind::LineComment("foo; bar".into())), + Ok(TokenKind::Nil), + ]); + + test!(list: "(1 () -2.3)", [ Ok(TokenKind::OpenParen), - Ok(TokenKind::Symbol(Symbol::from("-"))), Ok(TokenKind::Number(1.0)), + Ok(TokenKind::OpenParen), + Ok(TokenKind::CloseParen), + Ok(TokenKind::Number(-2.3)), Ok(TokenKind::CloseParen), ]); - test!(line_comment: "; foo\n;; bar baz; qux", [ - Ok(TokenKind::LineComment("foo".into())), - Ok(TokenKind::LineComment("bar baz; qux".into())), - ]); - - test!(hashset: "{{} true false}", [ + test!(set: "{{} true false}", [ Ok(TokenKind::OpenBrace), Ok(TokenKind::OpenBrace), Ok(TokenKind::CloseBrace), @@ -357,70 +364,34 @@ mod tests { Ok(TokenKind::CloseBrace), ]); - test!(hashmap: "#{:bar 0 :baz 1}", [ - Ok(TokenKind::OpenHashBrace), - Ok(TokenKind::Keyword(Symbol::from("bar"))), + test!(vector: "[0 10 200]", [ + Ok(TokenKind::OpenBracket), Ok(TokenKind::Number(0.0)), - Ok(TokenKind::Keyword(Symbol::from("baz"))), + Ok(TokenKind::Number(10.0)), + Ok(TokenKind::Number(200.0)), + Ok(TokenKind::CloseBracket), + ]); + + test!(map: "#{:a 0 :b 1}", [ + Ok(TokenKind::OpenHashBrace), + Ok(TokenKind::Keyword(Symbol::from("a"))), + Ok(TokenKind::Number(0.0)), + Ok(TokenKind::Keyword(Symbol::from("b"))), Ok(TokenKind::Number(1.0)), Ok(TokenKind::CloseBrace), ]); - test!(vector: "[0 1 2]", [ - Ok(TokenKind::OpenBracket), - Ok(TokenKind::Number(0.0)), - Ok(TokenKind::Number(1.0)), - Ok(TokenKind::Number(2.0)), - Ok(TokenKind::CloseBracket), - ]); - - test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [ + test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [ Ok(TokenKind::Char('x')), Ok(TokenKind::Char('\n')), + Ok(TokenKind::Char('\r')), + Ok(TokenKind::Char('\t')), Ok(TokenKind::Char('\x1b')), Ok(TokenKind::Char('\\')), Err(LexerErrorKind::InvalidEscape('q')), Err(LexerErrorKind::UnclosedChar), ]); - test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [ - Ok(TokenKind::OpenParen), - Ok(TokenKind::Symbol(Symbol::from("+"))), - Ok(TokenKind::Number(14.0)), - Ok(TokenKind::Number(25.5)), - Ok(TokenKind::Number(333.0)), - Ok(TokenKind::OpenParen), - Ok(TokenKind::Symbol(Symbol::from("*"))), - Ok(TokenKind::Number(2.0)), - Ok(TokenKind::Number(5.0)), - Ok(TokenKind::CloseParen), - Ok(TokenKind::CloseParen), - ]); - - test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [ - Ok(TokenKind::OpenParen), - Ok(TokenKind::Symbol(Symbol::from("+"))), - Ok(TokenKind::Number(14.0)), - Ok(TokenKind::Number(25.5)), - Ok(TokenKind::Number(333.0)), - Ok(TokenKind::OpenParen), - Ok(TokenKind::Symbol(Symbol::from("*"))), - Ok(TokenKind::Number(2.0)), - Ok(TokenKind::Number(5.0)), - Err(LexerErrorKind::InvalidNumber("5.x".into())), - Ok(TokenKind::CloseParen), - Ok(TokenKind::CloseParen), - ]); - - test!(negative_minus: "(- 1 -2 3)", [ - Ok(TokenKind::OpenParen), - Ok(TokenKind::Symbol(Symbol::from("-"))), - Ok(TokenKind::Number(1.0)), - Ok(TokenKind::Number(-2.0)), - Ok(TokenKind::Number(3.0)), - Ok(TokenKind::CloseParen), - ]); - test!(error_unclosed_char_escape: r"'\", [ Err(LexerErrorKind::UnclosedChar), ]); @@ -429,12 +400,30 @@ mod tests { Err(LexerErrorKind::UnclosedChar), ]); - test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [ - Ok(TokenKind::Number(2.0)), - Ok(TokenKind::Number(55.0)), - Ok(TokenKind::Number(3.144)), - Ok(TokenKind::Number(0.0001)), + test!(number: "0 -1 20.0 +0.003", [ + Ok(TokenKind::Number(0.0)), + Ok(TokenKind::Number(-1.0)), + Ok(TokenKind::Number(20.0)), + Ok(TokenKind::Number(0.003)), + ]); + + test!(error_parse_number: "1.1.1 0.x", [ Err(LexerErrorKind::InvalidNumber("1.1.1".into())), + Err(LexerErrorKind::InvalidNumber("0.x".into())), + ]); + + test!(string: "\"\" \"xyz\" \"This is a string!\"", [ + Ok(TokenKind::String("".into())), + Ok(TokenKind::String("xyz".into())), + Ok(TokenKind::String("This is a string!".into())), + ]); + + test!(string_with_escapes: "\"\\e[0mfoo\\nbar\\r\\t\"", [ + Ok(TokenKind::String("\x1b[0mfoo\nbar\r\t".into())), + ]); + + test!(error_invalid_escape_string: "\"oh no \\p\"", [ + Err(LexerErrorKind::InvalidEscape('p')), ]); test!(error_unclosed_string: "\"hiii", [ @@ -444,4 +433,20 @@ mod tests { test!(error_invalid_string: "\"hiii\"222", [ Err(LexerErrorKind::InvalidString), ]); + + test!(nested_lists: "(+ (- 0 -1)\n(* 2 3))", [ + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("+"))), + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("-"))), + Ok(TokenKind::Number(0.0)), + Ok(TokenKind::Number(-1.0)), + Ok(TokenKind::CloseParen), + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("*"))), + Ok(TokenKind::Number(2.0)), + Ok(TokenKind::Number(3.0)), + Ok(TokenKind::CloseParen), + Ok(TokenKind::CloseParen), + ]); }