From 11917bb1835c0ecb5a4097e47ca97f4dc796a9ea Mon Sep 17 00:00:00 2001
From: Jesse Braham <jesse@beta7.io>
Date: Thu, 5 Dec 2024 17:16:45 +0100
Subject: [PATCH] Even more lexer improvments, increased test coverage too!

---
 onihime/Cargo.toml         |  3 ++
 onihime/src/lexer/error.rs |  8 ++--
 onihime/src/lexer/mod.rs   | 89 +++++++++++++++++++++-----------------
 onihime/src/span.rs        |  6 +--
 4 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/onihime/Cargo.toml b/onihime/Cargo.toml
index ef48661..76c5369 100644
--- a/onihime/Cargo.toml
+++ b/onihime/Cargo.toml
@@ -8,3 +8,6 @@ repository.workspace = true
 license.workspace    = true
 
 [dependencies]
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs
index 58e6d7c..f50081a 100644
--- a/onihime/src/lexer/error.rs
+++ b/onihime/src/lexer/error.rs
@@ -1,10 +1,8 @@
 use crate::span::Span;
 
 /// Errors that can occur during lexical analysis.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum LexerErrorKind {
-    /// An invalid character was encountered.
-    InvalidChar,
     /// An invalid escape sequence was encountered.
     InvalidEscape(char),
     /// An invalid number was encountered.
@@ -18,7 +16,7 @@ pub enum LexerErrorKind {
 }
 
 /// Lexer error, with a start and end location.
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, PartialEq, Hash)]
 pub struct LexerError {
     /// The type of error encountered.
     pub kind: LexerErrorKind,
@@ -36,12 +34,12 @@ impl LexerError {
 
 impl std::error::Error for LexerError {}
 
+#[cfg(not(tarpaulin_include))]
 impl std::fmt::Display for LexerError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         use LexerErrorKind::*;
 
         match &self.kind {
-            InvalidChar => write!(f, "Invalid character literal"),
             InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
             InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
             InvalidString => write!(f, "Invalid string literal"),
diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs
index a4b53a7..c725a91 100644
--- a/onihime/src/lexer/mod.rs
+++ b/onihime/src/lexer/mod.rs
@@ -92,6 +92,14 @@ impl<'lexer> Lexer<'lexer> {
         Some(c)
     }
 
+    /// Advance the lexer by one character, and then return the specified
+    /// `TokenKind`:
+    fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
+        self.advance();
+
+        kind
+    }
+
     /// Read a word from the input until a separator is reached.
     fn read_word(&mut self) -> String {
         let mut word = String::new();
@@ -135,13 +143,12 @@ impl<'lexer> Lexer<'lexer> {
         let kind = match c {
             '#' if matches!(self.peek(1), Some('|')) => {
                 self.advance(); // '#'
-                self.advance(); // '|#'
+                self.advance(); // '|'
 
                 let mut comment = String::new();
                 while let Some(c) = self.advance() {
                     match c {
                         '|' if matches!(self.peek(0), Some('#')) => {
-                            self.advance(); // '|'
                             self.advance(); // '#'
                             break;
                         }
@@ -154,44 +161,29 @@ impl<'lexer> Lexer<'lexer> {
                 TokenKind::BlockComment(comment.trim().into())
             }
             ';' => {
+                while self.current().is_some_and(|c| c == ';') {
+                    self.advance();
+                }
+
                 let mut comment = String::new();
                 while let Some(c) = self.advance() {
-                    match c {
-                        ';' => continue,
-                        '\n' => break,
-                        c => {
-                            comment.push(c);
-                        }
+                    if c == '\n' {
+                        break;
                     }
+
+                    comment.push(c);
                 }
 
                 TokenKind::LineComment(comment.trim().into())
             }
 
-            '(' => {
-                self.advance();
-                TokenKind::OpenParen
-            }
-            ')' => {
-                self.advance();
-                TokenKind::CloseParen
-            }
-            '{' => {
-                self.advance();
-                TokenKind::OpenBrace
-            }
-            '}' => {
-                self.advance();
-                TokenKind::CloseBrace
-            }
-            '[' => {
-                self.advance();
-                TokenKind::OpenBracket
-            }
-            ']' => {
-                self.advance();
-                TokenKind::CloseBracket
-            }
+            '(' => self.advance_and(TokenKind::OpenParen),
+            ')' => self.advance_and(TokenKind::CloseParen),
+            '{' => self.advance_and(TokenKind::OpenBrace),
+            '}' => self.advance_and(TokenKind::CloseBrace),
+            '[' => self.advance_and(TokenKind::OpenBracket),
+            ']' => self.advance_and(TokenKind::CloseBracket),
+
             '#' if matches!(self.peek(1), Some('{')) => {
                 self.advance(); // '#'
                 self.advance(); // '{'
@@ -207,6 +199,7 @@ impl<'lexer> Lexer<'lexer> {
                         Some('n') => '\n',
                         Some('e') => '\x1b',
                         Some(c) => {
+                            self.read_word(); // Recover from the error
                             return Err(LexerError::new(
                                 LexerErrorKind::InvalidEscape(c),
                                 span.join(&self.span()),
@@ -229,9 +222,9 @@ impl<'lexer> Lexer<'lexer> {
                 };
 
                 if self.advance() != Some('\'') {
-                    self.read_word();
+                    self.read_word(); // Recover from the error
                     return Err(LexerError::new(
-                        LexerErrorKind::InvalidChar,
+                        LexerErrorKind::UnclosedChar,
                         span.join(&self.span()),
                     ));
                 }
@@ -277,7 +270,7 @@ impl<'lexer> Lexer<'lexer> {
                 }
 
                 if self.current().is_some_and(|c| !c.is_separator()) {
-                    self.read_word();
+                    self.read_word(); // Recover from the error
                     return Err(LexerError::new(
                         LexerErrorKind::InvalidString,
                         span.join(&self.span()),
@@ -339,13 +332,17 @@ mod tests {
         };
     }
 
-    test!(block_comment: "#| foo\nbar |#", [
-        Ok(TokenKind::BlockComment("foo\nbar".into()))
+    test!(block_comment: "#| foo\nbar |#(- 1)", [
+        Ok(TokenKind::BlockComment("foo\nbar".into())),
+        Ok(TokenKind::OpenParen),
+        Ok(TokenKind::Symbol(Symbol::from("-"))),
+        Ok(TokenKind::Number(1.0)),
+        Ok(TokenKind::CloseParen),
     ]);
 
-    test!(line_comment: "; foo\n;; bar baz qux", [
+    test!(line_comment: "; foo\n;; bar baz; qux", [
         Ok(TokenKind::LineComment("foo".into())),
-        Ok(TokenKind::LineComment("bar baz qux".into())),
+        Ok(TokenKind::LineComment("bar baz; qux".into())),
     ]);
 
     test!(hashset: "{{} true false}", [
@@ -369,9 +366,21 @@ mod tests {
         Ok(TokenKind::CloseParen),
     ]);
 
-    test!(char_literal: "'x' '\n'", [
+    test!(vector: "[0 1 2]", [
+        Ok(TokenKind::OpenBracket),
+        Ok(TokenKind::Number(0.0)),
+        Ok(TokenKind::Number(1.0)),
+        Ok(TokenKind::Number(2.0)),
+        Ok(TokenKind::CloseBracket),
+    ]);
+
+    test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [
         Ok(TokenKind::Char('x')),
         Ok(TokenKind::Char('\n')),
+        Ok(TokenKind::Char('\x1b')),
+        Ok(TokenKind::Char('\\')),
+        Err(LexerErrorKind::InvalidEscape('q')),
+        Err(LexerErrorKind::UnclosedChar),
     ]);
 
     test!(lex: "(+ 14 25.5 333 (* 2 5))", [
diff --git a/onihime/src/span.rs b/onihime/src/span.rs
index 1c78a8b..22d61c5 100644
--- a/onihime/src/span.rs
+++ b/onihime/src/span.rs
@@ -1,7 +1,7 @@
 use std::{cmp::Ordering, iter, ops::Range, sync::Arc};
 
 /// A location within some source text.
-#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
 pub struct Location {
     line: usize,
     column: usize,
@@ -25,7 +25,7 @@ impl PartialOrd for Location {
 }
 
 /// Some (optionally named) source text.
-#[derive(Debug, Default, Clone, PartialEq, Eq)]
+#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
 pub struct Source {
     name: Option<String>,
     contents: String,
@@ -87,7 +87,7 @@ impl Source {
 }
 
 /// A contiguous sequence of bytes within some source.
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Default, Clone, Hash)]
 pub struct Span {
     bytes: Range<usize>,
     source: Arc<Source>,