From 4cdbccbc8a2f49cc477b56fd0d2e5605f20096b7 Mon Sep 17 00:00:00 2001
From: Jesse Braham <jesse@beta7.io>
Date: Fri, 6 Dec 2024 18:27:45 +0100
Subject: [PATCH] Lexer is starting to look pretty okay

---
 onihime/src/lexer/error.rs  | 16 +++++++++-----
 onihime/src/lexer/mod.rs    | 40 +++++++++++++++++++--------------
 onihime/src/lexer/symbol.rs | 13 +----------
 onihime/src/lexer/token.rs  | 44 +++++++++++++++++++++----------------
 4 files changed, 61 insertions(+), 52 deletions(-)
diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs
index f50081a..303dbc7 100644
--- a/onihime/src/lexer/error.rs
+++ b/onihime/src/lexer/error.rs
@@ -1,13 +1,13 @@
 use crate::span::Span;
 
-/// Errors that can occur during lexical analysis.
+/// Kinds of errors that may occur during lexical analysis.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum LexerErrorKind {
     /// An invalid escape sequence was encountered.
     InvalidEscape(char),
-    /// An invalid number was encountered.
+    /// An invalid numeric literal was encountered.
     InvalidNumber(String),
-    /// An invalid string was encountered.
+    /// An invalid string literal was encountered.
     InvalidString,
     /// An unclosed character literal was encountered.
     UnclosedChar,
@@ -15,10 +15,16 @@ pub enum LexerErrorKind {
     UnclosedString,
 }
 
-/// Lexer error, with a start and end location.
+/// An error which occurred during lexical analysis.
+///
+/// `LexerError`s contain the kind of error which occurred, as well as a [Span]
+/// specifying the [Source] and [Location] of the error.
+///
+/// [Source]: crate::span::Source
+/// [Location]: crate::span::Location
 #[derive(Debug, Clone, PartialEq, Hash)]
 pub struct LexerError {
-    /// The type of error encountered.
+    /// The kind of error encountered.
     pub kind: LexerErrorKind,
     /// The span in which the error occurred.
     pub span: Span,
diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs
index c725a91..90d11b0 100644
--- a/onihime/src/lexer/mod.rs
+++ b/onihime/src/lexer/mod.rs
@@ -94,6 +94,7 @@ impl<'lexer> Lexer<'lexer> {
 
     /// Advance the lexer by one character, and then return the specified
     /// `TokenKind`:
+    #[must_use]
     fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
         self.advance();
 
@@ -129,6 +130,8 @@ impl<'lexer> Lexer<'lexer> {
 
     /// Read the next token from the input.
     pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
+        // Eat whitespace until we encounter a meaningful character, or simply return if
+        // we have reached the end of input and no additional characters can be read:
         let c = loop {
             match self.current() {
                 Some(c) if c.is_ascii_whitespace() => {
@@ -141,26 +144,25 @@ impl<'lexer> Lexer<'lexer> {
 
         let mut span = self.span();
         let kind = match c {
-            '#' if matches!(self.peek(1), Some('|')) => {
+            '#' if self.peek(1) == Some('|') => {
                 self.advance(); // '#'
                 self.advance(); // '|'
 
                 let mut comment = String::new();
                 while let Some(c) = self.advance() {
-                    match c {
-                        '|' if matches!(self.peek(0), Some('#')) => {
-                            self.advance(); // '#'
-                            break;
-                        }
-                        c => {
-                            comment.push(c);
-                        }
+                    if c == '|' && matches!(self.peek(0), Some('#')) {
+                        self.advance(); // '#'
+                        break;
                     }
+
+                    comment.push(c);
                 }
 
                 TokenKind::BlockComment(comment.trim().into())
             }
             ';' => {
+                // Line comments may start with any number of semicolons, so consume however
+                // many are present at the beginning of the comment:
                 while self.current().is_some_and(|c| c == ';') {
                     self.advance();
                 }
@@ -183,10 +185,10 @@ impl<'lexer> Lexer<'lexer> {
             '}' => self.advance_and(TokenKind::CloseBrace),
             '[' => self.advance_and(TokenKind::OpenBracket),
             ']' => self.advance_and(TokenKind::CloseBracket),
-
-            '#' if matches!(self.peek(1), Some('{')) => {
+            '#' if self.peek(1) == Some('{') => {
                 self.advance(); // '#'
                 self.advance(); // '{'
+
                 TokenKind::OpenHashBrace
             }
 
@@ -281,6 +283,7 @@ impl<'lexer> Lexer<'lexer> {
             }
             ':' => {
                 self.advance();
+
                 TokenKind::Keyword(Symbol(self.read_word()))
             }
             _ => {
@@ -354,16 +357,13 @@ mod tests {
         Ok(TokenKind::CloseBrace),
     ]);
 
-    test!(hashmap: "(foo #{:bar 0 :baz 1})", [
-        Ok(TokenKind::OpenParen),
-        Ok(TokenKind::Symbol(Symbol::from("foo"))),
+    test!(hashmap: "#{:bar 0 :baz 1}", [
         Ok(TokenKind::OpenHashBrace),
         Ok(TokenKind::Keyword(Symbol::from("bar"))),
         Ok(TokenKind::Number(0.0)),
         Ok(TokenKind::Keyword(Symbol::from("baz"))),
         Ok(TokenKind::Number(1.0)),
         Ok(TokenKind::CloseBrace),
-        Ok(TokenKind::CloseParen),
     ]);
 
     test!(vector: "[0 1 2]", [
@@ -383,7 +383,7 @@ mod tests {
         Err(LexerErrorKind::UnclosedChar),
     ]);
 
-    test!(lex: "(+ 14 25.5 333 (* 2 5))", [
+    test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [
         Ok(TokenKind::OpenParen),
         Ok(TokenKind::Symbol(Symbol::from("+"))),
         Ok(TokenKind::Number(14.0)),
@@ -421,6 +421,14 @@ mod tests {
         Ok(TokenKind::CloseParen),
     ]);
 
+    test!(error_unclosed_char_escape: r"'\", [
+        Err(LexerErrorKind::UnclosedChar),
+    ]);
+
+    test!(error_unclosed_char_empty: r"'", [
+        Err(LexerErrorKind::UnclosedChar),
+    ]);
+
     test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
         Ok(TokenKind::Number(2.0)),
         Ok(TokenKind::Number(55.0)),
diff --git a/onihime/src/lexer/symbol.rs b/onihime/src/lexer/symbol.rs
index 0eac9fa..b33cc5b 100644
--- a/onihime/src/lexer/symbol.rs
+++ b/onihime/src/lexer/symbol.rs
@@ -12,20 +12,9 @@ impl Symbol {
     }
 }
 
+#[cfg(not(tarpaulin_include))]
 impl std::fmt::Display for Symbol {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.0)
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn display() {
-        assert_eq!(Symbol::from("foo").to_string(), "foo");
-        assert_eq!(Symbol::from("+").to_string(), "+");
-        assert_eq!(Symbol::from("bar0").to_string(), "bar0");
-    }
-}
diff --git a/onihime/src/lexer/token.rs b/onihime/src/lexer/token.rs
index 6ebc0ee..1263c14 100644
--- a/onihime/src/lexer/token.rs
+++ b/onihime/src/lexer/token.rs
@@ -1,49 +1,55 @@
 use super::Symbol;
 use crate::span::Span;
 
-/// The type of a [Token].
+/// Possible kinds of a [Token].
 #[derive(Debug, Clone, PartialEq)]
 pub enum TokenKind {
-    /// Block comment, e.g. '#| ... |#'
+    /// Block comment, e.g. `#| ... |#`
     BlockComment(String),
-    /// Line comment, e.g. '; ...'
+    /// Line comment, e.g. `; ...`
     LineComment(String),
 
-    /// Opening parenthesis, e.g. '('
+    /// Opening parenthesis, e.g. `(`
     OpenParen,
-    /// Closing parenthesis, e.g. ')'
+    /// Closing parenthesis, e.g. `)`
     CloseParen,
-    /// Opening brace, e.g. '{'
+    /// Opening brace, e.g. `{`
     OpenBrace,
-    /// Closing brace, e.g. '}'
+    /// Closing brace, e.g. `}`
     CloseBrace,
-    /// Opening bracket, e.g. '['
+    /// Opening bracket, e.g. `[`
     OpenBracket,
-    /// Closing bracket, e.g. ']'
+    /// Closing bracket, e.g. `]`
     CloseBracket,
-    /// Opening hash-brace, e.g. '#{'
+    /// Opening hash-brace, e.g. `#{`
     OpenHashBrace,
 
-    /// Boolean, e.g. 'true', 'false'
+    /// Boolean, e.g. `true`, `false`
     Bool(bool),
-    /// Character, e.g. 'c', '\n'
+    /// Character, e.g. `'c'`, `'\n'`
     Char(char),
-    /// Number, e.g. '1', '2.0', '0.003'
+    /// Number, e.g. `1`, `2.0`, `0.003`
     Number(f64),
-    /// String, e.g. '"foo bar"'
+    /// String, e.g. `"foo bar"`
     String(String),
-    /// Keyword, e.g. ':baz'
+    /// Keyword, e.g. `:baz`
     Keyword(Symbol),
-    /// Symbol, e.g. 'qux', '+'
+    /// Symbol, e.g. `qux`, `+`
     Symbol(Symbol),
-    /// Nil, e.g. 'nil'
+    /// Nil, e.g. `nil`
     Nil,
 }
 
-/// A token with a start and end location.
+/// A token encountered during lexical analysis.
+///
+/// `Token`s contain the kind of token which was found, as well as a [Span]
+/// specifying the [Source] and [Location] of the token.
+///
+/// [Source]: crate::span::Source
+/// [Location]: crate::span::Location
 #[derive(Debug, Clone, PartialEq)]
 pub struct Token {
-    /// The type of token.
+    /// The kind of token.
     pub kind: TokenKind,
     /// The span in which the token occurs.
     pub span: Span,