Lexer is starting to look pretty okay

2024-12-06 18:27:45 +01:00 · 2024-12-06 18:27:45 +01:00 · 4cdbccbc8a
commit 4cdbccbc8a
parent 11917bb183
4 changed files with 61 additions and 52 deletions
--- a/onihime/src/lexer/error.rs
+++ b/onihime/src/lexer/error.rs
@ -1,13 +1,13 @@
 use crate::span::Span;

-/// Errors that can occur during lexical analysis.
+/// Kinds of errors that may occur during lexical analysis.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum LexerErrorKind {
    /// An invalid escape sequence was encountered.
    InvalidEscape(char),
-    /// An invalid number was encountered.
+    /// An invalid numeric literal was encountered.
    InvalidNumber(String),
-    /// An invalid string was encountered.
+    /// An invalid string literal was encountered.
    InvalidString,
    /// An unclosed character literal was encountered.
    UnclosedChar,
@ -15,10 +15,16 @@ pub enum LexerErrorKind {
    UnclosedString,
 }

-/// Lexer error, with a start and end location.
+/// An error which occurred during lexical analysis.
+///
+/// `LexerError`s contain the kind of error which occurred, as well as a [Span]
+/// specifying the [Source] and [Location] of the error.
+///
+/// [Source]: crate::span::Source
+/// [Location]: crate::span::Location
 #[derive(Debug, Clone, PartialEq, Hash)]
 pub struct LexerError {
-    /// The type of error encountered.
+    /// The kind of error encountered.
    pub kind: LexerErrorKind,
    /// The span in which the error occurred.
    pub span: Span,
--- a/onihime/src/lexer/mod.rs
+++ b/onihime/src/lexer/mod.rs
@ -94,6 +94,7 @@ impl<'lexer> Lexer<'lexer> {

    /// Advance the lexer by one character, and then return the specified
    /// `TokenKind`:
+    #[must_use]
    fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
        self.advance();

@ -129,6 +130,8 @@ impl<'lexer> Lexer<'lexer> {

    /// Read the next token from the input.
    pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
+        // Eat whitespace until we encounter a meaningful character, or simply return if
+        // we have reached the end of input and no additional characters can be read:
        let c = loop {
            match self.current() {
                Some(c) if c.is_ascii_whitespace() => {
@ -141,26 +144,25 @@ impl<'lexer> Lexer<'lexer> {

        let mut span = self.span();
        let kind = match c {
-            '#' if matches!(self.peek(1), Some('|')) => {
+            '#' if self.peek(1) == Some('|') => {
                self.advance(); // '#'
                self.advance(); // '|'

                let mut comment = String::new();
                while let Some(c) = self.advance() {
-                    match c {
-                        '|' if matches!(self.peek(0), Some('#')) => {
+                    if c == '|' && matches!(self.peek(0), Some('#')) {
                        self.advance(); // '#'
                        break;
                    }
-                        c => {
+
                    comment.push(c);
                }
-                    }
-                }

                TokenKind::BlockComment(comment.trim().into())
            }
            ';' => {
+                // Line comments may start with any number of semicolons, so consume however
+                // many are present at the beginning of the comment:
                while self.current().is_some_and(|c| c == ';') {
                    self.advance();
                }
@ -183,10 +185,10 @@ impl<'lexer> Lexer<'lexer> {
            '}' => self.advance_and(TokenKind::CloseBrace),
            '[' => self.advance_and(TokenKind::OpenBracket),
            ']' => self.advance_and(TokenKind::CloseBracket),
-
-            '#' if matches!(self.peek(1), Some('{')) => {
+            '#' if self.peek(1) == Some('{') => {
                self.advance(); // '#'
                self.advance(); // '{'
+
                TokenKind::OpenHashBrace
            }

@ -281,6 +283,7 @@ impl<'lexer> Lexer<'lexer> {
            }
            ':' => {
                self.advance();
+
                TokenKind::Keyword(Symbol(self.read_word()))
            }
            _ => {
@ -354,16 +357,13 @@ mod tests {
        Ok(TokenKind::CloseBrace),
    ]);

-    test!(hashmap: "(foo #{:bar 0 :baz 1})", [
-        Ok(TokenKind::OpenParen),
-        Ok(TokenKind::Symbol(Symbol::from("foo"))),
+    test!(hashmap: "#{:bar 0 :baz 1}", [
        Ok(TokenKind::OpenHashBrace),
        Ok(TokenKind::Keyword(Symbol::from("bar"))),
        Ok(TokenKind::Number(0.0)),
        Ok(TokenKind::Keyword(Symbol::from("baz"))),
        Ok(TokenKind::Number(1.0)),
        Ok(TokenKind::CloseBrace),
-        Ok(TokenKind::CloseParen),
    ]);

    test!(vector: "[0 1 2]", [
@ -383,7 +383,7 @@ mod tests {
        Err(LexerErrorKind::UnclosedChar),
    ]);

-    test!(lex: "(+ 14 25.5 333 (* 2 5))", [
+    test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [
        Ok(TokenKind::OpenParen),
        Ok(TokenKind::Symbol(Symbol::from("+"))),
        Ok(TokenKind::Number(14.0)),
@ -421,6 +421,14 @@ mod tests {
        Ok(TokenKind::CloseParen),
    ]);

+    test!(error_unclosed_char_escape: r"'\", [
+        Err(LexerErrorKind::UnclosedChar),
+    ]);
+
+    test!(error_unclosed_char_empty: r"'", [
+        Err(LexerErrorKind::UnclosedChar),
+    ]);
+
    test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
        Ok(TokenKind::Number(2.0)),
        Ok(TokenKind::Number(55.0)),
--- a/onihime/src/lexer/symbol.rs
+++ b/onihime/src/lexer/symbol.rs
@ -12,20 +12,9 @@ impl Symbol {
    }
 }

+#[cfg(not(tarpaulin_include))]
 impl std::fmt::Display for Symbol {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn display() {
-        assert_eq!(Symbol::from("foo").to_string(), "foo");
-        assert_eq!(Symbol::from("+").to_string(), "+");
-        assert_eq!(Symbol::from("bar0").to_string(), "bar0");
-    }
-}
--- a/onihime/src/lexer/token.rs
+++ b/onihime/src/lexer/token.rs
@ -1,49 +1,55 @@
 use super::Symbol;
 use crate::span::Span;

-/// The type of a [Token].
+/// Possible kinds of a [Token].
 #[derive(Debug, Clone, PartialEq)]
 pub enum TokenKind {
-    /// Block comment, e.g. '#| ... |#'
+    /// Block comment, e.g. `#| ... |#`
    BlockComment(String),
-    /// Line comment, e.g. '; ...'
+    /// Line comment, e.g. `; ...`
    LineComment(String),

-    /// Opening parenthesis, e.g. '('
+    /// Opening parenthesis, e.g. `(`
    OpenParen,
-    /// Closing parenthesis, e.g. ')'
+    /// Closing parenthesis, e.g. `)`
    CloseParen,
-    /// Opening brace, e.g. '{'
+    /// Opening brace, e.g. `{`
    OpenBrace,
-    /// Closing brace, e.g. '}'
+    /// Closing brace, e.g. `}`
    CloseBrace,
-    /// Opening bracket, e.g. '['
+    /// Opening bracket, e.g. `[`
    OpenBracket,
-    /// Closing bracket, e.g. ']'
+    /// Closing bracket, e.g. `]`
    CloseBracket,
-    /// Opening hash-brace, e.g. '#{'
+    /// Opening hash-brace, e.g. `#{`
    OpenHashBrace,

-    /// Boolean, e.g. 'true', 'false'
+    /// Boolean, e.g. `true`, `false`
    Bool(bool),
-    /// Character, e.g. 'c', '\n'
+    /// Character, e.g. `'c'`, `'\n'`
    Char(char),
-    /// Number, e.g. '1', '2.0', '0.003'
+    /// Number, e.g. `1`, `2.0`, `0.003`
    Number(f64),
-    /// String, e.g. '"foo bar"'
+    /// String, e.g. `"foo bar"`
    String(String),
-    /// Keyword, e.g. ':baz'
+    /// Keyword, e.g. `:baz`
    Keyword(Symbol),
-    /// Symbol, e.g. 'qux', '+'
+    /// Symbol, e.g. `qux`, `+`
    Symbol(Symbol),
-    /// Nil, e.g. 'nil'
+    /// Nil, e.g. `nil`
    Nil,
 }

-/// A token with a start and end location.
+/// A token encountered during lexical analysis.
+///
+/// `Token`s contain the kind of token which was found, as well as a [Span]
+/// specifying the [Source] and [Location] of the token.
+///
+/// [Source]: crate::span::Source
+/// [Location]: crate::span::Location
 #[derive(Debug, Clone, PartialEq)]
 pub struct Token {
-    /// The type of token.
+    /// The kind of token.
    pub kind: TokenKind,
    /// The span in which the token occurs.
    pub span: Span,