From 4cdbccbc8a2f49cc477b56fd0d2e5605f20096b7 Mon Sep 17 00:00:00 2001 From: Jesse Braham Date: Fri, 6 Dec 2024 18:27:45 +0100 Subject: [PATCH] Lexer is starting to look pretty okay --- onihime/src/lexer/error.rs | 16 +++++++++----- onihime/src/lexer/mod.rs | 40 +++++++++++++++++++-------------- onihime/src/lexer/symbol.rs | 13 +---------- onihime/src/lexer/token.rs | 44 +++++++++++++++++++++---------------- 4 files changed, 61 insertions(+), 52 deletions(-) diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs index f50081a..303dbc7 100644 --- a/onihime/src/lexer/error.rs +++ b/onihime/src/lexer/error.rs @@ -1,13 +1,13 @@ use crate::span::Span; -/// Errors that can occur during lexical analysis. +/// Kinds of errors that may occur during lexical analysis. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum LexerErrorKind { /// An invalid escape sequence was encountered. InvalidEscape(char), - /// An invalid number was encountered. + /// An invalid numeric literal was encountered. InvalidNumber(String), - /// An invalid string was encountered. + /// An invalid string literal was encountered. InvalidString, /// An unclosed character literal was encountered. UnclosedChar, @@ -15,10 +15,16 @@ pub enum LexerErrorKind { UnclosedString, } -/// Lexer error, with a start and end location. +/// An error which occurred during lexical analysis. +/// +/// `LexerError`s contain the kind of error which occurred, as well as a [Span] +/// specifying the [Source] and [Location] of the error. +/// +/// [Source]: crate::span::Source +/// [Location]: crate::span::Location #[derive(Debug, Clone, PartialEq, Hash)] pub struct LexerError { - /// The type of error encountered. + /// The kind of error encountered. pub kind: LexerErrorKind, /// The span in which the error occurred. pub span: Span, diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs index c725a91..90d11b0 100644 --- a/onihime/src/lexer/mod.rs +++ b/onihime/src/lexer/mod.rs @@ -94,6 +94,7 @@ impl<'lexer> Lexer<'lexer> { /// Advance the lexer by one character, and then return the specified /// `TokenKind`: + #[must_use] fn advance_and(&mut self, kind: TokenKind) -> TokenKind { self.advance(); @@ -129,6 +130,8 @@ impl<'lexer> Lexer<'lexer> { /// Read the next token from the input. pub fn read(&mut self) -> Result, LexerError> { + // Eat whitespace until we encounter a meaningful character, or simply return if + // we have reached the end of input and no additional characters can be read: let c = loop { match self.current() { Some(c) if c.is_ascii_whitespace() => { @@ -141,26 +144,25 @@ impl<'lexer> Lexer<'lexer> { let mut span = self.span(); let kind = match c { - '#' if matches!(self.peek(1), Some('|')) => { + '#' if self.peek(1) == Some('|') => { self.advance(); // '#' self.advance(); // '|' let mut comment = String::new(); while let Some(c) = self.advance() { - match c { - '|' if matches!(self.peek(0), Some('#')) => { - self.advance(); // '#' - break; - } - c => { - comment.push(c); - } + if c == '|' && matches!(self.peek(0), Some('#')) { + self.advance(); // '#' + break; } + + comment.push(c); } TokenKind::BlockComment(comment.trim().into()) } ';' => { + // Line comments may start with any number of semicolons, so consume however + // many are present at the beginning of the comment: while self.current().is_some_and(|c| c == ';') { self.advance(); } @@ -183,10 +185,10 @@ impl<'lexer> Lexer<'lexer> { '}' => self.advance_and(TokenKind::CloseBrace), '[' => self.advance_and(TokenKind::OpenBracket), ']' => self.advance_and(TokenKind::CloseBracket), - - '#' if matches!(self.peek(1), Some('{')) => { + '#' if self.peek(1) == Some('{') => { self.advance(); // '#' self.advance(); // '{' + TokenKind::OpenHashBrace } @@ -281,6 +283,7 @@ impl<'lexer> Lexer<'lexer> { } ':' => { self.advance(); + TokenKind::Keyword(Symbol(self.read_word())) } _ => { @@ -354,16 +357,13 @@ mod tests { Ok(TokenKind::CloseBrace), ]); - test!(hashmap: "(foo #{:bar 0 :baz 1})", [ - Ok(TokenKind::OpenParen), - Ok(TokenKind::Symbol(Symbol::from("foo"))), + test!(hashmap: "#{:bar 0 :baz 1}", [ Ok(TokenKind::OpenHashBrace), Ok(TokenKind::Keyword(Symbol::from("bar"))), Ok(TokenKind::Number(0.0)), Ok(TokenKind::Keyword(Symbol::from("baz"))), Ok(TokenKind::Number(1.0)), Ok(TokenKind::CloseBrace), - Ok(TokenKind::CloseParen), ]); test!(vector: "[0 1 2]", [ @@ -383,7 +383,7 @@ mod tests { Err(LexerErrorKind::UnclosedChar), ]); - test!(lex: "(+ 14 25.5 333 (* 2 5))", [ + test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [ Ok(TokenKind::OpenParen), Ok(TokenKind::Symbol(Symbol::from("+"))), Ok(TokenKind::Number(14.0)), @@ -421,6 +421,14 @@ mod tests { Ok(TokenKind::CloseParen), ]); + test!(error_unclosed_char_escape: r"'\", [ + Err(LexerErrorKind::UnclosedChar), + ]); + + test!(error_unclosed_char_empty: r"'", [ + Err(LexerErrorKind::UnclosedChar), + ]); + test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [ Ok(TokenKind::Number(2.0)), Ok(TokenKind::Number(55.0)), diff --git a/onihime/src/lexer/symbol.rs b/onihime/src/lexer/symbol.rs index 0eac9fa..b33cc5b 100644 --- a/onihime/src/lexer/symbol.rs +++ b/onihime/src/lexer/symbol.rs @@ -12,20 +12,9 @@ impl Symbol { } } +#[cfg(not(tarpaulin_include))] impl std::fmt::Display for Symbol { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn display() { - assert_eq!(Symbol::from("foo").to_string(), "foo"); - assert_eq!(Symbol::from("+").to_string(), "+"); - assert_eq!(Symbol::from("bar0").to_string(), "bar0"); - } -} diff --git a/onihime/src/lexer/token.rs b/onihime/src/lexer/token.rs index 6ebc0ee..1263c14 100644 --- a/onihime/src/lexer/token.rs +++ b/onihime/src/lexer/token.rs @@ -1,49 +1,55 @@ use super::Symbol; use crate::span::Span; -/// The type of a [Token]. +/// Possible kinds of a [Token]. #[derive(Debug, Clone, PartialEq)] pub enum TokenKind { - /// Block comment, e.g. '#| ... |#' + /// Block comment, e.g. `#| ... |#` BlockComment(String), - /// Line comment, e.g. '; ...' + /// Line comment, e.g. `; ...` LineComment(String), - /// Opening parenthesis, e.g. '(' + /// Opening parenthesis, e.g. `(` OpenParen, - /// Closing parenthesis, e.g. ')' + /// Closing parenthesis, e.g. `)` CloseParen, - /// Opening brace, e.g. '{' + /// Opening brace, e.g. `{` OpenBrace, - /// Closing brace, e.g. '}' + /// Closing brace, e.g. `}` CloseBrace, - /// Opening bracket, e.g. '[' + /// Opening bracket, e.g. `[` OpenBracket, - /// Closing bracket, e.g. ']' + /// Closing bracket, e.g. `]` CloseBracket, - /// Opening hash-brace, e.g. '#{' + /// Opening hash-brace, e.g. `#{` OpenHashBrace, - /// Boolean, e.g. 'true', 'false' + /// Boolean, e.g. `true`, `false` Bool(bool), - /// Character, e.g. 'c', '\n' + /// Character, e.g. `'c'`, `'\n'` Char(char), - /// Number, e.g. '1', '2.0', '0.003' + /// Number, e.g. `1`, `2.0`, `0.003` Number(f64), - /// String, e.g. '"foo bar"' + /// String, e.g. `"foo bar"` String(String), - /// Keyword, e.g. ':baz' + /// Keyword, e.g. `:baz` Keyword(Symbol), - /// Symbol, e.g. 'qux', '+' + /// Symbol, e.g. `qux`, `+` Symbol(Symbol), - /// Nil, e.g. 'nil' + /// Nil, e.g. `nil` Nil, } -/// A token with a start and end location. +/// A token encountered during lexical analysis. +/// +/// `Token`s contain the kind of token which was found, as well as a [Span] +/// specifying the [Source] and [Location] of the token. +/// +/// [Source]: crate::span::Source +/// [Location]: crate::span::Location #[derive(Debug, Clone, PartialEq)] pub struct Token { - /// The type of token. + /// The kind of token. pub kind: TokenKind, /// The span in which the token occurs. pub span: Span,