Lexer is starting to look pretty okay

This commit is contained in:
Jesse Braham 2024-12-06 18:27:45 +01:00
parent 11917bb183
commit 4cdbccbc8a
4 changed files with 61 additions and 52 deletions

View File

@ -1,13 +1,13 @@
use crate::span::Span; use crate::span::Span;
/// Errors that can occur during lexical analysis. /// Kinds of errors that may occur during lexical analysis.
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LexerErrorKind { pub enum LexerErrorKind {
/// An invalid escape sequence was encountered. /// An invalid escape sequence was encountered.
InvalidEscape(char), InvalidEscape(char),
/// An invalid number was encountered. /// An invalid numeric literal was encountered.
InvalidNumber(String), InvalidNumber(String),
/// An invalid string was encountered. /// An invalid string literal was encountered.
InvalidString, InvalidString,
/// An unclosed character literal was encountered. /// An unclosed character literal was encountered.
UnclosedChar, UnclosedChar,
@ -15,10 +15,16 @@ pub enum LexerErrorKind {
UnclosedString, UnclosedString,
} }
/// Lexer error, with a start and end location. /// An error which occurred during lexical analysis.
///
/// `LexerError`s contain the kind of error which occurred, as well as a [Span]
/// specifying the [Source] and [Location] of the error.
///
/// [Source]: crate::span::Source
/// [Location]: crate::span::Location
#[derive(Debug, Clone, PartialEq, Hash)] #[derive(Debug, Clone, PartialEq, Hash)]
pub struct LexerError { pub struct LexerError {
/// The type of error encountered. /// The kind of error encountered.
pub kind: LexerErrorKind, pub kind: LexerErrorKind,
/// The span in which the error occurred. /// The span in which the error occurred.
pub span: Span, pub span: Span,

View File

@ -94,6 +94,7 @@ impl<'lexer> Lexer<'lexer> {
/// Advance the lexer by one character, and then return the specified /// Advance the lexer by one character, and then return the specified
/// `TokenKind`: /// `TokenKind`:
#[must_use]
fn advance_and(&mut self, kind: TokenKind) -> TokenKind { fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
self.advance(); self.advance();
@ -129,6 +130,8 @@ impl<'lexer> Lexer<'lexer> {
/// Read the next token from the input. /// Read the next token from the input.
pub fn read(&mut self) -> Result<Option<Token>, LexerError> { pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
// Eat whitespace until we encounter a meaningful character, or simply return if
// we have reached the end of input and no additional characters can be read:
let c = loop { let c = loop {
match self.current() { match self.current() {
Some(c) if c.is_ascii_whitespace() => { Some(c) if c.is_ascii_whitespace() => {
@ -141,26 +144,25 @@ impl<'lexer> Lexer<'lexer> {
let mut span = self.span(); let mut span = self.span();
let kind = match c { let kind = match c {
'#' if matches!(self.peek(1), Some('|')) => { '#' if self.peek(1) == Some('|') => {
self.advance(); // '#' self.advance(); // '#'
self.advance(); // '|' self.advance(); // '|'
let mut comment = String::new(); let mut comment = String::new();
while let Some(c) = self.advance() { while let Some(c) = self.advance() {
match c { if c == '|' && matches!(self.peek(0), Some('#')) {
'|' if matches!(self.peek(0), Some('#')) => {
self.advance(); // '#' self.advance(); // '#'
break; break;
} }
c => {
comment.push(c); comment.push(c);
} }
}
}
TokenKind::BlockComment(comment.trim().into()) TokenKind::BlockComment(comment.trim().into())
} }
';' => { ';' => {
// Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment:
while self.current().is_some_and(|c| c == ';') { while self.current().is_some_and(|c| c == ';') {
self.advance(); self.advance();
} }
@ -183,10 +185,10 @@ impl<'lexer> Lexer<'lexer> {
'}' => self.advance_and(TokenKind::CloseBrace), '}' => self.advance_and(TokenKind::CloseBrace),
'[' => self.advance_and(TokenKind::OpenBracket), '[' => self.advance_and(TokenKind::OpenBracket),
']' => self.advance_and(TokenKind::CloseBracket), ']' => self.advance_and(TokenKind::CloseBracket),
'#' if self.peek(1) == Some('{') => {
'#' if matches!(self.peek(1), Some('{')) => {
self.advance(); // '#' self.advance(); // '#'
self.advance(); // '{' self.advance(); // '{'
TokenKind::OpenHashBrace TokenKind::OpenHashBrace
} }
@ -281,6 +283,7 @@ impl<'lexer> Lexer<'lexer> {
} }
':' => { ':' => {
self.advance(); self.advance();
TokenKind::Keyword(Symbol(self.read_word())) TokenKind::Keyword(Symbol(self.read_word()))
} }
_ => { _ => {
@ -354,16 +357,13 @@ mod tests {
Ok(TokenKind::CloseBrace), Ok(TokenKind::CloseBrace),
]); ]);
test!(hashmap: "(foo #{:bar 0 :baz 1})", [ test!(hashmap: "#{:bar 0 :baz 1}", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("foo"))),
Ok(TokenKind::OpenHashBrace), Ok(TokenKind::OpenHashBrace),
Ok(TokenKind::Keyword(Symbol::from("bar"))), Ok(TokenKind::Keyword(Symbol::from("bar"))),
Ok(TokenKind::Number(0.0)), Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Keyword(Symbol::from("baz"))), Ok(TokenKind::Keyword(Symbol::from("baz"))),
Ok(TokenKind::Number(1.0)), Ok(TokenKind::Number(1.0)),
Ok(TokenKind::CloseBrace), Ok(TokenKind::CloseBrace),
Ok(TokenKind::CloseParen),
]); ]);
test!(vector: "[0 1 2]", [ test!(vector: "[0 1 2]", [
@ -383,7 +383,7 @@ mod tests {
Err(LexerErrorKind::UnclosedChar), Err(LexerErrorKind::UnclosedChar),
]); ]);
test!(lex: "(+ 14 25.5 333 (* 2 5))", [ test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [
Ok(TokenKind::OpenParen), Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))), Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::Number(14.0)), Ok(TokenKind::Number(14.0)),
@ -421,6 +421,14 @@ mod tests {
Ok(TokenKind::CloseParen), Ok(TokenKind::CloseParen),
]); ]);
test!(error_unclosed_char_escape: r"'\", [
Err(LexerErrorKind::UnclosedChar),
]);
test!(error_unclosed_char_empty: r"'", [
Err(LexerErrorKind::UnclosedChar),
]);
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [ test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
Ok(TokenKind::Number(2.0)), Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(55.0)), Ok(TokenKind::Number(55.0)),

View File

@ -12,20 +12,9 @@ impl Symbol {
} }
} }
#[cfg(not(tarpaulin_include))]
impl std::fmt::Display for Symbol { impl std::fmt::Display for Symbol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0) write!(f, "{}", self.0)
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn display() {
assert_eq!(Symbol::from("foo").to_string(), "foo");
assert_eq!(Symbol::from("+").to_string(), "+");
assert_eq!(Symbol::from("bar0").to_string(), "bar0");
}
}

View File

@ -1,49 +1,55 @@
use super::Symbol; use super::Symbol;
use crate::span::Span; use crate::span::Span;
/// The type of a [Token]. /// Possible kinds of a [Token].
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum TokenKind { pub enum TokenKind {
/// Block comment, e.g. '#| ... |#' /// Block comment, e.g. `#| ... |#`
BlockComment(String), BlockComment(String),
/// Line comment, e.g. '; ...' /// Line comment, e.g. `; ...`
LineComment(String), LineComment(String),
/// Opening parenthesis, e.g. '(' /// Opening parenthesis, e.g. `(`
OpenParen, OpenParen,
/// Closing parenthesis, e.g. ')' /// Closing parenthesis, e.g. `)`
CloseParen, CloseParen,
/// Opening brace, e.g. '{' /// Opening brace, e.g. `{`
OpenBrace, OpenBrace,
/// Closing brace, e.g. '}' /// Closing brace, e.g. `}`
CloseBrace, CloseBrace,
/// Opening bracket, e.g. '[' /// Opening bracket, e.g. `[`
OpenBracket, OpenBracket,
/// Closing bracket, e.g. ']' /// Closing bracket, e.g. `]`
CloseBracket, CloseBracket,
/// Opening hash-brace, e.g. '#{' /// Opening hash-brace, e.g. `#{`
OpenHashBrace, OpenHashBrace,
/// Boolean, e.g. 'true', 'false' /// Boolean, e.g. `true`, `false`
Bool(bool), Bool(bool),
/// Character, e.g. 'c', '\n' /// Character, e.g. `'c'`, `'\n'`
Char(char), Char(char),
/// Number, e.g. '1', '2.0', '0.003' /// Number, e.g. `1`, `2.0`, `0.003`
Number(f64), Number(f64),
/// String, e.g. '"foo bar"' /// String, e.g. `"foo bar"`
String(String), String(String),
/// Keyword, e.g. ':baz' /// Keyword, e.g. `:baz`
Keyword(Symbol), Keyword(Symbol),
/// Symbol, e.g. 'qux', '+' /// Symbol, e.g. `qux`, `+`
Symbol(Symbol), Symbol(Symbol),
/// Nil, e.g. 'nil' /// Nil, e.g. `nil`
Nil, Nil,
} }
/// A token with a start and end location. /// A token encountered during lexical analysis.
///
/// `Token`s contain the kind of token which was found, as well as a [Span]
/// specifying the [Source] and [Location] of the token.
///
/// [Source]: crate::span::Source
/// [Location]: crate::span::Location
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub struct Token { pub struct Token {
/// The type of token. /// The kind of token.
pub kind: TokenKind, pub kind: TokenKind,
/// The span in which the token occurs. /// The span in which the token occurs.
pub span: Span, pub span: Span,