Lexer is starting to look pretty okay

This commit is contained in:
Jesse Braham 2024-12-06 18:27:45 +01:00
parent 11917bb183
commit 4cdbccbc8a
4 changed files with 61 additions and 52 deletions

View File

@ -1,13 +1,13 @@
use crate::span::Span;
/// Errors that can occur during lexical analysis.
/// Kinds of errors that may occur during lexical analysis.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LexerErrorKind {
/// An invalid escape sequence was encountered.
InvalidEscape(char),
/// An invalid number was encountered.
/// An invalid numeric literal was encountered.
InvalidNumber(String),
/// An invalid string was encountered.
/// An invalid string literal was encountered.
InvalidString,
/// An unclosed character literal was encountered.
UnclosedChar,
@ -15,10 +15,16 @@ pub enum LexerErrorKind {
UnclosedString,
}
/// Lexer error, with a start and end location.
/// An error which occurred during lexical analysis.
///
/// `LexerError`s contain the kind of error which occurred, as well as a [Span]
/// specifying the [Source] and [Location] of the error.
///
/// [Source]: crate::span::Source
/// [Location]: crate::span::Location
#[derive(Debug, Clone, PartialEq, Hash)]
pub struct LexerError {
/// The type of error encountered.
/// The kind of error encountered.
pub kind: LexerErrorKind,
/// The span in which the error occurred.
pub span: Span,

View File

@ -94,6 +94,7 @@ impl<'lexer> Lexer<'lexer> {
/// Advance the lexer by one character, and then return the specified
/// `TokenKind`:
#[must_use]
fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
self.advance();
@ -129,6 +130,8 @@ impl<'lexer> Lexer<'lexer> {
/// Read the next token from the input.
pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
// Eat whitespace until we encounter a meaningful character, or simply return if
// we have reached the end of input and no additional characters can be read:
let c = loop {
match self.current() {
Some(c) if c.is_ascii_whitespace() => {
@ -141,26 +144,25 @@ impl<'lexer> Lexer<'lexer> {
let mut span = self.span();
let kind = match c {
'#' if matches!(self.peek(1), Some('|')) => {
'#' if self.peek(1) == Some('|') => {
self.advance(); // '#'
self.advance(); // '|'
let mut comment = String::new();
while let Some(c) = self.advance() {
match c {
'|' if matches!(self.peek(0), Some('#')) => {
if c == '|' && matches!(self.peek(0), Some('#')) {
self.advance(); // '#'
break;
}
c => {
comment.push(c);
}
}
}
TokenKind::BlockComment(comment.trim().into())
}
';' => {
// Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment:
while self.current().is_some_and(|c| c == ';') {
self.advance();
}
@ -183,10 +185,10 @@ impl<'lexer> Lexer<'lexer> {
'}' => self.advance_and(TokenKind::CloseBrace),
'[' => self.advance_and(TokenKind::OpenBracket),
']' => self.advance_and(TokenKind::CloseBracket),
'#' if matches!(self.peek(1), Some('{')) => {
'#' if self.peek(1) == Some('{') => {
self.advance(); // '#'
self.advance(); // '{'
TokenKind::OpenHashBrace
}
@ -281,6 +283,7 @@ impl<'lexer> Lexer<'lexer> {
}
':' => {
self.advance();
TokenKind::Keyword(Symbol(self.read_word()))
}
_ => {
@ -354,16 +357,13 @@ mod tests {
Ok(TokenKind::CloseBrace),
]);
test!(hashmap: "(foo #{:bar 0 :baz 1})", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("foo"))),
test!(hashmap: "#{:bar 0 :baz 1}", [
Ok(TokenKind::OpenHashBrace),
Ok(TokenKind::Keyword(Symbol::from("bar"))),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Keyword(Symbol::from("baz"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::CloseBrace),
Ok(TokenKind::CloseParen),
]);
test!(vector: "[0 1 2]", [
@ -383,7 +383,7 @@ mod tests {
Err(LexerErrorKind::UnclosedChar),
]);
test!(lex: "(+ 14 25.5 333 (* 2 5))", [
test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::Number(14.0)),
@ -421,6 +421,14 @@ mod tests {
Ok(TokenKind::CloseParen),
]);
test!(error_unclosed_char_escape: r"'\", [
Err(LexerErrorKind::UnclosedChar),
]);
test!(error_unclosed_char_empty: r"'", [
Err(LexerErrorKind::UnclosedChar),
]);
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(55.0)),

View File

@ -12,20 +12,9 @@ impl Symbol {
}
}
#[cfg(not(tarpaulin_include))]
impl std::fmt::Display for Symbol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn display() {
assert_eq!(Symbol::from("foo").to_string(), "foo");
assert_eq!(Symbol::from("+").to_string(), "+");
assert_eq!(Symbol::from("bar0").to_string(), "bar0");
}
}

View File

@ -1,49 +1,55 @@
use super::Symbol;
use crate::span::Span;
/// The type of a [Token].
/// Possible kinds of a [Token].
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
/// Block comment, e.g. '#| ... |#'
/// Block comment, e.g. `#| ... |#`
BlockComment(String),
/// Line comment, e.g. '; ...'
/// Line comment, e.g. `; ...`
LineComment(String),
/// Opening parenthesis, e.g. '('
/// Opening parenthesis, e.g. `(`
OpenParen,
/// Closing parenthesis, e.g. ')'
/// Closing parenthesis, e.g. `)`
CloseParen,
/// Opening brace, e.g. '{'
/// Opening brace, e.g. `{`
OpenBrace,
/// Closing brace, e.g. '}'
/// Closing brace, e.g. `}`
CloseBrace,
/// Opening bracket, e.g. '['
/// Opening bracket, e.g. `[`
OpenBracket,
/// Closing bracket, e.g. ']'
/// Closing bracket, e.g. `]`
CloseBracket,
/// Opening hash-brace, e.g. '#{'
/// Opening hash-brace, e.g. `#{`
OpenHashBrace,
/// Boolean, e.g. 'true', 'false'
/// Boolean, e.g. `true`, `false`
Bool(bool),
/// Character, e.g. 'c', '\n'
/// Character, e.g. `'c'`, `'\n'`
Char(char),
/// Number, e.g. '1', '2.0', '0.003'
/// Number, e.g. `1`, `2.0`, `0.003`
Number(f64),
/// String, e.g. '"foo bar"'
/// String, e.g. `"foo bar"`
String(String),
/// Keyword, e.g. ':baz'
/// Keyword, e.g. `:baz`
Keyword(Symbol),
/// Symbol, e.g. 'qux', '+'
/// Symbol, e.g. `qux`, `+`
Symbol(Symbol),
/// Nil, e.g. 'nil'
/// Nil, e.g. `nil`
Nil,
}
/// A token with a start and end location.
/// A token encountered during lexical analysis.
///
/// `Token`s contain the kind of token which was found, as well as a [Span]
/// specifying the [Source] and [Location] of the token.
///
/// [Source]: crate::span::Source
/// [Location]: crate::span::Location
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
/// The type of token.
/// The kind of token.
pub kind: TokenKind,
/// The span in which the token occurs.
pub span: Span,