Miscellaneous improvments/cleanup/tweaks in the lexer module

This commit is contained in:
Jesse Braham 2024-12-20 20:49:46 +01:00
parent 1b021f1a89
commit b7f28b32f0
4 changed files with 33 additions and 18 deletions

View File

@ -46,8 +46,8 @@ impl std::fmt::Display for LexerError {
use LexerErrorKind::*; use LexerErrorKind::*;
match &self.kind { match &self.kind {
InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"), InvalidEscape(c) => write!(f, "Invalid escape sequence '\\{c}'"),
InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"), InvalidNumber(n) => write!(f, "Invalid numeric literal `{n}`"),
InvalidString => write!(f, "Invalid string literal"), InvalidString => write!(f, "Invalid string literal"),
UnclosedChar => write!(f, "Unclosed character literal"), UnclosedChar => write!(f, "Unclosed character literal"),
UnclosedString => write!(f, "Unclosed string literal"), UnclosedString => write!(f, "Unclosed string literal"),

View File

@ -1,6 +1,6 @@
use std::{str::Chars, sync::Arc}; use std::{str::Chars, sync::Arc};
pub use self::{ pub(crate) use self::{
error::{LexerError, LexerErrorKind}, error::{LexerError, LexerErrorKind},
symbol::Symbol, symbol::Symbol,
token::{Token, TokenKind}, token::{Token, TokenKind},
@ -30,7 +30,7 @@ pub(crate) struct Lexer<'lexer> {
impl<'lexer> Lexer<'lexer> { impl<'lexer> Lexer<'lexer> {
/// Create a new lexer instance from a string. /// Create a new lexer instance from a string.
#[must_use] #[must_use]
pub fn new(input: &'lexer str) -> Self { pub(crate) fn new(input: &'lexer str) -> Self {
let source = Arc::new(Source::new(None, input.to_string())); let source = Arc::new(Source::new(None, input.to_string()));
Self { Self {
@ -41,20 +41,21 @@ impl<'lexer> Lexer<'lexer> {
} }
/// Set the name of the [Source] being lexically analyzed. /// Set the name of the [Source] being lexically analyzed.
pub fn set_name(&mut self, name: String) { pub(crate) fn set_name(&mut self, name: String) {
// TODO: Avoid unwrapping here (if possible?) // TODO: Avoid unwrapping here (if possible?)
Arc::get_mut(&mut self.source).unwrap().set_name(name); Arc::get_mut(&mut self.source).unwrap().set_name(name);
} }
/// The source being lexically analyzed. /// The source being lexically analyzed.
#[cfg(test)]
#[must_use] #[must_use]
pub fn source(&self) -> Arc<Source> { pub(crate) fn source(&self) -> Arc<Source> {
self.source.clone() self.source.clone()
} }
/// Get the current position of the lexer. /// Get the current position of the lexer.
#[must_use] #[must_use]
pub fn span(&self) -> Span { pub(crate) fn span(&self) -> Span {
Span::new(self.byte..self.byte, self.source.clone()) Span::new(self.byte..self.byte, self.source.clone())
} }
@ -103,7 +104,8 @@ impl<'lexer> Lexer<'lexer> {
} }
/// Read the next token from the input. /// Read the next token from the input.
pub fn read(&mut self) -> Result<Option<Token>, LexerError> { #[must_use]
pub(crate) fn read(&mut self) -> Result<Option<Token>, LexerError> {
// Eat whitespace until we encounter a meaningful character, or simply return if // Eat whitespace until we encounter a meaningful character, or simply return if
// we have reached the end of input and no additional characters can be read: // we have reached the end of input and no additional characters can be read:
let c = loop { let c = loop {
@ -178,6 +180,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(Some(Token::new(kind, span))) Ok(Some(Token::new(kind, span)))
} }
#[must_use]
fn line_comment(&mut self) -> TokenKind { fn line_comment(&mut self) -> TokenKind {
// Line comments may start with any number of semicolons, so consume however // Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment: // many are present at the beginning of the comment:
@ -198,6 +201,7 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::LineComment(comment.trim().into()) TokenKind::LineComment(comment.trim().into())
} }
#[must_use]
fn block_comment(&mut self) -> TokenKind { fn block_comment(&mut self) -> TokenKind {
// TODO: This currently allows for unclosed block comments; do we care? // TODO: This currently allows for unclosed block comments; do we care?
self.advance(); // '#' self.advance(); // '#'
@ -216,6 +220,7 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::BlockComment(comment.trim().into()) TokenKind::BlockComment(comment.trim().into())
} }
#[must_use]
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> { fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
let float = word.parse().map_err(|_| { let float = word.parse().map_err(|_| {
LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span())) LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span()))
@ -224,6 +229,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(TokenKind::Float(float)) Ok(TokenKind::Float(float))
} }
#[must_use]
fn integer_literal( fn integer_literal(
&self, &self,
word: String, word: String,
@ -244,6 +250,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(TokenKind::Integer(integer)) Ok(TokenKind::Integer(integer))
} }
#[must_use]
fn numeric_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> { fn numeric_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
let word = self.read_word(); let word = self.read_word();
@ -256,6 +263,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(kind) Ok(kind)
} }
#[must_use]
fn char_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> { fn char_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
self.advance(); // '\'' self.advance(); // '\''
@ -300,6 +308,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(TokenKind::Char(c)) Ok(TokenKind::Char(c))
} }
#[must_use]
fn string_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> { fn string_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
self.advance(); // '"' self.advance(); // '"'
@ -436,9 +445,10 @@ mod tests {
Ok(TokenKind::Integer(255)), Ok(TokenKind::Integer(255)),
]); ]);
test!(error_parse_number: "1.1.1 0.x", [ test!(error_parse_number: "1.1.1 0.x 7b", [
Err(LexerErrorKind::InvalidNumber("1.1.1".into())), Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
Err(LexerErrorKind::InvalidNumber("0.x".into())), Err(LexerErrorKind::InvalidNumber("0.x".into())),
Err(LexerErrorKind::InvalidNumber("7b".into())),
]); ]);
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [ test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
@ -479,6 +489,10 @@ mod tests {
Err(LexerErrorKind::UnclosedString), Err(LexerErrorKind::UnclosedString),
]); ]);
test!(error_escape_unclosed_string: "\"oops\\", [
Err(LexerErrorKind::UnclosedString),
]);
test!(error_invalid_string: "\"hiii\"222", [ test!(error_invalid_string: "\"hiii\"222", [
Err(LexerErrorKind::InvalidString), Err(LexerErrorKind::InvalidString),
]); ]);

View File

@ -1,10 +1,11 @@
/// A symbol used to identify a function or variable. /// A symbol used to identify a function or variable.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Symbol(pub String); #[repr(transparent)]
pub(crate) struct Symbol(String);
impl Symbol { impl Symbol {
/// Create a new `Symbol` from a string. /// Create a new `Symbol` from a string.
pub fn from<S>(s: S) -> Self pub(crate) fn from<S>(s: S) -> Self
where where
S: Into<String>, S: Into<String>,
{ {

View File

@ -3,7 +3,7 @@ use crate::span::Span;
/// Possible kinds of a [Token]. /// Possible kinds of a [Token].
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum TokenKind { pub(crate) enum TokenKind {
/// Block comment, e.g. `#| ... |#` /// Block comment, e.g. `#| ... |#`
BlockComment(String), BlockComment(String),
/// Line comment, e.g. `; ...` /// Line comment, e.g. `; ...`
@ -32,10 +32,10 @@ pub enum TokenKind {
Float(f64), Float(f64),
/// Integer, e.g. `0`, `-1`, `+200` /// Integer, e.g. `0`, `-1`, `+200`
Integer(i64), Integer(i64),
/// String, e.g. `"foo bar"`
String(String),
/// Keyword, e.g. `:baz` /// Keyword, e.g. `:baz`
Keyword(Symbol), Keyword(Symbol),
/// String, e.g. `"foo bar"`
String(String),
/// Symbol, e.g. `qux`, `+` /// Symbol, e.g. `qux`, `+`
Symbol(Symbol), Symbol(Symbol),
/// Nil, e.g. `nil` /// Nil, e.g. `nil`
@ -50,17 +50,17 @@ pub enum TokenKind {
/// [Source]: crate::span::Source /// [Source]: crate::span::Source
/// [Location]: crate::span::Location /// [Location]: crate::span::Location
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub struct Token { pub(crate) struct Token {
/// The kind of token. /// The kind of token.
pub kind: TokenKind, pub(crate) kind: TokenKind,
/// The span in which the token occurs. /// The span in which the token occurs.
pub span: Span, pub(crate) span: Span,
} }
impl Token { impl Token {
/// Construct a new instance of `Token`. /// Construct a new instance of `Token`.
#[must_use] #[must_use]
pub const fn new(kind: TokenKind, span: Span) -> Self { pub(crate) const fn new(kind: TokenKind, span: Span) -> Self {
Self { kind, span } Self { kind, span }
} }
} }