Miscellaneous improvments/cleanup/tweaks in the lexer module

This commit is contained in:
Jesse Braham 2024-12-20 20:49:46 +01:00
parent 1b021f1a89
commit b7f28b32f0
4 changed files with 33 additions and 18 deletions

View File

@ -46,8 +46,8 @@ impl std::fmt::Display for LexerError {
use LexerErrorKind::*;
match &self.kind {
InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
InvalidEscape(c) => write!(f, "Invalid escape sequence '\\{c}'"),
InvalidNumber(n) => write!(f, "Invalid numeric literal `{n}`"),
InvalidString => write!(f, "Invalid string literal"),
UnclosedChar => write!(f, "Unclosed character literal"),
UnclosedString => write!(f, "Unclosed string literal"),

View File

@ -1,6 +1,6 @@
use std::{str::Chars, sync::Arc};
pub use self::{
pub(crate) use self::{
error::{LexerError, LexerErrorKind},
symbol::Symbol,
token::{Token, TokenKind},
@ -30,7 +30,7 @@ pub(crate) struct Lexer<'lexer> {
impl<'lexer> Lexer<'lexer> {
/// Create a new lexer instance from a string.
#[must_use]
pub fn new(input: &'lexer str) -> Self {
pub(crate) fn new(input: &'lexer str) -> Self {
let source = Arc::new(Source::new(None, input.to_string()));
Self {
@ -41,20 +41,21 @@ impl<'lexer> Lexer<'lexer> {
}
/// Set the name of the [Source] being lexically analyzed.
pub fn set_name(&mut self, name: String) {
pub(crate) fn set_name(&mut self, name: String) {
// TODO: Avoid unwrapping here (if possible?)
Arc::get_mut(&mut self.source).unwrap().set_name(name);
}
/// The source being lexically analyzed.
#[cfg(test)]
#[must_use]
pub fn source(&self) -> Arc<Source> {
pub(crate) fn source(&self) -> Arc<Source> {
self.source.clone()
}
/// Get the current position of the lexer.
#[must_use]
pub fn span(&self) -> Span {
pub(crate) fn span(&self) -> Span {
Span::new(self.byte..self.byte, self.source.clone())
}
@ -103,7 +104,8 @@ impl<'lexer> Lexer<'lexer> {
}
/// Read the next token from the input.
pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
#[must_use]
pub(crate) fn read(&mut self) -> Result<Option<Token>, LexerError> {
// Eat whitespace until we encounter a meaningful character, or simply return if
// we have reached the end of input and no additional characters can be read:
let c = loop {
@ -178,6 +180,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(Some(Token::new(kind, span)))
}
#[must_use]
fn line_comment(&mut self) -> TokenKind {
// Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment:
@ -198,6 +201,7 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::LineComment(comment.trim().into())
}
#[must_use]
fn block_comment(&mut self) -> TokenKind {
// TODO: This currently allows for unclosed block comments; do we care?
self.advance(); // '#'
@ -216,6 +220,7 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::BlockComment(comment.trim().into())
}
#[must_use]
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
let float = word.parse().map_err(|_| {
LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span()))
@ -224,6 +229,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(TokenKind::Float(float))
}
#[must_use]
fn integer_literal(
&self,
word: String,
@ -244,6 +250,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(TokenKind::Integer(integer))
}
#[must_use]
fn numeric_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
let word = self.read_word();
@ -256,6 +263,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(kind)
}
#[must_use]
fn char_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
self.advance(); // '\''
@ -300,6 +308,7 @@ impl<'lexer> Lexer<'lexer> {
Ok(TokenKind::Char(c))
}
#[must_use]
fn string_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
self.advance(); // '"'
@ -436,9 +445,10 @@ mod tests {
Ok(TokenKind::Integer(255)),
]);
test!(error_parse_number: "1.1.1 0.x", [
test!(error_parse_number: "1.1.1 0.x 7b", [
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
Err(LexerErrorKind::InvalidNumber("0.x".into())),
Err(LexerErrorKind::InvalidNumber("7b".into())),
]);
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
@ -479,6 +489,10 @@ mod tests {
Err(LexerErrorKind::UnclosedString),
]);
test!(error_escape_unclosed_string: "\"oops\\", [
Err(LexerErrorKind::UnclosedString),
]);
test!(error_invalid_string: "\"hiii\"222", [
Err(LexerErrorKind::InvalidString),
]);

View File

@ -1,10 +1,11 @@
/// A symbol used to identify a function or variable.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Symbol(pub String);
#[repr(transparent)]
pub(crate) struct Symbol(String);
impl Symbol {
/// Create a new `Symbol` from a string.
pub fn from<S>(s: S) -> Self
pub(crate) fn from<S>(s: S) -> Self
where
S: Into<String>,
{

View File

@ -3,7 +3,7 @@ use crate::span::Span;
/// Possible kinds of a [Token].
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
pub(crate) enum TokenKind {
/// Block comment, e.g. `#| ... |#`
BlockComment(String),
/// Line comment, e.g. `; ...`
@ -32,10 +32,10 @@ pub enum TokenKind {
Float(f64),
/// Integer, e.g. `0`, `-1`, `+200`
Integer(i64),
/// String, e.g. `"foo bar"`
String(String),
/// Keyword, e.g. `:baz`
Keyword(Symbol),
/// String, e.g. `"foo bar"`
String(String),
/// Symbol, e.g. `qux`, `+`
Symbol(Symbol),
/// Nil, e.g. `nil`
@ -50,17 +50,17 @@ pub enum TokenKind {
/// [Source]: crate::span::Source
/// [Location]: crate::span::Location
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
pub(crate) struct Token {
/// The kind of token.
pub kind: TokenKind,
pub(crate) kind: TokenKind,
/// The span in which the token occurs.
pub span: Span,
pub(crate) span: Span,
}
impl Token {
/// Construct a new instance of `Token`.
#[must_use]
pub const fn new(kind: TokenKind, span: Span) -> Self {
pub(crate) const fn new(kind: TokenKind, span: Span) -> Self {
Self { kind, span }
}
}