From b7f28b32f030f015f990b5e71fbf808cc100224e Mon Sep 17 00:00:00 2001 From: Jesse Braham Date: Fri, 20 Dec 2024 20:49:46 +0100 Subject: [PATCH] Miscellaneous improvments/cleanup/tweaks in the lexer module --- onihime/src/lexer/error.rs | 4 ++-- onihime/src/lexer/mod.rs | 28 +++++++++++++++++++++------- onihime/src/lexer/symbol.rs | 5 +++-- onihime/src/lexer/token.rs | 14 +++++++------- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs index 303dbc7..6d8c9aa 100644 --- a/onihime/src/lexer/error.rs +++ b/onihime/src/lexer/error.rs @@ -46,8 +46,8 @@ impl std::fmt::Display for LexerError { use LexerErrorKind::*; match &self.kind { - InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"), - InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"), + InvalidEscape(c) => write!(f, "Invalid escape sequence '\\{c}'"), + InvalidNumber(n) => write!(f, "Invalid numeric literal `{n}`"), InvalidString => write!(f, "Invalid string literal"), UnclosedChar => write!(f, "Unclosed character literal"), UnclosedString => write!(f, "Unclosed string literal"), diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs index f423401..b0e808c 100644 --- a/onihime/src/lexer/mod.rs +++ b/onihime/src/lexer/mod.rs @@ -1,6 +1,6 @@ use std::{str::Chars, sync::Arc}; -pub use self::{ +pub(crate) use self::{ error::{LexerError, LexerErrorKind}, symbol::Symbol, token::{Token, TokenKind}, @@ -30,7 +30,7 @@ pub(crate) struct Lexer<'lexer> { impl<'lexer> Lexer<'lexer> { /// Create a new lexer instance from a string. #[must_use] - pub fn new(input: &'lexer str) -> Self { + pub(crate) fn new(input: &'lexer str) -> Self { let source = Arc::new(Source::new(None, input.to_string())); Self { @@ -41,20 +41,21 @@ impl<'lexer> Lexer<'lexer> { } /// Set the name of the [Source] being lexically analyzed. - pub fn set_name(&mut self, name: String) { + pub(crate) fn set_name(&mut self, name: String) { // TODO: Avoid unwrapping here (if possible?) Arc::get_mut(&mut self.source).unwrap().set_name(name); } /// The source being lexically analyzed. + #[cfg(test)] #[must_use] - pub fn source(&self) -> Arc { + pub(crate) fn source(&self) -> Arc { self.source.clone() } /// Get the current position of the lexer. #[must_use] - pub fn span(&self) -> Span { + pub(crate) fn span(&self) -> Span { Span::new(self.byte..self.byte, self.source.clone()) } @@ -103,7 +104,8 @@ impl<'lexer> Lexer<'lexer> { } /// Read the next token from the input. - pub fn read(&mut self) -> Result, LexerError> { + #[must_use] + pub(crate) fn read(&mut self) -> Result, LexerError> { // Eat whitespace until we encounter a meaningful character, or simply return if // we have reached the end of input and no additional characters can be read: let c = loop { @@ -178,6 +180,7 @@ impl<'lexer> Lexer<'lexer> { Ok(Some(Token::new(kind, span))) } + #[must_use] fn line_comment(&mut self) -> TokenKind { // Line comments may start with any number of semicolons, so consume however // many are present at the beginning of the comment: @@ -198,6 +201,7 @@ impl<'lexer> Lexer<'lexer> { TokenKind::LineComment(comment.trim().into()) } + #[must_use] fn block_comment(&mut self) -> TokenKind { // TODO: This currently allows for unclosed block comments; do we care? self.advance(); // '#' @@ -216,6 +220,7 @@ impl<'lexer> Lexer<'lexer> { TokenKind::BlockComment(comment.trim().into()) } + #[must_use] fn float_literal(&self, word: String, span: Span) -> Result { let float = word.parse().map_err(|_| { LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span())) @@ -224,6 +229,7 @@ impl<'lexer> Lexer<'lexer> { Ok(TokenKind::Float(float)) } + #[must_use] fn integer_literal( &self, word: String, @@ -244,6 +250,7 @@ impl<'lexer> Lexer<'lexer> { Ok(TokenKind::Integer(integer)) } + #[must_use] fn numeric_literal(&mut self, span: Span) -> Result { let word = self.read_word(); @@ -256,6 +263,7 @@ impl<'lexer> Lexer<'lexer> { Ok(kind) } + #[must_use] fn char_literal(&mut self, span: Span) -> Result { self.advance(); // '\'' @@ -300,6 +308,7 @@ impl<'lexer> Lexer<'lexer> { Ok(TokenKind::Char(c)) } + #[must_use] fn string_literal(&mut self, span: Span) -> Result { self.advance(); // '"' @@ -436,9 +445,10 @@ mod tests { Ok(TokenKind::Integer(255)), ]); - test!(error_parse_number: "1.1.1 0.x", [ + test!(error_parse_number: "1.1.1 0.x 7b", [ Err(LexerErrorKind::InvalidNumber("1.1.1".into())), Err(LexerErrorKind::InvalidNumber("0.x".into())), + Err(LexerErrorKind::InvalidNumber("7b".into())), ]); test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [ @@ -479,6 +489,10 @@ mod tests { Err(LexerErrorKind::UnclosedString), ]); + test!(error_escape_unclosed_string: "\"oops\\", [ + Err(LexerErrorKind::UnclosedString), + ]); + test!(error_invalid_string: "\"hiii\"222", [ Err(LexerErrorKind::InvalidString), ]); diff --git a/onihime/src/lexer/symbol.rs b/onihime/src/lexer/symbol.rs index b33cc5b..16c6a3d 100644 --- a/onihime/src/lexer/symbol.rs +++ b/onihime/src/lexer/symbol.rs @@ -1,10 +1,11 @@ /// A symbol used to identify a function or variable. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Symbol(pub String); +#[repr(transparent)] +pub(crate) struct Symbol(String); impl Symbol { /// Create a new `Symbol` from a string. - pub fn from(s: S) -> Self + pub(crate) fn from(s: S) -> Self where S: Into, { diff --git a/onihime/src/lexer/token.rs b/onihime/src/lexer/token.rs index 62c8a54..5c8641e 100644 --- a/onihime/src/lexer/token.rs +++ b/onihime/src/lexer/token.rs @@ -3,7 +3,7 @@ use crate::span::Span; /// Possible kinds of a [Token]. #[derive(Debug, Clone, PartialEq)] -pub enum TokenKind { +pub(crate) enum TokenKind { /// Block comment, e.g. `#| ... |#` BlockComment(String), /// Line comment, e.g. `; ...` @@ -32,10 +32,10 @@ pub enum TokenKind { Float(f64), /// Integer, e.g. `0`, `-1`, `+200` Integer(i64), - /// String, e.g. `"foo bar"` - String(String), /// Keyword, e.g. `:baz` Keyword(Symbol), + /// String, e.g. `"foo bar"` + String(String), /// Symbol, e.g. `qux`, `+` Symbol(Symbol), /// Nil, e.g. `nil` @@ -50,17 +50,17 @@ pub enum TokenKind { /// [Source]: crate::span::Source /// [Location]: crate::span::Location #[derive(Debug, Clone, PartialEq)] -pub struct Token { +pub(crate) struct Token { /// The kind of token. - pub kind: TokenKind, + pub(crate) kind: TokenKind, /// The span in which the token occurs. - pub span: Span, + pub(crate) span: Span, } impl Token { /// Construct a new instance of `Token`. #[must_use] - pub const fn new(kind: TokenKind, span: Span) -> Self { + pub(crate) const fn new(kind: TokenKind, span: Span) -> Self { Self { kind, span } } }