diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs new file mode 100644 index 0000000..58e6d7c --- /dev/null +++ b/onihime/src/lexer/error.rs @@ -0,0 +1,52 @@ +use crate::span::Span; + +/// Errors that can occur during lexical analysis. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LexerErrorKind { + /// An invalid character was encountered. + InvalidChar, + /// An invalid escape sequence was encountered. + InvalidEscape(char), + /// An invalid number was encountered. + InvalidNumber(String), + /// An invalid string was encountered. + InvalidString, + /// An unclosed character literal was encountered. + UnclosedChar, + /// And unclosed string literal was encountered. + UnclosedString, +} + +/// Lexer error, with a start and end location. +#[derive(Debug, Clone, PartialEq)] +pub struct LexerError { + /// The type of error encountered. + pub kind: LexerErrorKind, + /// The span in which the error occurred. + pub span: Span, +} + +impl LexerError { + /// Construct a new instance of `LexerError`. + #[must_use] + pub const fn new(kind: LexerErrorKind, span: Span) -> Self { + Self { kind, span } + } +} + +impl std::error::Error for LexerError {} + +impl std::fmt::Display for LexerError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use LexerErrorKind::*; + + match &self.kind { + InvalidChar => write!(f, "Invalid character literal"), + InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"), + InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"), + InvalidString => write!(f, "Invalid string literal"), + UnclosedChar => write!(f, "Unclosed character literal"), + UnclosedString => write!(f, "Unclosed string literal"), + } + } +} diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs new file mode 100644 index 0000000..b065461 --- /dev/null +++ b/onihime/src/lexer/mod.rs @@ -0,0 +1,399 @@ +use std::{ + str::{Chars, FromStr}, + sync::Arc, +}; + +pub use self::{ + error::{LexerError, LexerErrorKind}, + symbol::Symbol, + token::{Token, TokenKind}, +}; +use crate::span::{Source, Span}; + +mod error; +mod symbol; +mod token; + +/// A trait for checking if a character is a separator. +pub trait Separator { + /// Check if the character is a separator. + fn is_separator(&self) -> bool; +} + +impl Separator for char { + fn is_separator(&self) -> bool { + self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}' | ',') + } +} + +/// A lexer, used by the parser. +#[derive(Debug)] +pub struct Lexer<'lexer> { + input: Chars<'lexer>, + byte: usize, + source: Arc, +} + +impl<'lexer> Lexer<'lexer> { + /// Create a new lexer instance from a string. + #[must_use] + pub fn new(input: &'lexer str) -> Self { + let source = Arc::new(Source::new(None, input.to_string())); + + Self { + input: input.chars(), + byte: 0, + source, + } + } + + /// Set the name of the [Source] being lexically analyzed. + pub fn set_name(&mut self, name: String) { + if let Some(source) = Arc::get_mut(&mut self.source) { + source.set_name(name); + } else { + unimplemented!(); // FIXME: What should we do in this case? + } + } + + /// The source being lexically analyzed. + #[must_use] + pub fn source(&self) -> Arc { + self.source.clone() + } + + /// Get the current character. + #[must_use] + pub fn current(&self) -> Option { + self.input.as_str().chars().next() + } + + /// Get the unparsed input. + #[must_use] + pub fn get_unparsed(&self) -> &str { + self.input.as_str() + } + + /// Get the current position of the lexer. + #[inline] + #[must_use] + pub(crate) fn span(&self) -> Span { + Span::new(self.byte..self.byte, self.source.clone()) + } + + /// Get the nth character ahead of the current character without advancing. + fn peek(&self, n: usize) -> Option { + self.input.as_str().chars().nth(n) + } + + /// Advance the lexer by one character. + fn advance(&mut self) -> Option { + let c = self.input.next()?; + self.byte += c.len_utf8(); + + Some(c) + } + + /// Read a word from the input until a separator is reached. + fn read_word(&mut self) -> String { + let mut word = String::new(); + while let Some(c) = self.current() { + if c.is_separator() { + break; + } + + word.push(c); + self.advance(); + } + + word + } + + /// Parse a value from the input or return an error. + fn parse_or(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result + where + T: FromStr, + { + let span = self.span(); + let word = self.read_word(); + + word.parse() + .map_err(|_| LexerError::new(err(word), span.join(&self.span()))) + } + + /// Read the next token from the input. + pub fn read(&mut self) -> Result, LexerError> { + let c = loop { + match self.current() { + Some(c) if c.is_ascii_whitespace() || c == ',' => { + self.advance(); + } + Some(c) => break c, + None => return Ok(None), + } + }; + + let mut span = self.span(); + let kind = match c { + '(' => { + self.advance(); + TokenKind::OpenParen + } + ')' => { + self.advance(); + TokenKind::CloseParen + } + '{' => { + self.advance(); + TokenKind::OpenBrace + } + '}' => { + self.advance(); + TokenKind::CloseBrace + } + '[' => { + self.advance(); + TokenKind::OpenBracket + } + ']' => { + self.advance(); + TokenKind::CloseBracket + } + '0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?), + '+' | '-' if matches!(self.peek(1), Some('0'..='9')) => { + TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?) + } + ';' => { + let mut comment = String::new(); + while let Some(c) = self.advance() { + match c { + ';' => continue, + '\n' => break, + c => { + comment.push(c); + } + } + } + + TokenKind::LineComment(comment.trim().into()) + } + '#' if matches!(self.peek(1), Some('|')) => { + self.advance(); // '#' + self.advance(); // '|#' + + let mut comment = String::new(); + while let Some(c) = self.advance() { + match c { + '|' if matches!(self.peek(0), Some('#')) => { + self.advance(); // '|' + self.advance(); // '#' + break; + } + c => { + comment.push(c); + } + } + } + + TokenKind::BlockComment(comment.trim().into()) + } + ':' => { + self.advance(); + TokenKind::Keyword(Symbol(self.read_word())) + } + '"' => { + self.advance(); // '"' + + let quote_span = span.clone().join(&self.span()); + let mut string = String::new(); + + loop { + let ch_span = self.span(); + string.push(match self.advance() { + Some('"') => break, + Some('\\') => match self.advance() { + Some(c @ ('"' | '\\')) => c, + Some('n') => '\n', + Some('e') => '\x1b', + Some(c) => { + return Err(LexerError::new( + LexerErrorKind::InvalidEscape(c), + ch_span.join(&self.span()), + )) + } + None => { + return Err(LexerError::new( + LexerErrorKind::UnclosedString, + quote_span, + )) + } + }, + Some(c) => c, + None => { + return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span)) + } + }); + } + + if self.current().is_some_and(|c| !c.is_separator()) { + self.read_word(); + return Err(LexerError::new( + LexerErrorKind::InvalidString, + span.join(&self.span()), + )); + } + + TokenKind::String(string) + } + '\'' => { + self.advance(); // '\'' + + let c = match self.advance() { + Some('\\') => match self.advance() { + Some(c @ ('"' | '\\')) => c, + Some('n') => '\n', + Some('e') => '\x1b', + Some(c) => { + return Err(LexerError::new( + LexerErrorKind::InvalidEscape(c), + span.join(&self.span()), + )); + } + None => { + return Err(LexerError::new( + LexerErrorKind::UnclosedChar, + span.join(&self.span()), + )); + } + }, + Some(c) => c, + None => { + return Err(LexerError::new( + LexerErrorKind::UnclosedChar, + span.join(&self.span()), + )) + } + }; + + if self.advance() != Some('\'') { + self.read_word(); + return Err(LexerError::new( + LexerErrorKind::InvalidChar, + span.join(&self.span()), + )); + } + + TokenKind::Char(c) + } + _ => { + let word = self.read_word(); + match word.as_str() { + "true" => TokenKind::Bool(true), + "false" => TokenKind::Bool(false), + "nil" => TokenKind::Nil, + _ => TokenKind::Symbol(Symbol::from(word)), + } + } + }; + + span.extend(&self.span()); + + Ok(Some(Token::new(kind, span))) + } +} + +impl Iterator for Lexer<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + self.read().transpose() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! test { + ( $name:ident: $input:literal, $tokens:expr ) => { + #[test] + fn $name() { + let mut lexer = Lexer::new($input); + + for token in $tokens { + let x = lexer.next().map(|r| match r { + Ok(t) => Ok(t.kind), + Err(e) => Err(e.kind), + }); + + assert_eq!(x, Some(token)); + } + + assert_eq!(lexer.next(), None); + } + }; + } + + test!(lex: "(+ 14 25.5 333 (* 2 5))", [ + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("+"))), + Ok(TokenKind::Number(14.0)), + Ok(TokenKind::Number(25.5)), + Ok(TokenKind::Number(333.0)), + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("*"))), + Ok(TokenKind::Number(2.0)), + Ok(TokenKind::Number(5.0)), + Ok(TokenKind::CloseParen), + Ok(TokenKind::CloseParen), + ]); + + test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [ + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("+"))), + Ok(TokenKind::Number(14.0)), + Ok(TokenKind::Number(25.5)), + Ok(TokenKind::Number(333.0)), + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("*"))), + Ok(TokenKind::Number(2.0)), + Ok(TokenKind::Number(5.0)), + Err(LexerErrorKind::InvalidNumber("5.x".into())), + Ok(TokenKind::CloseParen), + Ok(TokenKind::CloseParen), + ]); + + test!(negative_minus: "(- 1 -2 3)", [ + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("-"))), + Ok(TokenKind::Number(1.0)), + Ok(TokenKind::Number(-2.0)), + Ok(TokenKind::Number(3.0)), + Ok(TokenKind::CloseParen), + ]); + + test!(line_comment: "; foo\n;; bar baz qux", [ + Ok(TokenKind::LineComment("foo".into())), + Ok(TokenKind::LineComment("bar baz qux".into())), + ]); + + test!(block_comment: "#| foo\nbar |#", [ + Ok(TokenKind::BlockComment("foo\nbar".into())) + ]); + + test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [ + Ok(TokenKind::Number(2.0)), + Ok(TokenKind::Number(55.0)), + Ok(TokenKind::Number(3.144)), + Ok(TokenKind::Number(0.0001)), + Err(LexerErrorKind::InvalidNumber("1.1.1".into())), + ]); + + test!(error_unclosed_string: "\"hiii", [ + Err(LexerErrorKind::UnclosedString), + ]); + + test!(error_invalid_string: "\"hiii\"222", [ + Err(LexerErrorKind::InvalidString), + ]); +} diff --git a/onihime/src/lexer/symbol.rs b/onihime/src/lexer/symbol.rs new file mode 100644 index 0000000..3f7c697 --- /dev/null +++ b/onihime/src/lexer/symbol.rs @@ -0,0 +1,19 @@ +/// A symbol used to identify a function or variable. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Symbol(pub String); + +impl Symbol { + /// Create a new `Symbol` from a string. + pub fn from(s: S) -> Self + where + S: Into, + { + Self(s.into()) + } +} + +impl std::fmt::Display for Symbol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/onihime/src/lexer/token.rs b/onihime/src/lexer/token.rs new file mode 100644 index 0000000..f3f9c6f --- /dev/null +++ b/onihime/src/lexer/token.rs @@ -0,0 +1,56 @@ +use super::Symbol; +use crate::span::Span; + +/// The type of a [Token]. +#[derive(Debug, Clone, PartialEq)] +pub enum TokenKind { + /// Opening parenthesis, e.g. '(' + OpenParen, + /// Closing parenthesis, e.g. ')' + CloseParen, + /// Opening brace, e.g. '{' + OpenBrace, + /// Closing brace, e.g. '}' + CloseBrace, + /// Opening bracket, e.g. '[' + OpenBracket, + /// Closing bracket, e.g. ']' + CloseBracket, + + /// Block comment, e.g. '#| ... |#' + BlockComment(String), + /// Line comment, e.g. '; ...' + LineComment(String), + + /// Boolean, e.g. 'true', 'false' + Bool(bool), + /// Character, e.g. 'c', '\n' + Char(char), + /// Number, e.g. '1', '2.0', '0.003' + Number(f64), + /// String, e.g. '"foo bar"' + String(String), + /// Keyword, e.g. ':baz' + Keyword(Symbol), + /// Symbol, e.g. 'qux', '+' + Symbol(Symbol), + /// Nil, e.g. 'nil' + Nil, +} + +/// A token with a start and end location. +#[derive(Debug, Clone, PartialEq)] +pub struct Token { + /// The type of token. + pub kind: TokenKind, + /// The span in which the token occurs. + pub span: Span, +} + +impl Token { + /// Construct a new instance of `Token`. + #[must_use] + pub const fn new(kind: TokenKind, span: Span) -> Self { + Self { kind, span } + } +} diff --git a/onihime/src/lib.rs b/onihime/src/lib.rs index 0cffc7e..b5ef643 100644 --- a/onihime/src/lib.rs +++ b/onihime/src/lib.rs @@ -2,4 +2,5 @@ #![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)] +mod lexer; mod span;