Add the initial implementation of the lexer
This commit is contained in:
parent
a345c35f80
commit
cd76ceaa77
52
onihime/src/lexer/error.rs
Normal file
52
onihime/src/lexer/error.rs
Normal file
@ -0,0 +1,52 @@
|
||||
use crate::span::Span;
|
||||
|
||||
/// Errors that can occur during lexical analysis.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum LexerErrorKind {
|
||||
/// An invalid character was encountered.
|
||||
InvalidChar,
|
||||
/// An invalid escape sequence was encountered.
|
||||
InvalidEscape(char),
|
||||
/// An invalid number was encountered.
|
||||
InvalidNumber(String),
|
||||
/// An invalid string was encountered.
|
||||
InvalidString,
|
||||
/// An unclosed character literal was encountered.
|
||||
UnclosedChar,
|
||||
/// And unclosed string literal was encountered.
|
||||
UnclosedString,
|
||||
}
|
||||
|
||||
/// Lexer error, with a start and end location.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct LexerError {
|
||||
/// The type of error encountered.
|
||||
pub kind: LexerErrorKind,
|
||||
/// The span in which the error occurred.
|
||||
pub span: Span,
|
||||
}
|
||||
|
||||
impl LexerError {
|
||||
/// Construct a new instance of `LexerError`.
|
||||
#[must_use]
|
||||
pub const fn new(kind: LexerErrorKind, span: Span) -> Self {
|
||||
Self { kind, span }
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for LexerError {}
|
||||
|
||||
impl std::fmt::Display for LexerError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use LexerErrorKind::*;
|
||||
|
||||
match &self.kind {
|
||||
InvalidChar => write!(f, "Invalid character literal"),
|
||||
InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
|
||||
InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
|
||||
InvalidString => write!(f, "Invalid string literal"),
|
||||
UnclosedChar => write!(f, "Unclosed character literal"),
|
||||
UnclosedString => write!(f, "Unclosed string literal"),
|
||||
}
|
||||
}
|
||||
}
|
399
onihime/src/lexer/mod.rs
Normal file
399
onihime/src/lexer/mod.rs
Normal file
@ -0,0 +1,399 @@
|
||||
use std::{
|
||||
str::{Chars, FromStr},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
pub use self::{
|
||||
error::{LexerError, LexerErrorKind},
|
||||
symbol::Symbol,
|
||||
token::{Token, TokenKind},
|
||||
};
|
||||
use crate::span::{Source, Span};
|
||||
|
||||
mod error;
|
||||
mod symbol;
|
||||
mod token;
|
||||
|
||||
/// A trait for checking if a character is a separator.
|
||||
pub trait Separator {
|
||||
/// Check if the character is a separator.
|
||||
fn is_separator(&self) -> bool;
|
||||
}
|
||||
|
||||
impl Separator for char {
|
||||
fn is_separator(&self) -> bool {
|
||||
self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}' | ',')
|
||||
}
|
||||
}
|
||||
|
||||
/// A lexer, used by the parser.
|
||||
#[derive(Debug)]
|
||||
pub struct Lexer<'lexer> {
|
||||
input: Chars<'lexer>,
|
||||
byte: usize,
|
||||
source: Arc<Source>,
|
||||
}
|
||||
|
||||
impl<'lexer> Lexer<'lexer> {
|
||||
/// Create a new lexer instance from a string.
|
||||
#[must_use]
|
||||
pub fn new(input: &'lexer str) -> Self {
|
||||
let source = Arc::new(Source::new(None, input.to_string()));
|
||||
|
||||
Self {
|
||||
input: input.chars(),
|
||||
byte: 0,
|
||||
source,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the name of the [Source] being lexically analyzed.
|
||||
pub fn set_name(&mut self, name: String) {
|
||||
if let Some(source) = Arc::get_mut(&mut self.source) {
|
||||
source.set_name(name);
|
||||
} else {
|
||||
unimplemented!(); // FIXME: What should we do in this case?
|
||||
}
|
||||
}
|
||||
|
||||
/// The source being lexically analyzed.
|
||||
#[must_use]
|
||||
pub fn source(&self) -> Arc<Source> {
|
||||
self.source.clone()
|
||||
}
|
||||
|
||||
/// Get the current character.
|
||||
#[must_use]
|
||||
pub fn current(&self) -> Option<char> {
|
||||
self.input.as_str().chars().next()
|
||||
}
|
||||
|
||||
/// Get the unparsed input.
|
||||
#[must_use]
|
||||
pub fn get_unparsed(&self) -> &str {
|
||||
self.input.as_str()
|
||||
}
|
||||
|
||||
/// Get the current position of the lexer.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub(crate) fn span(&self) -> Span {
|
||||
Span::new(self.byte..self.byte, self.source.clone())
|
||||
}
|
||||
|
||||
/// Get the nth character ahead of the current character without advancing.
|
||||
fn peek(&self, n: usize) -> Option<char> {
|
||||
self.input.as_str().chars().nth(n)
|
||||
}
|
||||
|
||||
/// Advance the lexer by one character.
|
||||
fn advance(&mut self) -> Option<char> {
|
||||
let c = self.input.next()?;
|
||||
self.byte += c.len_utf8();
|
||||
|
||||
Some(c)
|
||||
}
|
||||
|
||||
/// Read a word from the input until a separator is reached.
|
||||
fn read_word(&mut self) -> String {
|
||||
let mut word = String::new();
|
||||
while let Some(c) = self.current() {
|
||||
if c.is_separator() {
|
||||
break;
|
||||
}
|
||||
|
||||
word.push(c);
|
||||
self.advance();
|
||||
}
|
||||
|
||||
word
|
||||
}
|
||||
|
||||
/// Parse a value from the input or return an error.
|
||||
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
|
||||
where
|
||||
T: FromStr,
|
||||
{
|
||||
let span = self.span();
|
||||
let word = self.read_word();
|
||||
|
||||
word.parse()
|
||||
.map_err(|_| LexerError::new(err(word), span.join(&self.span())))
|
||||
}
|
||||
|
||||
/// Read the next token from the input.
|
||||
pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
|
||||
let c = loop {
|
||||
match self.current() {
|
||||
Some(c) if c.is_ascii_whitespace() || c == ',' => {
|
||||
self.advance();
|
||||
}
|
||||
Some(c) => break c,
|
||||
None => return Ok(None),
|
||||
}
|
||||
};
|
||||
|
||||
let mut span = self.span();
|
||||
let kind = match c {
|
||||
'(' => {
|
||||
self.advance();
|
||||
TokenKind::OpenParen
|
||||
}
|
||||
')' => {
|
||||
self.advance();
|
||||
TokenKind::CloseParen
|
||||
}
|
||||
'{' => {
|
||||
self.advance();
|
||||
TokenKind::OpenBrace
|
||||
}
|
||||
'}' => {
|
||||
self.advance();
|
||||
TokenKind::CloseBrace
|
||||
}
|
||||
'[' => {
|
||||
self.advance();
|
||||
TokenKind::OpenBracket
|
||||
}
|
||||
']' => {
|
||||
self.advance();
|
||||
TokenKind::CloseBracket
|
||||
}
|
||||
'0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?),
|
||||
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
|
||||
TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?)
|
||||
}
|
||||
';' => {
|
||||
let mut comment = String::new();
|
||||
while let Some(c) = self.advance() {
|
||||
match c {
|
||||
';' => continue,
|
||||
'\n' => break,
|
||||
c => {
|
||||
comment.push(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TokenKind::LineComment(comment.trim().into())
|
||||
}
|
||||
'#' if matches!(self.peek(1), Some('|')) => {
|
||||
self.advance(); // '#'
|
||||
self.advance(); // '|#'
|
||||
|
||||
let mut comment = String::new();
|
||||
while let Some(c) = self.advance() {
|
||||
match c {
|
||||
'|' if matches!(self.peek(0), Some('#')) => {
|
||||
self.advance(); // '|'
|
||||
self.advance(); // '#'
|
||||
break;
|
||||
}
|
||||
c => {
|
||||
comment.push(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TokenKind::BlockComment(comment.trim().into())
|
||||
}
|
||||
':' => {
|
||||
self.advance();
|
||||
TokenKind::Keyword(Symbol(self.read_word()))
|
||||
}
|
||||
'"' => {
|
||||
self.advance(); // '"'
|
||||
|
||||
let quote_span = span.clone().join(&self.span());
|
||||
let mut string = String::new();
|
||||
|
||||
loop {
|
||||
let ch_span = self.span();
|
||||
string.push(match self.advance() {
|
||||
Some('"') => break,
|
||||
Some('\\') => match self.advance() {
|
||||
Some(c @ ('"' | '\\')) => c,
|
||||
Some('n') => '\n',
|
||||
Some('e') => '\x1b',
|
||||
Some(c) => {
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidEscape(c),
|
||||
ch_span.join(&self.span()),
|
||||
))
|
||||
}
|
||||
None => {
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::UnclosedString,
|
||||
quote_span,
|
||||
))
|
||||
}
|
||||
},
|
||||
Some(c) => c,
|
||||
None => {
|
||||
return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span))
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if self.current().is_some_and(|c| !c.is_separator()) {
|
||||
self.read_word();
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidString,
|
||||
span.join(&self.span()),
|
||||
));
|
||||
}
|
||||
|
||||
TokenKind::String(string)
|
||||
}
|
||||
'\'' => {
|
||||
self.advance(); // '\''
|
||||
|
||||
let c = match self.advance() {
|
||||
Some('\\') => match self.advance() {
|
||||
Some(c @ ('"' | '\\')) => c,
|
||||
Some('n') => '\n',
|
||||
Some('e') => '\x1b',
|
||||
Some(c) => {
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidEscape(c),
|
||||
span.join(&self.span()),
|
||||
));
|
||||
}
|
||||
None => {
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::UnclosedChar,
|
||||
span.join(&self.span()),
|
||||
));
|
||||
}
|
||||
},
|
||||
Some(c) => c,
|
||||
None => {
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::UnclosedChar,
|
||||
span.join(&self.span()),
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
if self.advance() != Some('\'') {
|
||||
self.read_word();
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidChar,
|
||||
span.join(&self.span()),
|
||||
));
|
||||
}
|
||||
|
||||
TokenKind::Char(c)
|
||||
}
|
||||
_ => {
|
||||
let word = self.read_word();
|
||||
match word.as_str() {
|
||||
"true" => TokenKind::Bool(true),
|
||||
"false" => TokenKind::Bool(false),
|
||||
"nil" => TokenKind::Nil,
|
||||
_ => TokenKind::Symbol(Symbol::from(word)),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
span.extend(&self.span());
|
||||
|
||||
Ok(Some(Token::new(kind, span)))
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for Lexer<'_> {
|
||||
type Item = Result<Token, LexerError>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.read().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
macro_rules! test {
|
||||
( $name:ident: $input:literal, $tokens:expr ) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let mut lexer = Lexer::new($input);
|
||||
|
||||
for token in $tokens {
|
||||
let x = lexer.next().map(|r| match r {
|
||||
Ok(t) => Ok(t.kind),
|
||||
Err(e) => Err(e.kind),
|
||||
});
|
||||
|
||||
assert_eq!(x, Some(token));
|
||||
}
|
||||
|
||||
assert_eq!(lexer.next(), None);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
test!(lex: "(+ 14 25.5 333 (* 2 5))", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
||||
Ok(TokenKind::Number(14.0)),
|
||||
Ok(TokenKind::Number(25.5)),
|
||||
Ok(TokenKind::Number(333.0)),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(5.0)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
||||
Ok(TokenKind::Number(14.0)),
|
||||
Ok(TokenKind::Number(25.5)),
|
||||
Ok(TokenKind::Number(333.0)),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(5.0)),
|
||||
Err(LexerErrorKind::InvalidNumber("5.x".into())),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(negative_minus: "(- 1 -2 3)", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
||||
Ok(TokenKind::Number(1.0)),
|
||||
Ok(TokenKind::Number(-2.0)),
|
||||
Ok(TokenKind::Number(3.0)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(line_comment: "; foo\n;; bar baz qux", [
|
||||
Ok(TokenKind::LineComment("foo".into())),
|
||||
Ok(TokenKind::LineComment("bar baz qux".into())),
|
||||
]);
|
||||
|
||||
test!(block_comment: "#| foo\nbar |#", [
|
||||
Ok(TokenKind::BlockComment("foo\nbar".into()))
|
||||
]);
|
||||
|
||||
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(55.0)),
|
||||
Ok(TokenKind::Number(3.144)),
|
||||
Ok(TokenKind::Number(0.0001)),
|
||||
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
|
||||
]);
|
||||
|
||||
test!(error_unclosed_string: "\"hiii", [
|
||||
Err(LexerErrorKind::UnclosedString),
|
||||
]);
|
||||
|
||||
test!(error_invalid_string: "\"hiii\"222", [
|
||||
Err(LexerErrorKind::InvalidString),
|
||||
]);
|
||||
}
|
19
onihime/src/lexer/symbol.rs
Normal file
19
onihime/src/lexer/symbol.rs
Normal file
@ -0,0 +1,19 @@
|
||||
/// A symbol used to identify a function or variable.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Symbol(pub String);
|
||||
|
||||
impl Symbol {
|
||||
/// Create a new `Symbol` from a string.
|
||||
pub fn from<S>(s: S) -> Self
|
||||
where
|
||||
S: Into<String>,
|
||||
{
|
||||
Self(s.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Symbol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
56
onihime/src/lexer/token.rs
Normal file
56
onihime/src/lexer/token.rs
Normal file
@ -0,0 +1,56 @@
|
||||
use super::Symbol;
|
||||
use crate::span::Span;
|
||||
|
||||
/// The type of a [Token].
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum TokenKind {
|
||||
/// Opening parenthesis, e.g. '('
|
||||
OpenParen,
|
||||
/// Closing parenthesis, e.g. ')'
|
||||
CloseParen,
|
||||
/// Opening brace, e.g. '{'
|
||||
OpenBrace,
|
||||
/// Closing brace, e.g. '}'
|
||||
CloseBrace,
|
||||
/// Opening bracket, e.g. '['
|
||||
OpenBracket,
|
||||
/// Closing bracket, e.g. ']'
|
||||
CloseBracket,
|
||||
|
||||
/// Block comment, e.g. '#| ... |#'
|
||||
BlockComment(String),
|
||||
/// Line comment, e.g. '; ...'
|
||||
LineComment(String),
|
||||
|
||||
/// Boolean, e.g. 'true', 'false'
|
||||
Bool(bool),
|
||||
/// Character, e.g. 'c', '\n'
|
||||
Char(char),
|
||||
/// Number, e.g. '1', '2.0', '0.003'
|
||||
Number(f64),
|
||||
/// String, e.g. '"foo bar"'
|
||||
String(String),
|
||||
/// Keyword, e.g. ':baz'
|
||||
Keyword(Symbol),
|
||||
/// Symbol, e.g. 'qux', '+'
|
||||
Symbol(Symbol),
|
||||
/// Nil, e.g. 'nil'
|
||||
Nil,
|
||||
}
|
||||
|
||||
/// A token with a start and end location.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct Token {
|
||||
/// The type of token.
|
||||
pub kind: TokenKind,
|
||||
/// The span in which the token occurs.
|
||||
pub span: Span,
|
||||
}
|
||||
|
||||
impl Token {
|
||||
/// Construct a new instance of `Token`.
|
||||
#[must_use]
|
||||
pub const fn new(kind: TokenKind, span: Span) -> Self {
|
||||
Self { kind, span }
|
||||
}
|
||||
}
|
@ -2,4 +2,5 @@
|
||||
|
||||
#![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)]
|
||||
|
||||
mod lexer;
|
||||
mod span;
|
||||
|
Loading…
Reference in New Issue
Block a user