Compare commits

...

2 Commits

6 changed files with 670 additions and 0 deletions

View File

@ -0,0 +1,52 @@
use crate::span::Span;
/// Errors that can occur during lexical analysis.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LexerErrorKind {
/// An invalid character was encountered.
InvalidChar,
/// An invalid escape sequence was encountered.
InvalidEscape(char),
/// An invalid number was encountered.
InvalidNumber(String),
/// An invalid string was encountered.
InvalidString,
/// An unclosed character literal was encountered.
UnclosedChar,
/// And unclosed string literal was encountered.
UnclosedString,
}
/// Lexer error, with a start and end location.
#[derive(Debug, Clone, PartialEq)]
pub struct LexerError {
/// The type of error encountered.
pub kind: LexerErrorKind,
/// The span in which the error occurred.
pub span: Span,
}
impl LexerError {
/// Construct a new instance of `LexerError`.
#[must_use]
pub const fn new(kind: LexerErrorKind, span: Span) -> Self {
Self { kind, span }
}
}
impl std::error::Error for LexerError {}
impl std::fmt::Display for LexerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use LexerErrorKind::*;
match &self.kind {
InvalidChar => write!(f, "Invalid character literal"),
InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
InvalidString => write!(f, "Invalid string literal"),
UnclosedChar => write!(f, "Unclosed character literal"),
UnclosedString => write!(f, "Unclosed string literal"),
}
}
}

399
onihime/src/lexer/mod.rs Normal file
View File

@ -0,0 +1,399 @@
use std::{
str::{Chars, FromStr},
sync::Arc,
};
pub use self::{
error::{LexerError, LexerErrorKind},
symbol::Symbol,
token::{Token, TokenKind},
};
use crate::span::{Source, Span};
mod error;
mod symbol;
mod token;
/// A trait for checking if a character is a separator.
pub trait Separator {
/// Check if the character is a separator.
fn is_separator(&self) -> bool;
}
impl Separator for char {
fn is_separator(&self) -> bool {
self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}' | ',')
}
}
/// A lexer, used by the parser.
#[derive(Debug)]
pub struct Lexer<'lexer> {
input: Chars<'lexer>,
byte: usize,
source: Arc<Source>,
}
impl<'lexer> Lexer<'lexer> {
/// Create a new lexer instance from a string.
#[must_use]
pub fn new(input: &'lexer str) -> Self {
let source = Arc::new(Source::new(None, input.to_string()));
Self {
input: input.chars(),
byte: 0,
source,
}
}
/// Set the name of the [Source] being lexically analyzed.
pub fn set_name(&mut self, name: String) {
if let Some(source) = Arc::get_mut(&mut self.source) {
source.set_name(name);
} else {
unimplemented!(); // FIXME: What should we do in this case?
}
}
/// The source being lexically analyzed.
#[must_use]
pub fn source(&self) -> Arc<Source> {
self.source.clone()
}
/// Get the current character.
#[must_use]
pub fn current(&self) -> Option<char> {
self.input.as_str().chars().next()
}
/// Get the unparsed input.
#[must_use]
pub fn get_unparsed(&self) -> &str {
self.input.as_str()
}
/// Get the current position of the lexer.
#[inline]
#[must_use]
pub(crate) fn span(&self) -> Span {
Span::new(self.byte..self.byte, self.source.clone())
}
/// Get the nth character ahead of the current character without advancing.
fn peek(&self, n: usize) -> Option<char> {
self.input.as_str().chars().nth(n)
}
/// Advance the lexer by one character.
fn advance(&mut self) -> Option<char> {
let c = self.input.next()?;
self.byte += c.len_utf8();
Some(c)
}
/// Read a word from the input until a separator is reached.
fn read_word(&mut self) -> String {
let mut word = String::new();
while let Some(c) = self.current() {
if c.is_separator() {
break;
}
word.push(c);
self.advance();
}
word
}
/// Parse a value from the input or return an error.
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
where
T: FromStr,
{
let span = self.span();
let word = self.read_word();
word.parse()
.map_err(|_| LexerError::new(err(word), span.join(&self.span())))
}
/// Read the next token from the input.
pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
let c = loop {
match self.current() {
Some(c) if c.is_ascii_whitespace() || c == ',' => {
self.advance();
}
Some(c) => break c,
None => return Ok(None),
}
};
let mut span = self.span();
let kind = match c {
'(' => {
self.advance();
TokenKind::OpenParen
}
')' => {
self.advance();
TokenKind::CloseParen
}
'{' => {
self.advance();
TokenKind::OpenBrace
}
'}' => {
self.advance();
TokenKind::CloseBrace
}
'[' => {
self.advance();
TokenKind::OpenBracket
}
']' => {
self.advance();
TokenKind::CloseBracket
}
'0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?),
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?)
}
';' => {
let mut comment = String::new();
while let Some(c) = self.advance() {
match c {
';' => continue,
'\n' => break,
c => {
comment.push(c);
}
}
}
TokenKind::LineComment(comment.trim().into())
}
'#' if matches!(self.peek(1), Some('|')) => {
self.advance(); // '#'
self.advance(); // '|#'
let mut comment = String::new();
while let Some(c) = self.advance() {
match c {
'|' if matches!(self.peek(0), Some('#')) => {
self.advance(); // '|'
self.advance(); // '#'
break;
}
c => {
comment.push(c);
}
}
}
TokenKind::BlockComment(comment.trim().into())
}
':' => {
self.advance();
TokenKind::Keyword(Symbol(self.read_word()))
}
'"' => {
self.advance(); // '"'
let quote_span = span.clone().join(&self.span());
let mut string = String::new();
loop {
let ch_span = self.span();
string.push(match self.advance() {
Some('"') => break,
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('e') => '\x1b',
Some(c) => {
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
ch_span.join(&self.span()),
))
}
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedString,
quote_span,
))
}
},
Some(c) => c,
None => {
return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span))
}
});
}
if self.current().is_some_and(|c| !c.is_separator()) {
self.read_word();
return Err(LexerError::new(
LexerErrorKind::InvalidString,
span.join(&self.span()),
));
}
TokenKind::String(string)
}
'\'' => {
self.advance(); // '\''
let c = match self.advance() {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('e') => '\x1b',
Some(c) => {
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
span.join(&self.span()),
));
}
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
));
}
},
Some(c) => c,
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
))
}
};
if self.advance() != Some('\'') {
self.read_word();
return Err(LexerError::new(
LexerErrorKind::InvalidChar,
span.join(&self.span()),
));
}
TokenKind::Char(c)
}
_ => {
let word = self.read_word();
match word.as_str() {
"true" => TokenKind::Bool(true),
"false" => TokenKind::Bool(false),
"nil" => TokenKind::Nil,
_ => TokenKind::Symbol(Symbol::from(word)),
}
}
};
span.extend(&self.span());
Ok(Some(Token::new(kind, span)))
}
}
impl Iterator for Lexer<'_> {
type Item = Result<Token, LexerError>;
fn next(&mut self) -> Option<Self::Item> {
self.read().transpose()
}
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! test {
( $name:ident: $input:literal, $tokens:expr ) => {
#[test]
fn $name() {
let mut lexer = Lexer::new($input);
for token in $tokens {
let x = lexer.next().map(|r| match r {
Ok(t) => Ok(t.kind),
Err(e) => Err(e.kind),
});
assert_eq!(x, Some(token));
}
assert_eq!(lexer.next(), None);
}
};
}
test!(lex: "(+ 14 25.5 333 (* 2 5))", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::Number(14.0)),
Ok(TokenKind::Number(25.5)),
Ok(TokenKind::Number(333.0)),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("*"))),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(5.0)),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen),
]);
test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::Number(14.0)),
Ok(TokenKind::Number(25.5)),
Ok(TokenKind::Number(333.0)),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("*"))),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(5.0)),
Err(LexerErrorKind::InvalidNumber("5.x".into())),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen),
]);
test!(negative_minus: "(- 1 -2 3)", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("-"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::Number(-2.0)),
Ok(TokenKind::Number(3.0)),
Ok(TokenKind::CloseParen),
]);
test!(line_comment: "; foo\n;; bar baz qux", [
Ok(TokenKind::LineComment("foo".into())),
Ok(TokenKind::LineComment("bar baz qux".into())),
]);
test!(block_comment: "#| foo\nbar |#", [
Ok(TokenKind::BlockComment("foo\nbar".into()))
]);
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(55.0)),
Ok(TokenKind::Number(3.144)),
Ok(TokenKind::Number(0.0001)),
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
]);
test!(error_unclosed_string: "\"hiii", [
Err(LexerErrorKind::UnclosedString),
]);
test!(error_invalid_string: "\"hiii\"222", [
Err(LexerErrorKind::InvalidString),
]);
}

View File

@ -0,0 +1,19 @@
/// A symbol used to identify a function or variable.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Symbol(pub String);
impl Symbol {
/// Create a new `Symbol` from a string.
pub fn from<S>(s: S) -> Self
where
S: Into<String>,
{
Self(s.into())
}
}
impl std::fmt::Display for Symbol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}

View File

@ -0,0 +1,56 @@
use super::Symbol;
use crate::span::Span;
/// The type of a [Token].
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
/// Opening parenthesis, e.g. '('
OpenParen,
/// Closing parenthesis, e.g. ')'
CloseParen,
/// Opening brace, e.g. '{'
OpenBrace,
/// Closing brace, e.g. '}'
CloseBrace,
/// Opening bracket, e.g. '['
OpenBracket,
/// Closing bracket, e.g. ']'
CloseBracket,
/// Block comment, e.g. '#| ... |#'
BlockComment(String),
/// Line comment, e.g. '; ...'
LineComment(String),
/// Boolean, e.g. 'true', 'false'
Bool(bool),
/// Character, e.g. 'c', '\n'
Char(char),
/// Number, e.g. '1', '2.0', '0.003'
Number(f64),
/// String, e.g. '"foo bar"'
String(String),
/// Keyword, e.g. ':baz'
Keyword(Symbol),
/// Symbol, e.g. 'qux', '+'
Symbol(Symbol),
/// Nil, e.g. 'nil'
Nil,
}
/// A token with a start and end location.
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
/// The type of token.
pub kind: TokenKind,
/// The span in which the token occurs.
pub span: Span,
}
impl Token {
/// Construct a new instance of `Token`.
#[must_use]
pub const fn new(kind: TokenKind, span: Span) -> Self {
Self { kind, span }
}
}

View File

@ -1,3 +1,6 @@
//! Onihime programming language. //! Onihime programming language.
#![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)] #![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)]
mod lexer;
mod span;

141
onihime/src/span.rs Normal file
View File

@ -0,0 +1,141 @@
use std::{cmp::Ordering, iter, ops::Range, sync::Arc};
/// A location within some source text.
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub struct Location {
line: usize,
column: usize,
}
impl Location {
/// Construct a new instance of `Location`.
#[must_use]
pub const fn new(line: usize, column: usize) -> Self {
Self { line, column }
}
}
impl PartialOrd for Location {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
match self.line.partial_cmp(&other.line) {
Some(Ordering::Equal) => self.column.partial_cmp(&other.column),
ord => ord,
}
}
}
/// Some (optionally named) source text.
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct Source {
name: Option<String>,
contents: String,
lines: Vec<usize>,
}
impl Source {
/// Construct a new instance of `Source`.
#[must_use]
pub fn new(name: Option<String>, contents: String) -> Self {
let lines = contents
.match_indices('\n')
.map(|(i, _)| i)
.chain(iter::once(contents.len()))
.collect();
Self {
name,
contents,
lines,
}
}
/// Get the name of the source.
#[must_use]
pub fn name(&self) -> Option<&str> {
self.name.as_deref()
}
/// Set the name of the source.
pub fn set_name(&mut self, name: String) {
self.name = Some(name);
}
/// Get the [Location] of the specified byte in the source.
#[must_use]
pub fn location(&self, byte: usize) -> Location {
let line = self.lines.partition_point(|&x| x < byte);
let start = line.checked_sub(1).map_or(0, |n| self.lines[n] + 1);
let column = self.contents[start..byte].chars().count();
Location::new(line, column)
}
/// Get the full contents of the source.
#[must_use]
pub fn contents(&self) -> &str {
&self.contents
}
/// Get the specified line from the source.
#[must_use]
pub fn get_line(&self, line: usize) -> &str {
let end = self.lines[line];
let start = line.checked_sub(1).map_or(0, |n| self.lines[n] + 1);
&self.contents[start..end]
}
}
/// A contiguous sequence of bytes within some source.
#[derive(Debug, Default, Clone)]
pub struct Span {
bytes: Range<usize>,
source: Arc<Source>,
}
impl Span {
/// Construct a new instance of `Span`.
#[must_use]
pub fn new(bytes: Range<usize>, source: Arc<Source>) -> Self {
Self { bytes, source }
}
/// Join two spans, creating a new span.
#[must_use]
pub fn join(self, other: &Self) -> Self {
debug_assert!(self.same_source(other));
Self::new(self.bytes.start..other.bytes.end, self.source)
}
/// Extend one span to include another.
pub fn extend(&mut self, other: &Self) {
debug_assert!(self.same_source(other));
self.bytes.end = other.bytes.end;
}
/// The start location of a span within some source.
#[must_use]
pub fn location(&self) -> Location {
self.source.location(self.bytes.start)
}
/// The end location of a span within some source.
#[must_use]
pub fn end_location(&self) -> Location {
self.source.location(self.bytes.end)
}
/// Do two spans share the same source?
#[must_use]
pub fn same_source(&self, other: &Self) -> bool {
Arc::ptr_eq(&self.source, &other.source)
}
}
impl PartialEq for Span {
fn eq(&self, other: &Self) -> bool {
self.same_source(other) && self.bytes == other.bytes
}
}