From de78b9840a1ee368eaa640de803a7d780613d930 Mon Sep 17 00:00:00 2001 From: Jesse Braham Date: Sun, 1 Dec 2024 15:03:22 +0100 Subject: [PATCH] Add the initial implementation of the parser --- onihime/src/lib.rs | 1 + onihime/src/parser/error.rs | 57 ++++++++++ onihime/src/parser/mod.rs | 212 ++++++++++++++++++++++++++++++++++++ onihime/src/parser/node.rs | 129 ++++++++++++++++++++++ onihime/src/span.rs | 4 + 5 files changed, 403 insertions(+) create mode 100644 onihime/src/parser/error.rs create mode 100644 onihime/src/parser/mod.rs create mode 100644 onihime/src/parser/node.rs diff --git a/onihime/src/lib.rs b/onihime/src/lib.rs index b5ef643..47d0387 100644 --- a/onihime/src/lib.rs +++ b/onihime/src/lib.rs @@ -3,4 +3,5 @@ #![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)] mod lexer; +mod parser; mod span; diff --git a/onihime/src/parser/error.rs b/onihime/src/parser/error.rs new file mode 100644 index 0000000..068d3a1 --- /dev/null +++ b/onihime/src/parser/error.rs @@ -0,0 +1,57 @@ +use crate::{lexer::LexerError, span::Span}; + +/// Errors that can occur during parsing. +#[derive(Debug, Clone, PartialEq)] +pub enum ParserErrorKind { + /// An error which ocurred during lexical analysis. + Lexer(LexerError), + /// An unexpecting closing parenthesis/bracket was encountered. + UnexpectedCloseBracket, + /// Opening parenthesis/bracket does not have a matching closing + /// parenthesis/bracket. + UnclosedBracket, + /// An unmatched parenthesis/bracket was encountered. + UnmatchedBracket, + /// Unexpected parser state reached. + Unreachable, +} + +/// Parser error, with a start and end location. +#[derive(Debug, Clone, PartialEq)] +pub struct ParserError { + /// The type of error encountered. + pub kind: ParserErrorKind, + /// The span in which the error occurred. + pub span: Span, +} + +impl ParserError { + /// Construct a new instance of `ParserErorr`. + #[must_use] + pub const fn new(kind: ParserErrorKind, span: Span) -> Self { + Self { kind, span } + } +} + +impl std::error::Error for ParserError {} + +impl std::fmt::Display for ParserError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use ParserErrorKind::*; + + match &self.kind { + Lexer(err) => write!(f, "{err}"), + UnexpectedCloseBracket => write!(f, "Unexpected closing bracket"), + UnclosedBracket => write!(f, "Unclosed parenthesis"), + UnmatchedBracket => write!(f, "Unmatched bracket"), + Unreachable => write!(f, "Unexpected parsing state reached"), + } + } +} + +impl From for ParserError { + fn from(err: LexerError) -> Self { + let span = err.span.clone(); + Self::new(ParserErrorKind::Lexer(err), span) + } +} diff --git a/onihime/src/parser/mod.rs b/onihime/src/parser/mod.rs new file mode 100644 index 0000000..4de2106 --- /dev/null +++ b/onihime/src/parser/mod.rs @@ -0,0 +1,212 @@ +pub use self::{ + error::{ParserError, ParserErrorKind}, + node::{Node, NodeKind}, +}; +use crate::lexer::{Lexer, TokenKind}; + +mod error; +mod node; + +/// A parser for the AST. +#[derive(Debug)] +pub struct Parser<'parser> { + lexer: Lexer<'parser>, +} + +impl<'parser> Parser<'parser> { + /// Create a new parser instance from a string. + #[must_use] + pub fn new(input: &'parser str) -> Self { + Self { + lexer: Lexer::new(input), + } + } + + /// Set the name of the lexer's source. + pub fn set_name(&mut self, name: String) { + self.lexer.set_name(name); + } + + /// Parse the input string into an AST. + pub fn parse(&mut self) -> Result, ParserError> { + let mut parents = Vec::new(); + let mut cur_node = Node::new(NodeKind::List(Vec::new()), self.lexer.span()); + + while let Some(token) = self.lexer.read()? { + match token.kind { + TokenKind::BlockComment(_) | TokenKind::LineComment(_) => {} + TokenKind::OpenParen => { + let child = Node::new(NodeKind::List(Vec::new()), token.span); + parents.push(cur_node); + cur_node = child; + } + TokenKind::CloseParen => { + let mut parent = parents.pop().ok_or_else(|| { + ParserError::new( + ParserErrorKind::UnexpectedCloseBracket, + token.span.clone(), + ) + })?; + + cur_node.span.extend(&token.span); + + if !matches!(cur_node.kind, NodeKind::List(_)) { + return Err(ParserError::new( + ParserErrorKind::UnmatchedBracket, + token.span, + )); + } + + parent.push_node(cur_node)?; + cur_node = parent; + } + TokenKind::OpenBracket => { + let child = Node::new(NodeKind::Vector(Vec::new()), token.span); + parents.push(cur_node); + cur_node = child; + } + TokenKind::CloseBracket => { + let mut parent = parents.pop().ok_or_else(|| { + ParserError::new( + ParserErrorKind::UnexpectedCloseBracket, + token.span.clone(), + ) + })?; + + cur_node.span.extend(&token.span); + + if !matches!(cur_node.kind, NodeKind::Vector(_)) { + return Err(ParserError::new( + ParserErrorKind::UnmatchedBracket, + token.span, + )); + } + + parent.push_node(cur_node)?; + cur_node = parent; + } + + _ => cur_node.push_node(Node::try_from(token)?)?, + } + } + + if !parents.is_empty() { + return Err(ParserError::new( + ParserErrorKind::UnclosedBracket, + cur_node.span, + )); + } + + if let NodeKind::List(body) = cur_node.kind { + Ok(body) + } else { + Err(ParserError::new( + ParserErrorKind::Unreachable, + cur_node.span, + )) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + lexer::{LexerError, LexerErrorKind, Symbol}, + span::Span, + }; + + macro_rules! test { + ( $name:ident: $input:literal, $src:ident => $ast:expr ) => { + #[test] + fn $name() { + let mut parser = Parser::new($input); + let $src = parser.lexer.source(); + assert_eq!(parser.parse(), $ast); + } + }; + } + + test!(parse_list: "(+ 1 2)", src => Ok(vec![ + Node::new( + NodeKind::List(vec![ + Node::new(NodeKind::Symbol(Symbol::from("+")), Span::new(1..2, src.clone())), + Node::new(NodeKind::Number(1.0), Span::new(3..4, src.clone())), + Node::new(NodeKind::Number(2.0), Span::new(5..6, src.clone())), + ]), + Span::new(0..7, src) + ) + ])); + + test!(parse_nested_list: "(+ 2.5 64 (* 2 3))", src => Ok(vec![Node::new( + NodeKind::List(vec![ + Node::new(NodeKind::Symbol(Symbol::from("+")), Span::new(1..2, src.clone())), + Node::new(NodeKind::Number(2.5), Span::new(3..6, src.clone())), + Node::new(NodeKind::Number(64.0), Span::new(7..9, src.clone())), + Node::new( + NodeKind::List(vec![ + Node::new(NodeKind::Symbol(Symbol::from("*")), Span::new(11..12, src.clone())), + Node::new(NodeKind::Number(2.0), Span::new(13..14, src.clone())), + Node::new(NodeKind::Number(3.0), Span::new(15..16, src.clone())), + ]), + Span::new(10..17, src.clone()) + ), + ]), + Span::new(0..18, src) + )])); + + test!(parse_multiple_expressions: "(/ 6 3 (+ 1 2)) (* 2 5)\n(- 10 5)", src => Ok(vec![ + Node::new( + NodeKind::List(vec![ + Node::new(NodeKind::Symbol(Symbol::from("/")), Span::new(1..2, src.clone())), + Node::new(NodeKind::Number(6.0), Span::new(3..4, src.clone())), + Node::new(NodeKind::Number(3.0), Span::new(5..6, src.clone())), + Node::new( + NodeKind::List(vec![ + Node::new(NodeKind::Symbol(Symbol::from("+")), Span::new(8..9, src.clone())), + Node::new(NodeKind::Number(1.0), Span::new(10..11, src.clone())), + Node::new(NodeKind::Number(2.0), Span::new(12..13, src.clone())), + ]), + Span::new(7..14, src.clone()) + ), + ]), + Span::new(0..15, src.clone()) + ), + Node::new( + NodeKind::List(vec![ + Node::new(NodeKind::Symbol(Symbol::from("*")), Span::new(17..18, src.clone())), + Node::new(NodeKind::Number(2.0), Span::new(19..20, src.clone())), + Node::new(NodeKind::Number(5.0), Span::new(21..22, src.clone())), + ]), + Span::new(16..23, src.clone()) + ), + Node::new( + NodeKind::List(vec![ + Node::new(NodeKind::Symbol(Symbol::from("-")), Span::new(25..26, src.clone())), + Node::new(NodeKind::Number(10.0), Span::new(27..29, src.clone())), + Node::new(NodeKind::Number(5.0), Span::new(30..31, src.clone())), + ]), + Span::new(24..32, src) + ), + ])); + + test!(parse_float: "(2.500000)", src => Ok(vec![Node::new( + NodeKind::List(vec![Node::new(NodeKind::Number(2.5), Span::new(1..9, src.clone()))]), + Span::new(0..10, src) + )])); + + test!(parse_empty: "", _src => Ok(vec![])); + + test!(error_invalid_number: "(+ 1.2.3)", src => Err(ParserError::new( + ParserErrorKind::Lexer(LexerError::new( + LexerErrorKind::InvalidNumber("1.2.3".into()), + Span::new(3..8, src.clone()) + )), + Span::new(3..8, src) + ))); + + test!(error_unexpected_close_paren: ")", src => Err(ParserError::new( + ParserErrorKind::UnexpectedCloseBracket, + Span::new(0..1, src) + ))); +} diff --git a/onihime/src/parser/node.rs b/onihime/src/parser/node.rs new file mode 100644 index 0000000..e04641b --- /dev/null +++ b/onihime/src/parser/node.rs @@ -0,0 +1,129 @@ +use super::error::{ParserError, ParserErrorKind}; +use crate::{ + lexer::{Symbol, Token, TokenKind}, + span::Span, +}; + +/// The type of a node in the AST. +#[derive(Debug, Clone, PartialEq)] +pub enum NodeKind { + /// Bool. + Bool(bool), + /// Character. + Char(char), + /// Keyword. + Keyword(Symbol), + /// Number. + Number(f64), + /// String. + String(String), + /// Symbol. + Symbol(Symbol), + /// Nil. + Nil, + + /// List. + List(Vec), + /// Vector. + Vector(Vec), +} + +impl std::fmt::Display for NodeKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + NodeKind::Bool(_) => write!(f, "BOOL"), + NodeKind::Char(_) => write!(f, "CHAR"), + NodeKind::Keyword(_) => write!(f, "KEYWORD"), + NodeKind::Number(_) => write!(f, "NUMBER"), + NodeKind::String(_) => write!(f, "STRING"), + NodeKind::Symbol(_) => write!(f, "SYMBOL"), + NodeKind::Nil => write!(f, "NIL"), + NodeKind::List(_) => write!(f, "LIST"), + NodeKind::Vector(_) => write!(f, "VECTOR"), + } + } +} + +/// A node in the AST with a start and end location. +#[derive(Debug, Clone, PartialEq)] +pub struct Node { + /// The type of node. + pub kind: NodeKind, + /// The span in which the node occurs. + pub span: Span, +} + +impl Node { + /// Construct a new instance of `Node`. + #[must_use] + pub const fn new(kind: NodeKind, span: Span) -> Self { + Self { kind, span } + } + + /// Push a child node onto a list node. + pub fn push_node(&mut self, child: Self) -> Result<(), ParserError> { + match &mut self.kind { + NodeKind::List(c) | NodeKind::Vector(c) => { + c.push(child); + } + _ => return Err(ParserError::new(ParserErrorKind::Unreachable, child.span)), + } + + Ok(()) + } + + fn display(&self, indent: usize) -> String { + let mut text = format!( + "{}{}@{}..{}\n", + " ".repeat(indent), + self.kind, + self.span.bytes().start, + self.span.bytes().end + ); + + match &self.kind { + NodeKind::List(vec) | NodeKind::Vector(vec) => { + for node in vec { + text.push_str(&node.display(indent + 1)); + } + } + _ => {} + } + + text.trim_end().to_string() + } +} + +impl std::fmt::Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.display(0)) + } +} + +impl TryFrom for Node { + type Error = ParserError; + + fn try_from(token: Token) -> Result { + let kind = match token.kind { + TokenKind::Bool(b) => NodeKind::Bool(b), + TokenKind::Char(c) => NodeKind::Char(c), + TokenKind::Number(n) => NodeKind::Number(n), + TokenKind::String(s) => NodeKind::String(s), + TokenKind::Keyword(k) => NodeKind::Keyword(k), + TokenKind::Symbol(s) => NodeKind::Symbol(s), + TokenKind::Nil => NodeKind::Nil, + TokenKind::OpenParen + | TokenKind::CloseParen + | TokenKind::OpenBrace + | TokenKind::CloseBrace + | TokenKind::OpenBracket + | TokenKind::CloseBracket + | TokenKind::LineComment(_) + | TokenKind::BlockComment(_) => { + return Err(ParserError::new(ParserErrorKind::Unreachable, token.span)) + } + }; + + Ok(Self::new(kind, token.span)) + } +} diff --git a/onihime/src/span.rs b/onihime/src/span.rs index 7290588..1c78a8b 100644 --- a/onihime/src/span.rs +++ b/onihime/src/span.rs @@ -132,6 +132,10 @@ impl Span { pub fn same_source(&self, other: &Self) -> bool { Arc::ptr_eq(&self.source, &other.source) } + + pub(crate) fn bytes(&self) -> &Range { + &self.bytes + } } impl PartialEq for Span {