From e5fafd03ba2e37a84bb0ad308d5a25661acc146a Mon Sep 17 00:00:00 2001 From: Jesse Braham Date: Sat, 28 Dec 2024 10:31:37 +0100 Subject: [PATCH] Make the lexer operate on graphemes instead of chars (still needs some work) --- onihime/Cargo.toml | 1 + onihime/src/lexer/error.rs | 2 +- onihime/src/lexer/mod.rs | 175 +++++++++++++++++++++---------------- onihime/src/lexer/token.rs | 2 +- onihime/src/parser/ast.rs | 2 +- onihime/src/parser/mod.rs | 6 +- 6 files changed, 108 insertions(+), 80 deletions(-) diff --git a/onihime/Cargo.toml b/onihime/Cargo.toml index 76c5369..24d013c 100644 --- a/onihime/Cargo.toml +++ b/onihime/Cargo.toml @@ -8,6 +8,7 @@ repository.workspace = true license.workspace = true [dependencies] +unicode-segmentation = "1.12.0" [lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] } diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs index 6d8c9aa..e4ead76 100644 --- a/onihime/src/lexer/error.rs +++ b/onihime/src/lexer/error.rs @@ -4,7 +4,7 @@ use crate::span::Span; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum LexerErrorKind { /// An invalid escape sequence was encountered. - InvalidEscape(char), + InvalidEscape(String), /// An invalid numeric literal was encountered. InvalidNumber(String), /// An invalid string literal was encountered. diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs index 32fcad9..3ec9781 100644 --- a/onihime/src/lexer/mod.rs +++ b/onihime/src/lexer/mod.rs @@ -1,4 +1,6 @@ -use std::{str::Chars, sync::Arc}; +use std::sync::Arc; + +use unicode_segmentation::{Graphemes, UnicodeSegmentation as _}; pub(crate) use self::{ error::{LexerError, LexerErrorKind}, @@ -11,18 +13,26 @@ mod error; mod symbol; mod token; -/// Determine if the current character is a separator, performing 1-character -/// lookahead as needed to handle multi-character separators. -fn is_separator(current: char, next: Option) -> bool { - current.is_ascii_whitespace() - || matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';') - || (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{'))) +/// Determine if the current grapheme is an ASCII digit. +fn is_ascii_digit(current: &str) -> bool { + matches!( + current, + "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" + ) +} + +/// Determine if the current grapheme is a separator, performing lookahead as +/// needed to handle multi-character separators. +fn is_separator(current: &str, next: Option<&str>) -> bool { + current.trim_ascii().is_empty() + || matches!(current, "(" | ")" | "[" | "]" | "{" | "}" | ";") + || (current == "#" && next.is_some_and(|c| matches!(c, "|" | "{"))) } /// A lexer, used by the parser. #[derive(Debug)] pub(crate) struct Lexer<'lexer> { - input: Chars<'lexer>, + input: Graphemes<'lexer>, byte: usize, source: Arc, } @@ -34,7 +44,7 @@ impl<'lexer> Lexer<'lexer> { let source = Arc::new(Source::new(None, input.to_string())); Self { - input: input.chars(), + input: input.graphemes(true), byte: 0, source, } @@ -62,30 +72,30 @@ impl<'lexer> Lexer<'lexer> { /// Returns `true` when at the end of the input. #[must_use] pub(crate) fn eof(&self) -> bool { - self.peek(0).is_none() + self.current().is_none() } - /// Get the current character. + /// Get the current grapheme without advancing. #[must_use] - fn current(&self) -> Option { - self.input.as_str().chars().next() + fn current(&self) -> Option<&str> { + self.input.clone().next() } - /// Get the nth character ahead of the current character without advancing. + /// Get the nth grapheme ahead of the current grapheme without advancing. #[must_use] - fn peek(&self, n: usize) -> Option { - self.input.as_str().chars().nth(n) + fn peek(&self) -> Option<&str> { + self.input.clone().take(2).last() } - /// Advance the lexer by one character. - fn advance(&mut self) -> Option { + /// Advance the lexer by one grapheme. + fn advance(&mut self) -> Option<&str> { let c = self.input.next()?; - self.byte += c.len_utf8(); + self.byte += c.len(); Some(c) } - /// Advance the lexer by one character, and then return the specified + /// Advance the lexer by one grapheme, and then return the specified /// `TokenKind`: #[must_use] fn advance_and(&mut self, kind: TokenKind) -> TokenKind { @@ -98,11 +108,11 @@ impl<'lexer> Lexer<'lexer> { fn read_word(&mut self) -> String { let mut word = String::new(); while let Some(c) = self.current() { - if is_separator(c, self.peek(1)) { + if is_separator(c, self.peek()) { break; } - word.push(c); + word.push_str(c); self.advance(); } @@ -115,7 +125,7 @@ impl<'lexer> Lexer<'lexer> { // we have reached the end of input and no additional characters can be read: let c = loop { match self.current() { - Some(c) if c.is_ascii_whitespace() => { + Some(c) if c.trim_ascii().is_empty() => { self.advance(); } Some(c) => break c, @@ -126,27 +136,27 @@ impl<'lexer> Lexer<'lexer> { let mut span = self.span(); let kind = match c { - ';' => self.line_comment(), - '#' if self.peek(1) == Some('|') => self.block_comment(), + ";" => self.line_comment(), + "#" if self.peek() == Some("|") => self.block_comment(), - '(' => self.advance_and(TokenKind::OpenParen), - ')' => self.advance_and(TokenKind::CloseParen), - '{' => self.advance_and(TokenKind::OpenBrace), - '}' => self.advance_and(TokenKind::CloseBrace), - '[' => self.advance_and(TokenKind::OpenBracket), - ']' => self.advance_and(TokenKind::CloseBracket), - '#' if self.peek(1) == Some('{') => { + "(" => self.advance_and(TokenKind::OpenParen), + ")" => self.advance_and(TokenKind::CloseParen), + "{" => self.advance_and(TokenKind::OpenBrace), + "}" => self.advance_and(TokenKind::CloseBrace), + "[" => self.advance_and(TokenKind::OpenBracket), + "]" => self.advance_and(TokenKind::CloseBracket), + "#" if self.peek() == Some("{") => { self.advance(); // '#' self.advance(); // '{' TokenKind::OpenHashBrace } - '0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => { - let radix = match self.peek(1) { - Some('b') => 2, - Some('o') => 8, - Some('x') => 16, + "0" if matches!(self.peek(), Some("b") | Some("o") | Some("x")) => { + let radix = match self.peek() { + Some("b") => 2, + Some("o") => 8, + Some("x") => 16, _ => unreachable!(), }; @@ -155,15 +165,15 @@ impl<'lexer> Lexer<'lexer> { self.integer_literal(word, span, radix)? } - '0'..='9' => self.numeric_literal(span.clone())?, - '+' | '-' if matches!(self.peek(1), Some('0'..='9')) => { + c if is_ascii_digit(c) => self.numeric_literal(span.clone())?, + "+" | "-" if self.peek().is_some_and(|c| is_ascii_digit(c)) => { self.numeric_literal(span.clone())? } - '\'' => self.char_literal(span.clone())?, - '"' => self.string_literal(span.clone())?, + "'" => self.char_literal(span.clone())?, + "\"" => self.string_literal(span.clone())?, - ':' => { + ":" => { self.advance(); // ':' TokenKind::Keyword(Symbol::from(self.read_word())) @@ -189,18 +199,18 @@ impl<'lexer> Lexer<'lexer> { fn line_comment(&mut self) -> TokenKind { // Line comments may start with any number of semicolons, so consume however // many are present at the beginning of the comment: - while self.current().is_some_and(|c| c == ';') { + while self.current().is_some_and(|c| c == ";") { self.advance(); } // Line comments continue until a newline character is encountered: let mut comment = String::new(); while let Some(c) = self.advance() { - if c == '\n' { + if c == "\n" { break; } - comment.push(c); + comment.push_str(c); } TokenKind::LineComment(comment.trim().into()) @@ -213,16 +223,20 @@ impl<'lexer> Lexer<'lexer> { self.advance(); // '|' let mut comment = String::new(); + let mut pipe_found = false; + while let Some(c) = self.advance() { - if c == '|' && matches!(self.peek(0), Some('#')) { - self.advance(); // '#' + if pipe_found && c == "#" { break; } - comment.push(c); + comment.push_str(c); + pipe_found = c == "|"; } - TokenKind::BlockComment(comment.trim().into()) + let comment = comment.trim_end_matches('|').trim(); + + TokenKind::BlockComment(comment.into()) } fn float_literal(&self, word: String, span: Span) -> Result { @@ -269,13 +283,15 @@ impl<'lexer> Lexer<'lexer> { self.advance(); // '\'' let c = match self.advance() { - Some('\\') => match self.advance() { - Some(c @ ('"' | '\\')) => c, - Some('n') => '\n', - Some('r') => '\r', - Some('t') => '\t', - Some('e') => '\x1b', + Some("\\") => match self.advance() { + Some(c @ ("\"" | "\\")) => c, + Some("n") => "\n", + Some("r") => "\r", + Some("t") => "\t", + Some("e") => "\x1b", Some(c) => { + let c = c.to_string(); + self.read_word(); // Recover from the error return Err(LexerError::new( LexerErrorKind::InvalidEscape(c), @@ -298,7 +314,9 @@ impl<'lexer> Lexer<'lexer> { } }; - if self.advance() != Some('\'') { + let c = c.to_string(); + + if self.advance() != Some("'") { self.read_word(); // Recover from the error return Err(LexerError::new( LexerErrorKind::UnclosedChar, @@ -317,15 +335,17 @@ impl<'lexer> Lexer<'lexer> { loop { let ch_span = self.span(); - string.push(match self.advance() { - Some('"') => break, - Some('\\') => match self.advance() { - Some(c @ ('"' | '\\')) => c, - Some('n') => '\n', - Some('r') => '\r', - Some('t') => '\t', - Some('e') => '\x1b', + string.push_str(match self.advance() { + Some("\"") => break, + Some("\\") => match self.advance() { + Some(c @ ("\"" | "\\")) => c, + Some("n") => "\n", + Some("r") => "\r", + Some("t") => "\t", + Some("e") => "\x1b", Some(c) => { + let c = c.to_string(); + self.read_word(); // Recover from the error return Err(LexerError::new( LexerErrorKind::InvalidEscape(c), @@ -343,7 +363,7 @@ impl<'lexer> Lexer<'lexer> { if self .current() - .is_some_and(|c| !is_separator(c, self.peek(1))) + .is_some_and(|c| !is_separator(c, self.peek())) { self.read_word(); // Recover from the error return Err(LexerError::new( @@ -452,16 +472,20 @@ mod tests { ]); test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [ - Ok(TokenKind::Char('x')), - Ok(TokenKind::Char('\n')), - Ok(TokenKind::Char('\r')), - Ok(TokenKind::Char('\t')), - Ok(TokenKind::Char('\x1b')), - Ok(TokenKind::Char('\\')), - Err(LexerErrorKind::InvalidEscape('q')), + Ok(TokenKind::Char("x".into())), + Ok(TokenKind::Char("\n".into())), + Ok(TokenKind::Char("\r".into())), + Ok(TokenKind::Char("\t".into())), + Ok(TokenKind::Char("\x1b".into())), + Ok(TokenKind::Char("\\".into())), + Err(LexerErrorKind::InvalidEscape("q".into())), Err(LexerErrorKind::UnclosedChar), ]); + test!(char_literal_with_unicode: "'y̆'", [ + Ok(TokenKind::Char("y̆".into())), + ]); + test!(error_unclosed_char_escape: r"'\", [ Err(LexerErrorKind::UnclosedChar), ]); @@ -482,7 +506,7 @@ mod tests { ]); test!(error_invalid_escape_string: "\"oh no \\p\"", [ - Err(LexerErrorKind::InvalidEscape('p')), + Err(LexerErrorKind::InvalidEscape("p".into())), ]); test!(error_unclosed_string: "\"hiii", [ @@ -513,11 +537,14 @@ mod tests { Ok(TokenKind::CloseParen), ]); - test!(unicode_symbol: "(かわいい 🐕 😻)", [ + test!(unicode_symbols: "(かわいい 🐕 😻 (föö))", [ Ok(TokenKind::OpenParen), Ok(TokenKind::Symbol(Symbol::from("かわいい"))), Ok(TokenKind::Symbol(Symbol::from("🐕"))), Ok(TokenKind::Symbol(Symbol::from("😻"))), + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("föö"))), + Ok(TokenKind::CloseParen), Ok(TokenKind::CloseParen), ]); } diff --git a/onihime/src/lexer/token.rs b/onihime/src/lexer/token.rs index 5c8641e..9b55d5d 100644 --- a/onihime/src/lexer/token.rs +++ b/onihime/src/lexer/token.rs @@ -27,7 +27,7 @@ pub(crate) enum TokenKind { /// Boolean, e.g. `true`, `false` Bool(bool), /// Character, e.g. `'c'`, `'\n'` - Char(char), + Char(String), /// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003` Float(f64), /// Integer, e.g. `0`, `-1`, `+200` diff --git a/onihime/src/parser/ast.rs b/onihime/src/parser/ast.rs index 5c2f5bb..9168d00 100644 --- a/onihime/src/parser/ast.rs +++ b/onihime/src/parser/ast.rs @@ -117,7 +117,7 @@ pub(crate) enum Atom { /// Boolean, e.g. `true`, `false` Bool(bool), /// Character, e.g. `'c'`, `'\n'` - Char(char), + Char(String), /// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003` Float(f64), /// Integer, e.g. `0`, `-1`, `+200` diff --git a/onihime/src/parser/mod.rs b/onihime/src/parser/mod.rs index e8874b3..5b9ed9a 100644 --- a/onihime/src/parser/mod.rs +++ b/onihime/src/parser/mod.rs @@ -258,9 +258,9 @@ mod tests { test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![ Node::new( Expr::Vector(vec![ - Node::new(Expr::Atom(Atom::Char('a')), Span::new(1..4, src.clone())), - Node::new(Expr::Atom(Atom::Char('b')), Span::new(5..8, src.clone())), - Node::new(Expr::Atom(Atom::Char('c')), Span::new(9..12, src.clone())), + Node::new(Expr::Atom(Atom::Char("a".into())), Span::new(1..4, src.clone())), + Node::new(Expr::Atom(Atom::Char("b".into())), Span::new(5..8, src.clone())), + Node::new(Expr::Atom(Atom::Char("c".into())), Span::new(9..12, src.clone())), ]), Span::new(0..13, src), )