Make the lexer operate on graphemes instead of chars (still needs some work)

This commit is contained in:
Jesse Braham 2024-12-28 10:31:37 +01:00
parent 8dcdd34b65
commit e5fafd03ba
6 changed files with 108 additions and 80 deletions

View File

@ -8,6 +8,7 @@ repository.workspace = true
license.workspace = true license.workspace = true
[dependencies] [dependencies]
unicode-segmentation = "1.12.0"
[lints.rust] [lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] } unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }

View File

@ -4,7 +4,7 @@ use crate::span::Span;
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LexerErrorKind { pub enum LexerErrorKind {
/// An invalid escape sequence was encountered. /// An invalid escape sequence was encountered.
InvalidEscape(char), InvalidEscape(String),
/// An invalid numeric literal was encountered. /// An invalid numeric literal was encountered.
InvalidNumber(String), InvalidNumber(String),
/// An invalid string literal was encountered. /// An invalid string literal was encountered.

View File

@ -1,4 +1,6 @@
use std::{str::Chars, sync::Arc}; use std::sync::Arc;
use unicode_segmentation::{Graphemes, UnicodeSegmentation as _};
pub(crate) use self::{ pub(crate) use self::{
error::{LexerError, LexerErrorKind}, error::{LexerError, LexerErrorKind},
@ -11,18 +13,26 @@ mod error;
mod symbol; mod symbol;
mod token; mod token;
/// Determine if the current character is a separator, performing 1-character /// Determine if the current grapheme is an ASCII digit.
/// lookahead as needed to handle multi-character separators. fn is_ascii_digit(current: &str) -> bool {
fn is_separator(current: char, next: Option<char>) -> bool { matches!(
current.is_ascii_whitespace() current,
|| matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';') "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
|| (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{'))) )
}
/// Determine if the current grapheme is a separator, performing lookahead as
/// needed to handle multi-character separators.
fn is_separator(current: &str, next: Option<&str>) -> bool {
current.trim_ascii().is_empty()
|| matches!(current, "(" | ")" | "[" | "]" | "{" | "}" | ";")
|| (current == "#" && next.is_some_and(|c| matches!(c, "|" | "{")))
} }
/// A lexer, used by the parser. /// A lexer, used by the parser.
#[derive(Debug)] #[derive(Debug)]
pub(crate) struct Lexer<'lexer> { pub(crate) struct Lexer<'lexer> {
input: Chars<'lexer>, input: Graphemes<'lexer>,
byte: usize, byte: usize,
source: Arc<Source>, source: Arc<Source>,
} }
@ -34,7 +44,7 @@ impl<'lexer> Lexer<'lexer> {
let source = Arc::new(Source::new(None, input.to_string())); let source = Arc::new(Source::new(None, input.to_string()));
Self { Self {
input: input.chars(), input: input.graphemes(true),
byte: 0, byte: 0,
source, source,
} }
@ -62,30 +72,30 @@ impl<'lexer> Lexer<'lexer> {
/// Returns `true` when at the end of the input. /// Returns `true` when at the end of the input.
#[must_use] #[must_use]
pub(crate) fn eof(&self) -> bool { pub(crate) fn eof(&self) -> bool {
self.peek(0).is_none() self.current().is_none()
} }
/// Get the current character. /// Get the current grapheme without advancing.
#[must_use] #[must_use]
fn current(&self) -> Option<char> { fn current(&self) -> Option<&str> {
self.input.as_str().chars().next() self.input.clone().next()
} }
/// Get the nth character ahead of the current character without advancing. /// Get the nth grapheme ahead of the current grapheme without advancing.
#[must_use] #[must_use]
fn peek(&self, n: usize) -> Option<char> { fn peek(&self) -> Option<&str> {
self.input.as_str().chars().nth(n) self.input.clone().take(2).last()
} }
/// Advance the lexer by one character. /// Advance the lexer by one grapheme.
fn advance(&mut self) -> Option<char> { fn advance(&mut self) -> Option<&str> {
let c = self.input.next()?; let c = self.input.next()?;
self.byte += c.len_utf8(); self.byte += c.len();
Some(c) Some(c)
} }
/// Advance the lexer by one character, and then return the specified /// Advance the lexer by one grapheme, and then return the specified
/// `TokenKind`: /// `TokenKind`:
#[must_use] #[must_use]
fn advance_and(&mut self, kind: TokenKind) -> TokenKind { fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
@ -98,11 +108,11 @@ impl<'lexer> Lexer<'lexer> {
fn read_word(&mut self) -> String { fn read_word(&mut self) -> String {
let mut word = String::new(); let mut word = String::new();
while let Some(c) = self.current() { while let Some(c) = self.current() {
if is_separator(c, self.peek(1)) { if is_separator(c, self.peek()) {
break; break;
} }
word.push(c); word.push_str(c);
self.advance(); self.advance();
} }
@ -115,7 +125,7 @@ impl<'lexer> Lexer<'lexer> {
// we have reached the end of input and no additional characters can be read: // we have reached the end of input and no additional characters can be read:
let c = loop { let c = loop {
match self.current() { match self.current() {
Some(c) if c.is_ascii_whitespace() => { Some(c) if c.trim_ascii().is_empty() => {
self.advance(); self.advance();
} }
Some(c) => break c, Some(c) => break c,
@ -126,27 +136,27 @@ impl<'lexer> Lexer<'lexer> {
let mut span = self.span(); let mut span = self.span();
let kind = match c { let kind = match c {
';' => self.line_comment(), ";" => self.line_comment(),
'#' if self.peek(1) == Some('|') => self.block_comment(), "#" if self.peek() == Some("|") => self.block_comment(),
'(' => self.advance_and(TokenKind::OpenParen), "(" => self.advance_and(TokenKind::OpenParen),
')' => self.advance_and(TokenKind::CloseParen), ")" => self.advance_and(TokenKind::CloseParen),
'{' => self.advance_and(TokenKind::OpenBrace), "{" => self.advance_and(TokenKind::OpenBrace),
'}' => self.advance_and(TokenKind::CloseBrace), "}" => self.advance_and(TokenKind::CloseBrace),
'[' => self.advance_and(TokenKind::OpenBracket), "[" => self.advance_and(TokenKind::OpenBracket),
']' => self.advance_and(TokenKind::CloseBracket), "]" => self.advance_and(TokenKind::CloseBracket),
'#' if self.peek(1) == Some('{') => { "#" if self.peek() == Some("{") => {
self.advance(); // '#' self.advance(); // '#'
self.advance(); // '{' self.advance(); // '{'
TokenKind::OpenHashBrace TokenKind::OpenHashBrace
} }
'0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => { "0" if matches!(self.peek(), Some("b") | Some("o") | Some("x")) => {
let radix = match self.peek(1) { let radix = match self.peek() {
Some('b') => 2, Some("b") => 2,
Some('o') => 8, Some("o") => 8,
Some('x') => 16, Some("x") => 16,
_ => unreachable!(), _ => unreachable!(),
}; };
@ -155,15 +165,15 @@ impl<'lexer> Lexer<'lexer> {
self.integer_literal(word, span, radix)? self.integer_literal(word, span, radix)?
} }
'0'..='9' => self.numeric_literal(span.clone())?, c if is_ascii_digit(c) => self.numeric_literal(span.clone())?,
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => { "+" | "-" if self.peek().is_some_and(|c| is_ascii_digit(c)) => {
self.numeric_literal(span.clone())? self.numeric_literal(span.clone())?
} }
'\'' => self.char_literal(span.clone())?, "'" => self.char_literal(span.clone())?,
'"' => self.string_literal(span.clone())?, "\"" => self.string_literal(span.clone())?,
':' => { ":" => {
self.advance(); // ':' self.advance(); // ':'
TokenKind::Keyword(Symbol::from(self.read_word())) TokenKind::Keyword(Symbol::from(self.read_word()))
@ -189,18 +199,18 @@ impl<'lexer> Lexer<'lexer> {
fn line_comment(&mut self) -> TokenKind { fn line_comment(&mut self) -> TokenKind {
// Line comments may start with any number of semicolons, so consume however // Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment: // many are present at the beginning of the comment:
while self.current().is_some_and(|c| c == ';') { while self.current().is_some_and(|c| c == ";") {
self.advance(); self.advance();
} }
// Line comments continue until a newline character is encountered: // Line comments continue until a newline character is encountered:
let mut comment = String::new(); let mut comment = String::new();
while let Some(c) = self.advance() { while let Some(c) = self.advance() {
if c == '\n' { if c == "\n" {
break; break;
} }
comment.push(c); comment.push_str(c);
} }
TokenKind::LineComment(comment.trim().into()) TokenKind::LineComment(comment.trim().into())
@ -213,16 +223,20 @@ impl<'lexer> Lexer<'lexer> {
self.advance(); // '|' self.advance(); // '|'
let mut comment = String::new(); let mut comment = String::new();
let mut pipe_found = false;
while let Some(c) = self.advance() { while let Some(c) = self.advance() {
if c == '|' && matches!(self.peek(0), Some('#')) { if pipe_found && c == "#" {
self.advance(); // '#'
break; break;
} }
comment.push(c); comment.push_str(c);
pipe_found = c == "|";
} }
TokenKind::BlockComment(comment.trim().into()) let comment = comment.trim_end_matches('|').trim();
TokenKind::BlockComment(comment.into())
} }
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> { fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
@ -269,13 +283,15 @@ impl<'lexer> Lexer<'lexer> {
self.advance(); // '\'' self.advance(); // '\''
let c = match self.advance() { let c = match self.advance() {
Some('\\') => match self.advance() { Some("\\") => match self.advance() {
Some(c @ ('"' | '\\')) => c, Some(c @ ("\"" | "\\")) => c,
Some('n') => '\n', Some("n") => "\n",
Some('r') => '\r', Some("r") => "\r",
Some('t') => '\t', Some("t") => "\t",
Some('e') => '\x1b', Some("e") => "\x1b",
Some(c) => { Some(c) => {
let c = c.to_string();
self.read_word(); // Recover from the error self.read_word(); // Recover from the error
return Err(LexerError::new( return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c), LexerErrorKind::InvalidEscape(c),
@ -298,7 +314,9 @@ impl<'lexer> Lexer<'lexer> {
} }
}; };
if self.advance() != Some('\'') { let c = c.to_string();
if self.advance() != Some("'") {
self.read_word(); // Recover from the error self.read_word(); // Recover from the error
return Err(LexerError::new( return Err(LexerError::new(
LexerErrorKind::UnclosedChar, LexerErrorKind::UnclosedChar,
@ -317,15 +335,17 @@ impl<'lexer> Lexer<'lexer> {
loop { loop {
let ch_span = self.span(); let ch_span = self.span();
string.push(match self.advance() { string.push_str(match self.advance() {
Some('"') => break, Some("\"") => break,
Some('\\') => match self.advance() { Some("\\") => match self.advance() {
Some(c @ ('"' | '\\')) => c, Some(c @ ("\"" | "\\")) => c,
Some('n') => '\n', Some("n") => "\n",
Some('r') => '\r', Some("r") => "\r",
Some('t') => '\t', Some("t") => "\t",
Some('e') => '\x1b', Some("e") => "\x1b",
Some(c) => { Some(c) => {
let c = c.to_string();
self.read_word(); // Recover from the error self.read_word(); // Recover from the error
return Err(LexerError::new( return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c), LexerErrorKind::InvalidEscape(c),
@ -343,7 +363,7 @@ impl<'lexer> Lexer<'lexer> {
if self if self
.current() .current()
.is_some_and(|c| !is_separator(c, self.peek(1))) .is_some_and(|c| !is_separator(c, self.peek()))
{ {
self.read_word(); // Recover from the error self.read_word(); // Recover from the error
return Err(LexerError::new( return Err(LexerError::new(
@ -452,16 +472,20 @@ mod tests {
]); ]);
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [ test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
Ok(TokenKind::Char('x')), Ok(TokenKind::Char("x".into())),
Ok(TokenKind::Char('\n')), Ok(TokenKind::Char("\n".into())),
Ok(TokenKind::Char('\r')), Ok(TokenKind::Char("\r".into())),
Ok(TokenKind::Char('\t')), Ok(TokenKind::Char("\t".into())),
Ok(TokenKind::Char('\x1b')), Ok(TokenKind::Char("\x1b".into())),
Ok(TokenKind::Char('\\')), Ok(TokenKind::Char("\\".into())),
Err(LexerErrorKind::InvalidEscape('q')), Err(LexerErrorKind::InvalidEscape("q".into())),
Err(LexerErrorKind::UnclosedChar), Err(LexerErrorKind::UnclosedChar),
]); ]);
test!(char_literal_with_unicode: "'y̆'", [
Ok(TokenKind::Char("".into())),
]);
test!(error_unclosed_char_escape: r"'\", [ test!(error_unclosed_char_escape: r"'\", [
Err(LexerErrorKind::UnclosedChar), Err(LexerErrorKind::UnclosedChar),
]); ]);
@ -482,7 +506,7 @@ mod tests {
]); ]);
test!(error_invalid_escape_string: "\"oh no \\p\"", [ test!(error_invalid_escape_string: "\"oh no \\p\"", [
Err(LexerErrorKind::InvalidEscape('p')), Err(LexerErrorKind::InvalidEscape("p".into())),
]); ]);
test!(error_unclosed_string: "\"hiii", [ test!(error_unclosed_string: "\"hiii", [
@ -513,11 +537,14 @@ mod tests {
Ok(TokenKind::CloseParen), Ok(TokenKind::CloseParen),
]); ]);
test!(unicode_symbol: "(かわいい 🐕 😻)", [ test!(unicode_symbols: "(かわいい 🐕 😻 (föö))", [
Ok(TokenKind::OpenParen), Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("かわいい"))), Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
Ok(TokenKind::Symbol(Symbol::from("🐕"))), Ok(TokenKind::Symbol(Symbol::from("🐕"))),
Ok(TokenKind::Symbol(Symbol::from("😻"))), Ok(TokenKind::Symbol(Symbol::from("😻"))),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("föö"))),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen), Ok(TokenKind::CloseParen),
]); ]);
} }

View File

@ -27,7 +27,7 @@ pub(crate) enum TokenKind {
/// Boolean, e.g. `true`, `false` /// Boolean, e.g. `true`, `false`
Bool(bool), Bool(bool),
/// Character, e.g. `'c'`, `'\n'` /// Character, e.g. `'c'`, `'\n'`
Char(char), Char(String),
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003` /// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
Float(f64), Float(f64),
/// Integer, e.g. `0`, `-1`, `+200` /// Integer, e.g. `0`, `-1`, `+200`

View File

@ -117,7 +117,7 @@ pub(crate) enum Atom {
/// Boolean, e.g. `true`, `false` /// Boolean, e.g. `true`, `false`
Bool(bool), Bool(bool),
/// Character, e.g. `'c'`, `'\n'` /// Character, e.g. `'c'`, `'\n'`
Char(char), Char(String),
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003` /// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
Float(f64), Float(f64),
/// Integer, e.g. `0`, `-1`, `+200` /// Integer, e.g. `0`, `-1`, `+200`

View File

@ -258,9 +258,9 @@ mod tests {
test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![ test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![
Node::new( Node::new(
Expr::Vector(vec![ Expr::Vector(vec![
Node::new(Expr::Atom(Atom::Char('a')), Span::new(1..4, src.clone())), Node::new(Expr::Atom(Atom::Char("a".into())), Span::new(1..4, src.clone())),
Node::new(Expr::Atom(Atom::Char('b')), Span::new(5..8, src.clone())), Node::new(Expr::Atom(Atom::Char("b".into())), Span::new(5..8, src.clone())),
Node::new(Expr::Atom(Atom::Char('c')), Span::new(9..12, src.clone())), Node::new(Expr::Atom(Atom::Char("c".into())), Span::new(9..12, src.clone())),
]), ]),
Span::new(0..13, src), Span::new(0..13, src),
) )