Make the lexer operate on graphemes instead of chars (still needs some work)

This commit is contained in:
Jesse Braham 2024-12-28 10:31:37 +01:00
parent 8dcdd34b65
commit e5fafd03ba
6 changed files with 108 additions and 80 deletions

View File

@ -8,6 +8,7 @@ repository.workspace = true
license.workspace = true
[dependencies]
unicode-segmentation = "1.12.0"
[lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }

View File

@ -4,7 +4,7 @@ use crate::span::Span;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LexerErrorKind {
/// An invalid escape sequence was encountered.
InvalidEscape(char),
InvalidEscape(String),
/// An invalid numeric literal was encountered.
InvalidNumber(String),
/// An invalid string literal was encountered.

View File

@ -1,4 +1,6 @@
use std::{str::Chars, sync::Arc};
use std::sync::Arc;
use unicode_segmentation::{Graphemes, UnicodeSegmentation as _};
pub(crate) use self::{
error::{LexerError, LexerErrorKind},
@ -11,18 +13,26 @@ mod error;
mod symbol;
mod token;
/// Determine if the current character is a separator, performing 1-character
/// lookahead as needed to handle multi-character separators.
fn is_separator(current: char, next: Option<char>) -> bool {
current.is_ascii_whitespace()
|| matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';')
|| (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{')))
/// Determine if the current grapheme is an ASCII digit.
fn is_ascii_digit(current: &str) -> bool {
matches!(
current,
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
)
}
/// Determine if the current grapheme is a separator, performing lookahead as
/// needed to handle multi-character separators.
fn is_separator(current: &str, next: Option<&str>) -> bool {
current.trim_ascii().is_empty()
|| matches!(current, "(" | ")" | "[" | "]" | "{" | "}" | ";")
|| (current == "#" && next.is_some_and(|c| matches!(c, "|" | "{")))
}
/// A lexer, used by the parser.
#[derive(Debug)]
pub(crate) struct Lexer<'lexer> {
input: Chars<'lexer>,
input: Graphemes<'lexer>,
byte: usize,
source: Arc<Source>,
}
@ -34,7 +44,7 @@ impl<'lexer> Lexer<'lexer> {
let source = Arc::new(Source::new(None, input.to_string()));
Self {
input: input.chars(),
input: input.graphemes(true),
byte: 0,
source,
}
@ -62,30 +72,30 @@ impl<'lexer> Lexer<'lexer> {
/// Returns `true` when at the end of the input.
#[must_use]
pub(crate) fn eof(&self) -> bool {
self.peek(0).is_none()
self.current().is_none()
}
/// Get the current character.
/// Get the current grapheme without advancing.
#[must_use]
fn current(&self) -> Option<char> {
self.input.as_str().chars().next()
fn current(&self) -> Option<&str> {
self.input.clone().next()
}
/// Get the nth character ahead of the current character without advancing.
/// Get the nth grapheme ahead of the current grapheme without advancing.
#[must_use]
fn peek(&self, n: usize) -> Option<char> {
self.input.as_str().chars().nth(n)
fn peek(&self) -> Option<&str> {
self.input.clone().take(2).last()
}
/// Advance the lexer by one character.
fn advance(&mut self) -> Option<char> {
/// Advance the lexer by one grapheme.
fn advance(&mut self) -> Option<&str> {
let c = self.input.next()?;
self.byte += c.len_utf8();
self.byte += c.len();
Some(c)
}
/// Advance the lexer by one character, and then return the specified
/// Advance the lexer by one grapheme, and then return the specified
/// `TokenKind`:
#[must_use]
fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
@ -98,11 +108,11 @@ impl<'lexer> Lexer<'lexer> {
fn read_word(&mut self) -> String {
let mut word = String::new();
while let Some(c) = self.current() {
if is_separator(c, self.peek(1)) {
if is_separator(c, self.peek()) {
break;
}
word.push(c);
word.push_str(c);
self.advance();
}
@ -115,7 +125,7 @@ impl<'lexer> Lexer<'lexer> {
// we have reached the end of input and no additional characters can be read:
let c = loop {
match self.current() {
Some(c) if c.is_ascii_whitespace() => {
Some(c) if c.trim_ascii().is_empty() => {
self.advance();
}
Some(c) => break c,
@ -126,27 +136,27 @@ impl<'lexer> Lexer<'lexer> {
let mut span = self.span();
let kind = match c {
';' => self.line_comment(),
'#' if self.peek(1) == Some('|') => self.block_comment(),
";" => self.line_comment(),
"#" if self.peek() == Some("|") => self.block_comment(),
'(' => self.advance_and(TokenKind::OpenParen),
')' => self.advance_and(TokenKind::CloseParen),
'{' => self.advance_and(TokenKind::OpenBrace),
'}' => self.advance_and(TokenKind::CloseBrace),
'[' => self.advance_and(TokenKind::OpenBracket),
']' => self.advance_and(TokenKind::CloseBracket),
'#' if self.peek(1) == Some('{') => {
"(" => self.advance_and(TokenKind::OpenParen),
")" => self.advance_and(TokenKind::CloseParen),
"{" => self.advance_and(TokenKind::OpenBrace),
"}" => self.advance_and(TokenKind::CloseBrace),
"[" => self.advance_and(TokenKind::OpenBracket),
"]" => self.advance_and(TokenKind::CloseBracket),
"#" if self.peek() == Some("{") => {
self.advance(); // '#'
self.advance(); // '{'
TokenKind::OpenHashBrace
}
'0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => {
let radix = match self.peek(1) {
Some('b') => 2,
Some('o') => 8,
Some('x') => 16,
"0" if matches!(self.peek(), Some("b") | Some("o") | Some("x")) => {
let radix = match self.peek() {
Some("b") => 2,
Some("o") => 8,
Some("x") => 16,
_ => unreachable!(),
};
@ -155,15 +165,15 @@ impl<'lexer> Lexer<'lexer> {
self.integer_literal(word, span, radix)?
}
'0'..='9' => self.numeric_literal(span.clone())?,
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
c if is_ascii_digit(c) => self.numeric_literal(span.clone())?,
"+" | "-" if self.peek().is_some_and(|c| is_ascii_digit(c)) => {
self.numeric_literal(span.clone())?
}
'\'' => self.char_literal(span.clone())?,
'"' => self.string_literal(span.clone())?,
"'" => self.char_literal(span.clone())?,
"\"" => self.string_literal(span.clone())?,
':' => {
":" => {
self.advance(); // ':'
TokenKind::Keyword(Symbol::from(self.read_word()))
@ -189,18 +199,18 @@ impl<'lexer> Lexer<'lexer> {
fn line_comment(&mut self) -> TokenKind {
// Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment:
while self.current().is_some_and(|c| c == ';') {
while self.current().is_some_and(|c| c == ";") {
self.advance();
}
// Line comments continue until a newline character is encountered:
let mut comment = String::new();
while let Some(c) = self.advance() {
if c == '\n' {
if c == "\n" {
break;
}
comment.push(c);
comment.push_str(c);
}
TokenKind::LineComment(comment.trim().into())
@ -213,16 +223,20 @@ impl<'lexer> Lexer<'lexer> {
self.advance(); // '|'
let mut comment = String::new();
let mut pipe_found = false;
while let Some(c) = self.advance() {
if c == '|' && matches!(self.peek(0), Some('#')) {
self.advance(); // '#'
if pipe_found && c == "#" {
break;
}
comment.push(c);
comment.push_str(c);
pipe_found = c == "|";
}
TokenKind::BlockComment(comment.trim().into())
let comment = comment.trim_end_matches('|').trim();
TokenKind::BlockComment(comment.into())
}
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
@ -269,13 +283,15 @@ impl<'lexer> Lexer<'lexer> {
self.advance(); // '\''
let c = match self.advance() {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
Some("\\") => match self.advance() {
Some(c @ ("\"" | "\\")) => c,
Some("n") => "\n",
Some("r") => "\r",
Some("t") => "\t",
Some("e") => "\x1b",
Some(c) => {
let c = c.to_string();
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
@ -298,7 +314,9 @@ impl<'lexer> Lexer<'lexer> {
}
};
if self.advance() != Some('\'') {
let c = c.to_string();
if self.advance() != Some("'") {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
@ -317,15 +335,17 @@ impl<'lexer> Lexer<'lexer> {
loop {
let ch_span = self.span();
string.push(match self.advance() {
Some('"') => break,
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
string.push_str(match self.advance() {
Some("\"") => break,
Some("\\") => match self.advance() {
Some(c @ ("\"" | "\\")) => c,
Some("n") => "\n",
Some("r") => "\r",
Some("t") => "\t",
Some("e") => "\x1b",
Some(c) => {
let c = c.to_string();
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
@ -343,7 +363,7 @@ impl<'lexer> Lexer<'lexer> {
if self
.current()
.is_some_and(|c| !is_separator(c, self.peek(1)))
.is_some_and(|c| !is_separator(c, self.peek()))
{
self.read_word(); // Recover from the error
return Err(LexerError::new(
@ -452,16 +472,20 @@ mod tests {
]);
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
Ok(TokenKind::Char('x')),
Ok(TokenKind::Char('\n')),
Ok(TokenKind::Char('\r')),
Ok(TokenKind::Char('\t')),
Ok(TokenKind::Char('\x1b')),
Ok(TokenKind::Char('\\')),
Err(LexerErrorKind::InvalidEscape('q')),
Ok(TokenKind::Char("x".into())),
Ok(TokenKind::Char("\n".into())),
Ok(TokenKind::Char("\r".into())),
Ok(TokenKind::Char("\t".into())),
Ok(TokenKind::Char("\x1b".into())),
Ok(TokenKind::Char("\\".into())),
Err(LexerErrorKind::InvalidEscape("q".into())),
Err(LexerErrorKind::UnclosedChar),
]);
test!(char_literal_with_unicode: "'y̆'", [
Ok(TokenKind::Char("".into())),
]);
test!(error_unclosed_char_escape: r"'\", [
Err(LexerErrorKind::UnclosedChar),
]);
@ -482,7 +506,7 @@ mod tests {
]);
test!(error_invalid_escape_string: "\"oh no \\p\"", [
Err(LexerErrorKind::InvalidEscape('p')),
Err(LexerErrorKind::InvalidEscape("p".into())),
]);
test!(error_unclosed_string: "\"hiii", [
@ -513,11 +537,14 @@ mod tests {
Ok(TokenKind::CloseParen),
]);
test!(unicode_symbol: "(かわいい 🐕 😻)", [
test!(unicode_symbols: "(かわいい 🐕 😻 (föö))", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
Ok(TokenKind::Symbol(Symbol::from("🐕"))),
Ok(TokenKind::Symbol(Symbol::from("😻"))),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("föö"))),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen),
]);
}

View File

@ -27,7 +27,7 @@ pub(crate) enum TokenKind {
/// Boolean, e.g. `true`, `false`
Bool(bool),
/// Character, e.g. `'c'`, `'\n'`
Char(char),
Char(String),
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
Float(f64),
/// Integer, e.g. `0`, `-1`, `+200`

View File

@ -117,7 +117,7 @@ pub(crate) enum Atom {
/// Boolean, e.g. `true`, `false`
Bool(bool),
/// Character, e.g. `'c'`, `'\n'`
Char(char),
Char(String),
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
Float(f64),
/// Integer, e.g. `0`, `-1`, `+200`

View File

@ -258,9 +258,9 @@ mod tests {
test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![
Node::new(
Expr::Vector(vec![
Node::new(Expr::Atom(Atom::Char('a')), Span::new(1..4, src.clone())),
Node::new(Expr::Atom(Atom::Char('b')), Span::new(5..8, src.clone())),
Node::new(Expr::Atom(Atom::Char('c')), Span::new(9..12, src.clone())),
Node::new(Expr::Atom(Atom::Char("a".into())), Span::new(1..4, src.clone())),
Node::new(Expr::Atom(Atom::Char("b".into())), Span::new(5..8, src.clone())),
Node::new(Expr::Atom(Atom::Char("c".into())), Span::new(9..12, src.clone())),
]),
Span::new(0..13, src),
)