Make the lexer operate on graphemes instead of chars (still needs some work)
This commit is contained in:
parent
8dcdd34b65
commit
e5fafd03ba
@ -8,6 +8,7 @@ repository.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
unicode-segmentation = "1.12.0"
|
||||
|
||||
[lints.rust]
|
||||
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
|
||||
|
@ -4,7 +4,7 @@ use crate::span::Span;
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub enum LexerErrorKind {
|
||||
/// An invalid escape sequence was encountered.
|
||||
InvalidEscape(char),
|
||||
InvalidEscape(String),
|
||||
/// An invalid numeric literal was encountered.
|
||||
InvalidNumber(String),
|
||||
/// An invalid string literal was encountered.
|
||||
|
@ -1,4 +1,6 @@
|
||||
use std::{str::Chars, sync::Arc};
|
||||
use std::sync::Arc;
|
||||
|
||||
use unicode_segmentation::{Graphemes, UnicodeSegmentation as _};
|
||||
|
||||
pub(crate) use self::{
|
||||
error::{LexerError, LexerErrorKind},
|
||||
@ -11,18 +13,26 @@ mod error;
|
||||
mod symbol;
|
||||
mod token;
|
||||
|
||||
/// Determine if the current character is a separator, performing 1-character
|
||||
/// lookahead as needed to handle multi-character separators.
|
||||
fn is_separator(current: char, next: Option<char>) -> bool {
|
||||
current.is_ascii_whitespace()
|
||||
|| matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';')
|
||||
|| (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{')))
|
||||
/// Determine if the current grapheme is an ASCII digit.
|
||||
fn is_ascii_digit(current: &str) -> bool {
|
||||
matches!(
|
||||
current,
|
||||
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
|
||||
)
|
||||
}
|
||||
|
||||
/// Determine if the current grapheme is a separator, performing lookahead as
|
||||
/// needed to handle multi-character separators.
|
||||
fn is_separator(current: &str, next: Option<&str>) -> bool {
|
||||
current.trim_ascii().is_empty()
|
||||
|| matches!(current, "(" | ")" | "[" | "]" | "{" | "}" | ";")
|
||||
|| (current == "#" && next.is_some_and(|c| matches!(c, "|" | "{")))
|
||||
}
|
||||
|
||||
/// A lexer, used by the parser.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Lexer<'lexer> {
|
||||
input: Chars<'lexer>,
|
||||
input: Graphemes<'lexer>,
|
||||
byte: usize,
|
||||
source: Arc<Source>,
|
||||
}
|
||||
@ -34,7 +44,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
let source = Arc::new(Source::new(None, input.to_string()));
|
||||
|
||||
Self {
|
||||
input: input.chars(),
|
||||
input: input.graphemes(true),
|
||||
byte: 0,
|
||||
source,
|
||||
}
|
||||
@ -62,30 +72,30 @@ impl<'lexer> Lexer<'lexer> {
|
||||
/// Returns `true` when at the end of the input.
|
||||
#[must_use]
|
||||
pub(crate) fn eof(&self) -> bool {
|
||||
self.peek(0).is_none()
|
||||
self.current().is_none()
|
||||
}
|
||||
|
||||
/// Get the current character.
|
||||
/// Get the current grapheme without advancing.
|
||||
#[must_use]
|
||||
fn current(&self) -> Option<char> {
|
||||
self.input.as_str().chars().next()
|
||||
fn current(&self) -> Option<&str> {
|
||||
self.input.clone().next()
|
||||
}
|
||||
|
||||
/// Get the nth character ahead of the current character without advancing.
|
||||
/// Get the nth grapheme ahead of the current grapheme without advancing.
|
||||
#[must_use]
|
||||
fn peek(&self, n: usize) -> Option<char> {
|
||||
self.input.as_str().chars().nth(n)
|
||||
fn peek(&self) -> Option<&str> {
|
||||
self.input.clone().take(2).last()
|
||||
}
|
||||
|
||||
/// Advance the lexer by one character.
|
||||
fn advance(&mut self) -> Option<char> {
|
||||
/// Advance the lexer by one grapheme.
|
||||
fn advance(&mut self) -> Option<&str> {
|
||||
let c = self.input.next()?;
|
||||
self.byte += c.len_utf8();
|
||||
self.byte += c.len();
|
||||
|
||||
Some(c)
|
||||
}
|
||||
|
||||
/// Advance the lexer by one character, and then return the specified
|
||||
/// Advance the lexer by one grapheme, and then return the specified
|
||||
/// `TokenKind`:
|
||||
#[must_use]
|
||||
fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
|
||||
@ -98,11 +108,11 @@ impl<'lexer> Lexer<'lexer> {
|
||||
fn read_word(&mut self) -> String {
|
||||
let mut word = String::new();
|
||||
while let Some(c) = self.current() {
|
||||
if is_separator(c, self.peek(1)) {
|
||||
if is_separator(c, self.peek()) {
|
||||
break;
|
||||
}
|
||||
|
||||
word.push(c);
|
||||
word.push_str(c);
|
||||
self.advance();
|
||||
}
|
||||
|
||||
@ -115,7 +125,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
// we have reached the end of input and no additional characters can be read:
|
||||
let c = loop {
|
||||
match self.current() {
|
||||
Some(c) if c.is_ascii_whitespace() => {
|
||||
Some(c) if c.trim_ascii().is_empty() => {
|
||||
self.advance();
|
||||
}
|
||||
Some(c) => break c,
|
||||
@ -126,27 +136,27 @@ impl<'lexer> Lexer<'lexer> {
|
||||
let mut span = self.span();
|
||||
|
||||
let kind = match c {
|
||||
';' => self.line_comment(),
|
||||
'#' if self.peek(1) == Some('|') => self.block_comment(),
|
||||
";" => self.line_comment(),
|
||||
"#" if self.peek() == Some("|") => self.block_comment(),
|
||||
|
||||
'(' => self.advance_and(TokenKind::OpenParen),
|
||||
')' => self.advance_and(TokenKind::CloseParen),
|
||||
'{' => self.advance_and(TokenKind::OpenBrace),
|
||||
'}' => self.advance_and(TokenKind::CloseBrace),
|
||||
'[' => self.advance_and(TokenKind::OpenBracket),
|
||||
']' => self.advance_and(TokenKind::CloseBracket),
|
||||
'#' if self.peek(1) == Some('{') => {
|
||||
"(" => self.advance_and(TokenKind::OpenParen),
|
||||
")" => self.advance_and(TokenKind::CloseParen),
|
||||
"{" => self.advance_and(TokenKind::OpenBrace),
|
||||
"}" => self.advance_and(TokenKind::CloseBrace),
|
||||
"[" => self.advance_and(TokenKind::OpenBracket),
|
||||
"]" => self.advance_and(TokenKind::CloseBracket),
|
||||
"#" if self.peek() == Some("{") => {
|
||||
self.advance(); // '#'
|
||||
self.advance(); // '{'
|
||||
|
||||
TokenKind::OpenHashBrace
|
||||
}
|
||||
|
||||
'0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => {
|
||||
let radix = match self.peek(1) {
|
||||
Some('b') => 2,
|
||||
Some('o') => 8,
|
||||
Some('x') => 16,
|
||||
"0" if matches!(self.peek(), Some("b") | Some("o") | Some("x")) => {
|
||||
let radix = match self.peek() {
|
||||
Some("b") => 2,
|
||||
Some("o") => 8,
|
||||
Some("x") => 16,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
@ -155,15 +165,15 @@ impl<'lexer> Lexer<'lexer> {
|
||||
|
||||
self.integer_literal(word, span, radix)?
|
||||
}
|
||||
'0'..='9' => self.numeric_literal(span.clone())?,
|
||||
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
|
||||
c if is_ascii_digit(c) => self.numeric_literal(span.clone())?,
|
||||
"+" | "-" if self.peek().is_some_and(|c| is_ascii_digit(c)) => {
|
||||
self.numeric_literal(span.clone())?
|
||||
}
|
||||
|
||||
'\'' => self.char_literal(span.clone())?,
|
||||
'"' => self.string_literal(span.clone())?,
|
||||
"'" => self.char_literal(span.clone())?,
|
||||
"\"" => self.string_literal(span.clone())?,
|
||||
|
||||
':' => {
|
||||
":" => {
|
||||
self.advance(); // ':'
|
||||
|
||||
TokenKind::Keyword(Symbol::from(self.read_word()))
|
||||
@ -189,18 +199,18 @@ impl<'lexer> Lexer<'lexer> {
|
||||
fn line_comment(&mut self) -> TokenKind {
|
||||
// Line comments may start with any number of semicolons, so consume however
|
||||
// many are present at the beginning of the comment:
|
||||
while self.current().is_some_and(|c| c == ';') {
|
||||
while self.current().is_some_and(|c| c == ";") {
|
||||
self.advance();
|
||||
}
|
||||
|
||||
// Line comments continue until a newline character is encountered:
|
||||
let mut comment = String::new();
|
||||
while let Some(c) = self.advance() {
|
||||
if c == '\n' {
|
||||
if c == "\n" {
|
||||
break;
|
||||
}
|
||||
|
||||
comment.push(c);
|
||||
comment.push_str(c);
|
||||
}
|
||||
|
||||
TokenKind::LineComment(comment.trim().into())
|
||||
@ -213,16 +223,20 @@ impl<'lexer> Lexer<'lexer> {
|
||||
self.advance(); // '|'
|
||||
|
||||
let mut comment = String::new();
|
||||
let mut pipe_found = false;
|
||||
|
||||
while let Some(c) = self.advance() {
|
||||
if c == '|' && matches!(self.peek(0), Some('#')) {
|
||||
self.advance(); // '#'
|
||||
if pipe_found && c == "#" {
|
||||
break;
|
||||
}
|
||||
|
||||
comment.push(c);
|
||||
comment.push_str(c);
|
||||
pipe_found = c == "|";
|
||||
}
|
||||
|
||||
TokenKind::BlockComment(comment.trim().into())
|
||||
let comment = comment.trim_end_matches('|').trim();
|
||||
|
||||
TokenKind::BlockComment(comment.into())
|
||||
}
|
||||
|
||||
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
|
||||
@ -269,13 +283,15 @@ impl<'lexer> Lexer<'lexer> {
|
||||
self.advance(); // '\''
|
||||
|
||||
let c = match self.advance() {
|
||||
Some('\\') => match self.advance() {
|
||||
Some(c @ ('"' | '\\')) => c,
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('e') => '\x1b',
|
||||
Some("\\") => match self.advance() {
|
||||
Some(c @ ("\"" | "\\")) => c,
|
||||
Some("n") => "\n",
|
||||
Some("r") => "\r",
|
||||
Some("t") => "\t",
|
||||
Some("e") => "\x1b",
|
||||
Some(c) => {
|
||||
let c = c.to_string();
|
||||
|
||||
self.read_word(); // Recover from the error
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidEscape(c),
|
||||
@ -298,7 +314,9 @@ impl<'lexer> Lexer<'lexer> {
|
||||
}
|
||||
};
|
||||
|
||||
if self.advance() != Some('\'') {
|
||||
let c = c.to_string();
|
||||
|
||||
if self.advance() != Some("'") {
|
||||
self.read_word(); // Recover from the error
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::UnclosedChar,
|
||||
@ -317,15 +335,17 @@ impl<'lexer> Lexer<'lexer> {
|
||||
|
||||
loop {
|
||||
let ch_span = self.span();
|
||||
string.push(match self.advance() {
|
||||
Some('"') => break,
|
||||
Some('\\') => match self.advance() {
|
||||
Some(c @ ('"' | '\\')) => c,
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('e') => '\x1b',
|
||||
string.push_str(match self.advance() {
|
||||
Some("\"") => break,
|
||||
Some("\\") => match self.advance() {
|
||||
Some(c @ ("\"" | "\\")) => c,
|
||||
Some("n") => "\n",
|
||||
Some("r") => "\r",
|
||||
Some("t") => "\t",
|
||||
Some("e") => "\x1b",
|
||||
Some(c) => {
|
||||
let c = c.to_string();
|
||||
|
||||
self.read_word(); // Recover from the error
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidEscape(c),
|
||||
@ -343,7 +363,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
|
||||
if self
|
||||
.current()
|
||||
.is_some_and(|c| !is_separator(c, self.peek(1)))
|
||||
.is_some_and(|c| !is_separator(c, self.peek()))
|
||||
{
|
||||
self.read_word(); // Recover from the error
|
||||
return Err(LexerError::new(
|
||||
@ -452,16 +472,20 @@ mod tests {
|
||||
]);
|
||||
|
||||
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
|
||||
Ok(TokenKind::Char('x')),
|
||||
Ok(TokenKind::Char('\n')),
|
||||
Ok(TokenKind::Char('\r')),
|
||||
Ok(TokenKind::Char('\t')),
|
||||
Ok(TokenKind::Char('\x1b')),
|
||||
Ok(TokenKind::Char('\\')),
|
||||
Err(LexerErrorKind::InvalidEscape('q')),
|
||||
Ok(TokenKind::Char("x".into())),
|
||||
Ok(TokenKind::Char("\n".into())),
|
||||
Ok(TokenKind::Char("\r".into())),
|
||||
Ok(TokenKind::Char("\t".into())),
|
||||
Ok(TokenKind::Char("\x1b".into())),
|
||||
Ok(TokenKind::Char("\\".into())),
|
||||
Err(LexerErrorKind::InvalidEscape("q".into())),
|
||||
Err(LexerErrorKind::UnclosedChar),
|
||||
]);
|
||||
|
||||
test!(char_literal_with_unicode: "'y̆'", [
|
||||
Ok(TokenKind::Char("y̆".into())),
|
||||
]);
|
||||
|
||||
test!(error_unclosed_char_escape: r"'\", [
|
||||
Err(LexerErrorKind::UnclosedChar),
|
||||
]);
|
||||
@ -482,7 +506,7 @@ mod tests {
|
||||
]);
|
||||
|
||||
test!(error_invalid_escape_string: "\"oh no \\p\"", [
|
||||
Err(LexerErrorKind::InvalidEscape('p')),
|
||||
Err(LexerErrorKind::InvalidEscape("p".into())),
|
||||
]);
|
||||
|
||||
test!(error_unclosed_string: "\"hiii", [
|
||||
@ -513,11 +537,14 @@ mod tests {
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(unicode_symbol: "(かわいい 🐕 😻)", [
|
||||
test!(unicode_symbols: "(かわいい 🐕 😻 (föö))", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
|
||||
Ok(TokenKind::Symbol(Symbol::from("🐕"))),
|
||||
Ok(TokenKind::Symbol(Symbol::from("😻"))),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("föö"))),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ pub(crate) enum TokenKind {
|
||||
/// Boolean, e.g. `true`, `false`
|
||||
Bool(bool),
|
||||
/// Character, e.g. `'c'`, `'\n'`
|
||||
Char(char),
|
||||
Char(String),
|
||||
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
|
||||
Float(f64),
|
||||
/// Integer, e.g. `0`, `-1`, `+200`
|
||||
|
@ -117,7 +117,7 @@ pub(crate) enum Atom {
|
||||
/// Boolean, e.g. `true`, `false`
|
||||
Bool(bool),
|
||||
/// Character, e.g. `'c'`, `'\n'`
|
||||
Char(char),
|
||||
Char(String),
|
||||
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
|
||||
Float(f64),
|
||||
/// Integer, e.g. `0`, `-1`, `+200`
|
||||
|
@ -258,9 +258,9 @@ mod tests {
|
||||
test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![
|
||||
Node::new(
|
||||
Expr::Vector(vec![
|
||||
Node::new(Expr::Atom(Atom::Char('a')), Span::new(1..4, src.clone())),
|
||||
Node::new(Expr::Atom(Atom::Char('b')), Span::new(5..8, src.clone())),
|
||||
Node::new(Expr::Atom(Atom::Char('c')), Span::new(9..12, src.clone())),
|
||||
Node::new(Expr::Atom(Atom::Char("a".into())), Span::new(1..4, src.clone())),
|
||||
Node::new(Expr::Atom(Atom::Char("b".into())), Span::new(5..8, src.clone())),
|
||||
Node::new(Expr::Atom(Atom::Char("c".into())), Span::new(9..12, src.clone())),
|
||||
]),
|
||||
Span::new(0..13, src),
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user