Make the lexer operate on graphemes instead of chars (still needs some work)
This commit is contained in:
parent
8dcdd34b65
commit
e5fafd03ba
@ -8,6 +8,7 @@ repository.workspace = true
|
|||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
unicode-segmentation = "1.12.0"
|
||||||
|
|
||||||
[lints.rust]
|
[lints.rust]
|
||||||
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
|
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
|
||||||
|
@ -4,7 +4,7 @@ use crate::span::Span;
|
|||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum LexerErrorKind {
|
pub enum LexerErrorKind {
|
||||||
/// An invalid escape sequence was encountered.
|
/// An invalid escape sequence was encountered.
|
||||||
InvalidEscape(char),
|
InvalidEscape(String),
|
||||||
/// An invalid numeric literal was encountered.
|
/// An invalid numeric literal was encountered.
|
||||||
InvalidNumber(String),
|
InvalidNumber(String),
|
||||||
/// An invalid string literal was encountered.
|
/// An invalid string literal was encountered.
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
use std::{str::Chars, sync::Arc};
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use unicode_segmentation::{Graphemes, UnicodeSegmentation as _};
|
||||||
|
|
||||||
pub(crate) use self::{
|
pub(crate) use self::{
|
||||||
error::{LexerError, LexerErrorKind},
|
error::{LexerError, LexerErrorKind},
|
||||||
@ -11,18 +13,26 @@ mod error;
|
|||||||
mod symbol;
|
mod symbol;
|
||||||
mod token;
|
mod token;
|
||||||
|
|
||||||
/// Determine if the current character is a separator, performing 1-character
|
/// Determine if the current grapheme is an ASCII digit.
|
||||||
/// lookahead as needed to handle multi-character separators.
|
fn is_ascii_digit(current: &str) -> bool {
|
||||||
fn is_separator(current: char, next: Option<char>) -> bool {
|
matches!(
|
||||||
current.is_ascii_whitespace()
|
current,
|
||||||
|| matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';')
|
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
|
||||||
|| (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{')))
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determine if the current grapheme is a separator, performing lookahead as
|
||||||
|
/// needed to handle multi-character separators.
|
||||||
|
fn is_separator(current: &str, next: Option<&str>) -> bool {
|
||||||
|
current.trim_ascii().is_empty()
|
||||||
|
|| matches!(current, "(" | ")" | "[" | "]" | "{" | "}" | ";")
|
||||||
|
|| (current == "#" && next.is_some_and(|c| matches!(c, "|" | "{")))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A lexer, used by the parser.
|
/// A lexer, used by the parser.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct Lexer<'lexer> {
|
pub(crate) struct Lexer<'lexer> {
|
||||||
input: Chars<'lexer>,
|
input: Graphemes<'lexer>,
|
||||||
byte: usize,
|
byte: usize,
|
||||||
source: Arc<Source>,
|
source: Arc<Source>,
|
||||||
}
|
}
|
||||||
@ -34,7 +44,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
let source = Arc::new(Source::new(None, input.to_string()));
|
let source = Arc::new(Source::new(None, input.to_string()));
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
input: input.chars(),
|
input: input.graphemes(true),
|
||||||
byte: 0,
|
byte: 0,
|
||||||
source,
|
source,
|
||||||
}
|
}
|
||||||
@ -62,30 +72,30 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
/// Returns `true` when at the end of the input.
|
/// Returns `true` when at the end of the input.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub(crate) fn eof(&self) -> bool {
|
pub(crate) fn eof(&self) -> bool {
|
||||||
self.peek(0).is_none()
|
self.current().is_none()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the current character.
|
/// Get the current grapheme without advancing.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
fn current(&self) -> Option<char> {
|
fn current(&self) -> Option<&str> {
|
||||||
self.input.as_str().chars().next()
|
self.input.clone().next()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the nth character ahead of the current character without advancing.
|
/// Get the nth grapheme ahead of the current grapheme without advancing.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
fn peek(&self, n: usize) -> Option<char> {
|
fn peek(&self) -> Option<&str> {
|
||||||
self.input.as_str().chars().nth(n)
|
self.input.clone().take(2).last()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Advance the lexer by one character.
|
/// Advance the lexer by one grapheme.
|
||||||
fn advance(&mut self) -> Option<char> {
|
fn advance(&mut self) -> Option<&str> {
|
||||||
let c = self.input.next()?;
|
let c = self.input.next()?;
|
||||||
self.byte += c.len_utf8();
|
self.byte += c.len();
|
||||||
|
|
||||||
Some(c)
|
Some(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Advance the lexer by one character, and then return the specified
|
/// Advance the lexer by one grapheme, and then return the specified
|
||||||
/// `TokenKind`:
|
/// `TokenKind`:
|
||||||
#[must_use]
|
#[must_use]
|
||||||
fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
|
fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
|
||||||
@ -98,11 +108,11 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
fn read_word(&mut self) -> String {
|
fn read_word(&mut self) -> String {
|
||||||
let mut word = String::new();
|
let mut word = String::new();
|
||||||
while let Some(c) = self.current() {
|
while let Some(c) = self.current() {
|
||||||
if is_separator(c, self.peek(1)) {
|
if is_separator(c, self.peek()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
word.push(c);
|
word.push_str(c);
|
||||||
self.advance();
|
self.advance();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,7 +125,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
// we have reached the end of input and no additional characters can be read:
|
// we have reached the end of input and no additional characters can be read:
|
||||||
let c = loop {
|
let c = loop {
|
||||||
match self.current() {
|
match self.current() {
|
||||||
Some(c) if c.is_ascii_whitespace() => {
|
Some(c) if c.trim_ascii().is_empty() => {
|
||||||
self.advance();
|
self.advance();
|
||||||
}
|
}
|
||||||
Some(c) => break c,
|
Some(c) => break c,
|
||||||
@ -126,27 +136,27 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
let mut span = self.span();
|
let mut span = self.span();
|
||||||
|
|
||||||
let kind = match c {
|
let kind = match c {
|
||||||
';' => self.line_comment(),
|
";" => self.line_comment(),
|
||||||
'#' if self.peek(1) == Some('|') => self.block_comment(),
|
"#" if self.peek() == Some("|") => self.block_comment(),
|
||||||
|
|
||||||
'(' => self.advance_and(TokenKind::OpenParen),
|
"(" => self.advance_and(TokenKind::OpenParen),
|
||||||
')' => self.advance_and(TokenKind::CloseParen),
|
")" => self.advance_and(TokenKind::CloseParen),
|
||||||
'{' => self.advance_and(TokenKind::OpenBrace),
|
"{" => self.advance_and(TokenKind::OpenBrace),
|
||||||
'}' => self.advance_and(TokenKind::CloseBrace),
|
"}" => self.advance_and(TokenKind::CloseBrace),
|
||||||
'[' => self.advance_and(TokenKind::OpenBracket),
|
"[" => self.advance_and(TokenKind::OpenBracket),
|
||||||
']' => self.advance_and(TokenKind::CloseBracket),
|
"]" => self.advance_and(TokenKind::CloseBracket),
|
||||||
'#' if self.peek(1) == Some('{') => {
|
"#" if self.peek() == Some("{") => {
|
||||||
self.advance(); // '#'
|
self.advance(); // '#'
|
||||||
self.advance(); // '{'
|
self.advance(); // '{'
|
||||||
|
|
||||||
TokenKind::OpenHashBrace
|
TokenKind::OpenHashBrace
|
||||||
}
|
}
|
||||||
|
|
||||||
'0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => {
|
"0" if matches!(self.peek(), Some("b") | Some("o") | Some("x")) => {
|
||||||
let radix = match self.peek(1) {
|
let radix = match self.peek() {
|
||||||
Some('b') => 2,
|
Some("b") => 2,
|
||||||
Some('o') => 8,
|
Some("o") => 8,
|
||||||
Some('x') => 16,
|
Some("x") => 16,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -155,15 +165,15 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
|
|
||||||
self.integer_literal(word, span, radix)?
|
self.integer_literal(word, span, radix)?
|
||||||
}
|
}
|
||||||
'0'..='9' => self.numeric_literal(span.clone())?,
|
c if is_ascii_digit(c) => self.numeric_literal(span.clone())?,
|
||||||
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
|
"+" | "-" if self.peek().is_some_and(|c| is_ascii_digit(c)) => {
|
||||||
self.numeric_literal(span.clone())?
|
self.numeric_literal(span.clone())?
|
||||||
}
|
}
|
||||||
|
|
||||||
'\'' => self.char_literal(span.clone())?,
|
"'" => self.char_literal(span.clone())?,
|
||||||
'"' => self.string_literal(span.clone())?,
|
"\"" => self.string_literal(span.clone())?,
|
||||||
|
|
||||||
':' => {
|
":" => {
|
||||||
self.advance(); // ':'
|
self.advance(); // ':'
|
||||||
|
|
||||||
TokenKind::Keyword(Symbol::from(self.read_word()))
|
TokenKind::Keyword(Symbol::from(self.read_word()))
|
||||||
@ -189,18 +199,18 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
fn line_comment(&mut self) -> TokenKind {
|
fn line_comment(&mut self) -> TokenKind {
|
||||||
// Line comments may start with any number of semicolons, so consume however
|
// Line comments may start with any number of semicolons, so consume however
|
||||||
// many are present at the beginning of the comment:
|
// many are present at the beginning of the comment:
|
||||||
while self.current().is_some_and(|c| c == ';') {
|
while self.current().is_some_and(|c| c == ";") {
|
||||||
self.advance();
|
self.advance();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Line comments continue until a newline character is encountered:
|
// Line comments continue until a newline character is encountered:
|
||||||
let mut comment = String::new();
|
let mut comment = String::new();
|
||||||
while let Some(c) = self.advance() {
|
while let Some(c) = self.advance() {
|
||||||
if c == '\n' {
|
if c == "\n" {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
comment.push(c);
|
comment.push_str(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
TokenKind::LineComment(comment.trim().into())
|
TokenKind::LineComment(comment.trim().into())
|
||||||
@ -213,16 +223,20 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
self.advance(); // '|'
|
self.advance(); // '|'
|
||||||
|
|
||||||
let mut comment = String::new();
|
let mut comment = String::new();
|
||||||
|
let mut pipe_found = false;
|
||||||
|
|
||||||
while let Some(c) = self.advance() {
|
while let Some(c) = self.advance() {
|
||||||
if c == '|' && matches!(self.peek(0), Some('#')) {
|
if pipe_found && c == "#" {
|
||||||
self.advance(); // '#'
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
comment.push(c);
|
comment.push_str(c);
|
||||||
|
pipe_found = c == "|";
|
||||||
}
|
}
|
||||||
|
|
||||||
TokenKind::BlockComment(comment.trim().into())
|
let comment = comment.trim_end_matches('|').trim();
|
||||||
|
|
||||||
|
TokenKind::BlockComment(comment.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
|
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
|
||||||
@ -269,13 +283,15 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
self.advance(); // '\''
|
self.advance(); // '\''
|
||||||
|
|
||||||
let c = match self.advance() {
|
let c = match self.advance() {
|
||||||
Some('\\') => match self.advance() {
|
Some("\\") => match self.advance() {
|
||||||
Some(c @ ('"' | '\\')) => c,
|
Some(c @ ("\"" | "\\")) => c,
|
||||||
Some('n') => '\n',
|
Some("n") => "\n",
|
||||||
Some('r') => '\r',
|
Some("r") => "\r",
|
||||||
Some('t') => '\t',
|
Some("t") => "\t",
|
||||||
Some('e') => '\x1b',
|
Some("e") => "\x1b",
|
||||||
Some(c) => {
|
Some(c) => {
|
||||||
|
let c = c.to_string();
|
||||||
|
|
||||||
self.read_word(); // Recover from the error
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::InvalidEscape(c),
|
LexerErrorKind::InvalidEscape(c),
|
||||||
@ -298,7 +314,9 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if self.advance() != Some('\'') {
|
let c = c.to_string();
|
||||||
|
|
||||||
|
if self.advance() != Some("'") {
|
||||||
self.read_word(); // Recover from the error
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::UnclosedChar,
|
LexerErrorKind::UnclosedChar,
|
||||||
@ -317,15 +335,17 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
|
|
||||||
loop {
|
loop {
|
||||||
let ch_span = self.span();
|
let ch_span = self.span();
|
||||||
string.push(match self.advance() {
|
string.push_str(match self.advance() {
|
||||||
Some('"') => break,
|
Some("\"") => break,
|
||||||
Some('\\') => match self.advance() {
|
Some("\\") => match self.advance() {
|
||||||
Some(c @ ('"' | '\\')) => c,
|
Some(c @ ("\"" | "\\")) => c,
|
||||||
Some('n') => '\n',
|
Some("n") => "\n",
|
||||||
Some('r') => '\r',
|
Some("r") => "\r",
|
||||||
Some('t') => '\t',
|
Some("t") => "\t",
|
||||||
Some('e') => '\x1b',
|
Some("e") => "\x1b",
|
||||||
Some(c) => {
|
Some(c) => {
|
||||||
|
let c = c.to_string();
|
||||||
|
|
||||||
self.read_word(); // Recover from the error
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::InvalidEscape(c),
|
LexerErrorKind::InvalidEscape(c),
|
||||||
@ -343,7 +363,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
|
|
||||||
if self
|
if self
|
||||||
.current()
|
.current()
|
||||||
.is_some_and(|c| !is_separator(c, self.peek(1)))
|
.is_some_and(|c| !is_separator(c, self.peek()))
|
||||||
{
|
{
|
||||||
self.read_word(); // Recover from the error
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
@ -452,16 +472,20 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
|
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
|
||||||
Ok(TokenKind::Char('x')),
|
Ok(TokenKind::Char("x".into())),
|
||||||
Ok(TokenKind::Char('\n')),
|
Ok(TokenKind::Char("\n".into())),
|
||||||
Ok(TokenKind::Char('\r')),
|
Ok(TokenKind::Char("\r".into())),
|
||||||
Ok(TokenKind::Char('\t')),
|
Ok(TokenKind::Char("\t".into())),
|
||||||
Ok(TokenKind::Char('\x1b')),
|
Ok(TokenKind::Char("\x1b".into())),
|
||||||
Ok(TokenKind::Char('\\')),
|
Ok(TokenKind::Char("\\".into())),
|
||||||
Err(LexerErrorKind::InvalidEscape('q')),
|
Err(LexerErrorKind::InvalidEscape("q".into())),
|
||||||
Err(LexerErrorKind::UnclosedChar),
|
Err(LexerErrorKind::UnclosedChar),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
test!(char_literal_with_unicode: "'y̆'", [
|
||||||
|
Ok(TokenKind::Char("y̆".into())),
|
||||||
|
]);
|
||||||
|
|
||||||
test!(error_unclosed_char_escape: r"'\", [
|
test!(error_unclosed_char_escape: r"'\", [
|
||||||
Err(LexerErrorKind::UnclosedChar),
|
Err(LexerErrorKind::UnclosedChar),
|
||||||
]);
|
]);
|
||||||
@ -482,7 +506,7 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
test!(error_invalid_escape_string: "\"oh no \\p\"", [
|
test!(error_invalid_escape_string: "\"oh no \\p\"", [
|
||||||
Err(LexerErrorKind::InvalidEscape('p')),
|
Err(LexerErrorKind::InvalidEscape("p".into())),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(error_unclosed_string: "\"hiii", [
|
test!(error_unclosed_string: "\"hiii", [
|
||||||
@ -513,11 +537,14 @@ mod tests {
|
|||||||
Ok(TokenKind::CloseParen),
|
Ok(TokenKind::CloseParen),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(unicode_symbol: "(かわいい 🐕 😻)", [
|
test!(unicode_symbols: "(かわいい 🐕 😻 (föö))", [
|
||||||
Ok(TokenKind::OpenParen),
|
Ok(TokenKind::OpenParen),
|
||||||
Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
|
Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
|
||||||
Ok(TokenKind::Symbol(Symbol::from("🐕"))),
|
Ok(TokenKind::Symbol(Symbol::from("🐕"))),
|
||||||
Ok(TokenKind::Symbol(Symbol::from("😻"))),
|
Ok(TokenKind::Symbol(Symbol::from("😻"))),
|
||||||
|
Ok(TokenKind::OpenParen),
|
||||||
|
Ok(TokenKind::Symbol(Symbol::from("föö"))),
|
||||||
|
Ok(TokenKind::CloseParen),
|
||||||
Ok(TokenKind::CloseParen),
|
Ok(TokenKind::CloseParen),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,7 @@ pub(crate) enum TokenKind {
|
|||||||
/// Boolean, e.g. `true`, `false`
|
/// Boolean, e.g. `true`, `false`
|
||||||
Bool(bool),
|
Bool(bool),
|
||||||
/// Character, e.g. `'c'`, `'\n'`
|
/// Character, e.g. `'c'`, `'\n'`
|
||||||
Char(char),
|
Char(String),
|
||||||
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
|
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
|
||||||
Float(f64),
|
Float(f64),
|
||||||
/// Integer, e.g. `0`, `-1`, `+200`
|
/// Integer, e.g. `0`, `-1`, `+200`
|
||||||
|
@ -117,7 +117,7 @@ pub(crate) enum Atom {
|
|||||||
/// Boolean, e.g. `true`, `false`
|
/// Boolean, e.g. `true`, `false`
|
||||||
Bool(bool),
|
Bool(bool),
|
||||||
/// Character, e.g. `'c'`, `'\n'`
|
/// Character, e.g. `'c'`, `'\n'`
|
||||||
Char(char),
|
Char(String),
|
||||||
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
|
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
|
||||||
Float(f64),
|
Float(f64),
|
||||||
/// Integer, e.g. `0`, `-1`, `+200`
|
/// Integer, e.g. `0`, `-1`, `+200`
|
||||||
|
@ -258,9 +258,9 @@ mod tests {
|
|||||||
test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![
|
test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![
|
||||||
Node::new(
|
Node::new(
|
||||||
Expr::Vector(vec![
|
Expr::Vector(vec![
|
||||||
Node::new(Expr::Atom(Atom::Char('a')), Span::new(1..4, src.clone())),
|
Node::new(Expr::Atom(Atom::Char("a".into())), Span::new(1..4, src.clone())),
|
||||||
Node::new(Expr::Atom(Atom::Char('b')), Span::new(5..8, src.clone())),
|
Node::new(Expr::Atom(Atom::Char("b".into())), Span::new(5..8, src.clone())),
|
||||||
Node::new(Expr::Atom(Atom::Char('c')), Span::new(9..12, src.clone())),
|
Node::new(Expr::Atom(Atom::Char("c".into())), Span::new(9..12, src.clone())),
|
||||||
]),
|
]),
|
||||||
Span::new(0..13, src),
|
Span::new(0..13, src),
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user