More lexer refactoring, support character literals for e.g. newline, tab, etc.
This commit is contained in:
		
							parent
							
								
									2d95a58ce7
								
							
						
					
					
						commit
						ece6645e50
					
				| @ -108,6 +108,11 @@ impl<'a> Lexer<'a> { | ||||
|         self.chars.peek() | ||||
|     } | ||||
| 
 | ||||
|     #[inline] | ||||
|     fn peek_some_not_separator(&mut self) -> bool { | ||||
|         self.peek().is_some_and(|c| !is_separator(c)) | ||||
|     } | ||||
| 
 | ||||
|     fn advance(&mut self) -> Option<char> { | ||||
|         self.chars.next().inspect(|c| { | ||||
|             self.cursor += c.len_utf8(); | ||||
| @ -137,20 +142,29 @@ impl<'a> Lexer<'a> { | ||||
|     fn read_char(&mut self) -> Result<TokenKind, LexerError> { | ||||
|         // NOTE: We have already consumed the initial '\' when this function is invoked
 | ||||
| 
 | ||||
|         let c = if self.peek().is_some_and(|c| !is_separator(c)) { | ||||
|             self.advance().unwrap() // SAFETY: This will never panic
 | ||||
|         } else { | ||||
|         if self.peek().is_none() || self.peek().is_some_and(|c| is_separator(c)) { | ||||
|             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||
|         }; | ||||
| 
 | ||||
|         match c { | ||||
|             'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(), | ||||
|             'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(), | ||||
|             _ if self.peek().is_some_and(|c| !is_separator(c)) => { | ||||
|                 Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())) | ||||
|             } | ||||
|             _ => Ok(TokenKind::Char), | ||||
|         } | ||||
| 
 | ||||
|         match self.advance() { | ||||
|             Some('u') if self.peek_some_not_separator() => self.complete_unicode_escape(), | ||||
|             Some('x') if self.peek_some_not_separator() => self.complete_ascii_escape(), | ||||
|             _ => { | ||||
|                 if self.peek_some_not_separator() { | ||||
|                     // Consume the remainder of the literal:
 | ||||
|                     self.read_word(); | ||||
| 
 | ||||
|                     if !matches!( | ||||
|                         &self.slice()[1..], // Ignore the leading '\'
 | ||||
|                         b"backspace" | b"formfeed" | b"newline" | b"return" | b"space" | b"tab" | ||||
|                     ) { | ||||
|                         return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||
|             } | ||||
|                 } | ||||
| 
 | ||||
|                 Ok(TokenKind::Char) | ||||
|         } | ||||
|     } | ||||
|     } | ||||
| 
 | ||||
|     fn complete_ascii_escape(&mut self) -> Result<TokenKind, LexerError> { | ||||
| @ -174,7 +188,7 @@ impl<'a> Lexer<'a> { | ||||
| 
 | ||||
|         // We should be at the end of the literal now, i.e. next char should be a
 | ||||
|         // separator:
 | ||||
|         if self.peek().is_some_and(|c| !is_separator(c)) { | ||||
|         if self.peek_some_not_separator() { | ||||
|             self.read_word(); // Recover
 | ||||
|             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||
|         } | ||||
| @ -187,7 +201,7 @@ impl<'a> Lexer<'a> { | ||||
| 
 | ||||
|         // Expect between 1 and 6 hexadecimal digits:
 | ||||
|         let mut count = 0; | ||||
|         while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 { | ||||
|         while self.peek_some_not_separator() && count < 6 { | ||||
|             match self.advance() { | ||||
|                 Some(c) if c.is_ascii_hexdigit() => count += 1, | ||||
|                 _ => { | ||||
| @ -199,7 +213,7 @@ impl<'a> Lexer<'a> { | ||||
| 
 | ||||
|         // If no hexadecimal digits were found, or digits were found but we are still
 | ||||
|         // not at the end of the literal, then the literal is invalid:
 | ||||
|         if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) { | ||||
|         if count == 0 || self.peek_some_not_separator() { | ||||
|             self.read_word(); // Recover
 | ||||
|             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||
|         } | ||||
| @ -210,7 +224,7 @@ impl<'a> Lexer<'a> { | ||||
|     fn read_keyword(&mut self) -> Result<TokenKind, LexerError> { | ||||
|         // NOTE: We have already consumed the initial ':' when this function is invoked
 | ||||
| 
 | ||||
|         if self.peek().is_some_and(|c| !is_separator(c)) { | ||||
|         if self.peek_some_not_separator() { | ||||
|             self.read_word(); | ||||
|             Ok(TokenKind::Keyword) | ||||
|         } else { | ||||
| @ -229,7 +243,7 @@ impl<'a> Lexer<'a> { | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         while self.peek().is_some_and(|c| !is_separator(c)) { | ||||
|         while self.peek_some_not_separator() { | ||||
|             match self.advance() { | ||||
|                 Some(c) if c.is_ascii_digit() => {} | ||||
|                 Some('.') => return self.complete_decimal(), | ||||
| @ -250,23 +264,20 @@ impl<'a> Lexer<'a> { | ||||
|         self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
 | ||||
| 
 | ||||
|         let mut digit_found = false; | ||||
|         while let Some(c) = self.peek() { | ||||
|             match c { | ||||
|                 _ if is_separator(c) => break, | ||||
|                 _ if c.is_digit(radix) => { | ||||
|                     self.advance(); | ||||
|                     digit_found = true; | ||||
|                 } | ||||
|         while self.peek_some_not_separator() { | ||||
|             match self.advance() { | ||||
|                 Some(c) if c.is_digit(radix) => digit_found = true, | ||||
|                 _ => { | ||||
|                     self.read_word(); // Recover
 | ||||
|                     return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); | ||||
|                 } | ||||
|             }; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         if !digit_found { | ||||
|             self.read_word(); // Recover
 | ||||
|             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); | ||||
|             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()) | ||||
|                 .with_context(|| "no valid digits found following the integer base prefix")); | ||||
|         } | ||||
| 
 | ||||
|         Ok(TokenKind::Integer) | ||||
| @ -280,7 +291,7 @@ impl<'a> Lexer<'a> { | ||||
|         let mut exp_found = false; | ||||
|         let mut sign_found = false; | ||||
| 
 | ||||
|         while self.peek().is_some_and(|c| !is_separator(c)) { | ||||
|         while self.peek_some_not_separator() { | ||||
|             match self.advance() { | ||||
|                 Some(c) if c.is_ascii_digit() => digit_found = true, | ||||
|                 Some('e') | Some('E') if digit_found && !exp_found => exp_found = true, | ||||
| @ -303,7 +314,7 @@ impl<'a> Lexer<'a> { | ||||
|         let mut sign_found = false; | ||||
|         let mut digit_found = false; | ||||
| 
 | ||||
|         while self.peek().is_some_and(|c| !is_separator(c)) { | ||||
|         while self.peek_some_not_separator() { | ||||
|             match self.advance() { | ||||
|                 Some(c) if c.is_ascii_digit() => digit_found = true, | ||||
|                 Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true, | ||||
| @ -317,7 +328,8 @@ impl<'a> Lexer<'a> { | ||||
| 
 | ||||
|         if !digit_found { | ||||
|             self.read_word(); // Recover
 | ||||
|             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); | ||||
|             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()) | ||||
|                 .with_context(|| "no digits found in the ratio's denominator")); | ||||
|         } | ||||
| 
 | ||||
|         Ok(TokenKind::Ratio) | ||||
| @ -467,7 +479,7 @@ mod tests { | ||||
|         (Ok(TokenKind::Comment), 3..4, ";"), | ||||
|     ]); | ||||
| 
 | ||||
|     test!(char: r"\a \? \7 \λ \\ \u \x" => [ | ||||
|     test!(char: r"\a \? \7 \λ \\ \u \x \newline" => [ | ||||
|         (Ok(TokenKind::Char), 0..2, r"\a"), | ||||
|         (Ok(TokenKind::Whitespace), 2..3, " "), | ||||
|         (Ok(TokenKind::Char), 3..5, r"\?"), | ||||
| @ -481,9 +493,11 @@ mod tests { | ||||
|         (Ok(TokenKind::Char), 16..18, r"\u"), | ||||
|         (Ok(TokenKind::Whitespace), 18..19, " "), | ||||
|         (Ok(TokenKind::Char), 19..21, r"\x"), | ||||
|         (Ok(TokenKind::Whitespace), 21..22, " "), | ||||
|         (Ok(TokenKind::Char), 22..30, r"\newline"), | ||||
|     ]); | ||||
| 
 | ||||
|     test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [ | ||||
|     test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222 \foobar" => [ | ||||
|         (Err(LexerErrorKind::InvalidChar), 0..1, r"\"), | ||||
|         (Ok(TokenKind::Whitespace), 1..2, " "), | ||||
|         (Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"), | ||||
| @ -495,6 +509,8 @@ mod tests { | ||||
|         (Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"), | ||||
|         (Ok(TokenKind::Whitespace), 19..20, " "), | ||||
|         (Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"), | ||||
|         (Ok(TokenKind::Whitespace), 29..30, " "), | ||||
|         (Err(LexerErrorKind::InvalidChar), 30..37, r"\foobar"), | ||||
|     ]); | ||||
| 
 | ||||
|     test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [ | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user