More lexer refactoring, support character literals for e.g. newline, tab, etc.
This commit is contained in:
		
							parent
							
								
									2d95a58ce7
								
							
						
					
					
						commit
						ece6645e50
					
				| @ -108,6 +108,11 @@ impl<'a> Lexer<'a> { | |||||||
|         self.chars.peek() |         self.chars.peek() | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     #[inline] | ||||||
|  |     fn peek_some_not_separator(&mut self) -> bool { | ||||||
|  |         self.peek().is_some_and(|c| !is_separator(c)) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     fn advance(&mut self) -> Option<char> { |     fn advance(&mut self) -> Option<char> { | ||||||
|         self.chars.next().inspect(|c| { |         self.chars.next().inspect(|c| { | ||||||
|             self.cursor += c.len_utf8(); |             self.cursor += c.len_utf8(); | ||||||
| @ -137,19 +142,28 @@ impl<'a> Lexer<'a> { | |||||||
|     fn read_char(&mut self) -> Result<TokenKind, LexerError> { |     fn read_char(&mut self) -> Result<TokenKind, LexerError> { | ||||||
|         // NOTE: We have already consumed the initial '\' when this function is invoked
 |         // NOTE: We have already consumed the initial '\' when this function is invoked
 | ||||||
| 
 | 
 | ||||||
|         let c = if self.peek().is_some_and(|c| !is_separator(c)) { |         if self.peek().is_none() || self.peek().is_some_and(|c| is_separator(c)) { | ||||||
|             self.advance().unwrap() // SAFETY: This will never panic
 |  | ||||||
|         } else { |  | ||||||
|             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); |             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|         match c { |  | ||||||
|             'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(), |  | ||||||
|             'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(), |  | ||||||
|             _ if self.peek().is_some_and(|c| !is_separator(c)) => { |  | ||||||
|                 Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())) |  | ||||||
|         } |         } | ||||||
|             _ => Ok(TokenKind::Char), | 
 | ||||||
|  |         match self.advance() { | ||||||
|  |             Some('u') if self.peek_some_not_separator() => self.complete_unicode_escape(), | ||||||
|  |             Some('x') if self.peek_some_not_separator() => self.complete_ascii_escape(), | ||||||
|  |             _ => { | ||||||
|  |                 if self.peek_some_not_separator() { | ||||||
|  |                     // Consume the remainder of the literal:
 | ||||||
|  |                     self.read_word(); | ||||||
|  | 
 | ||||||
|  |                     if !matches!( | ||||||
|  |                         &self.slice()[1..], // Ignore the leading '\'
 | ||||||
|  |                         b"backspace" | b"formfeed" | b"newline" | b"return" | b"space" | b"tab" | ||||||
|  |                     ) { | ||||||
|  |                         return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||||
|  |             } | ||||||
|  |                 } | ||||||
|  | 
 | ||||||
|  |                 Ok(TokenKind::Char) | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| @ -174,7 +188,7 @@ impl<'a> Lexer<'a> { | |||||||
| 
 | 
 | ||||||
|         // We should be at the end of the literal now, i.e. next char should be a
 |         // We should be at the end of the literal now, i.e. next char should be a
 | ||||||
|         // separator:
 |         // separator:
 | ||||||
|         if self.peek().is_some_and(|c| !is_separator(c)) { |         if self.peek_some_not_separator() { | ||||||
|             self.read_word(); // Recover
 |             self.read_word(); // Recover
 | ||||||
|             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); |             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||||
|         } |         } | ||||||
| @ -187,7 +201,7 @@ impl<'a> Lexer<'a> { | |||||||
| 
 | 
 | ||||||
|         // Expect between 1 and 6 hexadecimal digits:
 |         // Expect between 1 and 6 hexadecimal digits:
 | ||||||
|         let mut count = 0; |         let mut count = 0; | ||||||
|         while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 { |         while self.peek_some_not_separator() && count < 6 { | ||||||
|             match self.advance() { |             match self.advance() { | ||||||
|                 Some(c) if c.is_ascii_hexdigit() => count += 1, |                 Some(c) if c.is_ascii_hexdigit() => count += 1, | ||||||
|                 _ => { |                 _ => { | ||||||
| @ -199,7 +213,7 @@ impl<'a> Lexer<'a> { | |||||||
| 
 | 
 | ||||||
|         // If no hexadecimal digits were found, or digits were found but we are still
 |         // If no hexadecimal digits were found, or digits were found but we are still
 | ||||||
|         // not at the end of the literal, then the literal is invalid:
 |         // not at the end of the literal, then the literal is invalid:
 | ||||||
|         if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) { |         if count == 0 || self.peek_some_not_separator() { | ||||||
|             self.read_word(); // Recover
 |             self.read_word(); // Recover
 | ||||||
|             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); |             return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); | ||||||
|         } |         } | ||||||
| @ -210,7 +224,7 @@ impl<'a> Lexer<'a> { | |||||||
|     fn read_keyword(&mut self) -> Result<TokenKind, LexerError> { |     fn read_keyword(&mut self) -> Result<TokenKind, LexerError> { | ||||||
|         // NOTE: We have already consumed the initial ':' when this function is invoked
 |         // NOTE: We have already consumed the initial ':' when this function is invoked
 | ||||||
| 
 | 
 | ||||||
|         if self.peek().is_some_and(|c| !is_separator(c)) { |         if self.peek_some_not_separator() { | ||||||
|             self.read_word(); |             self.read_word(); | ||||||
|             Ok(TokenKind::Keyword) |             Ok(TokenKind::Keyword) | ||||||
|         } else { |         } else { | ||||||
| @ -229,7 +243,7 @@ impl<'a> Lexer<'a> { | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         while self.peek().is_some_and(|c| !is_separator(c)) { |         while self.peek_some_not_separator() { | ||||||
|             match self.advance() { |             match self.advance() { | ||||||
|                 Some(c) if c.is_ascii_digit() => {} |                 Some(c) if c.is_ascii_digit() => {} | ||||||
|                 Some('.') => return self.complete_decimal(), |                 Some('.') => return self.complete_decimal(), | ||||||
| @ -250,23 +264,20 @@ impl<'a> Lexer<'a> { | |||||||
|         self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
 |         self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
 | ||||||
| 
 | 
 | ||||||
|         let mut digit_found = false; |         let mut digit_found = false; | ||||||
|         while let Some(c) = self.peek() { |         while self.peek_some_not_separator() { | ||||||
|             match c { |             match self.advance() { | ||||||
|                 _ if is_separator(c) => break, |                 Some(c) if c.is_digit(radix) => digit_found = true, | ||||||
|                 _ if c.is_digit(radix) => { |  | ||||||
|                     self.advance(); |  | ||||||
|                     digit_found = true; |  | ||||||
|                 } |  | ||||||
|                 _ => { |                 _ => { | ||||||
|                     self.read_word(); // Recover
 |                     self.read_word(); // Recover
 | ||||||
|                     return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); |                     return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); | ||||||
|                 } |                 } | ||||||
|             }; |             } | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         if !digit_found { |         if !digit_found { | ||||||
|             self.read_word(); // Recover
 |             self.read_word(); // Recover
 | ||||||
|             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); |             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()) | ||||||
|  |                 .with_context(|| "no valid digits found following the integer base prefix")); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         Ok(TokenKind::Integer) |         Ok(TokenKind::Integer) | ||||||
| @ -280,7 +291,7 @@ impl<'a> Lexer<'a> { | |||||||
|         let mut exp_found = false; |         let mut exp_found = false; | ||||||
|         let mut sign_found = false; |         let mut sign_found = false; | ||||||
| 
 | 
 | ||||||
|         while self.peek().is_some_and(|c| !is_separator(c)) { |         while self.peek_some_not_separator() { | ||||||
|             match self.advance() { |             match self.advance() { | ||||||
|                 Some(c) if c.is_ascii_digit() => digit_found = true, |                 Some(c) if c.is_ascii_digit() => digit_found = true, | ||||||
|                 Some('e') | Some('E') if digit_found && !exp_found => exp_found = true, |                 Some('e') | Some('E') if digit_found && !exp_found => exp_found = true, | ||||||
| @ -303,7 +314,7 @@ impl<'a> Lexer<'a> { | |||||||
|         let mut sign_found = false; |         let mut sign_found = false; | ||||||
|         let mut digit_found = false; |         let mut digit_found = false; | ||||||
| 
 | 
 | ||||||
|         while self.peek().is_some_and(|c| !is_separator(c)) { |         while self.peek_some_not_separator() { | ||||||
|             match self.advance() { |             match self.advance() { | ||||||
|                 Some(c) if c.is_ascii_digit() => digit_found = true, |                 Some(c) if c.is_ascii_digit() => digit_found = true, | ||||||
|                 Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true, |                 Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true, | ||||||
| @ -317,7 +328,8 @@ impl<'a> Lexer<'a> { | |||||||
| 
 | 
 | ||||||
|         if !digit_found { |         if !digit_found { | ||||||
|             self.read_word(); // Recover
 |             self.read_word(); // Recover
 | ||||||
|             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); |             return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()) | ||||||
|  |                 .with_context(|| "no digits found in the ratio's denominator")); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         Ok(TokenKind::Ratio) |         Ok(TokenKind::Ratio) | ||||||
| @ -467,7 +479,7 @@ mod tests { | |||||||
|         (Ok(TokenKind::Comment), 3..4, ";"), |         (Ok(TokenKind::Comment), 3..4, ";"), | ||||||
|     ]); |     ]); | ||||||
| 
 | 
 | ||||||
|     test!(char: r"\a \? \7 \λ \\ \u \x" => [ |     test!(char: r"\a \? \7 \λ \\ \u \x \newline" => [ | ||||||
|         (Ok(TokenKind::Char), 0..2, r"\a"), |         (Ok(TokenKind::Char), 0..2, r"\a"), | ||||||
|         (Ok(TokenKind::Whitespace), 2..3, " "), |         (Ok(TokenKind::Whitespace), 2..3, " "), | ||||||
|         (Ok(TokenKind::Char), 3..5, r"\?"), |         (Ok(TokenKind::Char), 3..5, r"\?"), | ||||||
| @ -481,9 +493,11 @@ mod tests { | |||||||
|         (Ok(TokenKind::Char), 16..18, r"\u"), |         (Ok(TokenKind::Char), 16..18, r"\u"), | ||||||
|         (Ok(TokenKind::Whitespace), 18..19, " "), |         (Ok(TokenKind::Whitespace), 18..19, " "), | ||||||
|         (Ok(TokenKind::Char), 19..21, r"\x"), |         (Ok(TokenKind::Char), 19..21, r"\x"), | ||||||
|  |         (Ok(TokenKind::Whitespace), 21..22, " "), | ||||||
|  |         (Ok(TokenKind::Char), 22..30, r"\newline"), | ||||||
|     ]); |     ]); | ||||||
| 
 | 
 | ||||||
|     test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [ |     test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222 \foobar" => [ | ||||||
|         (Err(LexerErrorKind::InvalidChar), 0..1, r"\"), |         (Err(LexerErrorKind::InvalidChar), 0..1, r"\"), | ||||||
|         (Ok(TokenKind::Whitespace), 1..2, " "), |         (Ok(TokenKind::Whitespace), 1..2, " "), | ||||||
|         (Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"), |         (Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"), | ||||||
| @ -495,6 +509,8 @@ mod tests { | |||||||
|         (Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"), |         (Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"), | ||||||
|         (Ok(TokenKind::Whitespace), 19..20, " "), |         (Ok(TokenKind::Whitespace), 19..20, " "), | ||||||
|         (Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"), |         (Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"), | ||||||
|  |         (Ok(TokenKind::Whitespace), 29..30, " "), | ||||||
|  |         (Err(LexerErrorKind::InvalidChar), 30..37, r"\foobar"), | ||||||
|     ]); |     ]); | ||||||
| 
 | 
 | ||||||
|     test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [ |     test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [ | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user