wright/lexer/
identifier.rs

1//! Implementation related to parsing keywords and identifiers.
2
3use super::{Lexer, token::Token, token::TokenTy};
4use crate::source_tracking::fragment::Fragment;
5use std::str::Chars;
6use unicode_ident::{is_xid_continue, is_xid_start};
7
8/// Try to match a fragment recognized to be an identifier or keyword to
9/// a keyword or return [TokenTy::Identifier].
10fn identifier_or_keyword(fragment: Fragment) -> TokenTy {
11    use TokenTy::*;
12
13    match fragment.as_str() {
14        "record" => KwRecord,
15        "type" => KwType,
16        "enum" => KwEnum,
17        "union" => KwUnion,
18        "func" => KwFunc,
19        "pure" => KwPure,
20        "repr" => KwRepr,
21        "impl" => KwImpl,
22        "constraint" => KwConstraint,
23        "references" => KwReferences,
24        "trait" => KwTrait,
25        "const" => KwConst,
26        "where" => KwWhere,
27
28        "use" => KwUse,
29        "as" => KwAs,
30        "mod" => KwMod,
31        "pub" => KwPub,
32
33        "if" => KwIf,
34        "else" => KwElse,
35        "match" => KwMatch,
36
37        "for" => KwFor,
38        "in" => KwIn,
39        "while" => KwWhile,
40        "loop" => KwLoop,
41
42        "let" => KwLet,
43        "var" => KwVar,
44
45        "true" => KwTrue,
46        "false" => KwFalse,
47
48        "bool" => KwBool,
49        "u8" => KwU8,
50        "i8" => KwI8,
51        "u16" => KwU16,
52        "i16" => KwI16,
53        "u32" => KwU32,
54        "i32" => KwI32,
55        "f32" => KwF32,
56        "u64" => KwU64,
57        "i64" => KwI64,
58        "f64" => KwF64,
59        "char" => KwChar,
60
61        "_" => Underscore,
62
63        _ => Identifier,
64    }
65}
66
67/// Attempt to consume a keyword/[identifier](TokenTy::Identifier)/[underscore](TokenTy::Underscore) from the lexer.
68pub fn try_consume_keyword_or_identifier(lexer: &mut Lexer) -> Option<Token> {
69    // Get a character iterator that we can pull from.
70    let mut chars: Chars = lexer.remaining.chars();
71    // Get the next character from the iterator, consider it the first char of any potential match.
72    // Make sure it's a valid identifier start (includes start to all keywords) or is an underscore.
73    // If it does not exist or match predicates, return None.
74    let next: char = chars.next().filter(|c| is_xid_start(*c) || *c == '_')?;
75    // Store/track the number of bytes consumed so far.
76    let mut bytes_consumed: usize = next.len_utf8();
77
78    // Take remaining chars and add to sum.
79    bytes_consumed += chars
80        .take_while(|c| is_xid_continue(*c))
81        .map(char::len_utf8)
82        .sum::<usize>();
83
84    // Split the token and the new remaining fragment.
85    // VALIDITY: The character iterator should guarantee that we land on a valid character boundary within the bounds
86    // of the fragment.
87    let (token_fragment, new_remaining): (Fragment, Fragment) =
88        lexer.remaining.split_at_unchecked(bytes_consumed);
89
90    // Get the variant of token to produce.
91    let variant: TokenTy = identifier_or_keyword(token_fragment.clone());
92
93    // Update the lexer's remaining fragment.
94    lexer.remaining = new_remaining;
95
96    // Return the token.
97    Some(Token {
98        variant,
99        fragment: token_fragment,
100    })
101}
102
103#[cfg(test)]
104mod tests {
105    use super::{Lexer, TokenTy};
106
107    #[test]
108    fn identifiers_and_keywords() {
109        let mut lexer = Lexer::new_test("const TEST");
110
111        assert_eq!(lexer.next_token().unwrap().variant, TokenTy::KwConst);
112        assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Whitespace);
113        assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Identifier);
114    }
115}