wright/lexer/
trivial.rs

1//! Trivial tokens and their implementation.
2
3use super::{
4    token::{Token, TokenTy},
5    Lexer,
6};
7
8/// Trivial tokens that are two ASCII characters and can be matched directly
9/// against the input source code.
10pub const TWO_ASCII_TRIVIAL_TOKENS: &[(&[u8; 2], TokenTy)] = &[
11    (b"->", TokenTy::SingleArrow),
12    (b"-=", TokenTy::MinusEq),
13    (b"=>", TokenTy::DoubleArrow),
14    (b"==", TokenTy::EqEq),
15    (b"&&", TokenTy::AndAnd),
16    (b"||", TokenTy::OrOr),
17    (b"<<", TokenTy::LtLt),
18    (b">>", TokenTy::GtGt),
19    (b"::", TokenTy::ColonColon),
20    (b"|=", TokenTy::OrEq),
21    (b"&=", TokenTy::AndEq),
22    (b":=", TokenTy::ColonEq),
23    (b">=", TokenTy::GtEq),
24    (b"<=", TokenTy::LtEq),
25    (b"!=", TokenTy::BangEq),
26    (b"%=", TokenTy::ModEq),
27    (b"^=", TokenTy::XorEq),
28    (b"*=", TokenTy::StarEq),
29    (b"+=", TokenTy::PlusEq),
30    (b"/=", TokenTy::DivEq),
31];
32
33/// Single ASCII character trivial tokens that can be matched directly against
34/// the source code.
35pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[
36    (b'(', TokenTy::LeftParen),
37    (b')', TokenTy::RightParen),
38    (b'[', TokenTy::LeftBracket),
39    (b']', TokenTy::RightBracket),
40    (b'{', TokenTy::LeftCurly),
41    (b'}', TokenTy::RightCurly),
42    (b'@', TokenTy::At),
43    (b';', TokenTy::Semi),
44    (b'?', TokenTy::Question),
45    (b',', TokenTy::Comma),
46    (b'#', TokenTy::Hash),
47    (b'$', TokenTy::Dollar),
48    (b'>', TokenTy::Gt),
49    (b'<', TokenTy::Lt),
50    (b'-', TokenTy::Minus),
51    (b':', TokenTy::Colon),
52    (b'!', TokenTy::Bang),
53    (b'=', TokenTy::Eq),
54    (b'&', TokenTy::And),
55    (b'|', TokenTy::Or),
56    (b'/', TokenTy::Div),
57    (b'+', TokenTy::Plus),
58    (b'^', TokenTy::Xor),
59    (b'*', TokenTy::Star),
60    (b'%', TokenTy::Mod),
61];
62
63/// Attempt to consume a "trivial" token from the start of the [Lexer]'s [Lexer::remaining] fragment.
64///
65/// Leave the lexer unmodified if one is not available.
66pub fn try_consume_trivial_token(lexer: &mut Lexer) -> Option<Token> {
67    // Get the number of bytes remaining, since we need at least 1 to parse anything.
68    let bytes_remaining: usize = lexer.bytes_remaining();
69
70    // No token if there are no bytes of source left.
71    if bytes_remaining == 0 {
72        return None;
73    }
74
75    // Attempt to match any two-byte ASCII trivial tokens.
76    // This must be done before single-ascii byte tokens since matching is greedy.
77    if bytes_remaining >= 2 {
78        // Get the first two bytes of the remaining fragment.
79        // SAFETY: We just checked length.
80        let bytes: &[u8] = unsafe { lexer.remaining.as_str().as_bytes().get_unchecked(0..2) };
81
82        // Match against each possible token pattern.
83        for (pattern, kind) in TWO_ASCII_TRIVIAL_TOKENS {
84            if bytes == *pattern {
85                // We have already done bounds checking, and this cannot be a character
86                // boundary since we just matched against ASCII characters.
87                return Some(lexer.split_token_unchecked(2, *kind));
88            }
89        }
90    }
91
92    // Do the same for single byte patterns.
93    // SAFETY: We checked that the number of bytes remaining is not 0 above.
94    let byte: &u8 = unsafe { lexer.remaining.as_str().as_bytes().get_unchecked(0) };
95
96    for (pattern, kind) in SINGLE_ASCII_CHAR_TRIVIAL_TOKENS {
97        if byte == pattern {
98            // If we matched, then the first byte is ASCII, and therefore we don't have to worry
99            // about bounds or unicode boundaries.
100            return Some(lexer.split_token_unchecked(1, *kind));
101        }
102    }
103
104    // If nothing else has matched, there is no trivial token available.
105    None
106}
107
108#[cfg(test)]
109mod tests {
110    use super::{Lexer, TokenTy};
111
112    #[test]
113    fn plus_and_plus_eq_tokens() {
114        let mut plus = Lexer::new_test("+");
115        let mut plus_eq = Lexer::new_test("+=");
116
117        let plus_token = plus.next_token().unwrap();
118        let plus_eq_token = plus_eq.next_token().unwrap();
119
120        assert_eq!(plus.bytes_remaining(), 0);
121        assert_eq!(plus_eq.bytes_remaining(), 0);
122        assert_eq!(plus_token.variant, TokenTy::Plus);
123        assert_eq!(plus_eq_token.variant, TokenTy::PlusEq);
124    }
125
126    #[test]
127    fn plus_one_token() {
128        let mut plus_one = Lexer::new_test("+1");
129        let plus_token = plus_one.next_token().unwrap();
130        assert_eq!(plus_one.bytes_remaining(), 1);
131        assert_eq!(plus_token.variant, TokenTy::Plus);
132        assert_eq!(plus_token.fragment.len(), 1);
133    }
134}