wright/lexer/
trivial.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
//! Trivial tokens and their implementation.

use super::{
    token::{Token, TokenTy},
    Lexer,
};

/// Trivial tokens that are two ASCII characters and can be matched directly
/// against the input source code.
pub const TWO_ASCII_TRIVIAL_TOKENS: &[(&[u8; 2], TokenTy)] = &[
    (b"->", TokenTy::SingleArrow),
    (b"-=", TokenTy::MinusEq),
    (b"=>", TokenTy::DoubleArrow),
    (b"==", TokenTy::EqEq),
    (b"&&", TokenTy::AndAnd),
    (b"||", TokenTy::OrOr),
    (b"<<", TokenTy::LtLt),
    (b">>", TokenTy::GtGt),
    (b"::", TokenTy::ColonColon),
    (b"|=", TokenTy::OrEq),
    (b"&=", TokenTy::AndEq),
    (b":=", TokenTy::ColonEq),
    (b">=", TokenTy::GtEq),
    (b"<=", TokenTy::LtEq),
    (b"!=", TokenTy::BangEq),
    (b"%=", TokenTy::ModEq),
    (b"^=", TokenTy::XorEq),
    (b"*=", TokenTy::StarEq),
    (b"+=", TokenTy::PlusEq),
    (b"/=", TokenTy::DivEq),
];

/// Single ASCII character trivial tokens that can be matched directly against
/// the source code.
pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[
    (b'(', TokenTy::LeftParen),
    (b')', TokenTy::RightParen),
    (b'[', TokenTy::LeftBracket),
    (b']', TokenTy::RightBracket),
    (b'{', TokenTy::LeftCurly),
    (b'}', TokenTy::RightCurly),
    (b'@', TokenTy::At),
    (b';', TokenTy::Semi),
    (b'?', TokenTy::Question),
    (b',', TokenTy::Comma),
    (b'#', TokenTy::Hash),
    (b'$', TokenTy::Dollar),
    (b'>', TokenTy::Gt),
    (b'<', TokenTy::Lt),
    (b'-', TokenTy::Minus),
    (b':', TokenTy::Colon),
    (b'!', TokenTy::Bang),
    (b'=', TokenTy::Eq),
    (b'&', TokenTy::And),
    (b'|', TokenTy::Or),
    (b'/', TokenTy::Div),
    (b'+', TokenTy::Plus),
    (b'^', TokenTy::Xor),
    (b'*', TokenTy::Star),
    (b'%', TokenTy::Mod),
];

/// Attempt to consume a "trivial" token from the start of the [Lexer]'s [Lexer::remaining] fragment.
///
/// Leave the lexer unmodified if one is not available.
pub fn try_consume_trivial_token(lexer: &mut Lexer) -> Option<Token> {
    // Get the number of bytes remaining, since we need at least 1 to parse anything.
    let bytes_remaining: usize = lexer.bytes_remaining();

    // No token if there are no bytes of source left.
    if bytes_remaining == 0 {
        return None;
    }

    // Attempt to match any two-byte ASCII trivial tokens.
    // This must be done before single-ascii byte tokens since matching is greedy.
    if bytes_remaining >= 2 {
        // Get the first two bytes of the remaining fragment.
        // SAFETY: We just checked length.
        let bytes: &[u8] = unsafe { lexer.remaining.as_str().as_bytes().get_unchecked(0..2) };

        // Match against each possible token pattern.
        for (pattern, kind) in TWO_ASCII_TRIVIAL_TOKENS {
            if bytes == *pattern {
                // SAFETY: We have already done bounds checking, and this cannot be a character
                // boundary since we just matched against ASCII characters.
                return Some(unsafe { lexer.split_token_unchecked(2, *kind) });
            }
        }
    }

    // Do the same for single byte patterns.
    // SAFETY: We checked that the number of bytes remaining is not 0 above.
    let byte: &u8 = unsafe { lexer.remaining.as_str().as_bytes().get_unchecked(0) };

    for (pattern, kind) in SINGLE_ASCII_CHAR_TRIVIAL_TOKENS {
        if byte == pattern {
            // SAFETTY: If we matched, then the first byte is ASCII, and therefor we don't have to worry
            // about bounds or unicode boundaries.
            return Some(unsafe { lexer.split_token_unchecked(1, *kind) });
        }
    }

    // If nothing else has matched, there is no trivial token available.
    None
}

#[cfg(test)]
mod tests {
    use super::{Lexer, TokenTy};

    #[test]
    fn plus_and_plus_eq_tokens() {
        let mut plus = Lexer::new_test("+");
        let mut plus_eq = Lexer::new_test("+=");

        let plus_token = plus.next_token().unwrap();
        let plus_eq_token = plus_eq.next_token().unwrap();

        assert_eq!(plus.bytes_remaining(), 0);
        assert_eq!(plus_eq.bytes_remaining(), 0);
        assert_eq!(plus_token.variant, TokenTy::Plus);
        assert_eq!(plus_eq_token.variant, TokenTy::PlusEq);
    }

    #[test]
    fn plus_one_token() {
        let mut plus_one = Lexer::new_test("+1");
        let plus_token = plus_one.next_token().unwrap();
        assert_eq!(plus_one.bytes_remaining(), 1);
        assert_eq!(plus_token.variant, TokenTy::Plus);
        assert_eq!(plus_token.fragment.len(), 1);
    }
}