wright/
lexer.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
//! First pass lexer that gets run on the source code and returns a series of tokens with their associated [Fragment]s.
//!
//! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns
//! defined for tokens.

use self::comments::{try_match_block_comment, try_match_single_line_comment};
use self::integer_literal::try_consume_integer_literal;
use self::quoted::try_consume_quoted_literal;
use crate::source_tracking::fragment::Fragment;
use crate::source_tracking::SourceRef;
use std::str::Chars;
use token::{Token, TokenTy};

pub mod comments;
pub mod identifier;
pub mod integer_literal;
pub mod quoted;
pub mod token;
pub mod trivial;

/// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language.
#[derive(Debug, Clone)]
pub struct Lexer {
    /// The remaining source code that has not been processed and returned as a token from the iterator yet.
    pub remaining: Fragment,
}

impl Lexer {
    /// Get the number of bytes remaining that we need to transform into tokens.
    pub const fn bytes_remaining(&self) -> usize {
        self.remaining.len()
    }

    /// Construct a new [Lexer] over a given source reference.
    pub fn new(source: SourceRef) -> Self {
        Lexer {
            remaining: source.as_fragment(),
        }
    }

    /// Available in test cases, creates a new [Lexer] over a given static [str]ing.
    ///
    /// The instantiated [Source] in this [Lexer] has its name set to [FileName::None].
    ///
    /// This function is limited to this crate because `#[cfg(test)]` items are not available
    /// externally, however it should be relatively easy to reproduce.
    ///
    /// [Source]: crate::source_tracking::source::Source
    /// [FileName::None]: crate::source_tracking::filename::FileName::None
    #[cfg(test)]
    pub(crate) fn new_test(source: &'static str) -> Self {
        use crate::source_tracking::{filename::FileName, source::Source};
        use std::sync::Arc;

        Lexer {
            remaining: Fragment {
                source: Arc::new(Source::new_from_static_str(FileName::None, source)),
                range: 0..source.len(),
            },
        }
    }

    /// Make a token by splitting a given number of bytes off of [Lexer::remaining]
    /// and labeling them with the given kind.
    ///
    /// # Panics:
    /// - Panics if the number of bytes lands out of bounds or in the middle of a character.
    fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token {
        let (token_fragment, new_remaining_fragment) = self.remaining.split_at(bytes);
        self.remaining = new_remaining_fragment;

        Token {
            variant: kind,
            fragment: token_fragment,
        }
    }

    /// Unsafe version of [Lexer::split_token].
    ///
    /// # Safety
    /// - This function matches the safety guarantees of [Fragment::split_at_unchecked].
    unsafe fn split_token_unchecked(&mut self, bytes: usize, kind: TokenTy) -> Token {
        let (token_fragment, new_remaining_fragment) = self.remaining.split_at_unchecked(bytes);
        self.remaining = new_remaining_fragment;

        Token {
            variant: kind,
            fragment: token_fragment,
        }
    }

    /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for
    /// failable parsing. This can be compared to the original lexer it was forked from using [Lexer::offset_from].
    pub fn fork(&self) -> Self {
        self.clone()
    }

    /// Get the number of bytes between the origin's [remaining](Lexer::remaining) and
    /// this [Lexer]'s [remaining](Lexer::remaining) using [`Fragment::offset_from`].
    ///
    /// # Panics
    /// - This function panics under the same conditions as [`Fragment::offset_from`].
    /// - Generally the best way to avoid panics is to only call this function on
    ///     [Lexer]s created using [Lexer::fork] on the `origin` lexer.
    pub fn offset_from(&self, origin: &Self) -> usize {
        self.remaining.offset_from(&origin.remaining)
    }

    /// Check if a pattern matches at the start of the [Lexer::remaining] [Fragment].
    pub fn matches(&self, pattern: &str) -> bool {
        self.remaining.as_str().starts_with(pattern)
    }

    /// If the remaining fragment starts with the given `pattern`, strip it from the remaining fragment and return
    /// true. Otherwise return false.
    fn consume(&mut self, pattern: &str) -> bool {
        if self.matches(pattern) {
            // SAFETY: We just checked that the pattern matches.
            self.remaining.advance_by_unchecked(pattern.len());
            true
        } else {
            false
        }
    }

    /// Remove a character from the start of the `remaining` [`Fragment`], return the character
    /// consumed if there was a character available to consume.
    fn consume_any(&mut self) -> Option<char> {
        // Make a character iterator.
        let mut chars: Chars = self.remaining.chars();

        if let Some(c) = chars.next() {
            // Consumed a char, update the remaining fragment of this lexer.
            let char_bytes: usize = c.len_utf8();
            // SAFETY: we know that this is not on a char boundary and does not exceed the length of the slice,
            // since we just pulled it from a `Chars` iterator.
            unsafe { self.advance_unchecked(char_bytes) };
            // Return the character.
            Some(c)
        } else {
            // No characters available, return nothing.
            None
        }
    }

    /// Advance this lexer by the specified number of bytes.
    ///
    /// # Panics
    /// - If the lexer is not on a unicode character boundary after advancing.
    /// - If the number of bytes is greater than the length of the [remaining](Lexer::remaining) fragment.
    fn advance(&mut self, bytes: usize) {
        if bytes > self.remaining.len() {
            panic!("Cannot advance past end of lexer fragment");
        }

        if !self.remaining.as_str().is_char_boundary(bytes) {
            panic!("Advancing {bytes} bytes does not land on a character boundary");
        }

        self.remaining.range.start += bytes;
    }

    /// Unsafe version of [Lexer::advance].
    /// Advances this lexer by the specified number of bytes.
    ///
    /// # Safety
    /// - This lexer will be left in an invalid/undefined state if the number of bytes is greater than the length
    ///     of the [Lexer::remaining] fragment.
    /// - This lexer will be left in an invalid/undefined state if after advancing, the next byte in the
    ///     [Lexer::remaining] fragment is not the start of a unicode code point.
    unsafe fn advance_unchecked(&mut self, bytes: usize) {
        self.remaining.range.start += bytes;
    }

    /// Get the next token from the lexer.
    pub fn next_token(&mut self) -> Option<Token> {
        // If the remaining input is empty, there is no token.
        if self.remaining.is_empty() {
            return None;
        }

        // If there is whitespace, it becomes its own token.
        // Use a little unsafe here since this check is done every time and needs to be fast.
        {
            let remaining_str = self.remaining.as_str();
            let trimmed = remaining_str.trim_start().as_ptr();

            // Calculate the delta by pointer offset.
            // SAFETY: In this case, all the requirements of pointer::offset_from are satisfied.
            let delta = unsafe { trimmed.offset_from(remaining_str.as_ptr()) };

            if delta > 0 {
                // SAFETY: trim_start should always return a valid string, and delta is just checked to be > 0.
                return unsafe {
                    Some(self.split_token_unchecked(delta as usize, TokenTy::Whitespace))
                };
            }
        }

        // Attempt to parse a single line comment and then attempt a multi-line comment.
        for comment_match_fn in [try_match_single_line_comment, try_match_block_comment] {
            // Attempt to parse a comment using the given match function. Return it if it's documentation or unterminated.
            // Get a new token and return that if there was a comment and it was ignored successfully.
            match (comment_match_fn)(self) {
                // A comment was parsed, consume and return it.
                (bytes, Some(comment_variant)) => {
                    // Split the token.
                    let token: Token = self.split_token(bytes, comment_variant);
                    // Return it.
                    return Some(token);
                }

                // There was a comment, advance the lexer and ignore it. Re-start this function.
                (bytes @ 1.., None) => {
                    self.advance(bytes);
                    return self.next_token();
                }

                // There was no comment, keep trying to match other tokens.
                (0, None) => {}
            }
        }

        // Handle a trivial token if there is one.
        if let Some(token) = trivial::try_consume_trivial_token(self) {
            return Some(token);
        }

        // Next attempt to match a keyword or identifier.
        if let Some(token) = identifier::try_consume_keyword_or_identifier(self) {
            return Some(token);
        }

        // Next attempt to parse an integer literal.
        if let Some(integer_lit) = try_consume_integer_literal(self) {
            return Some(integer_lit);
        }

        // Next attempt to parse a quoted literal.
        if let Some(quoted_lit) = try_consume_quoted_literal(self) {
            return Some(quoted_lit);
        }

        // If we haven't matched at this point, produce a token marked as "Unknown".
        // The unsafe is fine -- we know from above that there are remaining characters.
        let unknown_char = unsafe { self.remaining.chars().next().unwrap_unchecked() };
        Some(self.split_token(unknown_char.len_utf8(), TokenTy::Unknown))
    }
}