wright/
lexer.rs

1//! First pass lexer that gets run on the source code and returns a series of tokens with their associated [Fragment]s.
2//!
3//! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns
4//! defined for tokens.
5
6use self::comments::{try_match_block_comment, try_match_single_line_comment};
7use self::integer_literal::try_consume_integer_literal;
8use self::quoted::try_consume_quoted_literal;
9use crate::source_tracking::fragment::Fragment;
10use crate::source_tracking::SourceRef;
11use std::str::Chars;
12use token::{Token, TokenTy};
13
14pub mod comments;
15pub mod identifier;
16pub mod integer_literal;
17pub mod quoted;
18pub mod token;
19pub mod trivial;
20
21/// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language.
22#[derive(Debug, Clone)]
23pub struct Lexer {
24    /// The remaining source code that has not been processed and returned as a token from the iterator yet.
25    pub remaining: Fragment,
26}
27
28impl Lexer {
29    /// Get the number of bytes remaining that we need to transform into tokens.
30    pub const fn bytes_remaining(&self) -> usize {
31        self.remaining.len()
32    }
33
34    /// Construct a new [Lexer] over a given source reference.
35    pub fn new(source: SourceRef) -> Self {
36        Lexer {
37            remaining: source.as_fragment(),
38        }
39    }
40
41    /// Available in test cases, creates a new [Lexer] over a given static [str]ing.
42    ///
43    /// The instantiated [Source] in this [Lexer] has its name set to [FileName::None].
44    ///
45    /// This function is limited to this crate because `#[cfg(test)]` items are not available
46    /// externally, however it should be relatively easy to reproduce.
47    ///
48    /// [Source]: crate::source_tracking::source::Source
49    /// [FileName::None]: crate::source_tracking::filename::FileName::None
50    #[cfg(test)]
51    pub(crate) fn new_test(source: &'static str) -> Self {
52        use crate::source_tracking::{filename::FileName, source::Source};
53        use std::sync::Arc;
54
55        Lexer {
56            remaining: Fragment {
57                source: Arc::new(Source::new_from_static_str(FileName::None, source)),
58                range: 0..source.len(),
59            },
60        }
61    }
62
63    /// Make a token by splitting a given number of bytes off of [Lexer::remaining]
64    /// and labeling them with the given kind.
65    ///
66    /// # Panics:
67    /// - Panics if the number of bytes lands out of bounds or in the middle of a character.
68    fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token {
69        let (token_fragment, new_remaining_fragment) = self.remaining.split_at(bytes);
70        self.remaining = new_remaining_fragment;
71
72        Token {
73            variant: kind,
74            fragment: token_fragment,
75        }
76    }
77
78    /// Unchecked version of [Lexer::split_token].
79    ///
80    /// # Panics
81    /// - This function has the same potential to cause logic bugs and panics as [Fragment::split_at_unchecked].
82    fn split_token_unchecked(&mut self, bytes: usize, kind: TokenTy) -> Token {
83        let (token_fragment, new_remaining_fragment) = self.remaining.split_at_unchecked(bytes);
84        self.remaining = new_remaining_fragment;
85
86        Token {
87            variant: kind,
88            fragment: token_fragment,
89        }
90    }
91
92    /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for
93    /// failable parsing. This can be compared to the original lexer it was forked from using [Lexer::offset_from].
94    pub fn fork(&self) -> Self {
95        self.clone()
96    }
97
98    /// Get the number of bytes between the origin's [remaining](Lexer::remaining) and
99    /// this [Lexer]'s [remaining](Lexer::remaining) using [`Fragment::offset_from`].
100    ///
101    /// # Panics
102    /// - This function panics under the same conditions as [`Fragment::offset_from`].
103    /// - Generally the best way to avoid panics is to only call this function on
104    ///     [Lexer]s created using [Lexer::fork] on the `origin` lexer.
105    pub fn offset_from(&self, origin: &Self) -> usize {
106        self.remaining.offset_from(&origin.remaining)
107    }
108
109    /// Check if a pattern matches at the start of the [Lexer::remaining] [Fragment].
110    pub fn matches(&self, pattern: &str) -> bool {
111        self.remaining.as_str().starts_with(pattern)
112    }
113
114    /// If the remaining fragment starts with the given `pattern`, strip it from the remaining fragment and return
115    /// true. Otherwise return false.
116    fn consume(&mut self, pattern: &str) -> bool {
117        if self.matches(pattern) {
118            // SOUNDNESS: We just checked that the pattern matches.
119            self.remaining.advance_by_unchecked(pattern.len());
120            true
121        } else {
122            false
123        }
124    }
125
126    /// Remove a character from the start of the `remaining` [`Fragment`], return the character
127    /// consumed if there was a character available to consume.
128    fn consume_any(&mut self) -> Option<char> {
129        // Make a character iterator.
130        let mut chars: Chars = self.remaining.chars();
131
132        if let Some(c) = chars.next() {
133            // Consumed a char, update the remaining fragment of this lexer.
134            let char_bytes: usize = c.len_utf8();
135            // SAFETY: we know that this is not on a char boundary and does not exceed the length of the slice,
136            // since we just pulled it from a `Chars` iterator.
137            unsafe { self.advance_unchecked(char_bytes) };
138            // Return the character.
139            Some(c)
140        } else {
141            // No characters available, return nothing.
142            None
143        }
144    }
145
146    /// Advance this lexer by the specified number of bytes.
147    ///
148    /// # Panics
149    /// - If the lexer is not on a unicode character boundary after advancing.
150    /// - If the number of bytes is greater than the length of the [remaining](Lexer::remaining) fragment.
151    fn advance(&mut self, bytes: usize) {
152        if bytes > self.remaining.len() {
153            panic!("Cannot advance past end of lexer fragment");
154        }
155
156        if !self.remaining.as_str().is_char_boundary(bytes) {
157            panic!("Advancing {bytes} bytes does not land on a character boundary");
158        }
159
160        self.remaining.range.start += bytes;
161    }
162
163    /// Unsafe version of [Lexer::advance].
164    /// Advances this lexer by the specified number of bytes.
165    ///
166    /// # Safety
167    /// - This lexer will be left in an invalid/undefined state if the number of bytes is greater than the length
168    ///     of the [Lexer::remaining] fragment.
169    /// - This lexer will be left in an invalid/undefined state if after advancing, the next byte in the
170    ///     [Lexer::remaining] fragment is not the start of a unicode code point.
171    unsafe fn advance_unchecked(&mut self, bytes: usize) {
172        self.remaining.range.start += bytes;
173    }
174
175    /// Get the next token from the lexer.
176    pub fn next_token(&mut self) -> Option<Token> {
177        // If the remaining input is empty, there is no token.
178        if self.remaining.is_empty() {
179            return None;
180        }
181
182        // If there is whitespace, it becomes its own token.
183        // Use a little unsafe here since this check is done every time and needs to be fast.
184        {
185            let remaining_str = self.remaining.as_str();
186            let trimmed = remaining_str.trim_start().as_ptr();
187
188            // Calculate the delta by pointer offset.
189            // SAFETY: In this case, all the requirements of pointer::offset_from are satisfied.
190            let delta = unsafe { trimmed.offset_from(remaining_str.as_ptr()) };
191
192            if delta > 0 {
193                // trim_start should always return a valid string, and delta is just checked to be > 0.
194                return Some(self.split_token_unchecked(delta as usize, TokenTy::Whitespace));
195            }
196        }
197
198        // Attempt to parse a single line comment and then attempt a multi-line comment.
199        for comment_match_fn in [try_match_single_line_comment, try_match_block_comment] {
200            // Attempt to parse a comment using the given match function. Return it if it's documentation or unterminated.
201            // Get a new token and return that if there was a comment and it was ignored successfully.
202            match (comment_match_fn)(self) {
203                // A comment was parsed, consume and return it.
204                (bytes, Some(comment_variant)) => {
205                    // Split the token.
206                    let token: Token = self.split_token(bytes, comment_variant);
207                    // Return it.
208                    return Some(token);
209                }
210
211                // There was a comment, advance the lexer and ignore it. Re-start this function.
212                (bytes @ 1.., None) => {
213                    self.advance(bytes);
214                    return self.next_token();
215                }
216
217                // There was no comment, keep trying to match other tokens.
218                (0, None) => {}
219            }
220        }
221
222        // Handle a trivial token if there is one.
223        if let Some(token) = trivial::try_consume_trivial_token(self) {
224            return Some(token);
225        }
226
227        // Next attempt to match a keyword or identifier.
228        if let Some(token) = identifier::try_consume_keyword_or_identifier(self) {
229            return Some(token);
230        }
231
232        // Next attempt to parse an integer literal.
233        if let Some(integer_lit) = try_consume_integer_literal(self) {
234            return Some(integer_lit);
235        }
236
237        // Next attempt to parse a quoted literal.
238        if let Some(quoted_lit) = try_consume_quoted_literal(self) {
239            return Some(quoted_lit);
240        }
241
242        // If we haven't matched at this point, produce a token marked as "Unknown".
243        // The unsafe is fine -- we know from above that there are remaining characters.
244        let unknown_char = unsafe { self.remaining.chars().next().unwrap_unchecked() };
245        Some(self.split_token(unknown_char.len_utf8(), TokenTy::Unknown))
246    }
247}