wright/lexer.rs
1//! First pass lexer that gets run on the source code and returns a series of tokens with their associated [Fragment]s.
2//!
3//! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns
4//! defined for tokens.
5
6use self::comments::{try_match_block_comment, try_match_single_line_comment};
7use self::integer_literal::try_consume_integer_literal;
8use self::quoted::try_consume_quoted_literal;
9use crate::source_tracking::fragment::Fragment;
10use crate::source_tracking::SourceRef;
11use std::str::Chars;
12use token::{Token, TokenTy};
13
14pub mod comments;
15pub mod identifier;
16pub mod integer_literal;
17pub mod quoted;
18pub mod token;
19pub mod trivial;
20
21/// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language.
22#[derive(Debug, Clone)]
23pub struct Lexer {
24 /// The remaining source code that has not been processed and returned as a token from the iterator yet.
25 pub remaining: Fragment,
26}
27
28impl Lexer {
29 /// Get the number of bytes remaining that we need to transform into tokens.
30 pub const fn bytes_remaining(&self) -> usize {
31 self.remaining.len()
32 }
33
34 /// Construct a new [Lexer] over a given source reference.
35 pub fn new(source: SourceRef) -> Self {
36 Lexer {
37 remaining: source.as_fragment(),
38 }
39 }
40
41 /// Available in test cases, creates a new [Lexer] over a given static [str]ing.
42 ///
43 /// The instantiated [Source] in this [Lexer] has its name set to [FileName::None].
44 ///
45 /// This function is limited to this crate because `#[cfg(test)]` items are not available
46 /// externally, however it should be relatively easy to reproduce.
47 ///
48 /// [Source]: crate::source_tracking::source::Source
49 /// [FileName::None]: crate::source_tracking::filename::FileName::None
50 #[cfg(test)]
51 pub(crate) fn new_test(source: &'static str) -> Self {
52 use crate::source_tracking::{filename::FileName, source::Source};
53 use std::sync::Arc;
54
55 Lexer {
56 remaining: Fragment {
57 source: Arc::new(Source::new_from_static_str(FileName::None, source)),
58 range: 0..source.len(),
59 },
60 }
61 }
62
63 /// Make a token by splitting a given number of bytes off of [Lexer::remaining]
64 /// and labeling them with the given kind.
65 ///
66 /// # Panics:
67 /// - Panics if the number of bytes lands out of bounds or in the middle of a character.
68 fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token {
69 let (token_fragment, new_remaining_fragment) = self.remaining.split_at(bytes);
70 self.remaining = new_remaining_fragment;
71
72 Token {
73 variant: kind,
74 fragment: token_fragment,
75 }
76 }
77
78 /// Unchecked version of [Lexer::split_token].
79 ///
80 /// # Panics
81 /// - This function has the same potential to cause logic bugs and panics as [Fragment::split_at_unchecked].
82 fn split_token_unchecked(&mut self, bytes: usize, kind: TokenTy) -> Token {
83 let (token_fragment, new_remaining_fragment) = self.remaining.split_at_unchecked(bytes);
84 self.remaining = new_remaining_fragment;
85
86 Token {
87 variant: kind,
88 fragment: token_fragment,
89 }
90 }
91
92 /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for
93 /// failable parsing. This can be compared to the original lexer it was forked from using [Lexer::offset_from].
94 pub fn fork(&self) -> Self {
95 self.clone()
96 }
97
98 /// Get the number of bytes between the origin's [remaining](Lexer::remaining) and
99 /// this [Lexer]'s [remaining](Lexer::remaining) using [`Fragment::offset_from`].
100 ///
101 /// # Panics
102 /// - This function panics under the same conditions as [`Fragment::offset_from`].
103 /// - Generally the best way to avoid panics is to only call this function on
104 /// [Lexer]s created using [Lexer::fork] on the `origin` lexer.
105 pub fn offset_from(&self, origin: &Self) -> usize {
106 self.remaining.offset_from(&origin.remaining)
107 }
108
109 /// Check if a pattern matches at the start of the [Lexer::remaining] [Fragment].
110 pub fn matches(&self, pattern: &str) -> bool {
111 self.remaining.as_str().starts_with(pattern)
112 }
113
114 /// If the remaining fragment starts with the given `pattern`, strip it from the remaining fragment and return
115 /// true. Otherwise return false.
116 fn consume(&mut self, pattern: &str) -> bool {
117 if self.matches(pattern) {
118 // SOUNDNESS: We just checked that the pattern matches.
119 self.remaining.advance_by_unchecked(pattern.len());
120 true
121 } else {
122 false
123 }
124 }
125
126 /// Remove a character from the start of the `remaining` [`Fragment`], return the character
127 /// consumed if there was a character available to consume.
128 fn consume_any(&mut self) -> Option<char> {
129 // Make a character iterator.
130 let mut chars: Chars = self.remaining.chars();
131
132 if let Some(c) = chars.next() {
133 // Consumed a char, update the remaining fragment of this lexer.
134 let char_bytes: usize = c.len_utf8();
135 // SAFETY: we know that this is not on a char boundary and does not exceed the length of the slice,
136 // since we just pulled it from a `Chars` iterator.
137 unsafe { self.advance_unchecked(char_bytes) };
138 // Return the character.
139 Some(c)
140 } else {
141 // No characters available, return nothing.
142 None
143 }
144 }
145
146 /// Advance this lexer by the specified number of bytes.
147 ///
148 /// # Panics
149 /// - If the lexer is not on a unicode character boundary after advancing.
150 /// - If the number of bytes is greater than the length of the [remaining](Lexer::remaining) fragment.
151 fn advance(&mut self, bytes: usize) {
152 if bytes > self.remaining.len() {
153 panic!("Cannot advance past end of lexer fragment");
154 }
155
156 if !self.remaining.as_str().is_char_boundary(bytes) {
157 panic!("Advancing {bytes} bytes does not land on a character boundary");
158 }
159
160 self.remaining.range.start += bytes;
161 }
162
163 /// Unsafe version of [Lexer::advance].
164 /// Advances this lexer by the specified number of bytes.
165 ///
166 /// # Safety
167 /// - This lexer will be left in an invalid/undefined state if the number of bytes is greater than the length
168 /// of the [Lexer::remaining] fragment.
169 /// - This lexer will be left in an invalid/undefined state if after advancing, the next byte in the
170 /// [Lexer::remaining] fragment is not the start of a unicode code point.
171 unsafe fn advance_unchecked(&mut self, bytes: usize) {
172 self.remaining.range.start += bytes;
173 }
174
175 /// Get the next token from the lexer.
176 pub fn next_token(&mut self) -> Option<Token> {
177 // If the remaining input is empty, there is no token.
178 if self.remaining.is_empty() {
179 return None;
180 }
181
182 // If there is whitespace, it becomes its own token.
183 // Use a little unsafe here since this check is done every time and needs to be fast.
184 {
185 let remaining_str = self.remaining.as_str();
186 let trimmed = remaining_str.trim_start().as_ptr();
187
188 // Calculate the delta by pointer offset.
189 // SAFETY: In this case, all the requirements of pointer::offset_from are satisfied.
190 let delta = unsafe { trimmed.offset_from(remaining_str.as_ptr()) };
191
192 if delta > 0 {
193 // trim_start should always return a valid string, and delta is just checked to be > 0.
194 return Some(self.split_token_unchecked(delta as usize, TokenTy::Whitespace));
195 }
196 }
197
198 // Attempt to parse a single line comment and then attempt a multi-line comment.
199 for comment_match_fn in [try_match_single_line_comment, try_match_block_comment] {
200 // Attempt to parse a comment using the given match function. Return it if it's documentation or unterminated.
201 // Get a new token and return that if there was a comment and it was ignored successfully.
202 match (comment_match_fn)(self) {
203 // A comment was parsed, consume and return it.
204 (bytes, Some(comment_variant)) => {
205 // Split the token.
206 let token: Token = self.split_token(bytes, comment_variant);
207 // Return it.
208 return Some(token);
209 }
210
211 // There was a comment, advance the lexer and ignore it. Re-start this function.
212 (bytes @ 1.., None) => {
213 self.advance(bytes);
214 return self.next_token();
215 }
216
217 // There was no comment, keep trying to match other tokens.
218 (0, None) => {}
219 }
220 }
221
222 // Handle a trivial token if there is one.
223 if let Some(token) = trivial::try_consume_trivial_token(self) {
224 return Some(token);
225 }
226
227 // Next attempt to match a keyword or identifier.
228 if let Some(token) = identifier::try_consume_keyword_or_identifier(self) {
229 return Some(token);
230 }
231
232 // Next attempt to parse an integer literal.
233 if let Some(integer_lit) = try_consume_integer_literal(self) {
234 return Some(integer_lit);
235 }
236
237 // Next attempt to parse a quoted literal.
238 if let Some(quoted_lit) = try_consume_quoted_literal(self) {
239 return Some(quoted_lit);
240 }
241
242 // If we haven't matched at this point, produce a token marked as "Unknown".
243 // The unsafe is fine -- we know from above that there are remaining characters.
244 let unknown_char = unsafe { self.remaining.chars().next().unwrap_unchecked() };
245 Some(self.split_token(unknown_char.len_utf8(), TokenTy::Unknown))
246 }
247}