wright/lexer/
comments.rs

1//! Implementation of comment token lexing.
2
3use super::{Lexer, token::TokenTy};
4
5/// The pattern that begins any single line comments (including doc comments).
6pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//";
7
8/// The pattern that starts any multi-line comments (including doc comments).
9pub const MULTI_LINE_COMMENT_START: &str = "/*";
10
11/// The pattern that ends any multi-line comments (including doc comments).
12pub const MULTI_LINE_COMMENT_END: &str = "*/";
13
14/// Attempt to match a sinlgle line comment from the start of the [Lexer::remaining] fragment.
15/// Return a [usize] and optionally a [TokenTy]. The [usize] indicates how many bytes were in the comment.
16/// The [TokenTy] (if it's not [None]) should be either [TokenTy::InnerDocComment] or [TokenTy::OuterDocComment].
17///
18/// If the [TokenTy] is not [None], the lexer should consume the specified number of bytes (by the [usize]) and
19/// Produce a token with the [variant](super::token::Token::variant) from this function.
20///
21/// Generally I'm trying to follow the [rust comment spec] here.
22///
23/// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html
24pub fn try_match_single_line_comment(lexer: &Lexer) -> (usize, Option<TokenTy>) {
25    // Fork the lexer so we can do all the parsing on the fork without worrying about modifying the original
26    // unnecessarily.
27    let mut fork: Lexer = lexer.fork();
28
29    // Try to consume the single line comment prefix from the fork.
30    if fork.consume(SINGLE_LINE_COMMENT_PREFIX) {
31        // We consumed it successfully, read through a newline or the end of the forked lexer if we get there.
32
33        // First determine if this is a doc comment of some kind.
34        let is_inner_doc: bool = fork.matches("/") && !fork.matches("//");
35        let is_outer_doc: bool = fork.matches("!");
36
37        // The consume until a newline, carraige return, or the end of the source fragment.
38        while !fork.remaining.is_empty() && !fork.matches("\r") && !fork.matches("\n") {
39            fork.consume_any();
40        }
41
42        // Determine the kind of token to produce (if any).
43        let variant: Option<TokenTy> = match (is_inner_doc, is_outer_doc) {
44            (true, false) => Some(TokenTy::InnerDocComment),
45            (false, true) => Some(TokenTy::OuterDocComment),
46            (false, false) => None,
47            (true, true) => unreachable!(
48                "It is impossible for the `remaining` fragment to start with an `!` and a `/` simultaneously."
49            ),
50        };
51
52        // Return the number of bytes consumed and the type of token to
53        // produce if any.
54        return (fork.offset_from(lexer), variant);
55    }
56
57    // If the single line comment prefix was not immediately available, there is no comment.
58    (0, None)
59}
60
61/// Attempt to match a block comment from the start of the [Lexer::remaining] fragment.
62/// Return a [usize] and optionally a [TokenTy]. The [usize] indicates how many bytes were in the comment.
63/// The [TokenTy] (if it's not [None]) should be [TokenTy::InnerBlockDocComment], [TokenTy::OuterBlockDocComment], or
64/// [TokenTy::UnterminatedBlockComment].
65///
66/// If the [TokenTy] is not [None], the lexer should consume the specified number of bytes (by the [usize]) and
67/// Produce a token with the [variant](super::token::Token::variant) from this function.
68pub fn try_match_block_comment(lexer: &Lexer) -> (usize, Option<TokenTy>) {
69    // Handle corner cases here so we don't have to below.
70    // These are both considered empty non-documenting comments.
71    if lexer.matches("/***/") {
72        return (5, None);
73    }
74
75    if lexer.matches("/**/") {
76        return (4, None);
77    }
78
79    // Make a fork of the lexer to avoid modifying this lexer if we fail to parse.
80    let mut fork: Lexer = lexer.fork();
81
82    // Try to parse the start of a multi-line comment.
83    if fork.consume(MULTI_LINE_COMMENT_START) {
84        // Check if this is a doc comment.
85        let is_outer_doc: bool = fork.matches("!");
86        // Use this to indicate that more than one following asterix is not a doc comment.
87        let is_inner_doc: bool = fork.matches("*") && !fork.matches("**");
88
89        // Consume until we see the end of the doc comment. If we run out of characters, consider the
90        // comment unterminated.
91        while !fork.matches(MULTI_LINE_COMMENT_END) {
92            // Handle nested comments here:
93            if fork.matches(MULTI_LINE_COMMENT_START) {
94                // Discard the output -- don't care about doc comments in other comments.
95                let (nested_comment_bytes, _) = try_match_block_comment(&fork);
96
97                // SAFETY: the return from this function should never be on a char boundary or out of bounds.
98                // This is because the return value is always either 0 or calculated using `offset_from`.
99                unsafe { fork.advance_unchecked(nested_comment_bytes) };
100
101                // Restart the loop to keep consuming this comment.
102                continue;
103            }
104
105            // Handle unterminated comments here.
106            if fork.remaining.is_empty() {
107                // If we have not hit a "*/" before the end of the input, return an unterminated block comment.
108                let bytes_consumed: usize = fork.offset_from(lexer);
109                return (bytes_consumed, Some(TokenTy::UnterminatedBlockComment));
110            }
111
112            // If there's still input, and not a nested comment, consume it.
113            fork.consume_any();
114        }
115
116        // If we get here, the comment was terminated. Consume the terminating characters, and return.
117        // Use debug assert here to make sure that the comment is actually terminated.
118        let consumed_comment_terminator: bool = fork.consume(MULTI_LINE_COMMENT_END);
119        debug_assert!(consumed_comment_terminator, "comment is actually terminated");
120
121        // Determine the kind of token to produce (if any).
122        let variant: Option<TokenTy> = match (is_inner_doc, is_outer_doc) {
123            (true, false) => Some(TokenTy::InnerBlockDocComment),
124            (false, true) => Some(TokenTy::OuterBlockDocComment),
125            (false, false) => None,
126            (true, true) => {
127                unreachable!("Lexer should not match multiple comment types at once.")
128            }
129        };
130
131        return (fork.offset_from(lexer), variant);
132    }
133
134    (0, None)
135}
136
137#[cfg(test)]
138mod tests {
139    use super::Lexer;
140
141    #[test]
142    fn ignored_single_line_comment() {
143        let mut lexer = Lexer::new_test("// test comment ");
144        assert!(lexer.next_token().is_none());
145        assert_eq!(lexer.remaining.len(), 0);
146    }
147}