wright/lexer/
comments.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
//! Implementation of comment token lexing.

use super::{token::TokenTy, Lexer};

/// The pattern that begins any single line comments (including doc comments).
pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//";

/// The pattern that starts any multi-line comments (including doc comments).
pub const MULTI_LINE_COMMENT_START: &str = "/*";

/// The pattern that ends any multi-line comments (including doc comments).
pub const MULTI_LINE_COMMENT_END: &str = "*/";

/// Attempt to match a sinlgle line comment from the start of the [Lexer::remaining] fragment.
/// Return a [usize] and optionally a [TokenTy]. The [usize] indicates how many bytes were in the comment.
/// The [TokenTy] (if it's not [None]) should be either [TokenTy::InnerDocComment] or [TokenTy::OuterDocComment].
///
/// If the [TokenTy] is not [None], the lexer should consume the specified number of bytes (by the [usize]) and
/// Produce a token with the [variant](super::token::Token::variant) from this function.
///
/// Generally I'm trying to follow the [rust comment spec] here.
///
/// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html
pub fn try_match_single_line_comment(lexer: &Lexer) -> (usize, Option<TokenTy>) {
    // Fork the lexer so we can do all the parsing on the fork without worrying about modifying the original
    // unnecessarily.
    let mut fork: Lexer = lexer.fork();

    // Try to consume the single line comment prefix from the fork.
    if fork.consume(SINGLE_LINE_COMMENT_PREFIX) {
        // We consumed it successfully, read through a newline or the end of the forked lexer if we get there.

        // First determine if this is a doc comment of some kind.
        let is_inner_doc: bool = fork.matches("/") && !fork.matches("//");
        let is_outer_doc: bool = fork.matches("!");

        // The consume until a newline, carraige return, or the end of the source fragment.
        while !fork.remaining.is_empty() && !fork.matches("\r") && !fork.matches("\n") {
            fork.consume_any();
        }

        // Determine the kind of token to produce (if any).
        let variant: Option<TokenTy> = match (is_inner_doc, is_outer_doc) {
            (true, false) => Some(TokenTy::InnerDocComment),
            (false, true) => Some(TokenTy::OuterDocComment),
            (false, false) => None,
            (true, true) => unreachable!("It is impossible for the `remaining` fragment to start with an `!` and a `/` simultaneously.")
        };

        // Return the number of bytes consumed and the type of token to
        // produce if any.
        return (fork.offset_from(lexer), variant);
    }

    // If the single line comment prefix was not immediately available, there is no comment.
    (0, None)
}

/// Attempt to match a block comment from the start of the [Lexer::remaining] fragment.
/// Return a [usize] and optionally a [TokenTy]. The [usize] indicates how many bytes were in the comment.
/// The [TokenTy] (if it's not [None]) should be [TokenTy::InnerBlockDocComment], [TokenTy::OuterBlockDocComment], or
/// [TokenTy::UnterminatedBlockComment].
///
/// If the [TokenTy] is not [None], the lexer should consume the specified number of bytes (by the [usize]) and
/// Produce a token with the [variant](super::token::Token::variant) from this function.
pub fn try_match_block_comment(lexer: &Lexer) -> (usize, Option<TokenTy>) {
    // Handle corner cases here so we don't have to below.
    // These are both considered empty non-documenting comments.
    if lexer.matches("/***/") {
        return (5, None);
    }

    if lexer.matches("/**/") {
        return (4, None);
    }

    // Make a fork of the lexer to avoid modifying this lexer if we fail to parse.
    let mut fork: Lexer = lexer.fork();

    // Try to parse the start of a multi-line comment.
    if fork.consume(MULTI_LINE_COMMENT_START) {
        // Check if this is a doc comment.
        let is_outer_doc: bool = fork.matches("!");
        // Use this to indicate that more than one following asterix is not a doc comment.
        let is_inner_doc: bool = fork.matches("*") && !fork.matches("**");

        // Consume until we see the end of the doc comment. If we run out of characters, consider the
        // comment unterminated.
        while !fork.matches(MULTI_LINE_COMMENT_END) {
            // Handle nested comments here:
            if fork.matches(MULTI_LINE_COMMENT_START) {
                // Discard the output -- don't care about doc comments in other comments.
                let (nested_comment_bytes, _) = try_match_block_comment(&fork);

                // SAFETY: the return from this function should never be on a char boundary or out of bounds.
                // This is because the return value is always either 0 or calculated using `offset_from`.
                unsafe { fork.advance_unchecked(nested_comment_bytes) };

                // Restart the loop to keep consuming this comment.
                continue;
            }

            // Handle unterminated comments here.
            if fork.remaining.is_empty() {
                // If we have not hit a "*/" before the end of the input, return an unterminated block comment.
                let bytes_consumed: usize = fork.offset_from(lexer);
                return (bytes_consumed, Some(TokenTy::UnterminatedBlockComment));
            }

            // If there's still input, and not a nested comment, consume it.
            fork.consume_any();
        }

        // If we get here, the comment was terminated. Consume the terminating characters, and return.
        // Use debug assert here to make sure that the comment is actually terminated.
        let consumed_comment_terminator: bool = fork.consume(MULTI_LINE_COMMENT_END);
        debug_assert!(consumed_comment_terminator, "comment is actually terminated");

        // Determine the kind of token to produce (if any).
        let variant: Option<TokenTy> = match (is_inner_doc, is_outer_doc) {
            (true, false) => Some(TokenTy::InnerBlockDocComment),
            (false, true) => Some(TokenTy::OuterBlockDocComment),
            (false, false) => None,
            (true, true) => {
                unreachable!("Lexer should not match multiple comment types at once.")
            }
        };

        return (fork.offset_from(lexer), variant);
    }

    (0, None)
}

#[cfg(test)]
mod tests {
    use super::Lexer;

    #[test]
    fn ignored_single_line_comment() {
        let mut lexer = Lexer::new_test("// test comment ");
        assert!(lexer.next_token().is_none());
        assert_eq!(lexer.remaining.len(), 0);
    }
}