wright/source_tracking/
immutable_string.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
//! Structure and implementation relating to the representation of source files (as immutable strings) throughout
//! the Wright compiler and tooling.

use std::{str::CharIndices, sync::Arc};

#[cfg(feature = "file_memmap")]
use fs4::fs_std::FileExt;

#[cfg(feature = "file_memmap")]
use memmap2::Mmap;

#[cfg(feature = "file_memmap")]
use std::{fs::File, io};

/// An immutable string that either
/// - References a source string in memory using a `'static` reference,
/// - Owns a source string in memory.
/// - Owns a locked and memory mapped file from the disk.
///
/// This uses an [Arc] internally to make cloning cheap.
#[derive(Debug, Clone)]
pub struct ImmutableString {
    /// Wrap the internal enum representation. This is to avoid exposing the API for a user to construct an
    /// [ImmutableStringInner] without satisfying certain invariants.
    inner: Arc<ImmutableStringInner>,
}

impl ImmutableString {
    /// Wrap the inner representation in this type.
    #[inline]
    fn from_inner(inner: ImmutableStringInner) -> Self {
        ImmutableString {
            inner: Arc::new(inner),
        }
    }

    /// Create a new [ImmutableString] holding the given [File] (assumed to be locked with [fs4])
    /// and the [Mmap] mapping that file to memory.
    ///
    /// This function requires that the memory mapped by the given
    /// [Mmap] is valid UTF-8 using [std::str::from_utf8].
    #[cfg(feature = "file_memmap")]
    pub(super) fn new_locked_file(file: File, mem_map: Mmap) -> Self {
        Self::from_inner(ImmutableStringInner::LockedFile {
            locked_file: file,
            mem_map,
        })
    }

    /// Create a new [ImmutableString] that owns a string allocated on the heap.
    pub(super) fn new_owned(boxed_str: Box<str>) -> Self {
        Self::from_inner(ImmutableStringInner::Owned(boxed_str))
    }

    /// Create a new [ImmutableString] referencing a string directly.
    pub(super) fn new_static(str_ref: &'static str) -> Self {
        Self::from_inner(ImmutableStringInner::Static(str_ref))
    }

    /// Get a list of byte indices into this [ImmutableString] of the start of every line.
    pub fn line_starts(&self) -> Vec<usize> {
        // Make a iterator over this string's characters and their byte indices.
        let mut char_indices: CharIndices = self.as_ref().char_indices();
        // Track whether the previous character was a newline using a bool -- this starts as true, so that the first
        // character of a source is considered to be starting a newline.
        let mut last_was_newline: bool = true;

        // Create a custom iterator that flattens to give us indices immediately following \n characters.
        let iter = std::iter::from_fn(move || {
            // If the next char indice is none, return none. There are no lines on empty strings.
            let (index, next) = char_indices.next()?;

            // Determine whether to list this character's index as starting a new line.
            let result = if last_was_newline {
                Some(Some(index))
            } else {
                Some(None)
            };

            // Update the boolean based on the consumed character.
            last_was_newline = next == '\n';

            // Return the above result.
            result
        });

        iter.flatten().collect()
    }

    /// Get this [ImmutableString] as a [str] reference.
    /// This just calls [AsRef::as_ref].
    pub fn as_str(&self) -> &str {
        self.as_ref()
    }

    /// Get the length of this [ImmutableString] in bytes.
    /// See [str::len].
    pub fn len(&self) -> usize {
        self.as_str().len()
    }

    /// Check if this [ImmutableString] is empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
}

impl AsRef<str> for ImmutableString {
    fn as_ref(&self) -> &str {
        (*self.inner).as_ref()
    }
}

/// The internal enum representation of the immutable string.
#[derive(Debug)]
enum ImmutableStringInner {
    /// An immutable reference to an existing static string.
    Static(&'static str),

    /// An owned immutable string.
    Owned(Box<str>),

    /// A locked, memory mapped file from the disk.
    #[cfg(feature = "file_memmap")]
    LockedFile {
        /// The locked file that gets unlocked when this struct is dropped.
        locked_file: File,

        /// The memory mapped file.
        ///
        /// # Safety
        /// - Undefined  behavior occurs if the file on disk is modified while memory mapped. Always lock the
        ///     file (in this crate's case, using [fs4]) before creating this [Mmap] for it.
        ///     See [Mmap] for more details.
        /// - This struct assumes that any memory-mapped files have their UTF-8 validity checked by the caller.
        ///     Specificically the [ImmutableString::as_ref] method relies on [std::str::from_utf8_unchecked],
        ///     so if you do not ensure the [Mmap] is valid UTF-8, you will run into undefined behavior.
        mem_map: Mmap,
    },
}

/// Implement [Drop] to make sure that the files from disk get unlocked as they go out of scope/use.
#[cfg(feature = "file_memmap")]
impl Drop for ImmutableStringInner {
    fn drop(&mut self) {
        match self {
            // Unlock locked files.
            ImmutableStringInner::LockedFile { locked_file, .. } => {
                locked_file
                    .unlock()
                    // Log the error if there is one,
                    .map_err(|io_err: io::Error| eprintln!("{}", io_err))
                    // Discard value of result
                    .ok();
            }

            // All other types drop trivially.
            ImmutableStringInner::Owned(_) | ImmutableStringInner::Static(_) => {}
        }
    }
}

impl AsRef<str> for ImmutableStringInner {
    fn as_ref(&self) -> &str {
        match self {
            ImmutableStringInner::Static(str) => str,
            ImmutableStringInner::Owned(str) => str,

            #[cfg(feature = "file_memmap")]
            ImmutableStringInner::LockedFile { mem_map, .. } => {
                // Get a direct reference to the data that is in the memory map.
                let raw_data: &[u8] = mem_map.as_ref();
                // SAFETY: UTF-8 validity is checked when the file is added to the file map, or by the API consumer.
                unsafe { std::str::from_utf8_unchecked(raw_data) }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::ImmutableString;

    #[test]
    fn test_line_starts() {
        let v: Vec<usize> = ImmutableString::new_static("a\n\nb\nc").line_starts();

        assert_eq!(v.as_slice(), &[0, 2, 3, 5]);
    }
}