anstream/adapter/
strip.rs

1use anstyle_parse::state::state_change;
2use anstyle_parse::state::Action;
3use anstyle_parse::state::State;
4
5/// Strip ANSI escapes from a `&str`, returning the printable content
6///
7/// This can be used to take output from a program that includes escape sequences and write it
8/// somewhere that does not easily support them, such as a log file.
9///
10/// For non-contiguous data, see [`StripStr`].
11///
12/// # Example
13///
14/// ```rust
15/// use std::io::Write as _;
16///
17/// let styled_text = "\x1b[32mfoo\x1b[m bar";
18/// let plain_str = anstream::adapter::strip_str(&styled_text).to_string();
19/// assert_eq!(plain_str, "foo bar");
20/// ```
21#[inline]
22pub fn strip_str(data: &str) -> StrippedStr<'_> {
23    StrippedStr::new(data)
24}
25
26/// See [`strip_str`]
27#[derive(Default, Clone, Debug, PartialEq, Eq)]
28pub struct StrippedStr<'s> {
29    bytes: &'s [u8],
30    state: State,
31}
32
33impl<'s> StrippedStr<'s> {
34    #[inline]
35    fn new(data: &'s str) -> Self {
36        Self {
37            bytes: data.as_bytes(),
38            state: State::Ground,
39        }
40    }
41
42    /// Create a [`String`] of the printable content
43    #[inline]
44    #[allow(clippy::inherent_to_string_shadow_display)] // Single-allocation implementation
45    pub fn to_string(&self) -> String {
46        use std::fmt::Write as _;
47        let mut stripped = String::with_capacity(self.bytes.len());
48        let _ = write!(&mut stripped, "{self}");
49        stripped
50    }
51}
52
53impl<'s> std::fmt::Display for StrippedStr<'s> {
54    /// **Note:** this does *not* exhaust the [`Iterator`]
55    #[inline]
56    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57        let iter = Self {
58            bytes: self.bytes,
59            state: self.state,
60        };
61        for printable in iter {
62            printable.fmt(f)?;
63        }
64        Ok(())
65    }
66}
67
68impl<'s> Iterator for StrippedStr<'s> {
69    type Item = &'s str;
70
71    #[inline]
72    fn next(&mut self) -> Option<Self::Item> {
73        next_str(&mut self.bytes, &mut self.state)
74    }
75}
76
77/// Incrementally strip non-contiguous data
78#[derive(Default, Clone, Debug, PartialEq, Eq)]
79pub struct StripStr {
80    state: State,
81}
82
83impl StripStr {
84    /// Initial state
85    pub fn new() -> Self {
86        Default::default()
87    }
88
89    /// Strip the next segment of data
90    pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> {
91        StripStrIter {
92            bytes: data.as_bytes(),
93            state: &mut self.state,
94        }
95    }
96}
97
98/// See [`StripStr`]
99#[derive(Debug, PartialEq, Eq)]
100pub struct StripStrIter<'s> {
101    bytes: &'s [u8],
102    state: &'s mut State,
103}
104
105impl<'s> Iterator for StripStrIter<'s> {
106    type Item = &'s str;
107
108    #[inline]
109    fn next(&mut self) -> Option<Self::Item> {
110        next_str(&mut self.bytes, self.state)
111    }
112}
113
114#[inline]
115fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> {
116    let offset = bytes.iter().copied().position(|b| {
117        let (next_state, action) = state_change(*state, b);
118        if next_state != State::Anywhere {
119            *state = next_state;
120        }
121        is_printable_bytes(action, b)
122    });
123    let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
124    *bytes = next;
125    *state = State::Ground;
126
127    let offset = bytes.iter().copied().position(|b| {
128        let (_next_state, action) = state_change(State::Ground, b);
129        !(is_printable_bytes(action, b) || is_utf8_continuation(b))
130    });
131    let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
132    *bytes = next;
133    if printable.is_empty() {
134        None
135    } else {
136        let printable = unsafe {
137            from_utf8_unchecked(
138                printable,
139                "`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations",
140            )
141        };
142        Some(printable)
143    }
144}
145
146#[inline]
147unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str {
148    unsafe {
149        if cfg!(debug_assertions) {
150            // Catch problems more quickly when testing
151            std::str::from_utf8(bytes).expect(safety_justification)
152        } else {
153            std::str::from_utf8_unchecked(bytes)
154        }
155    }
156}
157
158#[inline]
159fn is_utf8_continuation(b: u8) -> bool {
160    matches!(b, 0x80..=0xbf)
161}
162
163/// Strip ANSI escapes from bytes, returning the printable content
164///
165/// This can be used to take output from a program that includes escape sequences and write it
166/// somewhere that does not easily support them, such as a log file.
167///
168/// # Example
169///
170/// ```rust
171/// use std::io::Write as _;
172///
173/// let styled_text = "\x1b[32mfoo\x1b[m bar";
174/// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec();
175/// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]);
176/// ```
177#[inline]
178pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> {
179    StrippedBytes::new(data)
180}
181
182/// See [`strip_bytes`]
183#[derive(Default, Clone, Debug, PartialEq, Eq)]
184pub struct StrippedBytes<'s> {
185    bytes: &'s [u8],
186    state: State,
187    utf8parser: Utf8Parser,
188}
189
190impl<'s> StrippedBytes<'s> {
191    /// See [`strip_bytes`]
192    #[inline]
193    pub fn new(bytes: &'s [u8]) -> Self {
194        Self {
195            bytes,
196            state: State::Ground,
197            utf8parser: Default::default(),
198        }
199    }
200
201    /// Strip the next slice of bytes
202    ///
203    /// Used when the content is in several non-contiguous slices
204    ///
205    /// # Panic
206    ///
207    /// May panic if it is not exhausted / empty
208    #[inline]
209    pub fn extend(&mut self, bytes: &'s [u8]) {
210        debug_assert!(
211            self.is_empty(),
212            "current bytes must be processed to ensure we end at the right state"
213        );
214        self.bytes = bytes;
215    }
216
217    /// Report the bytes has been exhausted
218    #[inline]
219    pub fn is_empty(&self) -> bool {
220        self.bytes.is_empty()
221    }
222
223    /// Create a [`Vec`] of the printable content
224    #[inline]
225    pub fn into_vec(self) -> Vec<u8> {
226        let mut stripped = Vec::with_capacity(self.bytes.len());
227        for printable in self {
228            stripped.extend(printable);
229        }
230        stripped
231    }
232}
233
234impl<'s> Iterator for StrippedBytes<'s> {
235    type Item = &'s [u8];
236
237    #[inline]
238    fn next(&mut self) -> Option<Self::Item> {
239        next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser)
240    }
241}
242
243/// Incrementally strip non-contiguous data
244#[derive(Default, Clone, Debug, PartialEq, Eq)]
245pub struct StripBytes {
246    state: State,
247    utf8parser: Utf8Parser,
248}
249
250impl StripBytes {
251    /// Initial state
252    pub fn new() -> Self {
253        Default::default()
254    }
255
256    /// Strip the next segment of data
257    pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> {
258        StripBytesIter {
259            bytes,
260            state: &mut self.state,
261            utf8parser: &mut self.utf8parser,
262        }
263    }
264}
265
266/// See [`StripBytes`]
267#[derive(Debug, PartialEq, Eq)]
268pub struct StripBytesIter<'s> {
269    bytes: &'s [u8],
270    state: &'s mut State,
271    utf8parser: &'s mut Utf8Parser,
272}
273
274impl<'s> Iterator for StripBytesIter<'s> {
275    type Item = &'s [u8];
276
277    #[inline]
278    fn next(&mut self) -> Option<Self::Item> {
279        next_bytes(&mut self.bytes, self.state, self.utf8parser)
280    }
281}
282
283#[inline]
284fn next_bytes<'s>(
285    bytes: &mut &'s [u8],
286    state: &mut State,
287    utf8parser: &mut Utf8Parser,
288) -> Option<&'s [u8]> {
289    let offset = bytes.iter().copied().position(|b| {
290        if *state == State::Utf8 {
291            true
292        } else {
293            let (next_state, action) = state_change(*state, b);
294            if next_state != State::Anywhere {
295                *state = next_state;
296            }
297            is_printable_bytes(action, b)
298        }
299    });
300    let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
301    *bytes = next;
302
303    let offset = bytes.iter().copied().position(|b| {
304        if *state == State::Utf8 {
305            if utf8parser.add(b) {
306                *state = State::Ground;
307            }
308            false
309        } else {
310            let (next_state, action) = state_change(State::Ground, b);
311            if next_state != State::Anywhere {
312                *state = next_state;
313            }
314            if *state == State::Utf8 {
315                utf8parser.add(b);
316                false
317            } else {
318                !is_printable_bytes(action, b)
319            }
320        }
321    });
322    let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
323    *bytes = next;
324    if printable.is_empty() {
325        None
326    } else {
327        Some(printable)
328    }
329}
330
331#[derive(Default, Clone, Debug, PartialEq, Eq)]
332pub(crate) struct Utf8Parser {
333    utf8_parser: utf8parse::Parser,
334}
335
336impl Utf8Parser {
337    fn add(&mut self, byte: u8) -> bool {
338        let mut b = false;
339        let mut receiver = VtUtf8Receiver(&mut b);
340        self.utf8_parser.advance(&mut receiver, byte);
341        b
342    }
343}
344
345struct VtUtf8Receiver<'a>(&'a mut bool);
346
347impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> {
348    fn codepoint(&mut self, _: char) {
349        *self.0 = true;
350    }
351
352    fn invalid_sequence(&mut self) {
353        *self.0 = true;
354    }
355}
356
357#[inline]
358fn is_printable_bytes(action: Action, byte: u8) -> bool {
359    // VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
360    // ISO Latin-1, making it DEL and non-printable
361    const DEL: u8 = 0x7f;
362
363    // Continuations aren't included as they may also be control codes, requiring more context
364    (action == Action::Print && byte != DEL)
365        || action == Action::BeginUtf8
366        || (action == Action::Execute && byte.is_ascii_whitespace())
367}
368
369#[cfg(test)]
370mod test {
371    use super::*;
372    use proptest::prelude::*;
373
374    /// Model based off full parser
375    fn parser_strip(bytes: &[u8]) -> String {
376        #[derive(Default)]
377        struct Strip(String);
378        impl Strip {
379            fn with_capacity(capacity: usize) -> Self {
380                Self(String::with_capacity(capacity))
381            }
382        }
383        impl anstyle_parse::Perform for Strip {
384            fn print(&mut self, c: char) {
385                self.0.push(c);
386            }
387
388            fn execute(&mut self, byte: u8) {
389                if byte.is_ascii_whitespace() {
390                    self.0.push(byte as char);
391                }
392            }
393        }
394
395        let mut stripped = Strip::with_capacity(bytes.len());
396        let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
397        for byte in bytes {
398            parser.advance(&mut stripped, *byte);
399        }
400        stripped.0
401    }
402
403    /// Model verifying incremental parsing
404    fn strip_char(mut s: &str) -> String {
405        let mut result = String::new();
406        let mut state = StripStr::new();
407        while !s.is_empty() {
408            let mut indices = s.char_indices();
409            indices.next(); // current
410            let offset = indices.next().map(|(i, _)| i).unwrap_or_else(|| s.len());
411            let (current, remainder) = s.split_at(offset);
412            for printable in state.strip_next(current) {
413                result.push_str(printable);
414            }
415            s = remainder;
416        }
417        result
418    }
419
420    /// Model verifying incremental parsing
421    fn strip_byte(s: &[u8]) -> Vec<u8> {
422        let mut result = Vec::new();
423        let mut state = StripBytes::default();
424        for start in 0..s.len() {
425            let current = &s[start..=start];
426            for printable in state.strip_next(current) {
427                result.extend(printable);
428            }
429        }
430        result
431    }
432
433    #[test]
434    fn test_strip_bytes_multibyte() {
435        let bytes = [240, 145, 141, 139];
436        let expected = parser_strip(&bytes);
437        let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap();
438        assert_eq!(expected, actual);
439    }
440
441    #[test]
442    fn test_strip_byte_multibyte() {
443        let bytes = [240, 145, 141, 139];
444        let expected = parser_strip(&bytes);
445        let actual = String::from_utf8(strip_byte(&bytes).clone()).unwrap();
446        assert_eq!(expected, actual);
447    }
448
449    #[test]
450    fn test_strip_str_del() {
451        let input = std::str::from_utf8(&[0x7f]).unwrap();
452        let expected = "";
453        let actual = strip_str(input).to_string();
454        assert_eq!(expected, actual);
455    }
456
457    #[test]
458    fn test_strip_byte_del() {
459        let bytes = [0x7f];
460        let expected = "";
461        let actual = String::from_utf8(strip_byte(&bytes).clone()).unwrap();
462        assert_eq!(expected, actual);
463    }
464
465    #[test]
466    fn test_strip_str_handles_broken_sequence() {
467        // valid utf8: \xc3\xb6 then \x1b then \xf0\x9f\x98\x80
468        let s = "ö\x1b😀hello😀goodbye";
469        let mut it = strip_str(s);
470        assert_eq!("ö", it.next().unwrap());
471        assert_eq!("ello😀goodbye", it.next().unwrap());
472    }
473
474    proptest! {
475        #[test]
476        #[cfg_attr(miri, ignore)]  // See https://github.com/AltSysrq/proptest/issues/253
477        fn strip_str_no_escapes(s in "\\PC*") {
478            let expected = parser_strip(s.as_bytes());
479            let actual = strip_str(&s).to_string();
480            assert_eq!(expected, actual);
481        }
482
483        #[test]
484        #[cfg_attr(miri, ignore)]  // See https://github.com/AltSysrq/proptest/issues/253
485        fn strip_char_no_escapes(s in "\\PC*") {
486            let expected = parser_strip(s.as_bytes());
487            let actual = strip_char(&s);
488            assert_eq!(expected, actual);
489        }
490
491        #[test]
492        #[cfg_attr(miri, ignore)]  // See https://github.com/AltSysrq/proptest/issues/253
493        fn strip_bytes_no_escapes(s in "\\PC*") {
494            dbg!(&s);
495            dbg!(s.as_bytes());
496            let expected = parser_strip(s.as_bytes());
497            let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap();
498            assert_eq!(expected, actual);
499        }
500
501        #[test]
502        #[cfg_attr(miri, ignore)]  // See https://github.com/AltSysrq/proptest/issues/253
503        fn strip_byte_no_escapes(s in "\\PC*") {
504            dbg!(&s);
505            dbg!(s.as_bytes());
506            let expected = parser_strip(s.as_bytes());
507            let actual = String::from_utf8(strip_byte(s.as_bytes()).clone()).unwrap();
508            assert_eq!(expected, actual);
509        }
510    }
511}