relay_pii/
attachments.rs

1use regex::bytes::RegexBuilder as BytesRegexBuilder;
2use regex::{Match, Regex};
3use relay_event_schema::processor::{FieldAttrs, Pii, ProcessingState, ValueType};
4use smallvec::SmallVec;
5use std::borrow::Cow;
6use std::iter::FusedIterator;
7use utf16string::{LittleEndian, WStr};
8
9use crate::compiledconfig::RuleRef;
10use crate::regexes::{get_regex_for_rule_type, ReplaceBehavior};
11use crate::{transform, utils, CompiledPiiConfig, JsonScrubError, JsonScrubVisitor, Redaction};
12
13/// The minimum length a string needs to be in a binary blob.
14///
15/// This module extracts encoded strings from within binary blobs, this specifies the
16/// minimum length we require those strings to be before we accept them to match scrubbing
17/// selectors on.
18const MIN_STRING_LEN: usize = 5;
19
20fn apply_regex_to_utf8_bytes(
21    data: &mut [u8],
22    rule: &RuleRef,
23    regex: &Regex,
24    replace_behavior: &ReplaceBehavior,
25) -> SmallVec<[(usize, usize); 1]> {
26    let mut matches = SmallVec::<[(usize, usize); 1]>::new();
27
28    let regex = match BytesRegexBuilder::new(regex.as_str())
29        // https://github.com/rust-lang/regex/issues/697
30        .unicode(false)
31        .multi_line(false)
32        .dot_matches_new_line(true)
33        .build()
34    {
35        Ok(x) => x,
36        Err(e) => {
37            // XXX: This is not going to fly long-term
38            // Idea: Disable unicode support for regexes entirely, that drastically increases the
39            // likelihood this conversion will never fail.
40
41            // If we see this error in production, it means we need to add more regex validation
42            // to `validate_pii_config` (which is called sentry-side).
43            relay_log::error!(
44                error = &e as &dyn std::error::Error,
45                pattern = regex.as_str(),
46                "Regex failed to compile in non-unicode mode",
47            );
48            return matches;
49        }
50    };
51
52    for captures in regex.captures_iter(data) {
53        for (idx, group) in captures.iter().enumerate() {
54            if let Some(group) = group {
55                if group.start() == group.end() {
56                    continue;
57                }
58
59                match replace_behavior {
60                    ReplaceBehavior::Groups(ref replace_groups) => {
61                        if replace_groups.contains(&(idx as u8)) {
62                            matches.push((group.start(), group.end()));
63                        }
64                    }
65                    ReplaceBehavior::Value => {
66                        matches.push((0, data.len()));
67                        break;
68                    }
69                }
70            }
71        }
72    }
73
74    for (start, end) in matches.iter() {
75        data[*start..*end].apply_redaction(&rule.redaction);
76    }
77    matches
78}
79
80fn apply_regex_to_utf16le_bytes(
81    data: &mut [u8],
82    rule: &RuleRef,
83    regex: &Regex,
84    replace_behavior: &ReplaceBehavior,
85) -> bool {
86    let mut changed = false;
87    for segment in WStrSegmentIter::new(data) {
88        match replace_behavior {
89            ReplaceBehavior::Value => {
90                for re_match in regex.find_iter(&segment.decoded) {
91                    changed = true;
92                    let match_wstr = get_wstr_match(&segment.decoded, re_match, segment.encoded);
93                    match_wstr.apply_redaction(&rule.redaction);
94                }
95            }
96            ReplaceBehavior::Groups(ref replace_groups) => {
97                for captures in regex.captures_iter(&segment.decoded) {
98                    for group_idx in replace_groups.iter() {
99                        if let Some(re_match) = captures.get(*group_idx as usize) {
100                            changed = true;
101                            let match_wstr =
102                                get_wstr_match(&segment.decoded, re_match, segment.encoded);
103                            match_wstr.apply_redaction(&rule.redaction);
104                        }
105                    }
106                }
107            }
108        }
109    }
110    changed
111}
112
113/// Extract the matching encoded slice from the encoded string.
114fn get_wstr_match<'a>(
115    all_text: &str,
116    re_match: Match,
117    all_encoded: &'a mut WStr<LittleEndian>,
118) -> &'a mut WStr<LittleEndian> {
119    let mut encoded_start = 0;
120    let mut encoded_end = all_encoded.len();
121
122    let offsets_iter = all_text.char_indices().zip(all_encoded.char_indices());
123    for ((text_offset, _text_char), (encoded_offset, _encoded_char)) in offsets_iter {
124        if text_offset == re_match.start() {
125            encoded_start = encoded_offset;
126        }
127        if text_offset == re_match.end() {
128            encoded_end = encoded_offset;
129            break;
130        }
131    }
132    &mut all_encoded[encoded_start..encoded_end]
133}
134
135/// Traits to modify the strings in ways we need.
136trait StringMods: AsRef<[u8]> {
137    /// Replace this string's contents by repeating the given character into it.
138    ///
139    /// # Panics
140    ///
141    /// The `fill_char` has to encode to the smallest encoding unit, otherwise this will
142    /// panic.  Using an ASCII replacement character is usually safe in most encodings.
143    fn fill_content(&mut self, fill_char: char);
144
145    /// Replace this string's contents with the given replacement string.
146    ///
147    /// If the replacement string encodes to a shorter byte-slice than the current string
148    /// any remaining space will be filled with the padding character.
149    ///
150    /// If the replacement string encodes to a longer byte-slice than the current string the
151    /// replacement string is truncated.  If this does not align with a character boundary
152    /// in the replacement string it is further trucated to the previous character boundary
153    /// and the remainder is filled with the padding char.
154    ///
155    /// # Panics
156    ///
157    /// The `padding` character has to encode to the smallest encoding unit, otherwise this
158    /// will panic.  Using an ASCII padding character is usually safe in most encodings.
159    fn swap_content(&mut self, replacement: &str, padding: char);
160
161    /// Apply a PII scrubbing redaction to this string slice.
162    fn apply_redaction(&mut self, redaction: &Redaction) {
163        const PADDING: char = '*';
164        const MASK: char = '*';
165
166        match redaction {
167            Redaction::Default | Redaction::Remove => {
168                self.fill_content(PADDING);
169            }
170            Redaction::Mask => {
171                self.fill_content(MASK);
172            }
173            Redaction::Hash => {
174                let hashed = utils::hash_value(self.as_ref());
175                self.swap_content(&hashed, PADDING);
176            }
177            Redaction::Replace(ref replace) => {
178                self.swap_content(replace.text.as_str(), PADDING);
179            }
180            Redaction::Other => relay_log::warn!("Incoming redaction is not supported"),
181        }
182    }
183}
184
185impl StringMods for WStr<LittleEndian> {
186    fn fill_content(&mut self, fill_char: char) {
187        // If fill_char is too wide, fill_char.encode_utf16() will panic, fulfilling the
188        // trait's contract that we must panic if fill_char is too wide.
189        let mut buf = [0u16; 1];
190        let fill_u16 = fill_char.encode_utf16(&mut buf[..]);
191        let fill_buf = fill_u16[0].to_le_bytes();
192
193        unsafe {
194            let chunks = self
195                .as_bytes_mut()
196                .chunks_exact_mut(std::mem::size_of::<u16>());
197            for chunk in chunks {
198                chunk.copy_from_slice(&fill_buf);
199            }
200        }
201    }
202
203    fn swap_content(&mut self, replacement: &str, padding: char) {
204        // If the padding char is too wide, padding.encode_utf16() will panic, fulfilling
205        // the trait's contract that we must panic in this case.
206        let len = self.len();
207
208        let mut buf = [0u16; 1];
209        padding.encode_utf16(&mut buf[..]);
210        let fill_buf = buf[0].to_le_bytes();
211
212        let mut offset = 0;
213        for code in replacement.encode_utf16() {
214            let char_len = if 0xD800 & code == 0xD800 {
215                std::mem::size_of::<u16>() * 2 // leading surrogate
216            } else {
217                std::mem::size_of::<u16>()
218            };
219            if (len - offset) < char_len {
220                break; // Not enough space for this char
221            }
222            unsafe {
223                let target = &mut self.as_bytes_mut()[offset..offset + std::mem::size_of::<u16>()];
224                target.copy_from_slice(&code.to_le_bytes());
225            }
226            offset += std::mem::size_of::<u16>();
227        }
228
229        unsafe {
230            let remainder_bytes = &mut self.as_bytes_mut()[offset..];
231            let chunks = remainder_bytes.chunks_exact_mut(std::mem::size_of::<u16>());
232            for chunk in chunks {
233                chunk.copy_from_slice(&fill_buf);
234            }
235        }
236    }
237}
238
239impl StringMods for [u8] {
240    fn fill_content(&mut self, fill_char: char) {
241        // If fill_char is too wide, fill_char.encode_utf16() will panic, fulfilling the
242        // trait's contract that we must panic if fill_char is too wide.
243        let mut buf = [0u8; 1];
244        fill_char.encode_utf8(&mut buf[..]);
245        for byte in self {
246            *byte = buf[0];
247        }
248    }
249
250    fn swap_content(&mut self, replacement: &str, padding: char) {
251        // If the padding char is too wide, padding.encode_utf16() will panic, fulfilling
252        // the trait's contract that we must panic in this case.
253        let mut buf = [0u8; 1];
254        padding.encode_utf8(&mut buf[..]);
255
256        let cutoff = replacement.len().min(self.len());
257        let (left, right) = self.split_at_mut(cutoff);
258        left.copy_from_slice(&replacement.as_bytes()[..cutoff]);
259
260        for byte in right {
261            *byte = buf[0];
262        }
263    }
264}
265
266/// An iterator over segments of text in binary data.
267///
268/// This iterator will look for blocks of UTF-16 encoded text with little-endian byte order
269/// in a block of binary data and yield those slices as segments with both the decoded and
270/// encoded text.
271struct WStrSegmentIter<'a> {
272    data: &'a mut [u8],
273    offset: usize,
274}
275
276impl<'a> WStrSegmentIter<'a> {
277    fn new(data: &'a mut [u8]) -> Self {
278        Self { data, offset: 0 }
279    }
280}
281
282impl<'a> Iterator for WStrSegmentIter<'a> {
283    type Item = WStrSegment<'a>;
284
285    fn next(&mut self) -> Option<Self::Item> {
286        loop {
287            if self.offset >= self.data.len() {
288                return None;
289            }
290
291            let slice = match WStr::from_utf16le_mut(&mut self.data[self.offset..]) {
292                Ok(wstr) => {
293                    self.offset += wstr.len();
294                    unsafe { wstr.as_bytes_mut() }
295                }
296                Err(err) => {
297                    let start = self.offset;
298                    let end = start + err.valid_up_to();
299                    match err.error_len() {
300                        Some(len) => self.offset += err.valid_up_to() + len,
301                        None => self.offset = self.data.len(),
302                    }
303                    &mut self.data[start..end]
304                }
305            };
306
307            // We are handing out multiple mutable slices from the same mutable slice.  This
308            // is safe because we know they are not overlapping.  However the compiler
309            // doesn't know this so we need to transmute the lifetimes of the slices we
310            // return with std::slice::from_raw_parts_mut().
311            let ptr = slice.as_mut_ptr();
312            let len = slice.len();
313            let encoded = unsafe {
314                WStr::from_utf16le_unchecked_mut(std::slice::from_raw_parts_mut(ptr, len))
315            };
316
317            if encoded.chars().take(MIN_STRING_LEN).count() < MIN_STRING_LEN {
318                continue;
319            }
320            let decoded = encoded.to_utf8();
321            return Some(WStrSegment { encoded, decoded });
322        }
323    }
324}
325
326impl FusedIterator for WStrSegmentIter<'_> {}
327
328/// An encoded string segment in a larger data block.
329///
330/// The slice of data will contain the entire block which will be valid according to the
331/// encoding.  This will be a unique sub-slice of the data in [`WStrSegmentIter`] as the
332/// iterator will not yield overlapping slices.
333///
334/// While the `data` field is mutable, after mutating this the string in `decoded` will no
335/// longer match.
336struct WStrSegment<'a> {
337    /// The raw bytes of this segment.
338    encoded: &'a mut WStr<LittleEndian>,
339    /// The decoded string of this segment.
340    decoded: String,
341}
342
343/// A PII processor for attachment files.
344pub struct PiiAttachmentsProcessor<'a> {
345    compiled_config: &'a CompiledPiiConfig,
346    root_state: ProcessingState<'static>,
347}
348
349/// Which encodings to scrub for `scrub_bytes`.
350pub enum ScrubEncodings {
351    /// Scrub UTF-8.
352    Utf8,
353
354    /// Scrub UTF-16LE (little endian).
355    Utf16Le,
356
357    /// Attempt to scrub in all available encodings.
358    All,
359}
360
361impl<'a> PiiAttachmentsProcessor<'a> {
362    /// Creates a new `PiiAttachmentsProcessor` from the given PII config.
363    pub fn new(compiled_config: &'a CompiledPiiConfig) -> Self {
364        // this constructor needs to be cheap... a new PiiProcessor is created for each event. Move
365        // any init logic into CompiledPiiConfig::new.
366
367        let root_state =
368            ProcessingState::root().enter_static("", None, Some(ValueType::Attachments));
369
370        PiiAttachmentsProcessor {
371            compiled_config,
372            root_state,
373        }
374    }
375
376    /// Returns the processing state for the file with the given name.
377    pub(crate) fn state<'s>(
378        &'s self,
379        filename: &'s str,
380        value_type: ValueType,
381    ) -> ProcessingState<'s> {
382        self.root_state.enter_borrowed(
383            filename,
384            Some(Cow::Owned(FieldAttrs::new().pii(Pii::True))),
385            Some(value_type),
386        )
387    }
388
389    /// Applies PII rules to a plain buffer.
390    ///
391    /// Returns `true`, if the buffer was modified.
392    pub(crate) fn scrub_bytes(
393        &self,
394        data: &mut [u8],
395        state: &ProcessingState<'_>,
396        encodings: ScrubEncodings,
397    ) -> bool {
398        let pii = state.attrs().pii;
399        if pii == Pii::False {
400            return false;
401        }
402
403        let mut changed = false;
404
405        for (selector, rules) in &self.compiled_config.applications {
406            if selector.matches_path(&state.path()) {
407                for rule in rules {
408                    // Note:
409                    //
410                    // - We ignore pattern_type and just treat every regex like a value regex (i.e.
411                    //   redactPair becomes pattern rule). Very unlikely anybody would want that
412                    //   behavior (e.g.  "Remove passwords on **" would remove a file called
413                    //   "passwords.txt", but also "author.txt").  Just use selectors!
414                    //
415                    // - We impose severe restrictions on how redaction methods work, as we must
416                    //   not change the lengths of attachments.
417                    for (_pattern_type, regex, replace_behavior) in
418                        get_regex_for_rule_type(&rule.ty)
419                    {
420                        match encodings {
421                            ScrubEncodings::Utf8 => {
422                                let matches =
423                                    apply_regex_to_utf8_bytes(data, rule, regex, &replace_behavior);
424                                changed |= !(matches.is_empty());
425                            }
426                            ScrubEncodings::Utf16Le => {
427                                changed |= apply_regex_to_utf16le_bytes(
428                                    data,
429                                    rule,
430                                    regex,
431                                    &replace_behavior,
432                                );
433                            }
434                            ScrubEncodings::All => {
435                                let matches =
436                                    apply_regex_to_utf8_bytes(data, rule, regex, &replace_behavior);
437                                changed |= !(matches.is_empty());
438
439                                // Only scrub regions with the UTF-16 scrubber if they haven't been
440                                // scrubbed yet.
441                                let unscrubbed_ranges = matches
442                                    .into_iter()
443                                    .chain(std::iter::once((data.len(), 0)))
444                                    .scan((0usize, 0usize), |previous, current| {
445                                        let start = if previous.1 % 2 == 0 {
446                                            previous.1
447                                        } else {
448                                            previous.1 + 1
449                                        };
450                                        let item = (start, current.0);
451                                        *previous = current;
452                                        Some(item)
453                                    })
454                                    .filter(|(start, end)| end > start);
455                                for (start, end) in unscrubbed_ranges {
456                                    changed |= apply_regex_to_utf16le_bytes(
457                                        &mut data[start..end],
458                                        rule,
459                                        regex,
460                                        &replace_behavior,
461                                    );
462                                }
463                            }
464                        }
465                    }
466                }
467            }
468        }
469
470        changed
471    }
472
473    /// Applies PII scrubbing rules to a plain attachment.
474    ///
475    /// Returns `true`, if the attachment was modified.
476    pub fn scrub_attachment(&self, filename: &str, data: &mut [u8]) -> bool {
477        let state = self.state(filename, ValueType::Binary);
478        self.scrub_bytes(data, &state, ScrubEncodings::All)
479    }
480
481    /// Scrub a filepath, preserving the basename.
482    pub fn scrub_utf8_filepath(&self, path: &mut str, state: &ProcessingState<'_>) -> bool {
483        if let Some(index) = path.rfind(['/', '\\']) {
484            let data = unsafe { &mut path.as_bytes_mut()[..index] };
485            self.scrub_bytes(data, state, ScrubEncodings::Utf8)
486        } else {
487            false
488        }
489    }
490
491    /// Scrub a filepath, preserving the basename.
492    pub fn scrub_utf16_filepath(
493        &self,
494        path: &mut WStr<LittleEndian>,
495        state: &ProcessingState<'_>,
496    ) -> bool {
497        let index =
498            path.char_indices().rev().find_map(
499                |(i, c)| {
500                    if c == '/' || c == '\\' {
501                        Some(i)
502                    } else {
503                        None
504                    }
505                },
506            );
507
508        if let Some(index) = index {
509            let data = unsafe { &mut path.as_bytes_mut()[..index] };
510            self.scrub_bytes(data, state, ScrubEncodings::Utf16Le)
511        } else {
512            false
513        }
514    }
515
516    /// Applies PII rules to the given JSON.
517    ///
518    /// This function will perform PII scrubbing using `serde_transcode`, which means that it
519    /// does not have to read the entire document in memory but will rather perform in on a
520    /// per-item basis using a streaming approach.
521    ///
522    /// Returns a scrubbed copy of the JSON document.
523    pub fn scrub_json(&self, payload: &[u8]) -> Result<Vec<u8>, JsonScrubError> {
524        let output = Vec::new();
525
526        let visitor = JsonScrubVisitor::new(self.compiled_config);
527
528        let mut deserializer_inner = serde_json::Deserializer::from_slice(payload);
529        let deserializer = transform::Deserializer::new(&mut deserializer_inner, visitor);
530
531        let mut serializer = serde_json::Serializer::new(output);
532        serde_transcode::transcode(deserializer, &mut serializer)
533            .map_err(|_| JsonScrubError::TranscodeFailed)?;
534        Ok(serializer.into_inner())
535    }
536}
537
538#[cfg(test)]
539mod tests {
540    use itertools::Itertools;
541
542    use super::*;
543    use crate::PiiConfig;
544
545    enum AttachmentBytesTestCase<'a> {
546        Builtin {
547            selector: &'a str,
548            rule: &'a str,
549            filename: &'a str,
550            value_type: ValueType,
551            input: &'a [u8],
552            output: &'a [u8],
553            changed: bool,
554        },
555        Regex {
556            selector: &'a str,
557            regex: &'a str,
558            filename: &'a str,
559            value_type: ValueType,
560            input: &'a [u8],
561            output: &'a [u8],
562            changed: bool,
563        },
564    }
565
566    impl AttachmentBytesTestCase<'_> {
567        fn run(self) {
568            let (config, filename, value_type, input, expected, changed) = match self {
569                AttachmentBytesTestCase::Builtin {
570                    selector,
571                    rule,
572                    filename,
573                    value_type,
574                    input,
575                    output,
576                    changed,
577                } => {
578                    let config = serde_json::from_value::<PiiConfig>(serde_json::json!(
579                        {
580                            "applications": {
581                                selector: [rule]
582                            }
583                        }
584                    ))
585                    .unwrap();
586                    (config, filename, value_type, input, output, changed)
587                }
588                AttachmentBytesTestCase::Regex {
589                    selector,
590                    regex,
591                    filename,
592                    value_type,
593                    input,
594                    output,
595                    changed,
596                } => {
597                    let config = serde_json::from_value::<PiiConfig>(serde_json::json!(
598                        {
599                            "rules": {
600                                "custom": {
601                                    "type": "pattern",
602                                    "pattern": regex,
603                                    "redaction": {
604                                      "method": "remove"
605                                    }
606                                }
607                            },
608                            "applications": {
609                                selector: ["custom"]
610                            }
611                        }
612                    ))
613                    .unwrap();
614                    (config, filename, value_type, input, output, changed)
615                }
616            };
617
618            let mut actual = input.to_owned();
619            let processor = PiiAttachmentsProcessor::new(config.compiled());
620            let state = processor.state(filename, value_type);
621            let has_changed = processor.scrub_bytes(&mut actual, &state, ScrubEncodings::All);
622
623            assert!(
624                actual == expected,
625                "`actual == expected` in line {}:\n{}\n{}",
626                line!(),
627                pretty_hex::pretty_hex(&actual),
628                pretty_hex::pretty_hex(&expected),
629            );
630
631            assert_eq!(changed, has_changed);
632        }
633    }
634
635    fn utf16le(s: &str) -> Vec<u8> {
636        s.encode_utf16()
637            .map(|u| u.to_le_bytes())
638            .collect::<Vec<[u8; 2]>>()
639            .iter()
640            .flatten()
641            .copied()
642            .collect()
643    }
644
645    #[test]
646    fn test_ip_replace_padding() {
647        AttachmentBytesTestCase::Builtin {
648            selector: "$binary",
649            rule: "@ip",
650            filename: "foo.txt",
651            value_type: ValueType::Binary,
652            input: b"before 127.0.0.1 after",
653            output: b"before [ip]***** after",
654            changed: true,
655        }
656        .run();
657    }
658
659    #[test]
660    fn test_ip_replace_padding_utf16() {
661        AttachmentBytesTestCase::Builtin {
662            selector: "$binary",
663            rule: "@ip",
664            filename: "foo.txt",
665            value_type: ValueType::Binary,
666            input: utf16le("before 127.0.0.1 after").as_slice(),
667            output: utf16le("before [ip]***** after").as_slice(),
668            changed: true,
669        }
670        .run();
671    }
672
673    #[test]
674    fn test_ip_hash_trunchating() {
675        AttachmentBytesTestCase::Builtin {
676            selector: "$binary",
677            rule: "@ip:hash",
678            filename: "foo.txt",
679            value_type: ValueType::Binary,
680            input: b"before 127.0.0.1 after",
681            output: b"before AE12FE3B5 after",
682            changed: true,
683        }
684        .run();
685    }
686
687    #[test]
688    fn test_ip_hash_trunchating_utf16() {
689        AttachmentBytesTestCase::Builtin {
690            selector: "$binary",
691            rule: "@ip:hash",
692            filename: "foo.txt",
693            value_type: ValueType::Binary,
694            input: utf16le("before 127.0.0.1 after").as_slice(),
695            output: utf16le("before 3FA8F5A46 after").as_slice(),
696            changed: true,
697        }
698        .run();
699    }
700
701    #[test]
702    fn test_ip_masking() {
703        AttachmentBytesTestCase::Builtin {
704            selector: "$binary",
705            rule: "@ip:mask",
706            filename: "foo.txt",
707            value_type: ValueType::Binary,
708            input: b"before 127.0.0.1 after",
709            output: b"before ********* after",
710            changed: true,
711        }
712        .run();
713    }
714
715    #[test]
716    fn test_ip_masking_utf16() {
717        AttachmentBytesTestCase::Builtin {
718            selector: "$binary",
719            rule: "@ip:mask",
720            filename: "foo.txt",
721            value_type: ValueType::Binary,
722            input: utf16le("before 127.0.0.1 after").as_slice(),
723            output: utf16le("before ********* after").as_slice(),
724            changed: true,
725        }
726        .run();
727    }
728
729    #[test]
730    fn test_ip_removing() {
731        AttachmentBytesTestCase::Builtin {
732            selector: "$binary",
733            rule: "@ip:remove",
734            filename: "foo.txt",
735            value_type: ValueType::Binary,
736            input: b"before 127.0.0.1 after",
737            output: b"before ********* after",
738            changed: true,
739        }
740        .run();
741    }
742
743    #[test]
744    fn test_ip_removing_utf16() {
745        AttachmentBytesTestCase::Builtin {
746            selector: "$binary",
747            rule: "@ip:remove",
748            filename: "foo.txt",
749            value_type: ValueType::Binary,
750            input: utf16le("before 127.0.0.1 after").as_slice(),
751            output: utf16le("before ********* after").as_slice(),
752            changed: true,
753        }
754        .run();
755    }
756
757    #[test]
758    fn test_selectors() {
759        for wrong_selector in &[
760            "$string",
761            "$number",
762            "$attachments.* && $string",
763            "$attachments",
764            "** && !$binary",
765        ] {
766            AttachmentBytesTestCase::Builtin {
767                selector: wrong_selector,
768                rule: "@ip:mask",
769                filename: "foo.txt",
770                value_type: ValueType::Binary,
771                input: b"before 127.0.0.1 after",
772                output: b"before 127.0.0.1 after",
773                changed: false,
774            }
775            .run();
776        }
777    }
778
779    #[test]
780    fn test_all_the_bytes() {
781        AttachmentBytesTestCase::Builtin {
782            selector: "$binary",
783            rule: "@anything:remove",
784            filename: "foo.txt",
785            value_type: ValueType::Binary,
786            input: (0..255u8).collect::<Vec<_>>().as_slice(),
787            output: &[b'*'; 255],
788            changed: true,
789        }
790        .run();
791    }
792
793    #[test]
794    fn test_bytes_regexes() {
795        // Test that specifically bytes patterns that are not valid UTF-8 can be matched against.
796        //
797        // From https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
798        let samples: &[&[u8]] = &[
799            b"\xc3\x28",                 // Invalid 2 Octet Sequence
800            b"\xa0\xa1",                 // Invalid Sequence Identifier
801            b"\xe2\x28\xa1",             // Invalid 3 Octet Sequence (in 2nd Octet)
802            b"\xe2\x82\x28",             // Invalid 3 Octet Sequence (in 3rd Octet)
803            b"\xf0\x28\x8c\xbc",         // Invalid 4 Octet Sequence (in 2nd Octet)
804            b"\xf0\x90\x28\xbc",         // Invalid 4 Octet Sequence (in 3rd Octet)
805            b"\xf0\x28\x8c\x28",         // Invalid 4 Octet Sequence (in 4th Octet)
806            b"\xf8\xa1\xa1\xa1\xa1",     // Valid 5 Octet Sequence (but not Unicode!)
807            b"\xfc\xa1\xa1\xa1\xa1\xa1", // Valid 6 Octet Sequence (but not Unicode!)
808        ];
809
810        for bytes in samples {
811            assert!(String::from_utf8(bytes.to_vec()).is_err());
812
813            AttachmentBytesTestCase::Regex {
814                selector: "$binary",
815                regex: &bytes.iter().map(|x| format!("\\x{x:02x}")).join(""),
816                filename: "foo.txt",
817                value_type: ValueType::Binary,
818                input: bytes,
819                output: &vec![b'*'; bytes.len()],
820                changed: true,
821            }
822            .run()
823        }
824    }
825
826    #[test]
827    fn test_segments_all_data() {
828        let mut data = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
829        let mut iter = WStrSegmentIter::new(&mut data[..]);
830
831        let segment = iter.next().unwrap();
832        assert_eq!(segment.decoded, "hello");
833        assert_eq!(segment.encoded.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
834
835        assert!(iter.next().is_none());
836    }
837
838    #[test]
839    fn test_segments_middle_2_byte_aligned() {
840        let mut data = Vec::from(&b"\xd8\xd8\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8"[..]);
841        let mut iter = WStrSegmentIter::new(&mut data[..]);
842
843        let segment = iter.next().unwrap();
844        assert_eq!(segment.decoded, "hello");
845        assert_eq!(segment.encoded.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
846
847        assert!(iter.next().is_none());
848    }
849
850    #[test]
851    fn test_segments_middle_2_byte_aligned_mutation() {
852        let mut data = Vec::from(&b"\xd8\xd8\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8"[..]);
853        let mut iter = WStrSegmentIter::new(&mut data[..]);
854
855        let segment = iter.next().unwrap();
856        unsafe {
857            segment
858                .encoded
859                .as_bytes_mut()
860                .copy_from_slice(&b"w\x00o\x00r\x00l\x00d\x00"[..]);
861        }
862
863        assert!(iter.next().is_none());
864
865        assert_eq!(data, b"\xd8\xd8\xd8\xd8w\x00o\x00r\x00l\x00d\x00\xd8\xd8");
866    }
867
868    #[test]
869    fn test_segments_middle_unaligned() {
870        let mut data = Vec::from(&b"\xd8\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8"[..]);
871        let mut iter = WStrSegmentIter::new(&mut data);
872
873        // Off-by-one is devastating, nearly everything is valid unicode.
874        let segment = iter.next().unwrap();
875        assert_eq!(segment.decoded, "棘攀氀氀漀");
876
877        assert!(iter.next().is_none());
878    }
879
880    #[test]
881    fn test_segments_end_aligned() {
882        let mut data = Vec::from(&b"\xd8\xd8h\x00e\x00l\x00l\x00o\x00"[..]);
883        let mut iter = WStrSegmentIter::new(&mut data);
884
885        let segment = iter.next().unwrap();
886        assert_eq!(segment.decoded, "hello");
887
888        assert!(iter.next().is_none());
889    }
890
891    #[test]
892    fn test_segments_garbage() {
893        let mut data = Vec::from(&b"\xd8\xd8"[..]);
894        let mut iter = WStrSegmentIter::new(&mut data);
895
896        assert!(iter.next().is_none());
897    }
898
899    #[test]
900    fn test_segments_too_short() {
901        let mut data = Vec::from(&b"\xd8\xd8y\x00o\x00\xd8\xd8h\x00e\x00l\x00l\x00o\x00"[..]);
902        let mut iter = WStrSegmentIter::new(&mut data);
903
904        let segment = iter.next().unwrap();
905        assert_eq!(segment.decoded, "hello");
906
907        assert!(iter.next().is_none());
908    }
909
910    #[test]
911    fn test_segments_multiple() {
912        let mut data =
913            Vec::from(&b"\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8w\x00o\x00r\x00l\x00d\x00"[..]);
914
915        let mut iter = WStrSegmentIter::new(&mut data);
916
917        let segment = iter.next().unwrap();
918        assert_eq!(segment.decoded, "hello");
919
920        let segment = iter.next().unwrap();
921        assert_eq!(segment.decoded, "world");
922
923        assert!(iter.next().is_none());
924    }
925
926    #[test]
927    fn test_fill_content_wstr() {
928        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
929        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
930        s.fill_content('x');
931        assert_eq!(b.as_slice(), b"x\x00x\x00x\x00x\x00x\x00");
932    }
933
934    #[test]
935    #[should_panic]
936    fn test_fill_content_wstr_panic() {
937        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
938        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
939        s.fill_content('\u{10000}');
940    }
941
942    #[test]
943    fn test_swap_content_wstr() {
944        // Exact same size
945        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
946        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
947        s.swap_content("world", 'x');
948        assert_eq!(b.as_slice(), b"w\x00o\x00r\x00l\x00d\x00");
949
950        // Shorter, padding fits
951        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
952        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
953        s.swap_content("hey", 'x');
954        assert_eq!(b.as_slice(), b"h\x00e\x00y\x00x\x00x\x00");
955
956        // Longer, truncated fits
957        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
958        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
959        s.swap_content("world", 'x');
960        assert_eq!(b.as_slice(), b"w\x00o\x00r\x00");
961
962        // Longer, truncated + padding
963        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
964        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
965        s.swap_content("yo\u{10000}", 'x');
966        assert_eq!(b.as_slice(), b"y\x00o\x00x\x00");
967    }
968
969    #[test]
970    #[should_panic]
971    fn test_swap_content_wstr_panic() {
972        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
973        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
974        s.swap_content("yo", '\u{10000}');
975    }
976
977    #[test]
978    #[allow(clippy::trivial_regex)]
979    fn test_get_wstr_match() {
980        let s = "hello there";
981        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00 \x00t\x00h\x00e\x00r\x00e\x00"[..]);
982        let w = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
983
984        // Partial match
985        let re = Regex::new("hello").unwrap();
986        let re_match = re.find(s).unwrap();
987        let m = get_wstr_match(s, re_match, w);
988        assert_eq!(m.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
989
990        // Full match
991        let re = Regex::new(".*").unwrap();
992        let re_match = re.find(s).unwrap();
993        let m = get_wstr_match(s, re_match, w);
994        assert_eq!(
995            m.as_bytes(),
996            b"h\x00e\x00l\x00l\x00o\x00 \x00t\x00h\x00e\x00r\x00e\x00"
997        );
998    }
999}