relay_pii/
attachments.rs

1use regex::bytes::RegexBuilder as BytesRegexBuilder;
2use regex::{Match, Regex};
3use relay_event_schema::processor::{FieldAttrs, Pii, ProcessingState, ValueType};
4use smallvec::SmallVec;
5use std::borrow::Cow;
6use std::iter::FusedIterator;
7use utf16string::{LittleEndian, WStr};
8
9use crate::compiledconfig::RuleRef;
10use crate::regexes::{ReplaceBehavior, get_regex_for_rule_type};
11use crate::{CompiledPiiConfig, JsonScrubError, JsonScrubVisitor, Redaction, transform, utils};
12
13/// The minimum length a string needs to be in a binary blob.
14///
15/// This module extracts encoded strings from within binary blobs, this specifies the
16/// minimum length we require those strings to be before we accept them to match scrubbing
17/// selectors on.
18const MIN_STRING_LEN: usize = 5;
19
20fn apply_regex_to_utf8_bytes(
21    data: &mut [u8],
22    rule: &RuleRef,
23    regex: &Regex,
24    replace_behavior: &ReplaceBehavior,
25) -> SmallVec<[(usize, usize); 1]> {
26    let mut matches = SmallVec::<[(usize, usize); 1]>::new();
27
28    let regex = match BytesRegexBuilder::new(regex.as_str())
29        // https://github.com/rust-lang/regex/issues/697
30        .unicode(false)
31        .multi_line(false)
32        .dot_matches_new_line(true)
33        .build()
34    {
35        Ok(x) => x,
36        Err(e) => {
37            // XXX: This is not going to fly long-term
38            // Idea: Disable unicode support for regexes entirely, that drastically increases the
39            // likelihood this conversion will never fail.
40
41            // If we see this error in production, it means we need to add more regex validation
42            // to `validate_pii_config` (which is called sentry-side).
43            relay_log::error!(
44                error = &e as &dyn std::error::Error,
45                pattern = regex.as_str(),
46                "Regex failed to compile in non-unicode mode",
47            );
48            return matches;
49        }
50    };
51
52    for captures in regex.captures_iter(data) {
53        for (idx, group) in captures.iter().enumerate() {
54            if let Some(group) = group {
55                if group.start() == group.end() {
56                    continue;
57                }
58
59                match replace_behavior {
60                    ReplaceBehavior::Groups(replace_groups) => {
61                        if replace_groups.contains(&(idx as u8)) {
62                            matches.push((group.start(), group.end()));
63                        }
64                    }
65                    ReplaceBehavior::Value => {
66                        matches.push((0, data.len()));
67                        break;
68                    }
69                }
70            }
71        }
72    }
73
74    for (start, end) in matches.iter() {
75        data[*start..*end].apply_redaction(&rule.redaction);
76    }
77    matches
78}
79
80fn apply_regex_to_utf16le_bytes(
81    data: &mut [u8],
82    rule: &RuleRef,
83    regex: &Regex,
84    replace_behavior: &ReplaceBehavior,
85) -> bool {
86    let mut changed = false;
87    for segment in WStrSegmentIter::new(data) {
88        match replace_behavior {
89            ReplaceBehavior::Value => {
90                for re_match in regex.find_iter(&segment.decoded) {
91                    changed = true;
92                    let match_wstr = get_wstr_match(&segment.decoded, re_match, segment.encoded);
93                    match_wstr.apply_redaction(&rule.redaction);
94                }
95            }
96            ReplaceBehavior::Groups(replace_groups) => {
97                for captures in regex.captures_iter(&segment.decoded) {
98                    for group_idx in replace_groups.iter() {
99                        if let Some(re_match) = captures.get(*group_idx as usize) {
100                            changed = true;
101                            let match_wstr =
102                                get_wstr_match(&segment.decoded, re_match, segment.encoded);
103                            match_wstr.apply_redaction(&rule.redaction);
104                        }
105                    }
106                }
107            }
108        }
109    }
110    changed
111}
112
113/// Extract the matching encoded slice from the encoded string.
114fn get_wstr_match<'a>(
115    all_text: &str,
116    re_match: Match,
117    all_encoded: &'a mut WStr<LittleEndian>,
118) -> &'a mut WStr<LittleEndian> {
119    let mut encoded_start = 0;
120    let mut encoded_end = all_encoded.len();
121
122    let offsets_iter = all_text.char_indices().zip(all_encoded.char_indices());
123    for ((text_offset, _text_char), (encoded_offset, _encoded_char)) in offsets_iter {
124        if text_offset == re_match.start() {
125            encoded_start = encoded_offset;
126        }
127        if text_offset == re_match.end() {
128            encoded_end = encoded_offset;
129            break;
130        }
131    }
132    &mut all_encoded[encoded_start..encoded_end]
133}
134
135/// Traits to modify the strings in ways we need.
136trait StringMods: AsRef<[u8]> {
137    /// Replace this string's contents by repeating the given character into it.
138    ///
139    /// # Panics
140    ///
141    /// The `fill_char` has to encode to the smallest encoding unit, otherwise this will
142    /// panic.  Using an ASCII replacement character is usually safe in most encodings.
143    fn fill_content(&mut self, fill_char: char);
144
145    /// Replace this string's contents with the given replacement string.
146    ///
147    /// If the replacement string encodes to a shorter byte-slice than the current string
148    /// any remaining space will be filled with the padding character.
149    ///
150    /// If the replacement string encodes to a longer byte-slice than the current string the
151    /// replacement string is truncated.  If this does not align with a character boundary
152    /// in the replacement string it is further trucated to the previous character boundary
153    /// and the remainder is filled with the padding char.
154    ///
155    /// # Panics
156    ///
157    /// The `padding` character has to encode to the smallest encoding unit, otherwise this
158    /// will panic.  Using an ASCII padding character is usually safe in most encodings.
159    fn swap_content(&mut self, replacement: &str, padding: char);
160
161    /// Apply a PII scrubbing redaction to this string slice.
162    fn apply_redaction(&mut self, redaction: &Redaction) {
163        const PADDING: char = '*';
164        const MASK: char = '*';
165
166        match redaction {
167            Redaction::Default | Redaction::Remove => {
168                self.fill_content(PADDING);
169            }
170            Redaction::Mask => {
171                self.fill_content(MASK);
172            }
173            Redaction::Hash => {
174                let hashed = utils::hash_value(self.as_ref());
175                self.swap_content(&hashed, PADDING);
176            }
177            Redaction::Replace(replace) => {
178                self.swap_content(replace.text.as_str(), PADDING);
179            }
180            Redaction::Other => relay_log::warn!("Incoming redaction is not supported"),
181        }
182    }
183}
184
185impl StringMods for WStr<LittleEndian> {
186    fn fill_content(&mut self, fill_char: char) {
187        // If fill_char is too wide, fill_char.encode_utf16() will panic, fulfilling the
188        // trait's contract that we must panic if fill_char is too wide.
189        let mut buf = [0u16; 1];
190        let fill_u16 = fill_char.encode_utf16(&mut buf[..]);
191        let fill_buf = fill_u16[0].to_le_bytes();
192
193        unsafe {
194            let chunks = self
195                .as_bytes_mut()
196                .chunks_exact_mut(std::mem::size_of::<u16>());
197            for chunk in chunks {
198                chunk.copy_from_slice(&fill_buf);
199            }
200        }
201    }
202
203    fn swap_content(&mut self, replacement: &str, padding: char) {
204        // If the padding char is too wide, padding.encode_utf16() will panic, fulfilling
205        // the trait's contract that we must panic in this case.
206        let len = self.len();
207
208        let mut buf = [0u16; 1];
209        padding.encode_utf16(&mut buf[..]);
210        let fill_buf = buf[0].to_le_bytes();
211
212        let mut offset = 0;
213        for code in replacement.encode_utf16() {
214            let char_len = if 0xD800 & code == 0xD800 {
215                std::mem::size_of::<u16>() * 2 // leading surrogate
216            } else {
217                std::mem::size_of::<u16>()
218            };
219            if (len - offset) < char_len {
220                break; // Not enough space for this char
221            }
222            unsafe {
223                let target = &mut self.as_bytes_mut()[offset..offset + std::mem::size_of::<u16>()];
224                target.copy_from_slice(&code.to_le_bytes());
225            }
226            offset += std::mem::size_of::<u16>();
227        }
228
229        unsafe {
230            let remainder_bytes = &mut self.as_bytes_mut()[offset..];
231            let chunks = remainder_bytes.chunks_exact_mut(std::mem::size_of::<u16>());
232            for chunk in chunks {
233                chunk.copy_from_slice(&fill_buf);
234            }
235        }
236    }
237}
238
239impl StringMods for [u8] {
240    fn fill_content(&mut self, fill_char: char) {
241        // If fill_char is too wide, fill_char.encode_utf16() will panic, fulfilling the
242        // trait's contract that we must panic if fill_char is too wide.
243        let mut buf = [0u8; 1];
244        fill_char.encode_utf8(&mut buf[..]);
245        for byte in self {
246            *byte = buf[0];
247        }
248    }
249
250    fn swap_content(&mut self, replacement: &str, padding: char) {
251        // If the padding char is too wide, padding.encode_utf16() will panic, fulfilling
252        // the trait's contract that we must panic in this case.
253        let mut buf = [0u8; 1];
254        padding.encode_utf8(&mut buf[..]);
255
256        let cutoff = replacement.len().min(self.len());
257        let (left, right) = self.split_at_mut(cutoff);
258        left.copy_from_slice(&replacement.as_bytes()[..cutoff]);
259
260        for byte in right {
261            *byte = buf[0];
262        }
263    }
264}
265
266/// An iterator over segments of text in binary data.
267///
268/// This iterator will look for blocks of UTF-16 encoded text with little-endian byte order
269/// in a block of binary data and yield those slices as segments with both the decoded and
270/// encoded text.
271struct WStrSegmentIter<'a> {
272    data: &'a mut [u8],
273    offset: usize,
274}
275
276impl<'a> WStrSegmentIter<'a> {
277    fn new(data: &'a mut [u8]) -> Self {
278        Self { data, offset: 0 }
279    }
280}
281
282impl<'a> Iterator for WStrSegmentIter<'a> {
283    type Item = WStrSegment<'a>;
284
285    fn next(&mut self) -> Option<Self::Item> {
286        loop {
287            if self.offset >= self.data.len() {
288                return None;
289            }
290
291            let slice = match WStr::from_utf16le_mut(&mut self.data[self.offset..]) {
292                Ok(wstr) => {
293                    self.offset += wstr.len();
294                    unsafe { wstr.as_bytes_mut() }
295                }
296                Err(err) => {
297                    let start = self.offset;
298                    let end = start + err.valid_up_to();
299                    match err.error_len() {
300                        Some(len) => self.offset += err.valid_up_to() + len,
301                        None => self.offset = self.data.len(),
302                    }
303                    &mut self.data[start..end]
304                }
305            };
306
307            // We are handing out multiple mutable slices from the same mutable slice.  This
308            // is safe because we know they are not overlapping.  However the compiler
309            // doesn't know this so we need to transmute the lifetimes of the slices we
310            // return with std::slice::from_raw_parts_mut().
311            let ptr = slice.as_mut_ptr();
312            let len = slice.len();
313            let encoded = unsafe {
314                WStr::from_utf16le_unchecked_mut(std::slice::from_raw_parts_mut(ptr, len))
315            };
316
317            if encoded.chars().take(MIN_STRING_LEN).count() < MIN_STRING_LEN {
318                continue;
319            }
320            let decoded = encoded.to_utf8();
321            return Some(WStrSegment { encoded, decoded });
322        }
323    }
324}
325
326impl FusedIterator for WStrSegmentIter<'_> {}
327
328/// An encoded string segment in a larger data block.
329///
330/// The slice of data will contain the entire block which will be valid according to the
331/// encoding.  This will be a unique sub-slice of the data in [`WStrSegmentIter`] as the
332/// iterator will not yield overlapping slices.
333///
334/// While the `data` field is mutable, after mutating this the string in `decoded` will no
335/// longer match.
336struct WStrSegment<'a> {
337    /// The raw bytes of this segment.
338    encoded: &'a mut WStr<LittleEndian>,
339    /// The decoded string of this segment.
340    decoded: String,
341}
342
343/// A PII processor for attachment files.
344pub struct PiiAttachmentsProcessor<'a> {
345    compiled_config: &'a CompiledPiiConfig,
346    root_state: ProcessingState<'static>,
347}
348
349/// Which encodings to scrub for `scrub_bytes`.
350pub enum ScrubEncodings {
351    /// Scrub UTF-8.
352    Utf8,
353
354    /// Scrub UTF-16LE (little endian).
355    Utf16Le,
356
357    /// Attempt to scrub in all available encodings.
358    All,
359}
360
361impl<'a> PiiAttachmentsProcessor<'a> {
362    /// Creates a new `PiiAttachmentsProcessor` from the given PII config.
363    pub fn new(compiled_config: &'a CompiledPiiConfig) -> Self {
364        // this constructor needs to be cheap... a new PiiProcessor is created for each event. Move
365        // any init logic into CompiledPiiConfig::new.
366
367        let root_state =
368            ProcessingState::root().enter_static("", None, Some(ValueType::Attachments));
369
370        PiiAttachmentsProcessor {
371            compiled_config,
372            root_state,
373        }
374    }
375
376    /// Returns the processing state for the file with the given name.
377    pub(crate) fn state<'s>(
378        &'s self,
379        filename: &'s str,
380        value_type: ValueType,
381    ) -> ProcessingState<'s> {
382        self.root_state.enter_borrowed(
383            filename,
384            Some(Cow::Owned(FieldAttrs::new().pii(Pii::True))),
385            Some(value_type),
386        )
387    }
388
389    /// Applies PII rules to a plain buffer.
390    ///
391    /// Returns `true`, if the buffer was modified.
392    pub(crate) fn scrub_bytes(
393        &self,
394        data: &mut [u8],
395        state: &ProcessingState<'_>,
396        encodings: ScrubEncodings,
397    ) -> bool {
398        let pii = state.attrs().pii;
399        if pii == Pii::False {
400            return false;
401        }
402
403        let mut changed = false;
404
405        for (selector, rules) in &self.compiled_config.applications {
406            if selector.matches_path(&state.path()) {
407                for rule in rules {
408                    // Note:
409                    //
410                    // - We ignore pattern_type and just treat every regex like a value regex (i.e.
411                    //   redactPair becomes pattern rule). Very unlikely anybody would want that
412                    //   behavior (e.g.  "Remove passwords on **" would remove a file called
413                    //   "passwords.txt", but also "author.txt").  Just use selectors!
414                    //
415                    // - We impose severe restrictions on how redaction methods work, as we must
416                    //   not change the lengths of attachments.
417                    for (_pattern_type, regex, replace_behavior) in
418                        get_regex_for_rule_type(&rule.ty)
419                    {
420                        match encodings {
421                            ScrubEncodings::Utf8 => {
422                                let matches =
423                                    apply_regex_to_utf8_bytes(data, rule, regex, &replace_behavior);
424                                changed |= !(matches.is_empty());
425                            }
426                            ScrubEncodings::Utf16Le => {
427                                changed |= apply_regex_to_utf16le_bytes(
428                                    data,
429                                    rule,
430                                    regex,
431                                    &replace_behavior,
432                                );
433                            }
434                            ScrubEncodings::All => {
435                                let matches =
436                                    apply_regex_to_utf8_bytes(data, rule, regex, &replace_behavior);
437                                changed |= !(matches.is_empty());
438
439                                // Only scrub regions with the UTF-16 scrubber if they haven't been
440                                // scrubbed yet.
441                                let unscrubbed_ranges = matches
442                                    .into_iter()
443                                    .chain(std::iter::once((data.len(), 0)))
444                                    .scan((0usize, 0usize), |previous, current| {
445                                        let start = if previous.1 % 2 == 0 {
446                                            previous.1
447                                        } else {
448                                            previous.1 + 1
449                                        };
450                                        let item = (start, current.0);
451                                        *previous = current;
452                                        Some(item)
453                                    })
454                                    .filter(|(start, end)| end > start);
455                                for (start, end) in unscrubbed_ranges {
456                                    changed |= apply_regex_to_utf16le_bytes(
457                                        &mut data[start..end],
458                                        rule,
459                                        regex,
460                                        &replace_behavior,
461                                    );
462                                }
463                            }
464                        }
465                    }
466                }
467            }
468        }
469
470        changed
471    }
472
473    /// Applies PII scrubbing rules to a plain attachment.
474    ///
475    /// Returns `true`, if the attachment was modified.
476    pub fn scrub_attachment(&self, filename: &str, data: &mut [u8]) -> bool {
477        let state = self.state(filename, ValueType::Binary);
478        self.scrub_bytes(data, &state, ScrubEncodings::All)
479    }
480
481    /// Scrub a filepath, preserving the basename.
482    pub fn scrub_utf8_filepath(&self, path: &mut str, state: &ProcessingState<'_>) -> bool {
483        if let Some(index) = path.rfind(['/', '\\']) {
484            let data = unsafe { &mut path.as_bytes_mut()[..index] };
485            self.scrub_bytes(data, state, ScrubEncodings::Utf8)
486        } else {
487            false
488        }
489    }
490
491    /// Scrub a filepath, preserving the basename.
492    pub fn scrub_utf16_filepath(
493        &self,
494        path: &mut WStr<LittleEndian>,
495        state: &ProcessingState<'_>,
496    ) -> bool {
497        let index = path
498            .char_indices()
499            .rev()
500            .find_map(|(i, c)| if c == '/' || c == '\\' { Some(i) } else { None });
501
502        if let Some(index) = index {
503            let data = unsafe { &mut path.as_bytes_mut()[..index] };
504            self.scrub_bytes(data, state, ScrubEncodings::Utf16Le)
505        } else {
506            false
507        }
508    }
509
510    /// Applies PII rules to the given JSON.
511    ///
512    /// This function will perform PII scrubbing using `serde_transcode`, which means that it
513    /// does not have to read the entire document in memory but will rather perform in on a
514    /// per-item basis using a streaming approach.
515    ///
516    /// Returns a scrubbed copy of the JSON document.
517    pub fn scrub_json(&self, payload: &[u8]) -> Result<Vec<u8>, JsonScrubError> {
518        let output = Vec::new();
519
520        let visitor = JsonScrubVisitor::new(self.compiled_config);
521
522        let mut deserializer_inner = serde_json::Deserializer::from_slice(payload);
523        let deserializer = transform::Deserializer::new(&mut deserializer_inner, visitor);
524
525        let mut serializer = serde_json::Serializer::new(output);
526        serde_transcode::transcode(deserializer, &mut serializer)
527            .map_err(|_| JsonScrubError::TranscodeFailed)?;
528        Ok(serializer.into_inner())
529    }
530}
531
532#[cfg(test)]
533mod tests {
534    use itertools::Itertools;
535
536    use super::*;
537    use crate::PiiConfig;
538
539    enum AttachmentBytesTestCase<'a> {
540        Builtin {
541            selector: &'a str,
542            rule: &'a str,
543            filename: &'a str,
544            value_type: ValueType,
545            input: &'a [u8],
546            output: &'a [u8],
547            changed: bool,
548        },
549        Regex {
550            selector: &'a str,
551            regex: &'a str,
552            filename: &'a str,
553            value_type: ValueType,
554            input: &'a [u8],
555            output: &'a [u8],
556            changed: bool,
557        },
558    }
559
560    impl AttachmentBytesTestCase<'_> {
561        fn run(self) {
562            let (config, filename, value_type, input, expected, changed) = match self {
563                AttachmentBytesTestCase::Builtin {
564                    selector,
565                    rule,
566                    filename,
567                    value_type,
568                    input,
569                    output,
570                    changed,
571                } => {
572                    let config = serde_json::from_value::<PiiConfig>(serde_json::json!(
573                        {
574                            "applications": {
575                                selector: [rule]
576                            }
577                        }
578                    ))
579                    .unwrap();
580                    (config, filename, value_type, input, output, changed)
581                }
582                AttachmentBytesTestCase::Regex {
583                    selector,
584                    regex,
585                    filename,
586                    value_type,
587                    input,
588                    output,
589                    changed,
590                } => {
591                    let config = serde_json::from_value::<PiiConfig>(serde_json::json!(
592                        {
593                            "rules": {
594                                "custom": {
595                                    "type": "pattern",
596                                    "pattern": regex,
597                                    "redaction": {
598                                      "method": "remove"
599                                    }
600                                }
601                            },
602                            "applications": {
603                                selector: ["custom"]
604                            }
605                        }
606                    ))
607                    .unwrap();
608                    (config, filename, value_type, input, output, changed)
609                }
610            };
611
612            let mut actual = input.to_owned();
613            let processor = PiiAttachmentsProcessor::new(config.compiled());
614            let state = processor.state(filename, value_type);
615            let has_changed = processor.scrub_bytes(&mut actual, &state, ScrubEncodings::All);
616
617            assert!(
618                actual == expected,
619                "`actual == expected` in line {}:\n{}\n{}",
620                line!(),
621                pretty_hex::pretty_hex(&actual),
622                pretty_hex::pretty_hex(&expected),
623            );
624
625            assert_eq!(changed, has_changed);
626        }
627    }
628
629    fn utf16le(s: &str) -> Vec<u8> {
630        s.encode_utf16()
631            .map(|u| u.to_le_bytes())
632            .collect::<Vec<[u8; 2]>>()
633            .iter()
634            .flatten()
635            .copied()
636            .collect()
637    }
638
639    #[test]
640    fn test_ip_replace_padding() {
641        AttachmentBytesTestCase::Builtin {
642            selector: "$binary",
643            rule: "@ip",
644            filename: "foo.txt",
645            value_type: ValueType::Binary,
646            input: b"before 127.0.0.1 after",
647            output: b"before [ip]***** after",
648            changed: true,
649        }
650        .run();
651    }
652
653    #[test]
654    fn test_ip_replace_padding_utf16() {
655        AttachmentBytesTestCase::Builtin {
656            selector: "$binary",
657            rule: "@ip",
658            filename: "foo.txt",
659            value_type: ValueType::Binary,
660            input: utf16le("before 127.0.0.1 after").as_slice(),
661            output: utf16le("before [ip]***** after").as_slice(),
662            changed: true,
663        }
664        .run();
665    }
666
667    #[test]
668    fn test_ip_hash_trunchating() {
669        AttachmentBytesTestCase::Builtin {
670            selector: "$binary",
671            rule: "@ip:hash",
672            filename: "foo.txt",
673            value_type: ValueType::Binary,
674            input: b"before 127.0.0.1 after",
675            output: b"before AE12FE3B5 after",
676            changed: true,
677        }
678        .run();
679    }
680
681    #[test]
682    fn test_ip_hash_trunchating_utf16() {
683        AttachmentBytesTestCase::Builtin {
684            selector: "$binary",
685            rule: "@ip:hash",
686            filename: "foo.txt",
687            value_type: ValueType::Binary,
688            input: utf16le("before 127.0.0.1 after").as_slice(),
689            output: utf16le("before 3FA8F5A46 after").as_slice(),
690            changed: true,
691        }
692        .run();
693    }
694
695    #[test]
696    fn test_ip_masking() {
697        AttachmentBytesTestCase::Builtin {
698            selector: "$binary",
699            rule: "@ip:mask",
700            filename: "foo.txt",
701            value_type: ValueType::Binary,
702            input: b"before 127.0.0.1 after",
703            output: b"before ********* after",
704            changed: true,
705        }
706        .run();
707    }
708
709    #[test]
710    fn test_ip_masking_utf16() {
711        AttachmentBytesTestCase::Builtin {
712            selector: "$binary",
713            rule: "@ip:mask",
714            filename: "foo.txt",
715            value_type: ValueType::Binary,
716            input: utf16le("before 127.0.0.1 after").as_slice(),
717            output: utf16le("before ********* after").as_slice(),
718            changed: true,
719        }
720        .run();
721    }
722
723    #[test]
724    fn test_ip_removing() {
725        AttachmentBytesTestCase::Builtin {
726            selector: "$binary",
727            rule: "@ip:remove",
728            filename: "foo.txt",
729            value_type: ValueType::Binary,
730            input: b"before 127.0.0.1 after",
731            output: b"before ********* after",
732            changed: true,
733        }
734        .run();
735    }
736
737    #[test]
738    fn test_ip_removing_utf16() {
739        AttachmentBytesTestCase::Builtin {
740            selector: "$binary",
741            rule: "@ip:remove",
742            filename: "foo.txt",
743            value_type: ValueType::Binary,
744            input: utf16le("before 127.0.0.1 after").as_slice(),
745            output: utf16le("before ********* after").as_slice(),
746            changed: true,
747        }
748        .run();
749    }
750
751    #[test]
752    fn test_selectors() {
753        for wrong_selector in &[
754            "$string",
755            "$number",
756            "$attachments.* && $string",
757            "$attachments",
758            "** && !$binary",
759        ] {
760            AttachmentBytesTestCase::Builtin {
761                selector: wrong_selector,
762                rule: "@ip:mask",
763                filename: "foo.txt",
764                value_type: ValueType::Binary,
765                input: b"before 127.0.0.1 after",
766                output: b"before 127.0.0.1 after",
767                changed: false,
768            }
769            .run();
770        }
771    }
772
773    #[test]
774    fn test_all_the_bytes() {
775        AttachmentBytesTestCase::Builtin {
776            selector: "$binary",
777            rule: "@anything:remove",
778            filename: "foo.txt",
779            value_type: ValueType::Binary,
780            input: (0..255u8).collect::<Vec<_>>().as_slice(),
781            output: &[b'*'; 255],
782            changed: true,
783        }
784        .run();
785    }
786
787    #[test]
788    fn test_bytes_regexes() {
789        // Test that specifically bytes patterns that are not valid UTF-8 can be matched against.
790        //
791        // From https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
792        let samples: &[&[u8]] = &[
793            b"\xc3\x28",                 // Invalid 2 Octet Sequence
794            b"\xa0\xa1",                 // Invalid Sequence Identifier
795            b"\xe2\x28\xa1",             // Invalid 3 Octet Sequence (in 2nd Octet)
796            b"\xe2\x82\x28",             // Invalid 3 Octet Sequence (in 3rd Octet)
797            b"\xf0\x28\x8c\xbc",         // Invalid 4 Octet Sequence (in 2nd Octet)
798            b"\xf0\x90\x28\xbc",         // Invalid 4 Octet Sequence (in 3rd Octet)
799            b"\xf0\x28\x8c\x28",         // Invalid 4 Octet Sequence (in 4th Octet)
800            b"\xf8\xa1\xa1\xa1\xa1",     // Valid 5 Octet Sequence (but not Unicode!)
801            b"\xfc\xa1\xa1\xa1\xa1\xa1", // Valid 6 Octet Sequence (but not Unicode!)
802        ];
803
804        for bytes in samples {
805            assert!(String::from_utf8(bytes.to_vec()).is_err());
806
807            AttachmentBytesTestCase::Regex {
808                selector: "$binary",
809                regex: &bytes.iter().map(|x| format!("\\x{x:02x}")).join(""),
810                filename: "foo.txt",
811                value_type: ValueType::Binary,
812                input: bytes,
813                output: &vec![b'*'; bytes.len()],
814                changed: true,
815            }
816            .run()
817        }
818    }
819
820    #[test]
821    fn test_segments_all_data() {
822        let mut data = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
823        let mut iter = WStrSegmentIter::new(&mut data[..]);
824
825        let segment = iter.next().unwrap();
826        assert_eq!(segment.decoded, "hello");
827        assert_eq!(segment.encoded.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
828
829        assert!(iter.next().is_none());
830    }
831
832    #[test]
833    fn test_segments_middle_2_byte_aligned() {
834        let mut data = Vec::from(&b"\xd8\xd8\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8"[..]);
835        let mut iter = WStrSegmentIter::new(&mut data[..]);
836
837        let segment = iter.next().unwrap();
838        assert_eq!(segment.decoded, "hello");
839        assert_eq!(segment.encoded.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
840
841        assert!(iter.next().is_none());
842    }
843
844    #[test]
845    fn test_segments_middle_2_byte_aligned_mutation() {
846        let mut data = Vec::from(&b"\xd8\xd8\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8"[..]);
847        let mut iter = WStrSegmentIter::new(&mut data[..]);
848
849        let segment = iter.next().unwrap();
850        unsafe {
851            segment
852                .encoded
853                .as_bytes_mut()
854                .copy_from_slice(&b"w\x00o\x00r\x00l\x00d\x00"[..]);
855        }
856
857        assert!(iter.next().is_none());
858
859        assert_eq!(data, b"\xd8\xd8\xd8\xd8w\x00o\x00r\x00l\x00d\x00\xd8\xd8");
860    }
861
862    #[test]
863    fn test_segments_middle_unaligned() {
864        let mut data = Vec::from(&b"\xd8\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8"[..]);
865        let mut iter = WStrSegmentIter::new(&mut data);
866
867        // Off-by-one is devastating, nearly everything is valid unicode.
868        let segment = iter.next().unwrap();
869        assert_eq!(segment.decoded, "棘攀氀氀漀");
870
871        assert!(iter.next().is_none());
872    }
873
874    #[test]
875    fn test_segments_end_aligned() {
876        let mut data = Vec::from(&b"\xd8\xd8h\x00e\x00l\x00l\x00o\x00"[..]);
877        let mut iter = WStrSegmentIter::new(&mut data);
878
879        let segment = iter.next().unwrap();
880        assert_eq!(segment.decoded, "hello");
881
882        assert!(iter.next().is_none());
883    }
884
885    #[test]
886    fn test_segments_garbage() {
887        let mut data = Vec::from(&b"\xd8\xd8"[..]);
888        let mut iter = WStrSegmentIter::new(&mut data);
889
890        assert!(iter.next().is_none());
891    }
892
893    #[test]
894    fn test_segments_too_short() {
895        let mut data = Vec::from(&b"\xd8\xd8y\x00o\x00\xd8\xd8h\x00e\x00l\x00l\x00o\x00"[..]);
896        let mut iter = WStrSegmentIter::new(&mut data);
897
898        let segment = iter.next().unwrap();
899        assert_eq!(segment.decoded, "hello");
900
901        assert!(iter.next().is_none());
902    }
903
904    #[test]
905    fn test_segments_multiple() {
906        let mut data =
907            Vec::from(&b"\xd8\xd8h\x00e\x00l\x00l\x00o\x00\xd8\xd8w\x00o\x00r\x00l\x00d\x00"[..]);
908
909        let mut iter = WStrSegmentIter::new(&mut data);
910
911        let segment = iter.next().unwrap();
912        assert_eq!(segment.decoded, "hello");
913
914        let segment = iter.next().unwrap();
915        assert_eq!(segment.decoded, "world");
916
917        assert!(iter.next().is_none());
918    }
919
920    #[test]
921    fn test_fill_content_wstr() {
922        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
923        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
924        s.fill_content('x');
925        assert_eq!(b.as_slice(), b"x\x00x\x00x\x00x\x00x\x00");
926    }
927
928    #[test]
929    #[should_panic]
930    fn test_fill_content_wstr_panic() {
931        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
932        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
933        s.fill_content('\u{10000}');
934    }
935
936    #[test]
937    fn test_swap_content_wstr() {
938        // Exact same size
939        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
940        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
941        s.swap_content("world", 'x');
942        assert_eq!(b.as_slice(), b"w\x00o\x00r\x00l\x00d\x00");
943
944        // Shorter, padding fits
945        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
946        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
947        s.swap_content("hey", 'x');
948        assert_eq!(b.as_slice(), b"h\x00e\x00y\x00x\x00x\x00");
949
950        // Longer, truncated fits
951        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
952        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
953        s.swap_content("world", 'x');
954        assert_eq!(b.as_slice(), b"w\x00o\x00r\x00");
955
956        // Longer, truncated + padding
957        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
958        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
959        s.swap_content("yo\u{10000}", 'x');
960        assert_eq!(b.as_slice(), b"y\x00o\x00x\x00");
961    }
962
963    #[test]
964    #[should_panic]
965    fn test_swap_content_wstr_panic() {
966        let mut b = Vec::from(&b"h\x00e\x00y\x00"[..]);
967        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
968        s.swap_content("yo", '\u{10000}');
969    }
970
971    #[test]
972    #[allow(clippy::trivial_regex)]
973    fn test_get_wstr_match() {
974        let s = "hello there";
975        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00 \x00t\x00h\x00e\x00r\x00e\x00"[..]);
976        let w = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
977
978        // Partial match
979        let re = Regex::new("hello").unwrap();
980        let re_match = re.find(s).unwrap();
981        let m = get_wstr_match(s, re_match, w);
982        assert_eq!(m.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
983
984        // Full match
985        let re = Regex::new(".*").unwrap();
986        let re_match = re.find(s).unwrap();
987        let m = get_wstr_match(s, re_match, w);
988        assert_eq!(
989            m.as_bytes(),
990            b"h\x00e\x00l\x00l\x00o\x00 \x00t\x00h\x00e\x00r\x00e\x00"
991        );
992    }
993}