Skip to main content

relay_pii/
regexes.rs

1use std::sync::LazyLock;
2
3use regex::Regex;
4use smallvec::{SmallVec, smallvec};
5
6use crate::config::RuleType;
7
8#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
9pub enum PatternType {
10    /// Pattern-match on key and value
11    KeyValue,
12    /// Pattern-match on key only, apply replacement on the value.
13    Key,
14    /// Pattern-match on value only
15    Value,
16}
17
18/// What to do with regex matches once found.
19#[derive(Clone, Debug, Eq, PartialEq)]
20pub enum ReplaceBehavior {
21    /// Replace the entire string value (or just more than the match, depending on context).
22    Value,
23
24    /// Replace the following specific regex groups.
25    Groups(SmallVec<[u8; 1]>),
26}
27
28impl ReplaceBehavior {
29    /// Replace the entire string value (or just more than the match, depending on context).
30    pub fn replace_value() -> Self {
31        ReplaceBehavior::Value
32    }
33
34    /// Replace the entire match, equivalent to `ReplaceBehavior::Groups([0])`.
35    pub fn replace_match() -> Self {
36        ReplaceBehavior::replace_group(0)
37    }
38
39    /// Replace the following singular regex group.
40    pub fn replace_group(g: u8) -> Self {
41        ReplaceBehavior::Groups(smallvec![g])
42    }
43
44    /// Replace the following specific regex groups.
45    pub fn replace_groups(gs: SmallVec<[u8; 1]>) -> Self {
46        ReplaceBehavior::Groups(gs)
47    }
48}
49
50/// Return a list of regexes to apply for the given rule type.
51pub fn get_regex_for_rule_type(
52    ty: &RuleType,
53) -> SmallVec<[(PatternType, &Regex, ReplaceBehavior); 2]> {
54    let v = PatternType::Value;
55    let k = PatternType::Key;
56    let kv = PatternType::KeyValue;
57
58    match ty {
59        RuleType::RedactPair(redact_pair) => {
60            if let Ok(pattern) = redact_pair.key_pattern.compiled() {
61                smallvec![(kv, pattern, ReplaceBehavior::replace_value())]
62            } else {
63                smallvec![]
64            }
65        }
66        RuleType::Bearer => {
67            smallvec![(v, &*BEARER_TOKEN_REGEX, ReplaceBehavior::replace_match())]
68        }
69        RuleType::Password => {
70            smallvec![
71                // Bearer token was moved to its own regest and type out of the passwords, but we
72                // still keep it here for backwards compatibility.
73                (v, &*BEARER_TOKEN_REGEX, ReplaceBehavior::replace_match()),
74                (k, &*TOKEN_KEY_REGEX, ReplaceBehavior::replace_value()),
75                (kv, &*PASSWORD_KEY_REGEX, ReplaceBehavior::replace_value()),
76            ]
77        }
78        RuleType::Anything => smallvec![(v, &*ANYTHING_REGEX, ReplaceBehavior::replace_match())],
79        RuleType::Pattern(r) => {
80            let replace_behavior = match r.replace_groups {
81                Some(ref groups) => {
82                    ReplaceBehavior::replace_groups(groups.iter().copied().collect())
83                }
84                None => ReplaceBehavior::replace_match(),
85            };
86            if let Ok(pattern) = r.pattern.compiled() {
87                smallvec![(v, pattern, replace_behavior)]
88            } else {
89                smallvec![]
90            }
91        }
92
93        RuleType::Imei => smallvec![(v, &*IMEI_REGEX, ReplaceBehavior::replace_match())],
94        RuleType::Mac => smallvec![(v, &*MAC_REGEX, ReplaceBehavior::replace_match())],
95        RuleType::Uuid => smallvec![(v, &*UUID_REGEX, ReplaceBehavior::replace_match())],
96        RuleType::Email => smallvec![(v, &*EMAIL_REGEX, ReplaceBehavior::replace_match())],
97        RuleType::Iban => smallvec![(v, &*IBAN_REGEX, ReplaceBehavior::replace_match())],
98        RuleType::Ip => smallvec![
99            (v, &*IPV4_REGEX, ReplaceBehavior::replace_match()),
100            (v, &*IPV6_REGEX, ReplaceBehavior::replace_group(1)),
101        ],
102        RuleType::Creditcard => {
103            smallvec![(v, &*CREDITCARD_REGEX, ReplaceBehavior::replace_match())]
104        }
105        RuleType::Pemkey => smallvec![(v, &*PEM_KEY_REGEX, ReplaceBehavior::replace_group(1))],
106        RuleType::UrlAuth => smallvec![(v, &*URL_AUTH_REGEX, ReplaceBehavior::replace_group(1))],
107        RuleType::UsSsn => smallvec![(v, &*US_SSN_REGEX, ReplaceBehavior::replace_match())],
108        RuleType::Userpath => smallvec![(v, &*PATH_REGEX, ReplaceBehavior::replace_group(1))],
109
110        // These ought to have been resolved in CompiledConfig
111        RuleType::Alias(_) | RuleType::Multiple(_) | RuleType::Unknown(_) => smallvec![],
112    }
113}
114
115#[rustfmt::skip]
116macro_rules! ip {
117    (v4s) => { "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" };
118    (v4a) => { concat!(ip!(v4s), "\\.", ip!(v4s), "\\.", ip!(v4s), "\\.", ip!(v4s)) };
119    (v6s) => { "[0-9a-fA-F]{1,4}" };
120}
121
122macro_rules! regex {
123    ($name:ident, $rule:expr) => {
124        #[allow(non_snake_case)]
125        mod $name {
126            use super::*;
127            pub static $name: LazyLock<Regex> = LazyLock::new(|| Regex::new($rule).unwrap());
128
129            #[test]
130            fn supports_byte_mode() {
131                assert!(
132                    regex::bytes::RegexBuilder::new($name.as_str())
133                        .unicode(false)
134                        .multi_line(false)
135                        .dot_matches_new_line(true)
136                        .build()
137                        .is_ok()
138                );
139            }
140        }
141        use $name::$name;
142    };
143}
144
145pub static ANYTHING_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(".*").unwrap());
146
147regex!(
148    IMEI_REGEX,
149    r"(?x)
150        \b
151            (\d{2}-?
152                \d{6}-?
153                \d{6}-?
154                \d{1,2})
155        \b
156    "
157);
158
159regex!(
160    MAC_REGEX,
161    r"(?x)
162        \b([[:xdigit:]]{2}[:-]){5}[[:xdigit:]]{2}\b
163    "
164);
165
166regex!(
167    UUID_REGEX,
168    r"(?ix)
169        \b
170        [a-z0-9]{8}-?
171        [a-z0-9]{4}-?
172        [a-z0-9]{4}-?
173        [a-z0-9]{4}-?
174        [a-z0-9]{12}
175        \b
176    "
177);
178
179regex!(
180    EMAIL_REGEX,
181    r"(?x)
182        \b
183            [a-zA-Z0-9.!\#$%&'*+/=?^_`{|}~-]+
184            @
185            [a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}
186        \b
187    "
188);
189
190regex!(
191    IBAN_REGEX,
192    r"(?x)
193        \b
194        (AT|AD|AE|AL|AZ|BA|BE|BG|BH|BR|BY|CH|CR|CY|CZ|DE|DK|DO|EE|EG|ES|FI|FO|FR|GB|GE|GI|GL|GR|GT|HR|HU|IE|IL|IQ|IS|IT|JO|KW|KZ|LB|LC|LI|LT|LU|LV|LY|MC|MD|ME|MK|MR|MT|MU|NL|NO|PK|PL|PS|PT|QA|RO|RU|RS|SA|SC|SE|SI|SK|SM|ST|SV|TL|TN|TR|UA|VA|VG|XK|DZ|AO|BJ|BF|BI|CV|CM|CF|TD|KM|CG|CI|DJ|GQ|GA|GW|HN|IR|MG|ML|MA|MZ|NI|NE|SN|TG)\d{2}[a-zA-Z0-9]{11,29}
195        \b
196    "
197);
198
199regex!(IPV4_REGEX, concat!("\\b", ip!(v4a), "\\b"));
200
201regex!(
202    IPV6_REGEX,
203    concat!(
204        "(?i)(?:[\\s]|[[:punct:]]|^)(",
205        "(",
206        ip!(v6s),
207        ":){7}",
208        ip!(v6s),
209        "|",
210        "(",
211        ip!(v6s),
212        ":){1,7}:|",
213        "(",
214        ip!(v6s),
215        ":){1,6}::",
216        ip!(v6s),
217        "|",
218        "(",
219        ip!(v6s),
220        ":){1,5}:(:",
221        ip!(v6s),
222        "){1,2}|",
223        "(",
224        ip!(v6s),
225        ":){1,4}:(:",
226        ip!(v6s),
227        "){1,3}|",
228        "(",
229        ip!(v6s),
230        ":){1,3}:(:",
231        ip!(v6s),
232        "){1,4}|",
233        "(",
234        ip!(v6s),
235        ":){1,2}:(:",
236        ip!(v6s),
237        "){1,5}|",
238        ip!(v6s),
239        ":((:",
240        ip!(v6s),
241        "){1,6})|",
242        ":((:",
243        ip!(v6s),
244        "){1,7}|:)|",
245        "fe80:(:",
246        ip!(v6s),
247        "){0,4}%[0-9a-zA-Z]{1,}",
248        "::(ffff(:0{1,4}){0,1}:){0,1}",
249        ip!(v4a),
250        "|",
251        "(",
252        ip!(v6s),
253        ":){1,4}:",
254        ip!(v4a),
255        ")([\\s]|[[:punct:]]|$)",
256    )
257);
258
259// http://www.richardsramblings.com/regex/credit-card-numbers/
260// Re-formatted with comments and dashes support
261//
262// Why so complicated? Because creditcard numbers are variable length and we do not want to
263// strip any number that just happens to have the same length.
264regex!(
265    CREDITCARD_REGEX,
266    r#"(?x)
267    \b(
268        (?:  # vendor specific prefixes
269                3[47]\d      # amex (no 13-digit version) (length: 15)
270            | 4\d{3}       # visa (16-digit version only)
271            | 5[1-5]\d\d   # mastercard
272            | 65\d\d       # discover network (subset)
273            | 6011         # discover network (subset)
274        )
275
276        # "wildcard" remainder (allowing dashes in every position because of variable length)
277        ([-\s]?\d){12}
278    )\b
279    "#
280);
281
282regex!(
283    PATH_REGEX,
284    r"(?ix)
285        (?:
286            (?:
287                \b(?:[a-zA-Z]:[\\/])?
288                (?:users|home|documents and settings|[^/\\]+[/\\]profiles)[\\/]
289            ) | (?:
290                /(?:home|users)/
291            )
292        )
293        (
294            [^/\\\r\n]+
295        )
296    "
297);
298
299regex!(
300    PEM_KEY_REGEX,
301    r"(?sx)
302        (?:
303            -----
304            BEGIN[A-Z\ ]+(?:PRIVATE|PUBLIC)\ KEY
305            -----
306            [\t\ ]*\r?\n?
307        )
308        (.+?)
309        (?:
310            \r?\n?
311            -----
312            END[A-Z\ ]+(?:PRIVATE|PUBLIC)\ KEY
313            -----
314        )
315    "
316);
317
318regex!(
319    URL_AUTH_REGEX,
320    r"(?x)
321        \b(?:
322            (?:[a-z0-9+-]+:)?//
323            ([a-zA-Z0-9%_.-]+(?::[a-zA-Z0-9%_.-]+)?)
324        )@
325    "
326);
327
328regex!(
329    US_SSN_REGEX,
330    r"(?x)
331        \b(
332            [0-9]{3}-
333            [0-9]{2}-
334            [0-9]{4}
335        )\b
336    "
337);
338
339regex!(BEARER_TOKEN_REGEX, r"(?i)\b(Bearer\s+)([^\s]+)");
340
341regex!(TOKEN_KEY_REGEX, r"(?i)(token)");
342
343regex!(
344    PASSWORD_KEY_REGEX,
345    r"(?i)(password|secret|passwd|api[-_]key|apikey|auth|credentials|mysql_pwd|privatekey|private[-_]key|token[^\s]*[:=]|^otp$|^two[-_]factor$)"
346);
347
348#[cfg(test)]
349mod tests {
350    use super::*;
351
352    #[test]
353    fn test_userpath_utf8_bytes() {
354        // This mimicks `apply_regex_to_utf8_bytes`, which is used in minidump scrubbing.
355        // Ideally we would not compile a regex on the fly for every minidump
356        // (either add another lazy static or remove the distinction entirely).
357        let regex = regex::bytes::RegexBuilder::new(PATH_REGEX.as_str())
358            .unicode(false)
359            .multi_line(false)
360            .dot_matches_new_line(true)
361            .build()
362            .unwrap();
363        assert!(regex.is_match(br"C:\\Users\jane\somefile"));
364    }
365}