relay_pii/
regexes.rs

1use std::sync::LazyLock;
2
3use regex::Regex;
4use smallvec::{SmallVec, smallvec};
5
6use crate::config::RuleType;
7
8#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
9pub enum PatternType {
10    /// Pattern-match on key and value
11    KeyValue,
12    /// Pattern-match on value only
13    Value,
14}
15
16/// What to do with regex matches once found.
17#[derive(Clone, Debug, Eq, PartialEq)]
18pub enum ReplaceBehavior {
19    /// Replace the entire string value (or just more than the match, depending on context).
20    Value,
21
22    /// Replace the following specific regex groups.
23    Groups(SmallVec<[u8; 1]>),
24}
25
26impl ReplaceBehavior {
27    /// Replace the entire string value (or just more than the match, depending on context).
28    pub fn replace_value() -> Self {
29        ReplaceBehavior::Value
30    }
31
32    /// Replace the entire match, equivalent to `ReplaceBehavior::Groups([0])`.
33    pub fn replace_match() -> Self {
34        ReplaceBehavior::replace_group(0)
35    }
36
37    /// Replace the following singular regex group.
38    pub fn replace_group(g: u8) -> Self {
39        ReplaceBehavior::Groups(smallvec![g])
40    }
41
42    /// Replace the following specific regex groups.
43    pub fn replace_groups(gs: SmallVec<[u8; 1]>) -> Self {
44        ReplaceBehavior::Groups(gs)
45    }
46}
47
48/// Return a list of regexes to apply for the given rule type.
49pub fn get_regex_for_rule_type(
50    ty: &RuleType,
51) -> SmallVec<[(PatternType, &Regex, ReplaceBehavior); 2]> {
52    let v = PatternType::Value;
53    let kv = PatternType::KeyValue;
54
55    match ty {
56        RuleType::RedactPair(redact_pair) => {
57            if let Ok(pattern) = redact_pair.key_pattern.compiled() {
58                smallvec![(kv, pattern, ReplaceBehavior::replace_value())]
59            } else {
60                smallvec![]
61            }
62        }
63        RuleType::Bearer => {
64            smallvec![(v, &*BEARER_TOKEN_REGEX, ReplaceBehavior::replace_match())]
65        }
66        RuleType::Password => {
67            smallvec![
68                // Bearer token was moved to its own regest and type out of the passwords, but we
69                // still keep it here for backwards compatibility.
70                (v, &*BEARER_TOKEN_REGEX, ReplaceBehavior::replace_match()),
71                (kv, &*PASSWORD_KEY_REGEX, ReplaceBehavior::replace_value()),
72            ]
73        }
74        RuleType::Anything => smallvec![(v, &*ANYTHING_REGEX, ReplaceBehavior::replace_match())],
75        RuleType::Pattern(r) => {
76            let replace_behavior = match r.replace_groups {
77                Some(ref groups) => {
78                    ReplaceBehavior::replace_groups(groups.iter().copied().collect())
79                }
80                None => ReplaceBehavior::replace_match(),
81            };
82            if let Ok(pattern) = r.pattern.compiled() {
83                smallvec![(v, pattern, replace_behavior)]
84            } else {
85                smallvec![]
86            }
87        }
88
89        RuleType::Imei => smallvec![(v, &*IMEI_REGEX, ReplaceBehavior::replace_match())],
90        RuleType::Mac => smallvec![(v, &*MAC_REGEX, ReplaceBehavior::replace_match())],
91        RuleType::Uuid => smallvec![(v, &*UUID_REGEX, ReplaceBehavior::replace_match())],
92        RuleType::Email => smallvec![(v, &*EMAIL_REGEX, ReplaceBehavior::replace_match())],
93        RuleType::Iban => smallvec![(v, &*IBAN_REGEX, ReplaceBehavior::replace_match())],
94        RuleType::Ip => smallvec![
95            (v, &*IPV4_REGEX, ReplaceBehavior::replace_match()),
96            (v, &*IPV6_REGEX, ReplaceBehavior::replace_group(1)),
97        ],
98        RuleType::Creditcard => {
99            smallvec![(v, &*CREDITCARD_REGEX, ReplaceBehavior::replace_match())]
100        }
101        RuleType::Pemkey => smallvec![(v, &*PEM_KEY_REGEX, ReplaceBehavior::replace_group(1))],
102        RuleType::UrlAuth => smallvec![(v, &*URL_AUTH_REGEX, ReplaceBehavior::replace_group(1))],
103        RuleType::UsSsn => smallvec![(v, &*US_SSN_REGEX, ReplaceBehavior::replace_match())],
104        RuleType::Userpath => smallvec![(v, &*PATH_REGEX, ReplaceBehavior::replace_group(1))],
105
106        // These ought to have been resolved in CompiledConfig
107        RuleType::Alias(_) | RuleType::Multiple(_) | RuleType::Unknown(_) => smallvec![],
108    }
109}
110
111#[rustfmt::skip]
112macro_rules! ip {
113    (v4s) => { "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" };
114    (v4a) => { concat!(ip!(v4s), "\\.", ip!(v4s), "\\.", ip!(v4s), "\\.", ip!(v4s)) };
115    (v6s) => { "[0-9a-fA-F]{1,4}" };
116}
117
118macro_rules! regex {
119    ($name:ident, $rule:expr) => {
120        #[allow(non_snake_case)]
121        mod $name {
122            use super::*;
123            pub static $name: LazyLock<Regex> = LazyLock::new(|| Regex::new($rule).unwrap());
124
125            #[test]
126            fn supports_byte_mode() {
127                assert!(
128                    regex::bytes::RegexBuilder::new($name.as_str())
129                        .unicode(false)
130                        .multi_line(false)
131                        .dot_matches_new_line(true)
132                        .build()
133                        .is_ok()
134                );
135            }
136        }
137        use $name::$name;
138    };
139}
140
141pub static ANYTHING_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(".*").unwrap());
142
143regex!(
144    IMEI_REGEX,
145    r"(?x)
146        \b
147            (\d{2}-?
148                \d{6}-?
149                \d{6}-?
150                \d{1,2})
151        \b
152    "
153);
154
155regex!(
156    MAC_REGEX,
157    r"(?x)
158        \b([[:xdigit:]]{2}[:-]){5}[[:xdigit:]]{2}\b
159    "
160);
161
162regex!(
163    UUID_REGEX,
164    r"(?ix)
165        \b
166        [a-z0-9]{8}-?
167        [a-z0-9]{4}-?
168        [a-z0-9]{4}-?
169        [a-z0-9]{4}-?
170        [a-z0-9]{12}
171        \b
172    "
173);
174
175regex!(
176    EMAIL_REGEX,
177    r"(?x)
178        \b
179            [a-zA-Z0-9.!\#$%&'*+/=?^_`{|}~-]+
180            @
181            [a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*
182        \b
183    "
184);
185
186regex!(
187    IBAN_REGEX,
188    r"(?x)
189        \b
190        (AT|AD|AE|AL|AZ|BA|BE|BG|BH|BR|BY|CH|CR|CY|CZ|DE|DK|DO|EE|EG|ES|FI|FO|FR|GB|GE|GI|GL|GR|GT|HR|HU|IE|IL|IQ|IS|IT|JO|KW|KZ|LB|LC|LI|LT|LU|LV|LY|MC|MD|ME|MK|MR|MT|MU|NL|NO|PK|PL|PS|PT|QA|RO|RU|RS|SA|SC|SE|SI|SK|SM|ST|SV|TL|TN|TR|UA|VA|VG|XK|DZ|AO|BJ|BF|BI|CV|CM|CF|TD|KM|CG|CI|DJ|GQ|GA|GW|HN|IR|MG|ML|MA|MZ|NI|NE|SN|TG)\d{2}[a-zA-Z0-9]{11,29}
191        \b
192    "
193);
194
195regex!(IPV4_REGEX, concat!("\\b", ip!(v4a), "\\b"));
196
197regex!(
198    IPV6_REGEX,
199    concat!(
200        "(?i)(?:[\\s]|[[:punct:]]|^)(",
201        "(",
202        ip!(v6s),
203        ":){7}",
204        ip!(v6s),
205        "|",
206        "(",
207        ip!(v6s),
208        ":){1,7}:|",
209        "(",
210        ip!(v6s),
211        ":){1,6}::",
212        ip!(v6s),
213        "|",
214        "(",
215        ip!(v6s),
216        ":){1,5}:(:",
217        ip!(v6s),
218        "){1,2}|",
219        "(",
220        ip!(v6s),
221        ":){1,4}:(:",
222        ip!(v6s),
223        "){1,3}|",
224        "(",
225        ip!(v6s),
226        ":){1,3}:(:",
227        ip!(v6s),
228        "){1,4}|",
229        "(",
230        ip!(v6s),
231        ":){1,2}:(:",
232        ip!(v6s),
233        "){1,5}|",
234        ip!(v6s),
235        ":((:",
236        ip!(v6s),
237        "){1,6})|",
238        ":((:",
239        ip!(v6s),
240        "){1,7}|:)|",
241        "fe80:(:",
242        ip!(v6s),
243        "){0,4}%[0-9a-zA-Z]{1,}",
244        "::(ffff(:0{1,4}){0,1}:){0,1}",
245        ip!(v4a),
246        "|",
247        "(",
248        ip!(v6s),
249        ":){1,4}:",
250        ip!(v4a),
251        ")([\\s]|[[:punct:]]|$)",
252    )
253);
254
255// http://www.richardsramblings.com/regex/credit-card-numbers/
256// Re-formatted with comments and dashes support
257//
258// Why so complicated? Because creditcard numbers are variable length and we do not want to
259// strip any number that just happens to have the same length.
260regex!(
261    CREDITCARD_REGEX,
262    r#"(?x)
263    \b(
264        (?:  # vendor specific prefixes
265                3[47]\d      # amex (no 13-digit version) (length: 15)
266            | 4\d{3}       # visa (16-digit version only)
267            | 5[1-5]\d\d   # mastercard
268            | 65\d\d       # discover network (subset)
269            | 6011         # discover network (subset)
270        )
271
272        # "wildcard" remainder (allowing dashes in every position because of variable length)
273        ([-\s]?\d){12}
274    )\b
275    "#
276);
277
278regex!(
279    PATH_REGEX,
280    r"(?ix)
281        (?:
282            (?:
283                \b(?:[a-zA-Z]:[\\/])?
284                (?:users|home|documents and settings|[^/\\]+[/\\]profiles)[\\/]
285            ) | (?:
286                /(?:home|users)/
287            )
288        )
289        (
290            [^/\\\r\n]+
291        )
292    "
293);
294
295regex!(
296    PEM_KEY_REGEX,
297    r"(?sx)
298        (?:
299            -----
300            BEGIN[A-Z\ ]+(?:PRIVATE|PUBLIC)\ KEY
301            -----
302            [\t\ ]*\r?\n?
303        )
304        (.+?)
305        (?:
306            \r?\n?
307            -----
308            END[A-Z\ ]+(?:PRIVATE|PUBLIC)\ KEY
309            -----
310        )
311    "
312);
313
314regex!(
315    URL_AUTH_REGEX,
316    r"(?x)
317        \b(?:
318            (?:[a-z0-9+-]+:)?//
319            ([a-zA-Z0-9%_.-]+(?::[a-zA-Z0-9%_.-]+)?)
320        )@
321    "
322);
323
324regex!(
325    US_SSN_REGEX,
326    r"(?x)
327        \b(
328            [0-9]{3}-
329            [0-9]{2}-
330            [0-9]{4}
331        )\b
332    "
333);
334
335regex!(BEARER_TOKEN_REGEX, r"(?i)\b(Bearer\s+)([^\s]+)");
336
337regex!(
338    PASSWORD_KEY_REGEX,
339    r"(?i)(password|secret|passwd|api_key|apikey|auth|credentials|mysql_pwd|privatekey|private_key|token)"
340);
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    #[test]
347    fn test_userpath_utf8_bytes() {
348        // This mimicks `apply_regex_to_utf8_bytes`, which is used in minidump scrubbing.
349        // Ideally we would not compile a regex on the fly for every minidump
350        // (either add another lazy static or remove the distinction entirely).
351        let regex = regex::bytes::RegexBuilder::new(PATH_REGEX.as_str())
352            .unicode(false)
353            .multi_line(false)
354            .dot_matches_new_line(true)
355            .build()
356            .unwrap();
357        assert!(regex.is_match(br"C:\\Users\jane\somefile"));
358    }
359}