relay_pii/
regexes.rs

1use std::sync::LazyLock;
2
3use regex::Regex;
4use smallvec::{SmallVec, smallvec};
5
6use crate::config::RuleType;
7
8#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
9pub enum PatternType {
10    /// Pattern-match on key and value
11    KeyValue,
12    /// Pattern-match on value only
13    Value,
14}
15
16/// What to do with regex matches once found.
17#[derive(Clone, Debug, Eq, PartialEq)]
18pub enum ReplaceBehavior {
19    /// Replace the entire string value (or just more than the match, depending on context).
20    Value,
21
22    /// Replace the following specific regex groups.
23    Groups(SmallVec<[u8; 1]>),
24}
25
26impl ReplaceBehavior {
27    /// Replace the entire string value (or just more than the match, depending on context).
28    pub fn replace_value() -> Self {
29        ReplaceBehavior::Value
30    }
31
32    /// Replace the entire match, equivalent to `ReplaceBehavior::Groups([0])`.
33    pub fn replace_match() -> Self {
34        ReplaceBehavior::replace_group(0)
35    }
36
37    /// Replace the following singular regex group.
38    pub fn replace_group(g: u8) -> Self {
39        ReplaceBehavior::Groups(smallvec![g])
40    }
41
42    /// Replace the following specific regex groups.
43    pub fn replace_groups(gs: SmallVec<[u8; 1]>) -> Self {
44        ReplaceBehavior::Groups(gs)
45    }
46}
47
48/// Return a list of regexes to apply for the given rule type.
49pub fn get_regex_for_rule_type(
50    ty: &RuleType,
51) -> SmallVec<[(PatternType, &Regex, ReplaceBehavior); 2]> {
52    let v = PatternType::Value;
53    let kv = PatternType::KeyValue;
54
55    match ty {
56        RuleType::RedactPair(redact_pair) => {
57            if let Ok(pattern) = redact_pair.key_pattern.compiled() {
58                smallvec![(kv, pattern, ReplaceBehavior::replace_value())]
59            } else {
60                smallvec![]
61            }
62        }
63        RuleType::Password => {
64            smallvec![(kv, &*PASSWORD_KEY_REGEX, ReplaceBehavior::replace_value())]
65        }
66        RuleType::Anything => smallvec![(v, &*ANYTHING_REGEX, ReplaceBehavior::replace_match())],
67        RuleType::Pattern(r) => {
68            let replace_behavior = match r.replace_groups {
69                Some(ref groups) => {
70                    ReplaceBehavior::replace_groups(groups.iter().copied().collect())
71                }
72                None => ReplaceBehavior::replace_match(),
73            };
74            if let Ok(pattern) = r.pattern.compiled() {
75                smallvec![(v, pattern, replace_behavior)]
76            } else {
77                smallvec![]
78            }
79        }
80
81        RuleType::Imei => smallvec![(v, &*IMEI_REGEX, ReplaceBehavior::replace_match())],
82        RuleType::Mac => smallvec![(v, &*MAC_REGEX, ReplaceBehavior::replace_match())],
83        RuleType::Uuid => smallvec![(v, &*UUID_REGEX, ReplaceBehavior::replace_match())],
84        RuleType::Email => smallvec![(v, &*EMAIL_REGEX, ReplaceBehavior::replace_match())],
85        RuleType::Iban => smallvec![(v, &*IBAN_REGEX, ReplaceBehavior::replace_match())],
86        RuleType::Ip => smallvec![
87            (v, &*IPV4_REGEX, ReplaceBehavior::replace_match()),
88            (v, &*IPV6_REGEX, ReplaceBehavior::replace_group(1)),
89        ],
90        RuleType::Creditcard => {
91            smallvec![(v, &*CREDITCARD_REGEX, ReplaceBehavior::replace_match())]
92        }
93        RuleType::Pemkey => smallvec![(v, &*PEM_KEY_REGEX, ReplaceBehavior::replace_group(1))],
94        RuleType::UrlAuth => smallvec![(v, &*URL_AUTH_REGEX, ReplaceBehavior::replace_group(1))],
95        RuleType::UsSsn => smallvec![(v, &*US_SSN_REGEX, ReplaceBehavior::replace_match())],
96        RuleType::Userpath => smallvec![(v, &*PATH_REGEX, ReplaceBehavior::replace_group(1))],
97
98        // These ought to have been resolved in CompiledConfig
99        RuleType::Alias(_) | RuleType::Multiple(_) | RuleType::Unknown(_) => smallvec![],
100    }
101}
102
103#[rustfmt::skip]
104macro_rules! ip {
105    (v4s) => { "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" };
106    (v4a) => { concat!(ip!(v4s), "\\.", ip!(v4s), "\\.", ip!(v4s), "\\.", ip!(v4s)) };
107    (v6s) => { "[0-9a-fA-F]{1,4}" };
108}
109
110macro_rules! regex {
111    ($name:ident, $rule:expr) => {
112        #[allow(non_snake_case)]
113        mod $name {
114            use super::*;
115            pub static $name: LazyLock<Regex> = LazyLock::new(|| Regex::new($rule).unwrap());
116
117            #[test]
118            fn supports_byte_mode() {
119                assert!(
120                    regex::bytes::RegexBuilder::new($name.as_str())
121                        .unicode(false)
122                        .multi_line(false)
123                        .dot_matches_new_line(true)
124                        .build()
125                        .is_ok()
126                );
127            }
128        }
129        use $name::$name;
130    };
131}
132
133pub static ANYTHING_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(".*").unwrap());
134
135regex!(
136    IMEI_REGEX,
137    r"(?x)
138        \b
139            (\d{2}-?
140                \d{6}-?
141                \d{6}-?
142                \d{1,2})
143        \b
144    "
145);
146
147regex!(
148    MAC_REGEX,
149    r"(?x)
150        \b([[:xdigit:]]{2}[:-]){5}[[:xdigit:]]{2}\b
151    "
152);
153
154regex!(
155    UUID_REGEX,
156    r"(?ix)
157        \b
158        [a-z0-9]{8}-?
159        [a-z0-9]{4}-?
160        [a-z0-9]{4}-?
161        [a-z0-9]{4}-?
162        [a-z0-9]{12}
163        \b
164    "
165);
166
167regex!(
168    EMAIL_REGEX,
169    r"(?x)
170        \b
171            [a-zA-Z0-9.!\#$%&'*+/=?^_`{|}~-]+
172            @
173            [a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*
174        \b
175    "
176);
177
178regex!(
179    IBAN_REGEX,
180    r"(?x)
181        \b
182        (AT|AD|AE|AL|AZ|BA|BE|BG|BH|BR|BY|CH|CR|CY|CZ|DE|DK|DO|EE|EG|ES|FI|FO|FR|GB|GE|GI|GL|GR|GT|HR|HU|IE|IL|IQ|IS|IT|JO|KW|KZ|LB|LC|LI|LT|LU|LV|LY|MC|MD|ME|MK|MR|MT|MU|NL|NO|PK|PL|PS|PT|QA|RO|RU|RS|SA|SC|SE|SI|SK|SM|ST|SV|TL|TN|TR|UA|VA|VG|XK|DZ|AO|BJ|BF|BI|CV|CM|CF|TD|KM|CG|CI|DJ|GQ|GA|GW|HN|IR|MG|ML|MA|MZ|NI|NE|SN|TG)\d{2}[a-zA-Z0-9]{11,29}
183        \b
184    "
185);
186
187regex!(IPV4_REGEX, concat!("\\b", ip!(v4a), "\\b"));
188
189regex!(
190    IPV6_REGEX,
191    concat!(
192        "(?i)(?:[\\s]|[[:punct:]]|^)(",
193        "(",
194        ip!(v6s),
195        ":){7}",
196        ip!(v6s),
197        "|",
198        "(",
199        ip!(v6s),
200        ":){1,7}:|",
201        "(",
202        ip!(v6s),
203        ":){1,6}::",
204        ip!(v6s),
205        "|",
206        "(",
207        ip!(v6s),
208        ":){1,5}:(:",
209        ip!(v6s),
210        "){1,2}|",
211        "(",
212        ip!(v6s),
213        ":){1,4}:(:",
214        ip!(v6s),
215        "){1,3}|",
216        "(",
217        ip!(v6s),
218        ":){1,3}:(:",
219        ip!(v6s),
220        "){1,4}|",
221        "(",
222        ip!(v6s),
223        ":){1,2}:(:",
224        ip!(v6s),
225        "){1,5}|",
226        ip!(v6s),
227        ":((:",
228        ip!(v6s),
229        "){1,6})|",
230        ":((:",
231        ip!(v6s),
232        "){1,7}|:)|",
233        "fe80:(:",
234        ip!(v6s),
235        "){0,4}%[0-9a-zA-Z]{1,}",
236        "::(ffff(:0{1,4}){0,1}:){0,1}",
237        ip!(v4a),
238        "|",
239        "(",
240        ip!(v6s),
241        ":){1,4}:",
242        ip!(v4a),
243        ")([\\s]|[[:punct:]]|$)",
244    )
245);
246
247// http://www.richardsramblings.com/regex/credit-card-numbers/
248// Re-formatted with comments and dashes support
249//
250// Why so complicated? Because creditcard numbers are variable length and we do not want to
251// strip any number that just happens to have the same length.
252regex!(
253    CREDITCARD_REGEX,
254    r#"(?x)
255    \b(
256        (?:  # vendor specific prefixes
257                3[47]\d      # amex (no 13-digit version) (length: 15)
258            | 4\d{3}       # visa (16-digit version only)
259            | 5[1-5]\d\d   # mastercard
260            | 65\d\d       # discover network (subset)
261            | 6011         # discover network (subset)
262        )
263
264        # "wildcard" remainder (allowing dashes in every position because of variable length)
265        ([-\s]?\d){12}
266    )\b
267    "#
268);
269
270regex!(
271    PATH_REGEX,
272    r"(?ix)
273        (?:
274            (?:
275                \b(?:[a-zA-Z]:[\\/])?
276                (?:users|home|documents and settings|[^/\\]+[/\\]profiles)[\\/]
277            ) | (?:
278                /(?:home|users)/
279            )
280        )
281        (
282            [^/\\\r\n]+
283        )
284    "
285);
286
287regex!(
288    PEM_KEY_REGEX,
289    r"(?sx)
290        (?:
291            -----
292            BEGIN[A-Z\ ]+(?:PRIVATE|PUBLIC)\ KEY
293            -----
294            [\t\ ]*\r?\n?
295        )
296        (.+?)
297        (?:
298            \r?\n?
299            -----
300            END[A-Z\ ]+(?:PRIVATE|PUBLIC)\ KEY
301            -----
302        )
303    "
304);
305
306regex!(
307    URL_AUTH_REGEX,
308    r"(?x)
309        \b(?:
310            (?:[a-z0-9+-]+:)?//
311            ([a-zA-Z0-9%_.-]+(?::[a-zA-Z0-9%_.-]+)?)
312        )@
313    "
314);
315
316regex!(
317    US_SSN_REGEX,
318    r"(?x)
319        \b(
320            [0-9]{3}-
321            [0-9]{2}-
322            [0-9]{4}
323        )\b
324    "
325);
326
327regex!(
328    PASSWORD_KEY_REGEX,
329    r"(?i)(password|secret|passwd|api_key|apikey|auth|credentials|mysql_pwd|privatekey|private_key|token|bearer)"
330);
331
332#[cfg(test)]
333mod tests {
334    use super::*;
335
336    #[test]
337    fn test_userpath_utf8_bytes() {
338        // This mimicks `apply_regex_to_utf8_bytes`, which is used in minidump scrubbing.
339        // Ideally we would not compile a regex on the fly for every minidump
340        // (either add another lazy static or remove the distinction entirely).
341        let regex = regex::bytes::RegexBuilder::new(PATH_REGEX.as_str())
342            .unicode(false)
343            .multi_line(false)
344            .dot_matches_new_line(true)
345            .build()
346            .unwrap();
347        assert!(regex.is_match(br"C:\\Users\jane\somefile"));
348    }
349}