relay_pii/
config.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet};
use std::sync::OnceLock;

use regex::{Regex, RegexBuilder};
use serde::{Deserialize, Deserializer, Serialize, Serializer};

use crate::{CompiledPiiConfig, Redaction, SelectorSpec};

const COMPILED_PATTERN_MAX_SIZE: usize = 262_144;

/// Helper method to check whether a flag is false.
#[allow(clippy::trivially_copy_pass_by_ref)]
pub(crate) fn is_flag_default(flag: &bool) -> bool {
    !*flag
}

/// An error returned when parsing [`PiiConfig`].
#[derive(Clone, Debug, thiserror::Error)]
pub enum PiiConfigError {
    /// A match pattern in a PII rule config could not be parsed.
    #[error("could not parse pattern")]
    RegexError(#[source] regex::Error),
}

/// Wrapper for the regex and the raw pattern string.
///
/// The regex will be compiled only when it used once, and the compiled version will be reused on
/// consecutive calls.
#[derive(Debug, Clone)]
pub struct LazyPattern {
    raw: Cow<'static, str>,
    case_insensitive: bool,
    pattern: OnceLock<Result<Regex, PiiConfigError>>,
}

impl PartialEq for LazyPattern {
    fn eq(&self, other: &Self) -> bool {
        self.raw.to_lowercase() == other.raw.to_lowercase()
    }
}

impl LazyPattern {
    /// Create a new [`LazyPattern`] from a raw string.
    pub fn new<S>(raw: S) -> Self
    where
        Cow<'static, str>: From<S>,
    {
        Self {
            raw: raw.into(),
            case_insensitive: false,
            pattern: OnceLock::new(),
        }
    }

    /// Change the case sensativity settings for the underlying regex.
    ///
    /// It's possible to set the case sensativity on already compiled [`LazyPattern`], which will
    /// be recompiled (re-built) once it's used again.
    pub fn case_insensitive(mut self, value: bool) -> Self {
        self.case_insensitive = value;
        self.pattern.take();
        self
    }

    /// Compiles the regex from the internal raw string.
    pub fn compiled(&self) -> Result<&Regex, &PiiConfigError> {
        self.pattern
            .get_or_init(|| {
                let regex_result = RegexBuilder::new(&self.raw)
                    .size_limit(COMPILED_PATTERN_MAX_SIZE)
                    .case_insensitive(self.case_insensitive)
                    .build()
                    .map_err(PiiConfigError::RegexError);

                if let Err(ref error) = regex_result {
                    relay_log::error!(
                        error = error as &dyn std::error::Error,
                        "unable to compile pattern into regex"
                    );
                }
                regex_result
            })
            .as_ref()
    }
}

impl From<&'static str> for LazyPattern {
    fn from(pattern: &'static str) -> LazyPattern {
        LazyPattern::new(pattern)
    }
}

impl Serialize for LazyPattern {
    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
        serializer.serialize_str(&self.raw)
    }
}

impl<'de> Deserialize<'de> for LazyPattern {
    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
        let raw = String::deserialize(deserializer)?;
        Ok(LazyPattern::new(raw))
    }
}

#[allow(clippy::unnecessary_wraps)]
fn replace_groups_default() -> Option<BTreeSet<u8>> {
    let mut set = BTreeSet::new();
    set.insert(0);
    Some(set)
}

/// A rule that matches a regex pattern.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct PatternRule {
    /// The regular expression to apply.
    pub pattern: LazyPattern,
    /// The match group indices to replace.
    #[serde(default = "replace_groups_default")]
    pub replace_groups: Option<BTreeSet<u8>>,
}

/// A rule that dispatches to multiple other rules.
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct MultipleRule {
    /// A reference to other rules to apply
    pub rules: Vec<String>,
    /// When set to true, the outer rule is reported.
    #[serde(default, skip_serializing_if = "is_flag_default")]
    pub hide_inner: bool,
}

/// An alias for another rule.
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct AliasRule {
    /// A reference to another rule to apply.
    pub rule: String,
    /// When set to true, the outer rule is reported.
    #[serde(default, skip_serializing_if = "is_flag_default")]
    pub hide_inner: bool,
}

/// A pair redaction rule.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct RedactPairRule {
    /// A pattern to match for keys.
    pub key_pattern: LazyPattern,
}

/// Supported scrubbing rules.
#[derive(Deserialize, Serialize, Debug, Clone, PartialEq)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum RuleType {
    /// Matches any value.
    Anything,
    /// Applies a regular expression.
    Pattern(PatternRule),
    /// Matchse an IMEI or IMEISV
    Imei,
    /// Matches a mac address
    Mac,
    /// Matches a UUID
    Uuid,
    /// Matches an email
    Email,
    /// Matches any IP address
    Ip,
    /// Matches a creditcard number
    Creditcard,
    /// Matches an IBAN
    Iban,
    /// Sanitizes a path from user data
    Userpath,
    /// A PEM encoded key
    Pemkey,
    /// Auth info from URLs
    UrlAuth,
    /// US SSN.
    UsSsn,
    /// Keys that look like passwords
    Password,
    /// When a regex matches a key, a value is removed
    #[serde(alias = "redactPair")]
    RedactPair(RedactPairRule),
    /// Applies multiple rules.
    Multiple(MultipleRule),
    /// Applies another rule.  Works like a single multiple.
    Alias(AliasRule),
    /// Unknown ruletype for forward compatibility
    Unknown(String),
}

/// A single rule configuration.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct RuleSpec {
    /// The matching rule to apply on fields.
    #[serde(flatten)]
    pub ty: RuleType,

    /// The redaction to apply on matched fields.
    #[serde(default)]
    pub redaction: Redaction,
}

/// Configuration for rule parameters.
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct Vars {
    /// The default secret key for hashing operations.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub hash_key: Option<String>,
}

impl Vars {
    fn is_empty(&self) -> bool {
        self.hash_key.is_none()
    }
}

/// A set of named rule configurations.
#[derive(Serialize, Deserialize, Debug, Default, Clone)]
pub struct PiiConfig {
    /// A map of custom PII rules.
    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
    pub rules: BTreeMap<String, RuleSpec>,

    /// Parameters for PII rules.
    #[serde(default, skip_serializing_if = "Vars::is_empty")]
    pub vars: Vars,

    /// Mapping of selectors to rules.
    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
    pub applications: BTreeMap<SelectorSpec, Vec<String>>,

    /// PII config derived from datascrubbing settings.
    ///
    /// Cached because the conversion process is expensive.
    #[serde(skip)]
    pub(super) compiled: OnceLock<CompiledPiiConfig>,
}

impl PartialEq for PiiConfig {
    fn eq(&self, other: &PiiConfig) -> bool {
        // This is written in this way such that people will not forget to update this PartialEq
        // impl when they add more fields.
        let PiiConfig {
            rules,
            vars,
            applications,
            compiled: _compiled,
        } = &self;

        rules == &other.rules && vars == &other.vars && applications == &other.applications
    }
}

impl PiiConfig {
    /// Get a representation of this `PiiConfig` that is more (CPU-)efficient for processing.
    ///
    /// This can be computationally expensive when called for the first time. The result is cached
    /// internally and reused on the second call.
    pub fn compiled(&self) -> &CompiledPiiConfig {
        self.compiled.get_or_init(|| self.compiled_uncached())
    }

    /// Like [`compiled`](Self::compiled) but without internal caching.
    #[inline]
    pub fn compiled_uncached(&self) -> CompiledPiiConfig {
        CompiledPiiConfig::new(self)
    }
}