document_pii/
pii_finder.rs

1//! This module takes the types and paths from 'item_collector' module, and will recursively find
2//! all the fields with the specified PII-value, and save those fields with the full path.
3//!
4//! E.g. `MyStruct.sub_struct.mystery_field.bitcoin_wallet_key`, meaning you can find the full path
5//! from the top-level type all the way down to wherever the field with the correct PII-value resides.
6
7use std::collections::{BTreeMap, BTreeSet, HashMap};
8
9use anyhow::anyhow;
10use proc_macro2::TokenTree;
11use syn::visit::Visit;
12use syn::{Attribute, Field, ItemEnum, ItemStruct, Meta, Path, Type, TypePath};
13
14use crate::EnumOrStruct;
15
16/// The name of a field along with its type. Used for the path to a PII-field.
17#[derive(Ord, PartialOrd, PartialEq, Eq, Hash, Clone, Debug, Default)]
18pub struct TypeAndField {
19    // Full path of a type. E.g. relay_common::protocol::Event, rather than just 'Event'.
20    pub qualified_type_name: String,
21    pub field_ident: String,
22}
23
24#[derive(Ord, PartialOrd, PartialEq, Eq, Hash, Clone, Debug, Default)]
25pub struct FieldsWithAttribute {
26    pub type_and_fields: Vec<TypeAndField>,
27    pub attributes: BTreeMap<String, Option<String>>,
28}
29
30impl FieldsWithAttribute {
31    pub fn has_attribute(&self, key: &str, expected_values: Option<&Vec<String>>) -> bool {
32        let actual_value = match self.attributes.get(key) {
33            Some(value) => value,
34            None => return false,
35        };
36
37        match (expected_values, actual_value) {
38            (None, None) => true,
39            (Some(expected_values), Some(actual_value)) => expected_values
40                .iter()
41                .any(|expected_value| expected_value == actual_value),
42            (_, _) => false,
43        }
44    }
45}
46
47fn get_type_paths_from_type(ty: &Type, type_paths: &mut Vec<TypePath>) {
48    match ty {
49        Type::Path(path) => type_paths.push(path.clone()),
50        Type::Reference(reference) => get_type_paths_from_type(&reference.elem, type_paths),
51        Type::Array(arr) => get_type_paths_from_type(&arr.elem, type_paths),
52        Type::BareFn(bare_fn) => bare_fn
53            .inputs
54            .iter()
55            .for_each(|ty| get_type_paths_from_type(&ty.ty, type_paths)),
56        Type::Group(group) => get_type_paths_from_type(&group.elem, type_paths),
57        Type::Paren(paren) => get_type_paths_from_type(&paren.elem, type_paths),
58        Type::Ptr(ptr) => get_type_paths_from_type(&ptr.elem, type_paths),
59        Type::Slice(slice) => get_type_paths_from_type(&slice.elem, type_paths),
60        Type::Tuple(tuple) => tuple
61            .elems
62            .iter()
63            .for_each(|ty| get_type_paths_from_type(ty, type_paths)),
64        Type::Verbatim(_)
65        | Type::TraitObject(_)
66        | Type::ImplTrait(_)
67        | Type::Infer(_)
68        | Type::Macro(_)
69        | Type::Never(_) => {}
70        _ => {}
71    }
72}
73
74/// This is the visitor that actually generates the pii_types, it has a lot of associated data
75/// because it using the Visit trait from syn-crate means I cannot add data as arguments.
76/// The 'pii_types' field can be regarded as the output.
77pub struct PiiFinder<'a> {
78    /// Module path of a type is the full path up to the type itself.
79    ///
80    /// Example: `relay_event_schema::protocol::Event` -> `relay_event_schema::protocol`
81    pub module_path: String,
82    pub current_type: String,
83    pub all_types: &'a HashMap<String, EnumOrStruct>,
84    // The full paths of rust types either defined in the module or brought in to scope with a use-statement.
85    pub scoped_paths: &'a BTreeMap<String, BTreeSet<String>>,
86    pub current_path: Vec<TypeAndField>,
87    pub pii_types: BTreeSet<FieldsWithAttribute>, // output
88}
89
90impl<'a> PiiFinder<'a> {
91    pub fn new(
92        path: &str,
93        all_types: &'a HashMap<String, EnumOrStruct>,
94        scoped_paths: &'a BTreeMap<String, BTreeSet<String>>,
95    ) -> anyhow::Result<Self> {
96        let module_path = path
97            .rsplit_once("::")
98            .ok_or_else(|| anyhow!("invalid module path: {}", path))?
99            .0
100            .to_owned();
101
102        Ok(Self {
103            module_path,
104            current_type: String::new(),
105            all_types,
106            scoped_paths,
107            current_path: vec![],
108            pii_types: BTreeSet::new(),
109        })
110    }
111
112    fn visit_type_path(&mut self, path: &TypePath) {
113        let scoped_paths = self.scoped_paths.get(&self.module_path).unwrap().clone();
114
115        let mut field_types = BTreeSet::new();
116        get_field_types(&path.path, &mut field_types);
117
118        let use_paths = get_matching_scoped_paths(&field_types, &scoped_paths);
119        for use_path in use_paths {
120            if let Some(enum_or_struct) = self.all_types.get(use_path).cloned() {
121                // Theses values will be changed when recursing, so we save them here so when we
122                // return to this function after the match statement, we can set them back.
123                let current_type = self.current_type.clone();
124                let module_path = self.module_path.clone();
125                use_path
126                    .rsplit_once("::")
127                    .unwrap()
128                    .0
129                    .clone_into(&mut self.module_path);
130
131                match enum_or_struct {
132                    EnumOrStruct::Struct(itemstruct) => self.visit_item_struct(&itemstruct),
133                    EnumOrStruct::Enum(itemenum) => self.visit_item_enum(&itemenum),
134                }
135
136                self.module_path = module_path;
137                self.current_type = current_type;
138            }
139        }
140    }
141
142    fn visit_field_types(&mut self, ty: &Type) {
143        let mut type_paths = vec![];
144        get_type_paths_from_type(ty, &mut type_paths);
145
146        for path in type_paths {
147            self.visit_type_path(&path);
148        }
149    }
150
151    /// Checks if the type we are on has already been visited, this is to avoid infinite recursion.
152    fn is_current_type_already_visited(&self) -> bool {
153        self.current_path
154            .iter()
155            .any(|ty| ty.qualified_type_name == self.current_type)
156    }
157}
158
159impl<'ast> Visit<'ast> for PiiFinder<'_> {
160    fn visit_item_struct(&mut self, node: &'ast ItemStruct) {
161        self.current_type = node.ident.to_string();
162        if !self.is_current_type_already_visited() {
163            for field in node.fields.iter() {
164                self.visit_field(field);
165            }
166        }
167    }
168
169    fn visit_item_enum(&mut self, node: &'ast ItemEnum) {
170        self.current_type = node.ident.to_string();
171        if !self.is_current_type_already_visited() {
172            for variant in node.variants.iter() {
173                for field in variant.fields.iter() {
174                    self.visit_field(field);
175                }
176            }
177        }
178    }
179
180    fn visit_field(&mut self, node: &'ast Field) {
181        // Every time we visit a field, we have to append the field to the current_path, it gets
182        // popped in the end of this function. This is done so that we can store the full path
183        // whenever the field matches a correct PII value.
184        self.current_path.push(TypeAndField {
185            qualified_type_name: self.current_type.clone(),
186            field_ident: node
187                .clone()
188                .ident
189                .map(|x| x.to_string())
190                .unwrap_or_else(|| "{{Unnamed}}".to_string()),
191        });
192
193        let mut all_attributes = BTreeMap::new();
194        for attr in &node.attrs {
195            if let Some(mut attributes) = get_attributes(attr, "metastructure") {
196                all_attributes.append(&mut attributes);
197            }
198        }
199
200        if !all_attributes.is_empty() {
201            self.pii_types.insert(FieldsWithAttribute {
202                type_and_fields: self.current_path.clone(),
203                attributes: all_attributes,
204            });
205        }
206
207        // Recursively diving into the types of the field to look for more PII-fields.
208        self.visit_field_types(&node.ty);
209
210        self.current_path.pop();
211    }
212}
213
214/// Finds the full path to the given types by comparing them to the types in the scope.
215fn get_matching_scoped_paths<'a>(
216    field_types: &'a BTreeSet<String>,
217    scoped_paths: &'a BTreeSet<String>,
218) -> Vec<&'a String> {
219    scoped_paths
220        .iter()
221        .filter(|use_path| {
222            let last_use_path = use_path.split("::").last().unwrap().trim();
223            field_types
224                .iter()
225                .any(|field_type| field_type.trim() == last_use_path)
226        })
227        .collect()
228}
229
230/// This function extracts the type names from a complex type and stores them in a BTreeSet.
231/// It's designed to handle nested generic types, such as `Foo<Bar<Baz>>`, and return ["Foo", "Bar", "Baz"].
232fn get_field_types(path: &Path, segments: &mut BTreeSet<String>) {
233    // Iterating over path segments allows us to handle complex, possibly nested types
234    let mut path_iter = path.segments.iter();
235    if let Some(first_segment) = path_iter.next() {
236        let mut ident = first_segment.ident.to_string();
237
238        // Recursion on AngleBracketed args is necessary for nested generic types
239        if let syn::PathArguments::AngleBracketed(angle_bracketed) = &first_segment.arguments {
240            for generic_arg in angle_bracketed.args.iter() {
241                if let syn::GenericArgument::Type(Type::Path(path)) = generic_arg {
242                    get_field_types(&path.path, segments);
243                }
244            }
245        }
246
247        // Namespace resolution: if a second segment exists, it's part of the first type's namespace
248        if let Some(second_segment) = path_iter.next() {
249            ident.push_str("::");
250            ident.push_str(&second_segment.ident.to_string());
251        }
252        segments.insert(ident);
253    }
254}
255
256/// Collects all the attributes from a given field.
257fn get_attributes(attr: &Attribute, ident: &str) -> Option<BTreeMap<String, Option<String>>> {
258    let meta_list = match &attr.meta {
259        Meta::List(meta_list) => meta_list,
260        _ => return None,
261    };
262
263    // Checks name of attribute, E.g. 'metastructure'
264    if !meta_list.path.is_ident(ident) {
265        return None;
266    }
267
268    let mut attributes = BTreeMap::<String, Option<String>>::new();
269
270    let mut ident = String::new();
271    let mut literal = None;
272    for token in meta_list.tokens.clone().into_iter() {
273        match token {
274            TokenTree::Ident(new_ident) => {
275                if !ident.is_empty() {
276                    attributes.insert(ident.clone(), literal.clone());
277                }
278                ident = new_ident.to_string();
279                literal = None;
280            }
281            TokenTree::Literal(lit) => {
282                let mut as_string = lit.to_string();
283
284                // remove quotes
285                as_string.remove(0);
286                as_string.pop();
287
288                literal = Some(as_string);
289            }
290            TokenTree::Group(_) | TokenTree::Punct(_) => {}
291        }
292    }
293
294    if !ident.is_empty() {
295        attributes.insert(ident, literal);
296    }
297
298    Some(attributes)
299}