document_pii/
item_collector.rs

1//! Contains the helper types and functions which help to iterate over all the given rust files,
2//! collect all of the full path names and the actual AST-node for the types defined in those paths.
3//! This is later needed for finding PII fields recursively.
4
5use std::collections::{BTreeMap, BTreeSet, HashMap};
6use std::fs::{self, DirEntry};
7use std::io::BufRead;
8use std::path::{Path, PathBuf};
9
10use anyhow::anyhow;
11use syn::punctuated::Punctuated;
12use syn::visit::Visit;
13use syn::{ItemEnum, ItemStruct, UseTree};
14
15use crate::EnumOrStruct;
16use crate::pii_finder::{FieldsWithAttribute, PiiFinder};
17
18pub struct TypesAndScopedPaths {
19    // Maps the path name of an item to its actual AST node.
20    pub all_types: HashMap<String, EnumOrStruct>,
21    // Maps the paths in scope to different modules. For use in constructing the full path
22    // of an item from its type name.
23    pub scoped_paths: BTreeMap<String, BTreeSet<String>>,
24}
25
26impl TypesAndScopedPaths {
27    pub fn find_pii_fields(
28        &self,
29        type_path: Option<&str>,
30        pii_values: &Vec<String>,
31    ) -> anyhow::Result<BTreeSet<FieldsWithAttribute>> {
32        let fields = match type_path {
33            // If user provides path to an item, find PII_fields under this item in particular.
34            Some(path) => self.find_pii_fields_of_type(path),
35            // If no item is provided, find PII fields of all types in crate/workspace.
36            None => self.find_pii_fields_of_all_types(),
37        }?;
38
39        Ok(fields
40            .into_iter()
41            .filter(|pii| {
42                pii.has_attribute("pii", Some(pii_values))
43                    && (pii.has_attribute("retain", Some(&vec!["true".to_owned()]))
44                        || !pii.has_attribute("additional_properties", None))
45            })
46            .collect())
47    }
48
49    /// Finds all the PII fields recursively of a given type.
50    fn find_pii_fields_of_type(
51        &self,
52        type_path: &str,
53    ) -> anyhow::Result<BTreeSet<FieldsWithAttribute>> {
54        let mut visitor = PiiFinder::new(type_path, &self.all_types, &self.scoped_paths)?;
55
56        let value = &self
57            .all_types
58            .get(type_path)
59            .ok_or_else(|| anyhow!("Unable to find item with following path: {}", type_path))?;
60
61        match value {
62            EnumOrStruct::Struct(itemstruct) => visitor.visit_item_struct(itemstruct),
63            EnumOrStruct::Enum(itemenum) => visitor.visit_item_enum(itemenum),
64        };
65        Ok(visitor.pii_types)
66    }
67
68    /// Finds all the PII fields recursively of all the types in the rust crate/workspace.
69    fn find_pii_fields_of_all_types(&self) -> anyhow::Result<BTreeSet<FieldsWithAttribute>> {
70        let mut pii_types = BTreeSet::new();
71
72        for type_path in self.all_types.keys() {
73            pii_types.extend(self.find_pii_fields_of_type(type_path)?);
74        }
75
76        Ok(pii_types)
77    }
78}
79
80/// The types and use statements items collected from the rust files.
81#[derive(Default)]
82pub struct AstItemCollector {
83    module_path: String,
84    /// Maps from the full path of a type to its AST node.
85    all_types: HashMap<String, EnumOrStruct>,
86    /// Maps from a module_path to all the types that are in the module's scope.
87    scoped_paths: BTreeMap<String, BTreeSet<String>>,
88}
89
90impl AstItemCollector {
91    fn insert_scoped_paths(&mut self, use_statements: Vec<String>) {
92        self.scoped_paths
93            .entry(self.module_path.clone())
94            .or_default()
95            .extend(use_statements);
96    }
97
98    /// Gets both a mapping of the full path to a type and its actual AST node, and also the
99    /// use_statements in its module, which is needed to fetch the types that it referes to in its
100    /// fields.
101    pub fn collect(paths: &[PathBuf]) -> anyhow::Result<TypesAndScopedPaths> {
102        let mut visitor = Self::default();
103
104        visitor.visit_files(paths)?;
105
106        Ok(TypesAndScopedPaths {
107            all_types: visitor.all_types,
108            scoped_paths: visitor.scoped_paths,
109        })
110    }
111
112    fn visit_files(&mut self, paths: &[PathBuf]) -> anyhow::Result<()> {
113        for path in paths {
114            self.module_path = module_name_from_file(path)?;
115
116            let syntax_tree: syn::File = {
117                let file_content = fs::read_to_string(path.as_path())?;
118                syn::parse_file(&file_content)?
119            };
120
121            self.visit_file(&syntax_tree);
122        }
123        Ok(())
124    }
125}
126
127impl<'ast> Visit<'ast> for AstItemCollector {
128    fn visit_item_struct(&mut self, node: &'ast ItemStruct) {
129        let struct_name = format!("{}::{}", self.module_path, node.ident);
130        self.insert_scoped_paths(vec![struct_name.clone()]);
131        self.all_types
132            .insert(struct_name, EnumOrStruct::Struct(node.clone()));
133    }
134
135    fn visit_item_enum(&mut self, node: &'ast ItemEnum) {
136        let enum_name = format!("{}::{}", self.module_path, node.ident);
137        self.insert_scoped_paths(vec![enum_name.clone()]);
138        self.all_types
139            .insert(enum_name, EnumOrStruct::Enum(node.clone()));
140    }
141
142    fn visit_item_use(&mut self, i: &'ast syn::ItemUse) {
143        let use_statements = usetree_to_paths(&i.tree, &self.module_path)
144            .iter()
145            .filter(|s| s.contains("relay"))
146            .cloned()
147            .collect();
148
149        self.insert_scoped_paths(use_statements);
150    }
151}
152
153fn normalize_type_path(mut path: String, crate_root: &str, module_path: &str) -> String {
154    path = path
155        .replace(' ', "")
156        .replace('-', "_")
157        .replace("crate::", &format!("{crate_root}::"));
158
159    if path.contains("super::") {
160        let parent_module = {
161            let mut parts = module_path.split("::").collect::<Vec<_>>();
162            parts.pop();
163            parts.join("::")
164        };
165        path = path.replace("super::", &parent_module);
166    }
167    path
168}
169
170/// First flattens the UseTree and then normalizing the paths.
171fn usetree_to_paths(use_tree: &UseTree, module_path: &str) -> Vec<String> {
172    let crate_root = module_path.split_once("::").map_or(module_path, |s| s.0);
173    let paths = flatten_use_tree(
174        syn::Path {
175            leading_colon: None,
176            segments: Punctuated::new(),
177        },
178        use_tree,
179    );
180
181    paths
182        .into_iter()
183        .map(|path| normalize_type_path(path, crate_root, module_path))
184        .collect()
185}
186
187/// Flattens a usetree.
188///
189/// For example: `use protocol::{Foo, Bar, Baz}` into `[protocol::Foo, protocol::Bar,
190/// protocol::Baz]`.
191fn flatten_use_tree(mut leading_path: syn::Path, use_tree: &UseTree) -> Vec<String> {
192    match use_tree {
193        UseTree::Path(use_path) => {
194            leading_path.segments.push(use_path.ident.clone().into());
195            flatten_use_tree(leading_path, &use_path.tree)
196        }
197        UseTree::Name(use_name) => {
198            leading_path.segments.push(use_name.ident.clone().into());
199            vec![quote::quote!(#leading_path).to_string()]
200        }
201        UseTree::Group(use_group) => {
202            let mut paths = Vec::new();
203            for item in &use_group.items {
204                paths.extend(flatten_use_tree(leading_path.clone(), item));
205            }
206            paths
207        }
208
209        UseTree::Rename(use_rename) => {
210            leading_path.segments.push(use_rename.rename.clone().into());
211            vec![quote::quote!(#leading_path).to_string()]
212        }
213        // Currently this script can't handle glob imports, which, we shouldn't use anyway.
214        UseTree::Glob(_) => vec![quote::quote!(#leading_path).to_string()],
215    }
216}
217
218fn crate_name_from_file(file_path: &Path) -> anyhow::Result<String> {
219    // We know the crate_name is located like home/foo/bar/crate_name/src/...
220    // We therefore first find the index of the '/' to the left of src, then we find the index
221    // of the '/' to the left of that, and the crate_name will be whats between those indexes.
222    let file_str = file_path.to_string_lossy();
223
224    let src_index = file_str
225        .find("/src/")
226        .or_else(|| file_str.find("\\src\\"))
227        .ok_or_else(|| {
228            anyhow!(
229                "Invalid file path (missing '/src/' or '\\src\\'): {}",
230                file_path.display()
231            )
232        })?;
233
234    let back_index = file_str[..src_index]
235        .rfind('/')
236        .or_else(|| file_str[..src_index].rfind('\\'))
237        .ok_or_else(|| {
238            anyhow!(
239                "Invalid file path (missing separator before '/src/' or '\\src\\'): {}",
240                file_path.display()
241            )
242        })?
243        + 1;
244
245    Ok(file_str
246        .split_at(src_index)
247        .0
248        .split_at(back_index)
249        .1
250        .to_string())
251}
252
253fn add_file_stem_to_module_path(
254    file_path: &Path,
255    module_path: &mut Vec<String>,
256) -> anyhow::Result<()> {
257    let file_stem = file_path
258        .file_stem()
259        .ok_or_else(|| {
260            anyhow!(
261                "Invalid file path (unable to find file stem): {}",
262                file_path.display()
263            )
264        })?
265        .to_string_lossy()
266        .into_owned();
267
268    module_path.push(file_stem);
269    Ok(())
270}
271
272/// Takes in the path to a Rust file and returns the path as you'd refer to it in a use-statement.
273///
274/// e.g. `"relay/relay-event_schema/src/protocol/types.rs"` -> `"relay_event_schema::protocol"`.
275fn module_name_from_file(file_path: &Path) -> anyhow::Result<String> {
276    let mut module_path = file_path
277        .parent()
278        .ok_or_else(|| {
279            anyhow!(
280                "Invalid file path (unable to find parent directory): {}",
281                file_path.display()
282            )
283        })?
284        .components()
285        .map(|part| part.as_os_str().to_string_lossy().into_owned())
286        .filter(|part| part != "src")
287        .collect::<Vec<String>>();
288
289    if is_file_module(file_path)? {
290        add_file_stem_to_module_path(file_path, &mut module_path)?;
291    }
292
293    let crate_name = crate_name_from_file(file_path).unwrap();
294
295    // Removes all the folders before the crate name, and concatenates to a string.
296    Ok(module_path
297        .iter()
298        .position(|s| s == &crate_name)
299        .map(|index| &module_path[index..])
300        .ok_or_else(|| anyhow!("Couldn't find crate name {}.", crate_name))?
301        .join("::")
302        .replace('-', "_"))
303}
304
305fn is_file_declared_from_mod_file(parent_dir: &Path, file_stem: &str) -> anyhow::Result<bool> {
306    let mod_rs_path = parent_dir.join("mod.rs");
307    if !mod_rs_path.exists() {
308        return Ok(false);
309    }
310    // If "mod.rs" exists, we need to check if it declares the file in question as a module.
311    // The declaration line would start with "pub mod" and contain the file stem.
312    let mod_rs_file: fs::File = fs::File::open(mod_rs_path)?;
313    let reader = std::io::BufReader::new(mod_rs_file);
314
315    for line in reader.lines() {
316        let line = line?;
317        if line.trim().starts_with("pub mod") && line.contains(file_stem) {
318            return Ok(true);
319        }
320    }
321    Ok(false)
322}
323
324fn is_file_declared_from_other_file(
325    entry: &DirEntry,
326    file_stem: &str,
327    file_path: &Path,
328) -> anyhow::Result<bool> {
329    let path = entry.path();
330
331    if path.is_file() && path.extension().is_some_and(|ext| ext == "rs") && path != *file_path {
332        // Read the file and search for the same declaration pattern: "pub mod" and file stem.
333        let file = fs::File::open(path)?;
334        let reader = std::io::BufReader::new(file);
335
336        for line in reader.lines() {
337            let line = line?;
338            if line.trim().starts_with("pub mod") && line.contains(file_stem) {
339                return Ok(true);
340            }
341        }
342    }
343    Ok(false)
344}
345
346// Checks if a file is a Rust module.
347fn is_file_module(file_path: &Path) -> anyhow::Result<bool> {
348    let parent_dir = file_path
349        .parent()
350        .ok_or_else(|| anyhow!("Invalid file path: {}", file_path.display()))?;
351    let file_stem = file_path
352        .file_stem()
353        .ok_or_else(|| anyhow!("Invalid file path: {}", file_path.display()))?
354        .to_string_lossy();
355
356    if is_file_declared_from_mod_file(parent_dir, &file_stem)? {
357        return Ok(true);
358    }
359
360    for entry in fs::read_dir(parent_dir)? {
361        let entry = entry?;
362        if is_file_declared_from_other_file(&entry, &file_stem, file_path)? {
363            return Ok(true);
364        }
365    }
366
367    Ok(false)
368}