relay_filter/
web_crawlers.rs

1//! Filters events coming from user agents known to be web crawlers.
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6use crate::{FilterConfig, FilterStatKey, Filterable};
7
8static WEB_CRAWLERS: Lazy<Regex> = Lazy::new(|| {
9    Regex::new(
10        r"(?ix)
11        Mediapartners-Google|
12        AdsBot-Google|
13        Googlebot|
14        FeedFetcher-Google|
15        BingBot|                    # Bing search
16        BingPreview|
17        Baiduspider|                # Baidu search
18        Slurp|                      # Yahoo
19        Sogou|                      # Sogou
20        facebook|                   # facebook
21        ia_archiver|                # Alexa
22        bots?[/\s\);]|              # Generic bot
23        spider[/\s\);]|             # Generic spider
24        Slack|                      # Slack - see https://api.slack.com/robots
25        Calypso\sAppCrawler|        # Google indexing bot
26        pingdom|                    # Pingdom
27        lyticsbot|                  # Lytics
28        AWS\sSecurity\sScanner|     # AWS Security Scanner causing DisallowedHost errors in Django, see
29                                    # https://forums.aws.amazon.com/thread.jspa?messageID=932404
30                                    # and https://github.com/getsentry/sentry-python/issues/641
31        HubSpot\sCrawler|           # HubSpot web crawler (web-crawlers@hubspot.com)
32        Bytespider|                 # Bytedance
33        Better\sUptime|             # Better Uptime
34        Cloudflare-Healthchecks|    # Cloudflare Health Checks
35        GTmetrix|                   # GTmetrix
36        BrightEdgeOnCrawl|          # BrightEdge - see https://www.brightedge.com/news/press-releases/brightedge-acquires-oncrawl-future-proof-web-30-strategies
37        ELB-HealthChecker|          # AWS Elastic Load Balancing Health Checks
38        naver.me/spd|               # Yeti/1.1 - naver.me
39        ClaudeBot|                  # Anthropic - see https://support.anthropic.com/en/articles/10023637-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler
40        CCBot|                      # CCBot - see https://commoncrawl.org/ccbot
41        OAI-SearchBot|              # OpenAI - see https://platform.openai.com/docs/bots
42        GPTBot                      # OpenAI - see https://platform.openai.com/docs/bots
43    "
44    )
45    .expect("Invalid web crawlers filter Regex")
46});
47
48static ALLOWED_WEB_CRAWLERS: Lazy<Regex> = Lazy::new(|| {
49    Regex::new(
50        r"(?ix)
51        Slackbot\s1\.\d+|            # Slack - see https://api.slack.com/robots
52        SentryUptimeBot              # Uptime Checker https://docs.sentry.io/product/alerts/uptime-monitoring/
53    ",
54    )
55    .expect("Invalid allowed web crawlers filter Regex")
56});
57
58/// Checks if the event originates from a known web crawler.
59fn matches(user_agent: Option<&str>) -> bool {
60    if let Some(user_agent) = user_agent {
61        WEB_CRAWLERS.is_match(user_agent) && !ALLOWED_WEB_CRAWLERS.is_match(user_agent)
62    } else {
63        false
64    }
65}
66
67/// Filters events originating from a known web crawler.
68pub fn should_filter<F: Filterable>(item: &F, config: &FilterConfig) -> Result<(), FilterStatKey> {
69    if !config.is_enabled {
70        return Ok(());
71    }
72
73    if matches(item.user_agent()) {
74        return Err(FilterStatKey::WebCrawlers);
75    }
76
77    Ok(())
78}
79
80#[cfg(test)]
81mod tests {
82    use super::*;
83    use crate::testutils;
84
85    #[test]
86    fn test_filter_when_disabled() {
87        let evt = testutils::get_event_with_user_agent("Googlebot");
88        let filter_result = should_filter(&evt, &FilterConfig { is_enabled: false });
89        assert_eq!(
90            filter_result,
91            Ok(()),
92            "Event filtered although filter should have been disabled"
93        )
94    }
95
96    #[test]
97    fn test_filter_banned_user_agents() {
98        let user_agents = [
99            "Mediapartners-Google",
100            "AdsBot-Google",
101            "Googlebot",
102            "FeedFetcher-Google",
103            "BingBot",
104            "BingPreview",
105            "Baiduspider",
106            "Slurp",
107            "Sogou",
108            "facebook",
109            "ia_archiver",
110            "bots ",
111            "bots;",
112            "bots)",
113            "spider ",
114            "spider;",
115            "spider)",
116            "Calypso AppCrawler",
117            "pingdom",
118            "lyticsbot",
119            "AWS Security Scanner",
120            "Mozilla/5.0 (Linux; Android 6.0.1; Calypso AppCrawler Build/MMB30Y; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.124 Mobile Safari/537.36",
121            "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)",
122            "Slack-ImgProxy 0.19 (+https://api.slack.com/robots)",
123            "Twitterbot/1.0",
124            "FeedFetcher-Google; (+http://www.google.com/feedfetcher.html)",
125            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
126            "AdsBot-Google (+http://www.google.com/adsbot.html)",
127            "Mozilla/5.0 (compatible; HubSpot Crawler; web-crawlers@hubspot.com)",
128            "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)",
129            "Better Uptime Bot Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
130            "Mozilla/5.0 (compatible;Cloudflare-Healthchecks/1.0;+https://www.cloudflare.com/; healthcheck-id: 0d1ca23e292c8c14)",
131            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 GTmetrix",
132            "Mozilla/5.0 (compatible; BrightEdgeOnCrawl/1.0; +http://www.oncrawl.com)",
133            "ELB-HealthChecker/2.0",
134            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko; compatible; Yeti/1.1; +https://naver.me/spd) Chrome/127.0.0.0 Safari/537.36",
135            "Mozilla/5.0; ClaudeBot",
136            "Mozilla/5.0; CCBot",
137            "; OAI-SearchBot/1.0; +https://openai.com/searchbot",
138            "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot",
139        ];
140
141        for banned_user_agent in &user_agents {
142            let event = testutils::get_event_with_user_agent(banned_user_agent);
143            let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
144            assert_ne!(
145                filter_result,
146                Ok(()),
147                "Failed to filter web crawler with user agent '{banned_user_agent}'"
148            );
149        }
150    }
151
152    #[test]
153    fn test_dont_filter_normal_user_agents() {
154        let normal_user_agents = [
155            "some user agent",
156            "IE",
157            "ie",
158            "opera",
159            "safari",
160            "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)",
161            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
162            "Slackbot 1.0(+https://api.slack.com/robots)",
163            "SentryUptimeBot/1.0 (+http://docs.sentry.io/product/alerts/uptime-monitoring/)",
164            "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
165        ];
166        for user_agent in &normal_user_agents {
167            let event = testutils::get_event_with_user_agent(user_agent);
168            let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
169            assert_eq!(
170                filter_result,
171                Ok(()),
172                "Failed benign user agent '{user_agent}'"
173            );
174        }
175    }
176}