relay_filter/
web_crawlers.rs

1//! Filters events coming from user agents known to be web crawlers.
2
3use std::sync::LazyLock;
4
5use regex::Regex;
6
7use crate::{FilterConfig, FilterStatKey, Filterable};
8
9static WEB_CRAWLERS: LazyLock<Regex> = LazyLock::new(|| {
10    Regex::new(
11        r"(?ix)
12        Mediapartners-Google|
13        AdsBot-Google|
14        Googlebot|
15        FeedFetcher-Google|
16        Storebot-Google|
17        BingBot|                    # Bing search
18        BingPreview|
19        Baiduspider|                # Baidu search
20        Slurp|                      # Yahoo
21        Sogou|                      # Sogou
22        facebook|                   # facebook
23        meta-|                      # meta/facebook
24        ia_archiver|                # Alexa
25        bots?([/\s\);]|$)|          # Generic bot
26        spider([/\s\);]|$)|         # Generic spider
27        Slack|                      # Slack - see https://api.slack.com/robots
28        Calypso\sAppCrawler|        # Google indexing bot
29        pingdom|                    # Pingdom
30        lyticsbot|                  # Lytics
31        AWS\sSecurity\sScanner|     # AWS Security Scanner causing DisallowedHost errors in Django, see
32                                    # https://forums.aws.amazon.com/thread.jspa?messageID=932404
33                                    # and https://github.com/getsentry/sentry-python/issues/641
34        HubSpot\sCrawler|           # HubSpot web crawler (web-crawlers@hubspot.com)
35        Bytespider|                 # Bytedance
36        Better\sUptime|             # Better Uptime
37        Cloudflare-Healthchecks|    # Cloudflare Health Checks
38        GTmetrix|                   # GTmetrix
39        BrightEdgeOnCrawl|          # BrightEdge - see https://www.brightedge.com/news/press-releases/brightedge-acquires-oncrawl-future-proof-web-30-strategies
40        ELB-HealthChecker|          # AWS Elastic Load Balancing Health Checks
41        naver.me/spd|               # Yeti/1.1 - naver.me
42        ClaudeBot|                  # Anthropic - see https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler
43        CCBot|                      # CCBot - see https://commoncrawl.org/ccbot
44        OAI-SearchBot|              # OpenAI - see https://platform.openai.com/docs/bots
45        GPTBot|                     # OpenAI - see https://platform.openai.com/docs/bots
46        PerplexityBot|              # Perplexity - see https://docs.perplexity.ai/guides/bots
47        Applebot|                   # Apple - see https://support.apple.com/en-us/119829
48        DuckDuckBot                 # DuckDuckGo - see https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
49    "
50    )
51    .expect("Invalid web crawlers filter Regex")
52});
53
54static ALLOWED_WEB_CRAWLERS: LazyLock<Regex> = LazyLock::new(|| {
55    Regex::new(
56        r"(?ix)
57        Slackbot\s1\.\d+|            # Slack - see https://api.slack.com/robots
58        SentryUptimeBot|             # Uptime Checker https://docs.sentry.io/product/alerts/uptime-monitoring/
59        ChatGPT-User                 # ChatGPT user prompted requests
60    ",
61    )
62    .expect("Invalid allowed web crawlers filter Regex")
63});
64
65/// Checks if the event originates from a known web crawler.
66fn matches(user_agent: &str) -> bool {
67    WEB_CRAWLERS.is_match(user_agent) && !ALLOWED_WEB_CRAWLERS.is_match(user_agent)
68}
69
70/// Filters events originating from a known web crawler.
71pub fn should_filter<F: Filterable>(item: &F, config: &FilterConfig) -> Result<(), FilterStatKey> {
72    if !config.is_enabled {
73        return Ok(());
74    }
75
76    let user_agent = item.user_agent();
77    let family = user_agent.parsed.as_ref().map(|ua| ua.family.as_ref());
78
79    // Use the raw user agent if it is available, as it is higher quality. For example some user
80    // agents may be parsed as `Other` while the raw user agent would be filtered.
81    //
82    // Fallback to the parsed user agent as under circumstances only that may be available.
83    if user_agent.raw.or(family).is_some_and(matches) {
84        return Err(FilterStatKey::WebCrawlers);
85    }
86
87    Ok(())
88}
89
90#[cfg(test)]
91mod tests {
92    use super::*;
93    use crate::{UserAgent, testutils};
94
95    #[derive(Debug)]
96    struct TestFilterable<'a>(UserAgent<'a>);
97
98    impl<'a> Filterable for TestFilterable<'a> {
99        fn user_agent(&self) -> UserAgent<'_> {
100            self.0.clone()
101        }
102    }
103
104    #[test]
105    fn test_filter_when_disabled() {
106        let evt = testutils::get_event_with_user_agent("Googlebot");
107        let filter_result = should_filter(&evt, &FilterConfig { is_enabled: false });
108        assert_eq!(
109            filter_result,
110            Ok(()),
111            "Event filtered although filter should have been disabled"
112        )
113    }
114
115    #[test]
116    fn test_filter_banned_user_agents() {
117        let user_agents = [
118            "Mediapartners-Google",
119            "AdsBot-Google",
120            "Googlebot",
121            "FeedFetcher-Google",
122            "Storebot-Google",
123            "Mozilla/5.0 (X11; Linux x86_64; Storebot-Google/1.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
124            "BingBot",
125            "BingPreview",
126            "Baiduspider",
127            "Slurp",
128            "Sogou",
129            "facebook",
130            "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
131            "facebookcatalog/1.0",
132            "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)",
133            "meta-externalfetcher/1.1",
134            "ia_archiver",
135            "bots ",
136            "bots;",
137            "bots)",
138            "spider ",
139            "spider;",
140            "spider)",
141            "Calypso AppCrawler",
142            "pingdom",
143            "lyticsbot",
144            "AWS Security Scanner",
145            "Mozilla/5.0 (Linux; Android 6.0.1; Calypso AppCrawler Build/MMB30Y; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.124 Mobile Safari/537.36",
146            "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)",
147            "Slack-ImgProxy 0.19 (+https://api.slack.com/robots)",
148            "Twitterbot/1.0",
149            "FeedFetcher-Google; (+http://www.google.com/feedfetcher.html)",
150            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
151            "AdsBot-Google (+http://www.google.com/adsbot.html)",
152            "Mozilla/5.0 (compatible; HubSpot Crawler; web-crawlers@hubspot.com)",
153            "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)",
154            "Better Uptime Bot Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
155            "Mozilla/5.0 (compatible;Cloudflare-Healthchecks/1.0;+https://www.cloudflare.com/; healthcheck-id: 0d1ca23e292c8c14)",
156            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 GTmetrix",
157            "Mozilla/5.0 (compatible; BrightEdgeOnCrawl/1.0; +http://www.oncrawl.com)",
158            "ELB-HealthChecker/2.0",
159            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko; compatible; Yeti/1.1; +https://naver.me/spd) Chrome/127.0.0.0 Safari/537.36",
160            "Mozilla/5.0; ClaudeBot",
161            "Mozilla/5.0; CCBot",
162            "; OAI-SearchBot/1.0; +https://openai.com/searchbot",
163            "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot",
164            "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
165            "Mozilla/5.0 (Device; OS_version) AppleWebKit/WebKit_version (KHTML, like Gecko)Version/Safari_version [Mobile/Mobile_version] Safari/WebKit_version (Applebot/Applebot_version; +http://www.apple.com/go/applebot)",
166            "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)",
167        ];
168
169        for banned_user_agent in &user_agents {
170            let event = testutils::get_event_with_user_agent(banned_user_agent);
171            let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
172            assert_ne!(
173                filter_result,
174                Ok(()),
175                "Failed to filter web crawler with user agent '{banned_user_agent}'"
176            );
177        }
178    }
179
180    #[test]
181    fn test_dont_filter_normal_user_agents() {
182        let normal_user_agents = [
183            "some user agent",
184            "IE",
185            "ie",
186            "opera",
187            "safari",
188            "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)",
189            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
190            "Slackbot 1.0(+https://api.slack.com/robots)",
191            "SentryUptimeBot/1.0 (+http://docs.sentry.io/product/alerts/uptime-monitoring/)",
192            "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
193        ];
194        for user_agent in &normal_user_agents {
195            let event = testutils::get_event_with_user_agent(user_agent);
196            let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
197            assert_eq!(
198                filter_result,
199                Ok(()),
200                "Failed benign user agent '{user_agent}'"
201            );
202        }
203    }
204
205    #[test]
206    fn test_filter_parsed_ua_only() {
207        let ua = UserAgent {
208            raw: None,
209            parsed: Some(relay_ua::UserAgent {
210                family: "Twitterbot".into(),
211                ..Default::default()
212            }),
213        };
214
215        let filter_result = should_filter(&TestFilterable(ua), &FilterConfig { is_enabled: true });
216        assert_ne!(filter_result, Ok(()));
217    }
218
219    #[test]
220    fn test_filter_parsed_ua_does_not_filter_default() {
221        let ua = UserAgent {
222            raw: None,
223            // This may happen if the raw user agent cannot be parsed or is not available,
224            // in this case the filter should not accidentally remove the event.
225            parsed: Some(Default::default()),
226        };
227
228        let filter_result = should_filter(&TestFilterable(ua), &FilterConfig { is_enabled: true });
229        assert_eq!(filter_result, Ok(()));
230    }
231}