1use std::sync::LazyLock;
4
5use regex::Regex;
6
7use crate::{FilterConfig, FilterStatKey, Filterable};
8
9static WEB_CRAWLERS: LazyLock<Regex> = LazyLock::new(|| {
10 Regex::new(
11 r"(?ix)
12 Mediapartners-Google|
13 AdsBot-Google|
14 Googlebot|
15 FeedFetcher-Google|
16 Storebot-Google|
17 BingBot| # Bing search
18 BingPreview|
19 Baiduspider| # Baidu search
20 Slurp| # Yahoo
21 Sogou| # Sogou
22 facebook| # facebook
23 meta-| # meta/facebook
24 ia_archiver| # Alexa
25 bots?([/\s\);]|$)| # Generic bot
26 spider([/\s\);]|$)| # Generic spider
27 Slack| # Slack - see https://api.slack.com/robots
28 Calypso\sAppCrawler| # Google indexing bot
29 pingdom| # Pingdom
30 lyticsbot| # Lytics
31 AWS\sSecurity\sScanner| # AWS Security Scanner causing DisallowedHost errors in Django, see
32 # https://forums.aws.amazon.com/thread.jspa?messageID=932404
33 # and https://github.com/getsentry/sentry-python/issues/641
34 HubSpot\sCrawler| # HubSpot web crawler (web-crawlers@hubspot.com)
35 Bytespider| # Bytedance
36 Better\sUptime| # Better Uptime
37 Cloudflare-Healthchecks| # Cloudflare Health Checks
38 GTmetrix| # GTmetrix
39 BrightEdgeOnCrawl| # BrightEdge - see https://www.brightedge.com/news/press-releases/brightedge-acquires-oncrawl-future-proof-web-30-strategies
40 ELB-HealthChecker| # AWS Elastic Load Balancing Health Checks
41 naver.me/spd| # Yeti/1.1 - naver.me
42 ClaudeBot| # Anthropic - see https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler
43 CCBot| # CCBot - see https://commoncrawl.org/ccbot
44 OAI-SearchBot| # OpenAI - see https://platform.openai.com/docs/bots
45 GPTBot| # OpenAI - see https://platform.openai.com/docs/bots
46 PerplexityBot| # Perplexity - see https://docs.perplexity.ai/guides/bots
47 Applebot| # Apple - see https://support.apple.com/en-us/119829
48 DuckDuckBot # DuckDuckGo - see https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
49 "
50 )
51 .expect("Invalid web crawlers filter Regex")
52});
53
54static ALLOWED_WEB_CRAWLERS: LazyLock<Regex> = LazyLock::new(|| {
55 Regex::new(
56 r"(?ix)
57 Slackbot\s1\.\d+| # Slack - see https://api.slack.com/robots
58 SentryUptimeBot| # Uptime Checker https://docs.sentry.io/product/alerts/uptime-monitoring/
59 ChatGPT-User # ChatGPT user prompted requests
60 ",
61 )
62 .expect("Invalid allowed web crawlers filter Regex")
63});
64
65fn matches(user_agent: &str) -> bool {
67 WEB_CRAWLERS.is_match(user_agent) && !ALLOWED_WEB_CRAWLERS.is_match(user_agent)
68}
69
70pub fn should_filter<F: Filterable>(item: &F, config: &FilterConfig) -> Result<(), FilterStatKey> {
72 if !config.is_enabled {
73 return Ok(());
74 }
75
76 let user_agent = item.user_agent();
77 let family = user_agent.parsed.as_ref().map(|ua| ua.family.as_ref());
78
79 if user_agent.raw.or(family).is_some_and(matches) {
84 return Err(FilterStatKey::WebCrawlers);
85 }
86
87 Ok(())
88}
89
90#[cfg(test)]
91mod tests {
92 use super::*;
93 use crate::{UserAgent, testutils};
94
95 #[derive(Debug)]
96 struct TestFilterable<'a>(UserAgent<'a>);
97
98 impl<'a> Filterable for TestFilterable<'a> {
99 fn user_agent(&self) -> UserAgent<'_> {
100 self.0.clone()
101 }
102 }
103
104 #[test]
105 fn test_filter_when_disabled() {
106 let evt = testutils::get_event_with_user_agent("Googlebot");
107 let filter_result = should_filter(&evt, &FilterConfig { is_enabled: false });
108 assert_eq!(
109 filter_result,
110 Ok(()),
111 "Event filtered although filter should have been disabled"
112 )
113 }
114
115 #[test]
116 fn test_filter_banned_user_agents() {
117 let user_agents = [
118 "Mediapartners-Google",
119 "AdsBot-Google",
120 "Googlebot",
121 "FeedFetcher-Google",
122 "Storebot-Google",
123 "Mozilla/5.0 (X11; Linux x86_64; Storebot-Google/1.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
124 "BingBot",
125 "BingPreview",
126 "Baiduspider",
127 "Slurp",
128 "Sogou",
129 "facebook",
130 "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
131 "facebookcatalog/1.0",
132 "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)",
133 "meta-externalfetcher/1.1",
134 "ia_archiver",
135 "bots ",
136 "bots;",
137 "bots)",
138 "spider ",
139 "spider;",
140 "spider)",
141 "Calypso AppCrawler",
142 "pingdom",
143 "lyticsbot",
144 "AWS Security Scanner",
145 "Mozilla/5.0 (Linux; Android 6.0.1; Calypso AppCrawler Build/MMB30Y; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.124 Mobile Safari/537.36",
146 "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)",
147 "Slack-ImgProxy 0.19 (+https://api.slack.com/robots)",
148 "Twitterbot/1.0",
149 "FeedFetcher-Google; (+http://www.google.com/feedfetcher.html)",
150 "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
151 "AdsBot-Google (+http://www.google.com/adsbot.html)",
152 "Mozilla/5.0 (compatible; HubSpot Crawler; web-crawlers@hubspot.com)",
153 "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)",
154 "Better Uptime Bot Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
155 "Mozilla/5.0 (compatible;Cloudflare-Healthchecks/1.0;+https://www.cloudflare.com/; healthcheck-id: 0d1ca23e292c8c14)",
156 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 GTmetrix",
157 "Mozilla/5.0 (compatible; BrightEdgeOnCrawl/1.0; +http://www.oncrawl.com)",
158 "ELB-HealthChecker/2.0",
159 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko; compatible; Yeti/1.1; +https://naver.me/spd) Chrome/127.0.0.0 Safari/537.36",
160 "Mozilla/5.0; ClaudeBot",
161 "Mozilla/5.0; CCBot",
162 "; OAI-SearchBot/1.0; +https://openai.com/searchbot",
163 "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot",
164 "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
165 "Mozilla/5.0 (Device; OS_version) AppleWebKit/WebKit_version (KHTML, like Gecko)Version/Safari_version [Mobile/Mobile_version] Safari/WebKit_version (Applebot/Applebot_version; +http://www.apple.com/go/applebot)",
166 "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)",
167 ];
168
169 for banned_user_agent in &user_agents {
170 let event = testutils::get_event_with_user_agent(banned_user_agent);
171 let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
172 assert_ne!(
173 filter_result,
174 Ok(()),
175 "Failed to filter web crawler with user agent '{banned_user_agent}'"
176 );
177 }
178 }
179
180 #[test]
181 fn test_dont_filter_normal_user_agents() {
182 let normal_user_agents = [
183 "some user agent",
184 "IE",
185 "ie",
186 "opera",
187 "safari",
188 "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)",
189 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
190 "Slackbot 1.0(+https://api.slack.com/robots)",
191 "SentryUptimeBot/1.0 (+http://docs.sentry.io/product/alerts/uptime-monitoring/)",
192 "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
193 ];
194 for user_agent in &normal_user_agents {
195 let event = testutils::get_event_with_user_agent(user_agent);
196 let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
197 assert_eq!(
198 filter_result,
199 Ok(()),
200 "Failed benign user agent '{user_agent}'"
201 );
202 }
203 }
204
205 #[test]
206 fn test_filter_parsed_ua_only() {
207 let ua = UserAgent {
208 raw: None,
209 parsed: Some(relay_ua::UserAgent {
210 family: "Twitterbot".into(),
211 ..Default::default()
212 }),
213 };
214
215 let filter_result = should_filter(&TestFilterable(ua), &FilterConfig { is_enabled: true });
216 assert_ne!(filter_result, Ok(()));
217 }
218
219 #[test]
220 fn test_filter_parsed_ua_does_not_filter_default() {
221 let ua = UserAgent {
222 raw: None,
223 parsed: Some(Default::default()),
226 };
227
228 let filter_result = should_filter(&TestFilterable(ua), &FilterConfig { is_enabled: true });
229 assert_eq!(filter_result, Ok(()));
230 }
231}