relay_filter/
web_crawlers.rs1use once_cell::sync::Lazy;
4use regex::Regex;
5
6use crate::{FilterConfig, FilterStatKey, Filterable};
7
8static WEB_CRAWLERS: Lazy<Regex> = Lazy::new(|| {
9 Regex::new(
10 r"(?ix)
11 Mediapartners-Google|
12 AdsBot-Google|
13 Googlebot|
14 FeedFetcher-Google|
15 BingBot| # Bing search
16 BingPreview|
17 Baiduspider| # Baidu search
18 Slurp| # Yahoo
19 Sogou| # Sogou
20 facebook| # facebook
21 ia_archiver| # Alexa
22 bots?[/\s\);]| # Generic bot
23 spider[/\s\);]| # Generic spider
24 Slack| # Slack - see https://api.slack.com/robots
25 Calypso\sAppCrawler| # Google indexing bot
26 pingdom| # Pingdom
27 lyticsbot| # Lytics
28 AWS\sSecurity\sScanner| # AWS Security Scanner causing DisallowedHost errors in Django, see
29 # https://forums.aws.amazon.com/thread.jspa?messageID=932404
30 # and https://github.com/getsentry/sentry-python/issues/641
31 HubSpot\sCrawler| # HubSpot web crawler (web-crawlers@hubspot.com)
32 Bytespider| # Bytedance
33 Better\sUptime| # Better Uptime
34 Cloudflare-Healthchecks| # Cloudflare Health Checks
35 GTmetrix| # GTmetrix
36 BrightEdgeOnCrawl| # BrightEdge - see https://www.brightedge.com/news/press-releases/brightedge-acquires-oncrawl-future-proof-web-30-strategies
37 ELB-HealthChecker| # AWS Elastic Load Balancing Health Checks
38 naver.me/spd| # Yeti/1.1 - naver.me
39 ClaudeBot| # Anthropic - see https://support.anthropic.com/en/articles/10023637-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler
40 CCBot| # CCBot - see https://commoncrawl.org/ccbot
41 OAI-SearchBot| # OpenAI - see https://platform.openai.com/docs/bots
42 GPTBot # OpenAI - see https://platform.openai.com/docs/bots
43 "
44 )
45 .expect("Invalid web crawlers filter Regex")
46});
47
48static ALLOWED_WEB_CRAWLERS: Lazy<Regex> = Lazy::new(|| {
49 Regex::new(
50 r"(?ix)
51 Slackbot\s1\.\d+| # Slack - see https://api.slack.com/robots
52 SentryUptimeBot # Uptime Checker https://docs.sentry.io/product/alerts/uptime-monitoring/
53 ",
54 )
55 .expect("Invalid allowed web crawlers filter Regex")
56});
57
58fn matches(user_agent: Option<&str>) -> bool {
60 if let Some(user_agent) = user_agent {
61 WEB_CRAWLERS.is_match(user_agent) && !ALLOWED_WEB_CRAWLERS.is_match(user_agent)
62 } else {
63 false
64 }
65}
66
67pub fn should_filter<F: Filterable>(item: &F, config: &FilterConfig) -> Result<(), FilterStatKey> {
69 if !config.is_enabled {
70 return Ok(());
71 }
72
73 if matches(item.user_agent()) {
74 return Err(FilterStatKey::WebCrawlers);
75 }
76
77 Ok(())
78}
79
80#[cfg(test)]
81mod tests {
82 use super::*;
83 use crate::testutils;
84
85 #[test]
86 fn test_filter_when_disabled() {
87 let evt = testutils::get_event_with_user_agent("Googlebot");
88 let filter_result = should_filter(&evt, &FilterConfig { is_enabled: false });
89 assert_eq!(
90 filter_result,
91 Ok(()),
92 "Event filtered although filter should have been disabled"
93 )
94 }
95
96 #[test]
97 fn test_filter_banned_user_agents() {
98 let user_agents = [
99 "Mediapartners-Google",
100 "AdsBot-Google",
101 "Googlebot",
102 "FeedFetcher-Google",
103 "BingBot",
104 "BingPreview",
105 "Baiduspider",
106 "Slurp",
107 "Sogou",
108 "facebook",
109 "ia_archiver",
110 "bots ",
111 "bots;",
112 "bots)",
113 "spider ",
114 "spider;",
115 "spider)",
116 "Calypso AppCrawler",
117 "pingdom",
118 "lyticsbot",
119 "AWS Security Scanner",
120 "Mozilla/5.0 (Linux; Android 6.0.1; Calypso AppCrawler Build/MMB30Y; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.124 Mobile Safari/537.36",
121 "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)",
122 "Slack-ImgProxy 0.19 (+https://api.slack.com/robots)",
123 "Twitterbot/1.0",
124 "FeedFetcher-Google; (+http://www.google.com/feedfetcher.html)",
125 "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
126 "AdsBot-Google (+http://www.google.com/adsbot.html)",
127 "Mozilla/5.0 (compatible; HubSpot Crawler; web-crawlers@hubspot.com)",
128 "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)",
129 "Better Uptime Bot Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
130 "Mozilla/5.0 (compatible;Cloudflare-Healthchecks/1.0;+https://www.cloudflare.com/; healthcheck-id: 0d1ca23e292c8c14)",
131 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 GTmetrix",
132 "Mozilla/5.0 (compatible; BrightEdgeOnCrawl/1.0; +http://www.oncrawl.com)",
133 "ELB-HealthChecker/2.0",
134 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko; compatible; Yeti/1.1; +https://naver.me/spd) Chrome/127.0.0.0 Safari/537.36",
135 "Mozilla/5.0; ClaudeBot",
136 "Mozilla/5.0; CCBot",
137 "; OAI-SearchBot/1.0; +https://openai.com/searchbot",
138 "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot",
139 ];
140
141 for banned_user_agent in &user_agents {
142 let event = testutils::get_event_with_user_agent(banned_user_agent);
143 let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
144 assert_ne!(
145 filter_result,
146 Ok(()),
147 "Failed to filter web crawler with user agent '{banned_user_agent}'"
148 );
149 }
150 }
151
152 #[test]
153 fn test_dont_filter_normal_user_agents() {
154 let normal_user_agents = [
155 "some user agent",
156 "IE",
157 "ie",
158 "opera",
159 "safari",
160 "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)",
161 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
162 "Slackbot 1.0(+https://api.slack.com/robots)",
163 "SentryUptimeBot/1.0 (+http://docs.sentry.io/product/alerts/uptime-monitoring/)",
164 "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
165 ];
166 for user_agent in &normal_user_agents {
167 let event = testutils::get_event_with_user_agent(user_agent);
168 let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
169 assert_eq!(
170 filter_result,
171 Ok(()),
172 "Failed benign user agent '{user_agent}'"
173 );
174 }
175 }
176}