relay_filter/
web_crawlers.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
//! Filters events coming from user agents known to be web crawlers.

use once_cell::sync::Lazy;
use regex::Regex;

use crate::{FilterConfig, FilterStatKey, Filterable};

static WEB_CRAWLERS: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
        r"(?ix)
        Mediapartners-Google|
        AdsBot-Google|
        Googlebot|
        FeedFetcher-Google|
        BingBot|                    # Bing search
        BingPreview|
        Baiduspider|                # Baidu search
        Slurp|                      # Yahoo
        Sogou|                      # Sogou
        facebook|                   # facebook
        ia_archiver|                # Alexa
        bots?[/\s\);]|              # Generic bot
        spider[/\s\);]|             # Generic spider
        Slack|                      # Slack - see https://api.slack.com/robots
        Calypso\sAppCrawler|        # Google indexing bot
        pingdom|                    # Pingdom
        lyticsbot|                  # Lytics
        AWS\sSecurity\sScanner|     # AWS Security Scanner causing DisallowedHost errors in Django, see
                                    # https://forums.aws.amazon.com/thread.jspa?messageID=932404
                                    # and https://github.com/getsentry/sentry-python/issues/641
        HubSpot\sCrawler|           # HubSpot web crawler (web-crawlers@hubspot.com)
        Bytespider|                 # Bytedance
        Better\sUptime|             # Better Uptime
        Cloudflare-Healthchecks|    # Cloudflare Health Checks
        GTmetrix|                   # GTmetrix
        BrightEdgeOnCrawl|          # BrightEdge - see https://www.brightedge.com/news/press-releases/brightedge-acquires-oncrawl-future-proof-web-30-strategies
        ELB-HealthChecker           # AWS Elastic Load Balancing Health Checks
    "
    )
    .expect("Invalid web crawlers filter Regex")
});

static ALLOWED_WEB_CRAWLERS: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
        r"(?ix)
        Slackbot\s1\.\d+|            # Slack - see https://api.slack.com/robots
        SentryUptimeBot              # Uptime Checker https://docs.sentry.io/product/alerts/uptime-monitoring/
    ",
    )
    .expect("Invalid allowed web crawlers filter Regex")
});

/// Checks if the event originates from a known web crawler.
fn matches(user_agent: Option<&str>) -> bool {
    if let Some(user_agent) = user_agent {
        WEB_CRAWLERS.is_match(user_agent) && !ALLOWED_WEB_CRAWLERS.is_match(user_agent)
    } else {
        false
    }
}

/// Filters events originating from a known web crawler.
pub fn should_filter<F: Filterable>(item: &F, config: &FilterConfig) -> Result<(), FilterStatKey> {
    if !config.is_enabled {
        return Ok(());
    }

    if matches(item.user_agent()) {
        return Err(FilterStatKey::WebCrawlers);
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::testutils;

    #[test]
    fn test_filter_when_disabled() {
        let evt = testutils::get_event_with_user_agent("Googlebot");
        let filter_result = should_filter(&evt, &FilterConfig { is_enabled: false });
        assert_eq!(
            filter_result,
            Ok(()),
            "Event filtered although filter should have been disabled"
        )
    }

    #[test]
    fn test_filter_banned_user_agents() {
        let user_agents = [
            "Mediapartners-Google",
            "AdsBot-Google",
            "Googlebot",
            "FeedFetcher-Google",
            "BingBot",
            "BingPreview",
            "Baiduspider",
            "Slurp",
            "Sogou",
            "facebook",
            "ia_archiver",
            "bots ",
            "bots;",
            "bots)",
            "spider ",
            "spider;",
            "spider)",
            "Calypso AppCrawler",
            "pingdom",
            "lyticsbot",
            "AWS Security Scanner",
            "Mozilla/5.0 (Linux; Android 6.0.1; Calypso AppCrawler Build/MMB30Y; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.124 Mobile Safari/537.36",
            "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)",
            "Slack-ImgProxy 0.19 (+https://api.slack.com/robots)",
            "Twitterbot/1.0",
            "FeedFetcher-Google; (+http://www.google.com/feedfetcher.html)",
            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
            "AdsBot-Google (+http://www.google.com/adsbot.html)",
            "Mozilla/5.0 (compatible; HubSpot Crawler; web-crawlers@hubspot.com)",
            "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)",
            "Better Uptime Bot Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
            "Mozilla/5.0 (compatible;Cloudflare-Healthchecks/1.0;+https://www.cloudflare.com/; healthcheck-id: 0d1ca23e292c8c14)",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 GTmetrix",
            "Mozilla/5.0 (compatible; BrightEdgeOnCrawl/1.0; +http://www.oncrawl.com)",
            "ELB-HealthChecker/2.0",
        ];

        for banned_user_agent in &user_agents {
            let event = testutils::get_event_with_user_agent(banned_user_agent);
            let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
            assert_ne!(
                filter_result,
                Ok(()),
                "Failed to filter web crawler with user agent '{banned_user_agent}'"
            );
        }
    }

    #[test]
    fn test_dont_filter_normal_user_agents() {
        let normal_user_agents = [
            "some user agent",
            "IE",
            "ie",
            "opera",
            "safari",
            "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
            "Slackbot 1.0(+https://api.slack.com/robots)",
            "SentryUptimeBot/1.0 (+http://docs.sentry.io/product/alerts/uptime-monitoring/)",
        ];
        for user_agent in &normal_user_agents {
            let event = testutils::get_event_with_user_agent(user_agent);
            let filter_result = should_filter(&event, &FilterConfig { is_enabled: true });
            assert_eq!(
                filter_result,
                Ok(()),
                "Failed benign user agent '{user_agent}'"
            );
        }
    }
}