1import io
2import os
3import threading
4import unittest
5import urllib.robotparser
6from test import support
7from http.server import BaseHTTPRequestHandler, HTTPServer
8
9
10class BaseRobotTest:
11    robots_txt = ''
12    agent = 'test_robotparser'
13    good = []
14    bad = []
15    site_maps = None
16
17    def setUp(self):
18        lines = io.StringIO(self.robots_txt).readlines()
19        self.parser = urllib.robotparser.RobotFileParser()
20        self.parser.parse(lines)
21
22    def get_agent_and_url(self, url):
23        if isinstance(url, tuple):
24            agent, url = url
25            return agent, url
26        return self.agent, url
27
28    def test_good_urls(self):
29        for url in self.good:
30            agent, url = self.get_agent_and_url(url)
31            with self.subTest(url=url, agent=agent):
32                self.assertTrue(self.parser.can_fetch(agent, url))
33
34    def test_bad_urls(self):
35        for url in self.bad:
36            agent, url = self.get_agent_and_url(url)
37            with self.subTest(url=url, agent=agent):
38                self.assertFalse(self.parser.can_fetch(agent, url))
39
40    def test_site_maps(self):
41        self.assertEqual(self.parser.site_maps(), self.site_maps)
42
43
44class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
45    robots_txt = """\
46User-agent: *
47Disallow: /cyberworld/map/ # This is an infinite virtual URL space
48Disallow: /tmp/ # these will soon disappear
49Disallow: /foo.html
50    """
51    good = ['/', '/test.html']
52    bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
53
54
55class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
56    robots_txt = """\
57# robots.txt for http://www.example.com/
58
59User-agent: *
60Crawl-delay: 1
61Request-rate: 3/15
62Disallow: /cyberworld/map/ # This is an infinite virtual URL space
63
64# Cybermapper knows where to go.
65User-agent: cybermapper
66Disallow:
67    """
68    good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
69    bad = ['/cyberworld/map/index.html']
70
71
72class SitemapTest(BaseRobotTest, unittest.TestCase):
73    robots_txt = """\
74# robots.txt for http://www.example.com/
75
76User-agent: *
77Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
78Sitemap: http://www.google.com/hostednews/sitemap_index.xml
79Request-rate: 3/15
80Disallow: /cyberworld/map/ # This is an infinite virtual URL space
81
82    """
83    good = ['/', '/test.html']
84    bad = ['/cyberworld/map/index.html']
85    site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
86                 'http://www.google.com/hostednews/sitemap_index.xml']
87
88
89class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
90    robots_txt = """\
91# go away
92User-agent: *
93Disallow: /
94    """
95    good = []
96    bad = ['/cyberworld/map/index.html', '/', '/tmp/']
97
98
99class BaseRequestRateTest(BaseRobotTest):
100    request_rate = None
101    crawl_delay = None
102
103    def test_request_rate(self):
104        parser = self.parser
105        for url in self.good + self.bad:
106            agent, url = self.get_agent_and_url(url)
107            with self.subTest(url=url, agent=agent):
108                self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
109
110                parsed_request_rate = parser.request_rate(agent)
111                self.assertEqual(parsed_request_rate, self.request_rate)
112                if self.request_rate is not None:
113                    self.assertIsInstance(
114                        parsed_request_rate,
115                        urllib.robotparser.RequestRate
116                    )
117                    self.assertEqual(
118                        parsed_request_rate.requests,
119                        self.request_rate.requests
120                    )
121                    self.assertEqual(
122                        parsed_request_rate.seconds,
123                        self.request_rate.seconds
124                    )
125
126
127class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
128    robots_txt = ''
129    good = ['/foo']
130
131
132class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
133    robots_txt = """\
134User-agent: figtree
135Crawl-delay: 3
136Request-rate: 9/30
137Disallow: /tmp
138Disallow: /a%3cd.html
139Disallow: /a%2fb.html
140Disallow: /%7ejoe/index.html
141    """
142    agent = 'figtree'
143    request_rate = urllib.robotparser.RequestRate(9, 30)
144    crawl_delay = 3
145    good = [('figtree', '/foo.html')]
146    bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
147           '/a%2fb.html', '/~joe/index.html']
148
149
150class DifferentAgentTest(CrawlDelayAndRequestRateTest):
151    agent = 'FigTree Robot libwww-perl/5.04'
152
153
154class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
155    robots_txt = """\
156User-agent: *
157Disallow: /tmp/
158Disallow: /a%3Cd.html
159Disallow: /a/b.html
160Disallow: /%7ejoe/index.html
161Crawl-delay: 3
162Request-rate: 9/banana
163    """
164    good = ['/tmp']
165    bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
166           '/%7Ejoe/index.html']
167    crawl_delay = 3
168
169
170class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
171    # From bug report #523041
172    robots_txt = """\
173User-Agent: *
174Disallow: /.
175Crawl-delay: pears
176    """
177    good = ['/foo.html']
178    # bug report says "/" should be denied, but that is not in the RFC
179    bad = []
180
181
182class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
183    # also test that Allow and Diasallow works well with each other
184    robots_txt = """\
185User-agent: Googlebot
186Allow: /folder1/myfile.html
187Disallow: /folder1/
188Request-rate: whale/banana
189    """
190    agent = 'Googlebot'
191    good = ['/folder1/myfile.html']
192    bad = ['/folder1/anotherfile.html']
193
194
195class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
196    # the order of User-agent should be correct. note
197    # that this file is incorrect because "Googlebot" is a
198    # substring of "Googlebot-Mobile"
199    robots_txt = """\
200User-agent: Googlebot
201Disallow: /
202
203User-agent: Googlebot-Mobile
204Allow: /
205    """
206    agent = 'Googlebot'
207    bad = ['/something.jpg']
208
209
210class UserAgentGoogleMobileTest(UserAgentOrderingTest):
211    agent = 'Googlebot-Mobile'
212
213
214class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
215    # Google also got the order wrong. You need
216    # to specify the URLs from more specific to more general
217    robots_txt = """\
218User-agent: Googlebot
219Allow: /folder1/myfile.html
220Disallow: /folder1/
221    """
222    agent = 'googlebot'
223    good = ['/folder1/myfile.html']
224    bad = ['/folder1/anotherfile.html']
225
226
227class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
228    # see issue #6325 for details
229    robots_txt = """\
230User-agent: *
231Disallow: /some/path?name=value
232    """
233    good = ['/some/path']
234    bad = ['/some/path?name=value']
235
236
237class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
238    # obey first * entry (#4108)
239    robots_txt = """\
240User-agent: *
241Disallow: /some/path
242
243User-agent: *
244Disallow: /another/path
245    """
246    good = ['/another/path']
247    bad = ['/some/path']
248
249
250class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
251    # normalize the URL first (#17403)
252    robots_txt = """\
253User-agent: *
254Allow: /some/path?
255Disallow: /another/path?
256    """
257    good = ['/some/path?']
258    bad = ['/another/path?']
259
260
261class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
262    robots_txt = """\
263User-agent: *
264Crawl-delay: 1
265Request-rate: 3/15
266Disallow: /cyberworld/map/
267    """
268    request_rate = urllib.robotparser.RequestRate(3, 15)
269    crawl_delay = 1
270    good = ['/', '/test.html']
271    bad = ['/cyberworld/map/index.html']
272
273
274class StringFormattingTest(BaseRobotTest, unittest.TestCase):
275    robots_txt = """\
276User-agent: *
277Crawl-delay: 1
278Request-rate: 3/15
279Disallow: /cyberworld/map/ # This is an infinite virtual URL space
280
281# Cybermapper knows where to go.
282User-agent: cybermapper
283Disallow: /some/path
284    """
285
286    expected_output = """\
287User-agent: cybermapper
288Disallow: /some/path
289
290User-agent: *
291Crawl-delay: 1
292Request-rate: 3/15
293Disallow: /cyberworld/map/\
294"""
295
296    def test_string_formatting(self):
297        self.assertEqual(str(self.parser), self.expected_output)
298
299
300class RobotHandler(BaseHTTPRequestHandler):
301
302    def do_GET(self):
303        self.send_error(403, "Forbidden access")
304
305    def log_message(self, format, *args):
306        pass
307
308
309class PasswordProtectedSiteTestCase(unittest.TestCase):
310
311    def setUp(self):
312        self.server = HTTPServer((support.HOST, 0), RobotHandler)
313
314        self.t = threading.Thread(
315            name='HTTPServer serving',
316            target=self.server.serve_forever,
317            # Short poll interval to make the test finish quickly.
318            # Time between requests is short enough that we won't wake
319            # up spuriously too many times.
320            kwargs={'poll_interval':0.01})
321        self.t.daemon = True  # In case this function raises.
322        self.t.start()
323
324    def tearDown(self):
325        self.server.shutdown()
326        self.t.join()
327        self.server.server_close()
328
329    @support.reap_threads
330    def testPasswordProtectedSite(self):
331        addr = self.server.server_address
332        url = 'http://' + support.HOST + ':' + str(addr[1])
333        robots_url = url + "/robots.txt"
334        parser = urllib.robotparser.RobotFileParser()
335        parser.set_url(url)
336        parser.read()
337        self.assertFalse(parser.can_fetch("*", robots_url))
338
339
340class NetworkTestCase(unittest.TestCase):
341
342    base_url = 'http://www.pythontest.net/'
343    robots_txt = '{}elsewhere/robots.txt'.format(base_url)
344
345    @classmethod
346    def setUpClass(cls):
347        support.requires('network')
348        with support.transient_internet(cls.base_url):
349            cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
350            cls.parser.read()
351
352    def url(self, path):
353        return '{}{}{}'.format(
354            self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
355        )
356
357    def test_basic(self):
358        self.assertFalse(self.parser.disallow_all)
359        self.assertFalse(self.parser.allow_all)
360        self.assertGreater(self.parser.mtime(), 0)
361        self.assertFalse(self.parser.crawl_delay('*'))
362        self.assertFalse(self.parser.request_rate('*'))
363
364    def test_can_fetch(self):
365        self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
366        self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
367        self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
368        self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
369        self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
370        self.assertTrue(self.parser.can_fetch('*', self.base_url))
371
372    def test_read_404(self):
373        parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
374        parser.read()
375        self.assertTrue(parser.allow_all)
376        self.assertFalse(parser.disallow_all)
377        self.assertEqual(parser.mtime(), 0)
378        self.assertIsNone(parser.crawl_delay('*'))
379        self.assertIsNone(parser.request_rate('*'))
380
381if __name__=='__main__':
382    unittest.main()
383