1import io 2import os 3import threading 4import unittest 5import urllib.robotparser 6from test import support 7from http.server import BaseHTTPRequestHandler, HTTPServer 8 9 10class BaseRobotTest: 11 robots_txt = '' 12 agent = 'test_robotparser' 13 good = [] 14 bad = [] 15 site_maps = None 16 17 def setUp(self): 18 lines = io.StringIO(self.robots_txt).readlines() 19 self.parser = urllib.robotparser.RobotFileParser() 20 self.parser.parse(lines) 21 22 def get_agent_and_url(self, url): 23 if isinstance(url, tuple): 24 agent, url = url 25 return agent, url 26 return self.agent, url 27 28 def test_good_urls(self): 29 for url in self.good: 30 agent, url = self.get_agent_and_url(url) 31 with self.subTest(url=url, agent=agent): 32 self.assertTrue(self.parser.can_fetch(agent, url)) 33 34 def test_bad_urls(self): 35 for url in self.bad: 36 agent, url = self.get_agent_and_url(url) 37 with self.subTest(url=url, agent=agent): 38 self.assertFalse(self.parser.can_fetch(agent, url)) 39 40 def test_site_maps(self): 41 self.assertEqual(self.parser.site_maps(), self.site_maps) 42 43 44class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 45 robots_txt = """\ 46User-agent: * 47Disallow: /cyberworld/map/ # This is an infinite virtual URL space 48Disallow: /tmp/ # these will soon disappear 49Disallow: /foo.html 50 """ 51 good = ['/', '/test.html'] 52 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] 53 54 55class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase): 56 robots_txt = """\ 57# robots.txt for http://www.example.com/ 58 59User-agent: * 60Crawl-delay: 1 61Request-rate: 3/15 62Disallow: /cyberworld/map/ # This is an infinite virtual URL space 63 64# Cybermapper knows where to go. 65User-agent: cybermapper 66Disallow: 67 """ 68 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')] 69 bad = ['/cyberworld/map/index.html'] 70 71 72class SitemapTest(BaseRobotTest, unittest.TestCase): 73 robots_txt = """\ 74# robots.txt for http://www.example.com/ 75 76User-agent: * 77Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml 78Sitemap: http://www.google.com/hostednews/sitemap_index.xml 79Request-rate: 3/15 80Disallow: /cyberworld/map/ # This is an infinite virtual URL space 81 82 """ 83 good = ['/', '/test.html'] 84 bad = ['/cyberworld/map/index.html'] 85 site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml', 86 'http://www.google.com/hostednews/sitemap_index.xml'] 87 88 89class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): 90 robots_txt = """\ 91# go away 92User-agent: * 93Disallow: / 94 """ 95 good = [] 96 bad = ['/cyberworld/map/index.html', '/', '/tmp/'] 97 98 99class BaseRequestRateTest(BaseRobotTest): 100 request_rate = None 101 crawl_delay = None 102 103 def test_request_rate(self): 104 parser = self.parser 105 for url in self.good + self.bad: 106 agent, url = self.get_agent_and_url(url) 107 with self.subTest(url=url, agent=agent): 108 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay) 109 110 parsed_request_rate = parser.request_rate(agent) 111 self.assertEqual(parsed_request_rate, self.request_rate) 112 if self.request_rate is not None: 113 self.assertIsInstance( 114 parsed_request_rate, 115 urllib.robotparser.RequestRate 116 ) 117 self.assertEqual( 118 parsed_request_rate.requests, 119 self.request_rate.requests 120 ) 121 self.assertEqual( 122 parsed_request_rate.seconds, 123 self.request_rate.seconds 124 ) 125 126 127class EmptyFileTest(BaseRequestRateTest, unittest.TestCase): 128 robots_txt = '' 129 good = ['/foo'] 130 131 132class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): 133 robots_txt = """\ 134User-agent: figtree 135Crawl-delay: 3 136Request-rate: 9/30 137Disallow: /tmp 138Disallow: /a%3cd.html 139Disallow: /a%2fb.html 140Disallow: /%7ejoe/index.html 141 """ 142 agent = 'figtree' 143 request_rate = urllib.robotparser.RequestRate(9, 30) 144 crawl_delay = 3 145 good = [('figtree', '/foo.html')] 146 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', 147 '/a%2fb.html', '/~joe/index.html'] 148 149 150class DifferentAgentTest(CrawlDelayAndRequestRateTest): 151 agent = 'FigTree Robot libwww-perl/5.04' 152 153 154class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 155 robots_txt = """\ 156User-agent: * 157Disallow: /tmp/ 158Disallow: /a%3Cd.html 159Disallow: /a/b.html 160Disallow: /%7ejoe/index.html 161Crawl-delay: 3 162Request-rate: 9/banana 163 """ 164 good = ['/tmp'] 165 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html', 166 '/%7Ejoe/index.html'] 167 crawl_delay = 3 168 169 170class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase): 171 # From bug report #523041 172 robots_txt = """\ 173User-Agent: * 174Disallow: /. 175Crawl-delay: pears 176 """ 177 good = ['/foo.html'] 178 # bug report says "/" should be denied, but that is not in the RFC 179 bad = [] 180 181 182class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 183 # also test that Allow and Diasallow works well with each other 184 robots_txt = """\ 185User-agent: Googlebot 186Allow: /folder1/myfile.html 187Disallow: /folder1/ 188Request-rate: whale/banana 189 """ 190 agent = 'Googlebot' 191 good = ['/folder1/myfile.html'] 192 bad = ['/folder1/anotherfile.html'] 193 194 195class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): 196 # the order of User-agent should be correct. note 197 # that this file is incorrect because "Googlebot" is a 198 # substring of "Googlebot-Mobile" 199 robots_txt = """\ 200User-agent: Googlebot 201Disallow: / 202 203User-agent: Googlebot-Mobile 204Allow: / 205 """ 206 agent = 'Googlebot' 207 bad = ['/something.jpg'] 208 209 210class UserAgentGoogleMobileTest(UserAgentOrderingTest): 211 agent = 'Googlebot-Mobile' 212 213 214class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): 215 # Google also got the order wrong. You need 216 # to specify the URLs from more specific to more general 217 robots_txt = """\ 218User-agent: Googlebot 219Allow: /folder1/myfile.html 220Disallow: /folder1/ 221 """ 222 agent = 'googlebot' 223 good = ['/folder1/myfile.html'] 224 bad = ['/folder1/anotherfile.html'] 225 226 227class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): 228 # see issue #6325 for details 229 robots_txt = """\ 230User-agent: * 231Disallow: /some/path?name=value 232 """ 233 good = ['/some/path'] 234 bad = ['/some/path?name=value'] 235 236 237class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 238 # obey first * entry (#4108) 239 robots_txt = """\ 240User-agent: * 241Disallow: /some/path 242 243User-agent: * 244Disallow: /another/path 245 """ 246 good = ['/another/path'] 247 bad = ['/some/path'] 248 249 250class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): 251 # normalize the URL first (#17403) 252 robots_txt = """\ 253User-agent: * 254Allow: /some/path? 255Disallow: /another/path? 256 """ 257 good = ['/some/path?'] 258 bad = ['/another/path?'] 259 260 261class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): 262 robots_txt = """\ 263User-agent: * 264Crawl-delay: 1 265Request-rate: 3/15 266Disallow: /cyberworld/map/ 267 """ 268 request_rate = urllib.robotparser.RequestRate(3, 15) 269 crawl_delay = 1 270 good = ['/', '/test.html'] 271 bad = ['/cyberworld/map/index.html'] 272 273 274class StringFormattingTest(BaseRobotTest, unittest.TestCase): 275 robots_txt = """\ 276User-agent: * 277Crawl-delay: 1 278Request-rate: 3/15 279Disallow: /cyberworld/map/ # This is an infinite virtual URL space 280 281# Cybermapper knows where to go. 282User-agent: cybermapper 283Disallow: /some/path 284 """ 285 286 expected_output = """\ 287User-agent: cybermapper 288Disallow: /some/path 289 290User-agent: * 291Crawl-delay: 1 292Request-rate: 3/15 293Disallow: /cyberworld/map/\ 294""" 295 296 def test_string_formatting(self): 297 self.assertEqual(str(self.parser), self.expected_output) 298 299 300class RobotHandler(BaseHTTPRequestHandler): 301 302 def do_GET(self): 303 self.send_error(403, "Forbidden access") 304 305 def log_message(self, format, *args): 306 pass 307 308 309class PasswordProtectedSiteTestCase(unittest.TestCase): 310 311 def setUp(self): 312 self.server = HTTPServer((support.HOST, 0), RobotHandler) 313 314 self.t = threading.Thread( 315 name='HTTPServer serving', 316 target=self.server.serve_forever, 317 # Short poll interval to make the test finish quickly. 318 # Time between requests is short enough that we won't wake 319 # up spuriously too many times. 320 kwargs={'poll_interval':0.01}) 321 self.t.daemon = True # In case this function raises. 322 self.t.start() 323 324 def tearDown(self): 325 self.server.shutdown() 326 self.t.join() 327 self.server.server_close() 328 329 @support.reap_threads 330 def testPasswordProtectedSite(self): 331 addr = self.server.server_address 332 url = 'http://' + support.HOST + ':' + str(addr[1]) 333 robots_url = url + "/robots.txt" 334 parser = urllib.robotparser.RobotFileParser() 335 parser.set_url(url) 336 parser.read() 337 self.assertFalse(parser.can_fetch("*", robots_url)) 338 339 340class NetworkTestCase(unittest.TestCase): 341 342 base_url = 'http://www.pythontest.net/' 343 robots_txt = '{}elsewhere/robots.txt'.format(base_url) 344 345 @classmethod 346 def setUpClass(cls): 347 support.requires('network') 348 with support.transient_internet(cls.base_url): 349 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt) 350 cls.parser.read() 351 352 def url(self, path): 353 return '{}{}{}'.format( 354 self.base_url, path, '/' if not os.path.splitext(path)[1] else '' 355 ) 356 357 def test_basic(self): 358 self.assertFalse(self.parser.disallow_all) 359 self.assertFalse(self.parser.allow_all) 360 self.assertGreater(self.parser.mtime(), 0) 361 self.assertFalse(self.parser.crawl_delay('*')) 362 self.assertFalse(self.parser.request_rate('*')) 363 364 def test_can_fetch(self): 365 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) 366 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) 367 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) 368 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) 369 self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) 370 self.assertTrue(self.parser.can_fetch('*', self.base_url)) 371 372 def test_read_404(self): 373 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt')) 374 parser.read() 375 self.assertTrue(parser.allow_all) 376 self.assertFalse(parser.disallow_all) 377 self.assertEqual(parser.mtime(), 0) 378 self.assertIsNone(parser.crawl_delay('*')) 379 self.assertIsNone(parser.request_rate('*')) 380 381if __name__=='__main__': 382 unittest.main() 383