@@ -16,6 +16,14 @@ class BaseRobotTest:
1616 bad = []
1717 site_maps = None
1818
19+ def __init_subclass__ (cls ):
20+ super ().__init_subclass__ ()
21+ # Remove tests that do nothing.
22+ if not cls .good :
23+ cls .test_good_urls = None
24+ if not cls .bad :
25+ cls .test_bad_urls = None
26+
1927 def setUp (self ):
2028 lines = io .StringIO (self .robots_txt ).readlines ()
2129 self .parser = urllib .robotparser .RobotFileParser ()
@@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
231239 robots_txt = """\
232240 User-agent: *
233241Disallow: /some/path?name=value
242+ Disallow: /another/path?
243+ Disallow: /yet/one/path?name=value&more
234244 """
235- good = ['/some/path' ]
236- bad = ['/some/path?name=value' ]
245+ good = ['/some/path' , '/some/path?' ,
246+ '/some/path%3Fname=value' , '/some/path?name%3Dvalue' ,
247+ '/another/path' , '/another/path%3F' ,
248+ '/yet/one/path?name=value%26more' ]
249+ bad = ['/some/path?name=value'
250+ '/another/path?' , '/another/path?name=value' ,
251+ '/yet/one/path?name=value&more' ]
237252
238253
239254class UseFirstUserAgentWildcardTest (BaseRobotTest , unittest .TestCase ):
@@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249264 bad = ['/some/path' ]
250265
251266
252- class EmptyQueryStringTest (BaseRobotTest , unittest .TestCase ):
253- # normalize the URL first (#17403)
267+ class PercentEncodingTest (BaseRobotTest , unittest .TestCase ):
254268 robots_txt = """\
255269 User-agent: *
256- Allow: /some/path?
257- Disallow: /another/path?
258- """
259- good = ['/some/path?' ]
260- bad = ['/another/path?' ]
270+ Disallow: /a1/Z-._~ # unreserved characters
271+ Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
272+ Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
273+ Disallow: /u2/%f0%9f%90%8d
274+ Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
275+ Disallow: /v1/%F0 # percent-encoded non-ASCII octet
276+ Disallow: /v2/%f0
277+ Disallow: /v3/\udcf0 # raw non-ASCII octet
278+ Disallow: /p1%xy # raw percent
279+ Disallow: /p2%
280+ Disallow: /p3%25xy # percent-encoded percent
281+ Disallow: /p4%2525xy # double percent-encoded percent
282+ Disallow: /john%20smith # space
283+ Disallow: /john doe
284+ Disallow: /trailingspace%20
285+ Disallow: /question%3Fq=v # not query
286+ Disallow: /hash%23f # not fragment
287+ Disallow: /dollar%24
288+ Disallow: /asterisk%2A
289+ Disallow: /sub/dir
290+ Disallow: /slash%2F
291+ Disallow: /query/question?q=%3F
292+ Disallow: /query/raw/question?q=?
293+ Disallow: /query/eq?q%3Dv
294+ Disallow: /query/amp?q=v%26a
295+ """
296+ good = [
297+ '/u1/%F0' , '/u1/%f0' ,
298+ '/u2/%F0' , '/u2/%f0' ,
299+ '/u3/%F0' , '/u3/%f0' ,
300+ '/p1%2525xy' , '/p2%f0' , '/p3%2525xy' , '/p4%xy' , '/p4%25xy' ,
301+ '/question?q=v' ,
302+ '/dollar' , '/asterisk' ,
303+ '/query/eq?q=v' ,
304+ '/query/amp?q=v&a' ,
305+ ]
306+ bad = [
307+ '/a1/Z-._~' , '/a1/%5A%2D%2E%5F%7E' ,
308+ '/a2/Z-._~' , '/a2/%5A%2D%2E%5F%7E' ,
309+ '/u1/%F0%9F%90%8D' , '/u1/%f0%9f%90%8d' , '/u1/\U0001f40d ' ,
310+ '/u2/%F0%9F%90%8D' , '/u2/%f0%9f%90%8d' , '/u2/\U0001f40d ' ,
311+ '/u3/%F0%9F%90%8D' , '/u3/%f0%9f%90%8d' , '/u3/\U0001f40d ' ,
312+ '/v1/%F0' , '/v1/%f0' , '/v1/\udcf0 ' , '/v1/\U0001f40d ' ,
313+ '/v2/%F0' , '/v2/%f0' , '/v2/\udcf0 ' , '/v2/\U0001f40d ' ,
314+ '/v3/%F0' , '/v3/%f0' , '/v3/\udcf0 ' , '/v3/\U0001f40d ' ,
315+ '/p1%xy' , '/p1%25xy' ,
316+ '/p2%' , '/p2%25' , '/p2%2525' , '/p2%xy' ,
317+ '/p3%xy' , '/p3%25xy' ,
318+ '/p4%2525xy' ,
319+ '/john%20smith' , '/john smith' ,
320+ '/john%20doe' , '/john doe' ,
321+ '/trailingspace%20' , '/trailingspace ' ,
322+ '/question%3Fq=v' ,
323+ '/hash#f' , '/hash%23f' ,
324+ '/dollar$' , '/dollar%24' ,
325+ '/asterisk*' , '/asterisk%2A' ,
326+ '/sub/dir' , '/sub%2Fdir' ,
327+ '/slash%2F' , '/slash/' ,
328+ '/query/question?q=?' , '/query/question?q=%3F' ,
329+ '/query/raw/question?q=?' , '/query/raw/question?q=%3F' ,
330+ '/query/eq?q%3Dv' ,
331+ '/query/amp?q=v%26a' ,
332+ ]
333+ # other reserved characters
334+ for c in ":/#[]@!$&'()*+,;=" :
335+ robots_txt += f'Disallow: /raw{ c } \n Disallow: /pc%{ ord (c ):02X} \n '
336+ bad .append (f'/raw{ c } ' )
337+ bad .append (f'/raw%{ ord (c ):02X} ' )
338+ bad .append (f'/pc{ c } ' )
339+ bad .append (f'/pc%{ ord (c ):02X} ' )
261340
262341
263342class DefaultEntryTest (BaseRequestRateTest , unittest .TestCase ):
@@ -299,26 +378,17 @@ def test_string_formatting(self):
299378 self .assertEqual (str (self .parser ), self .expected_output )
300379
301380
302- class RobotHandler (BaseHTTPRequestHandler ):
303-
304- def do_GET (self ):
305- self .send_error (403 , "Forbidden access" )
306-
307- def log_message (self , format , * args ):
308- pass
309-
310-
311381@unittest .skipUnless (
312382 support .has_socket_support ,
313383 "Socket server requires working socket."
314384)
315- class PasswordProtectedSiteTestCase ( unittest . TestCase ) :
385+ class BaseLocalNetworkTestCase :
316386
317387 def setUp (self ):
318388 # clear _opener global variable
319389 self .addCleanup (urllib .request .urlcleanup )
320390
321- self .server = HTTPServer ((socket_helper .HOST , 0 ), RobotHandler )
391+ self .server = HTTPServer ((socket_helper .HOST , 0 ), self . RobotHandler )
322392
323393 self .t = threading .Thread (
324394 name = 'HTTPServer serving' ,
@@ -335,6 +405,57 @@ def tearDown(self):
335405 self .t .join ()
336406 self .server .server_close ()
337407
408+
409+ SAMPLE_ROBOTS_TXT = b'''\
410+ User-agent: test_robotparser
411+ Disallow: /utf8/\xf0 \x9f \x90 \x8d
412+ Disallow: /non-utf8/\xf0
413+ Disallow: //[spam]/path
414+ '''
415+
416+
417+ class LocalNetworkTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
418+ class RobotHandler (BaseHTTPRequestHandler ):
419+
420+ def do_GET (self ):
421+ self .send_response (200 )
422+ self .end_headers ()
423+ self .wfile .write (SAMPLE_ROBOTS_TXT )
424+
425+ def log_message (self , format , * args ):
426+ pass
427+
428+ @threading_helper .reap_threads
429+ def testRead (self ):
430+ # Test that reading a weird robots.txt doesn't fail.
431+ addr = self .server .server_address
432+ url = f'http://{ socket_helper .HOST } :{ addr [1 ]} '
433+ robots_url = url + '/robots.txt'
434+ parser = urllib .robotparser .RobotFileParser ()
435+ parser .set_url (robots_url )
436+ parser .read ()
437+ # And it can even interpret the weird paths in some reasonable way.
438+ agent = 'test_robotparser'
439+ self .assertTrue (parser .can_fetch (agent , robots_url ))
440+ self .assertTrue (parser .can_fetch (agent , url + '/utf8/' ))
441+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
442+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/%F0%9F%90%8D' ))
443+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
444+ self .assertTrue (parser .can_fetch (agent , url + '/non-utf8/' ))
445+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/%F0' ))
446+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/\U0001f40d ' ))
447+ self .assertFalse (parser .can_fetch (agent , url + '/%2F[spam]/path' ))
448+
449+
450+ class PasswordProtectedSiteTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
451+ class RobotHandler (BaseHTTPRequestHandler ):
452+
453+ def do_GET (self ):
454+ self .send_error (403 , "Forbidden access" )
455+
456+ def log_message (self , format , * args ):
457+ pass
458+
338459 @threading_helper .reap_threads
339460 def testPasswordProtectedSite (self ):
340461 addr = self .server .server_address
0 commit comments