@@ -16,6 +16,14 @@ class BaseRobotTest:
1616 bad = []
1717 site_maps = None
1818
19+ def __init_subclass__ (cls ):
20+ super ().__init_subclass__ ()
21+ # Remove tests that do nothing.
22+ if not cls .good :
23+ cls .test_good_urls = None
24+ if not cls .bad :
25+ cls .test_bad_urls = None
26+
1927 def setUp (self ):
2028 lines = io .StringIO (self .robots_txt ).readlines ()
2129 self .parser = urllib .robotparser .RobotFileParser ()
@@ -249,15 +257,77 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249257 bad = ['/some/path' ]
250258
251259
252- class EmptyQueryStringTest (BaseRobotTest , unittest .TestCase ):
253- # normalize the URL first (#17403)
260+ class PercentEncodingTest (BaseRobotTest , unittest .TestCase ):
254261 robots_txt = """\
255262 User-agent: *
256- Allow: /some/path?
257- Disallow: /another/path?
258- """
259- good = ['/some/path?' ]
260- bad = ['/another/path?' ]
263+ Disallow: /a1/Z-._~ # unreserved characters
264+ Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
265+ Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
266+ Disallow: /u2/%f0%9f%90%8d
267+ Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
268+ Disallow: /v1/%F0 # percent-encoded non-ASCII octet
269+ Disallow: /v2/%f0
270+ Disallow: /v3/\udcf0 # raw non-ASCII octet
271+ Disallow: /p1%xy # raw percent
272+ Disallow: /p2%
273+ Disallow: /p3%25xy # percent-encoded percent
274+ Disallow: /p4%2525xy # double percent-encoded percent
275+ Disallow: /john%20smith # space
276+ Disallow: /john doe
277+ Disallow: /trailingspace%20
278+ Disallow: /query?q=v # query
279+ Disallow: /query2?q=%3F
280+ Disallow: /query3?q=?
281+ Disallow: /emptyquery?
282+ Disallow: /question%3Fq=v # not query
283+ Disallow: /hash%23f # not fragment
284+ Disallow: /dollar%24
285+ Disallow: /asterisk%2A
286+ Disallow: /sub/dir
287+ Disallow: /slash%2F
288+ """
289+ good = [
290+ '/u1/%F0' , '/u1/%f0' ,
291+ '/u2/%F0' , '/u2/%f0' ,
292+ '/u3/%F0' , '/u3/%f0' ,
293+ '/p1%2525xy' , '/p2%f0' , '/p3%2525xy' , '/p4%xy' , '/p4%25xy' ,
294+ '/query%3Fq=v' , '/question?q=v' ,
295+ '/emptyquery' ,
296+ '/dollar' , '/asterisk' ,
297+ ]
298+ bad = [
299+ '/a1/Z-._~' , '/a1/%5A%2D%2E%5F%7E' ,
300+ '/a2/Z-._~' , '/a2/%5A%2D%2E%5F%7E' ,
301+ '/u1/%F0%9F%90%8D' , '/u1/%f0%9f%90%8d' , '/u1/\U0001f40d ' ,
302+ '/u2/%F0%9F%90%8D' , '/u2/%f0%9f%90%8d' , '/u2/\U0001f40d ' ,
303+ '/u3/%F0%9F%90%8D' , '/u3/%f0%9f%90%8d' , '/u3/\U0001f40d ' ,
304+ '/v1/%F0' , '/v1/%f0' , '/v1/\udcf0 ' , '/v1/\U0001f40d ' ,
305+ '/v2/%F0' , '/v2/%f0' , '/v2/\udcf0 ' , '/v2/\U0001f40d ' ,
306+ '/v3/%F0' , '/v3/%f0' , '/v3/\udcf0 ' , '/v3/\U0001f40d ' ,
307+ '/p1%xy' , '/p1%25xy' ,
308+ '/p2%' , '/p2%25' , '/p2%2525' , '/p2%xy' ,
309+ '/p3%xy' , '/p3%25xy' ,
310+ '/p4%2525xy' ,
311+ '/john%20smith' , '/john smith' ,
312+ '/john%20doe' , '/john doe' ,
313+ '/trailingspace%20' , '/trailingspace ' ,
314+ '/query?q=v' , '/question%3Fq=v' ,
315+ '/query2?q=?' , '/query2?q=%3F' ,
316+ '/query3?q=?' , '/query3?q=%3F' ,
317+ '/emptyquery?' , '/emptyquery?q=v' ,
318+ '/hash#f' , '/hash%23f' ,
319+ '/dollar$' , '/dollar%24' ,
320+ '/asterisk*' , '/asterisk%2A' ,
321+ '/sub/dir' , '/sub%2Fdir' ,
322+ '/slash%2F' , '/slash/' ,
323+ ]
324+ # other reserved characters
325+ for c in ":/#[]@!$&'()*+,;=" :
326+ robots_txt += f'Disallow: /raw{ c } \n Disallow: /pc%{ ord (c ):02X} \n '
327+ bad .append (f'/raw{ c } ' )
328+ bad .append (f'/raw%{ ord (c ):02X} ' )
329+ bad .append (f'/pc{ c } ' )
330+ bad .append (f'/pc%{ ord (c ):02X} ' )
261331
262332
263333class DefaultEntryTest (BaseRequestRateTest , unittest .TestCase ):
@@ -299,26 +369,17 @@ def test_string_formatting(self):
299369 self .assertEqual (str (self .parser ), self .expected_output )
300370
301371
302- class RobotHandler (BaseHTTPRequestHandler ):
303-
304- def do_GET (self ):
305- self .send_error (403 , "Forbidden access" )
306-
307- def log_message (self , format , * args ):
308- pass
309-
310-
311372@unittest .skipUnless (
312373 support .has_socket_support ,
313374 "Socket server requires working socket."
314375)
315- class PasswordProtectedSiteTestCase ( unittest . TestCase ) :
376+ class BaseLocalNetworkTestCase :
316377
317378 def setUp (self ):
318379 # clear _opener global variable
319380 self .addCleanup (urllib .request .urlcleanup )
320381
321- self .server = HTTPServer ((socket_helper .HOST , 0 ), RobotHandler )
382+ self .server = HTTPServer ((socket_helper .HOST , 0 ), self . RobotHandler )
322383
323384 self .t = threading .Thread (
324385 name = 'HTTPServer serving' ,
@@ -335,6 +396,57 @@ def tearDown(self):
335396 self .t .join ()
336397 self .server .server_close ()
337398
399+
400+ SAMPLE_ROBOTS_TXT = b'''\
401+ User-agent: test_robotparser
402+ Disallow: /utf8/\xf0 \x9f \x90 \x8d
403+ Disallow: /non-utf8/\xf0
404+ Disallow: //[spam]/path
405+ '''
406+
407+
408+ class LocalNetworkTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
409+ class RobotHandler (BaseHTTPRequestHandler ):
410+
411+ def do_GET (self ):
412+ self .send_response (200 )
413+ self .end_headers ()
414+ self .wfile .write (SAMPLE_ROBOTS_TXT )
415+
416+ def log_message (self , format , * args ):
417+ pass
418+
419+ @threading_helper .reap_threads
420+ def testRead (self ):
421+ # Test that reading a weird robots.txt doesn't fail.
422+ addr = self .server .server_address
423+ url = f'http://{ socket_helper .HOST } :{ addr [1 ]} '
424+ robots_url = url + '/robots.txt'
425+ parser = urllib .robotparser .RobotFileParser ()
426+ parser .set_url (robots_url )
427+ parser .read ()
428+ # And it can even interpret the weird paths in some reasonable way.
429+ agent = 'test_robotparser'
430+ self .assertTrue (parser .can_fetch (agent , robots_url ))
431+ self .assertTrue (parser .can_fetch (agent , url + '/utf8/' ))
432+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
433+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/%F0%9F%90%8D' ))
434+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
435+ self .assertTrue (parser .can_fetch (agent , url + '/non-utf8/' ))
436+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/%F0' ))
437+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/\U0001f40d ' ))
438+ self .assertFalse (parser .can_fetch (agent , url + '/%2F[spam]/path' ))
439+
440+
441+ class PasswordProtectedSiteTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
442+ class RobotHandler (BaseHTTPRequestHandler ):
443+
444+ def do_GET (self ):
445+ self .send_error (403 , "Forbidden access" )
446+
447+ def log_message (self , format , * args ):
448+ pass
449+
338450 @threading_helper .reap_threads
339451 def testPasswordProtectedSite (self ):
340452 addr = self .server .server_address
0 commit comments