8
8
from pymysql import Error
9
9
10
10
11
- def decode_page (page_bytes , charsets = ('utf-8' , )):
11
+ # 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
12
+ def decode_page (page_bytes , charsets = ('utf-8' ,)):
12
13
page_html = None
13
14
for charset in charsets :
14
15
try :
@@ -20,7 +21,8 @@ def decode_page(page_bytes, charsets=('utf-8', )):
20
21
return page_html
21
22
22
23
23
- def get_page_html (seed_url , * , retry_times = 3 , charsets = ('utf-8' , )):
24
+ # 获取页面的HTML代码(通过递归实现指定次数的重试操作)
25
+ def get_page_html (seed_url , * , retry_times = 3 , charsets = ('utf-8' ,)):
24
26
page_html = None
25
27
try :
26
28
page_html = decode_page (urlopen (seed_url ).read (), charsets )
@@ -32,32 +34,38 @@ def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8', )):
32
34
return page_html
33
35
34
36
37
+ # 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
35
38
def get_matched_parts (page_html , pattern_str , pattern_ignore_case = re .I ):
36
39
pattern_regex = re .compile (pattern_str , pattern_ignore_case )
37
40
return pattern_regex .findall (page_html ) if page_html else []
38
41
39
42
40
- def start_crawl (seed_url , match_pattern ):
43
+ # 开始执行爬虫程序并对指定的数据进行持久化操作
44
+ def start_crawl (seed_url , match_pattern , * , max_depth = - 1 ):
41
45
conn = pymysql .connect (host = 'localhost' , port = 3306 ,
42
46
database = 'crawler' , user = 'root' ,
43
47
password = '123456' , charset = 'utf8' )
44
48
try :
45
49
with conn .cursor () as cursor :
46
50
url_list = [seed_url ]
51
+ visited_url_list = {seed_url : 0 }
47
52
while url_list :
48
53
current_url = url_list .pop (0 )
49
- page_html = get_page_html (current_url , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
50
- links_list = get_matched_parts (page_html , match_pattern )
51
- url_list += links_list
52
- param_list = []
53
- for link in links_list :
54
- page_html = get_page_html (link , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
55
- headings = get_matched_parts (page_html , r'<h1>(.*)<span' )
56
- if headings :
57
- param_list .append ((headings [0 ], link ))
58
- cursor .executemany ('insert into tb_result values (default, %s, %s)' ,
59
- param_list )
60
- conn .commit ()
54
+ depth = visited_url_list [current_url ]
55
+ if depth != max_depth :
56
+ page_html = get_page_html (current_url , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
57
+ links_list = get_matched_parts (page_html , match_pattern )
58
+ param_list = []
59
+ for link in links_list :
60
+ if link not in visited_url_list :
61
+ visited_url_list [link ] = depth + 1
62
+ page_html = get_page_html (link , charsets = ('utf-8' , 'gbk' , 'gb2312' ))
63
+ headings = get_matched_parts (page_html , r'<h1>(.*)<span' )
64
+ if headings :
65
+ param_list .append ((headings [0 ], link ))
66
+ cursor .executemany ('insert into tb_result values (default, %s, %s)' ,
67
+ param_list )
68
+ conn .commit ()
61
69
except Error :
62
70
pass
63
71
# logging.error('SQL:', error)
@@ -67,8 +75,9 @@ def start_crawl(seed_url, match_pattern):
67
75
68
76
def main ():
69
77
ssl ._create_default_https_context = ssl ._create_unverified_context
70
- start_crawl ('http://sports.sohu.com/nba_a.shtml' ,
71
- r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']' )
78
+ start_crawl ('http://sports.sohu.com/nba_a.shtml' ,
79
+ r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']' ,
80
+ max_depth = 2 )
72
81
73
82
74
83
if __name__ == '__main__' :
0 commit comments