|
3 | 3 |
|
4 | 4 | import re
|
5 | 5 | import pymysql
|
| 6 | +import ssl |
6 | 7 |
|
| 8 | +from pymysql import Error |
7 | 9 |
|
8 |
| -def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )): |
| 10 | + |
| 11 | +def decode_page(page_bytes, charsets=('utf-8', )): |
| 12 | + page_html = None |
| 13 | + for charset in charsets: |
| 14 | + try: |
| 15 | + page_html = page_bytes.decode(charset) |
| 16 | + break |
| 17 | + except UnicodeDecodeError: |
| 18 | + pass |
| 19 | + # logging.error('Decode:', error) |
| 20 | + return page_html |
| 21 | + |
| 22 | + |
| 23 | +def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8', )): |
| 24 | + page_html = None |
9 | 25 | try:
|
10 |
| - for charset in charsets: |
11 |
| - try: |
12 |
| - html = urlopen(start_url).read().decode(charset) |
13 |
| - break |
14 |
| - except UnicodeDecodeError: |
15 |
| - html = None |
16 |
| - except URLError as ex: |
17 |
| - print('Error:', ex) |
18 |
| - return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \ |
19 |
| - retry_times > 0 else None |
20 |
| - return html |
| 26 | + page_html = decode_page(urlopen(seed_url).read(), charsets) |
| 27 | + except URLError: |
| 28 | + # logging.error('URL:', error) |
| 29 | + if retry_times > 0: |
| 30 | + return get_page_html(seed_url, retry_times=retry_times - 1, |
| 31 | + charsets=charsets) |
| 32 | + return page_html |
| 33 | + |
| 34 | + |
| 35 | +def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I): |
| 36 | + pattern_regex = re.compile(pattern_str, pattern_ignore_case) |
| 37 | + return pattern_regex.findall(page_html) if page_html else [] |
| 38 | + |
| 39 | + |
| 40 | +def start_crawl(seed_url, match_pattern): |
| 41 | + conn = pymysql.connect(host='localhost', port=3306, |
| 42 | + database='crawler', user='root', |
| 43 | + password='123456', charset='utf8') |
| 44 | + try: |
| 45 | + with conn.cursor() as cursor: |
| 46 | + url_list = [seed_url] |
| 47 | + while url_list: |
| 48 | + current_url = url_list.pop(0) |
| 49 | + page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312')) |
| 50 | + links_list = get_matched_parts(page_html, match_pattern) |
| 51 | + url_list += links_list |
| 52 | + param_list = [] |
| 53 | + for link in links_list: |
| 54 | + page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312')) |
| 55 | + headings = get_matched_parts(page_html, r'<h1>(.*)<span') |
| 56 | + if headings: |
| 57 | + param_list.append((headings[0], link)) |
| 58 | + cursor.executemany('insert into tb_result values (default, %s, %s)', |
| 59 | + param_list) |
| 60 | + conn.commit() |
| 61 | + except Error: |
| 62 | + pass |
| 63 | + # logging.error('SQL:', error) |
| 64 | + finally: |
| 65 | + conn.close() |
21 | 66 |
|
22 | 67 |
|
23 | 68 | def main():
|
24 |
| - url_list = ['http://sports.sohu.com/nba_a.shtml'] |
25 |
| - visited_list = set({}) |
26 |
| - while len(url_list) > 0: |
27 |
| - current_url = url_list.pop(0) |
28 |
| - visited_list.add(current_url) |
29 |
| - print(current_url) |
30 |
| - html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312')) |
31 |
| - if html: |
32 |
| - link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) |
33 |
| - link_list = re.findall(link_regex, html) |
34 |
| - url_list += link_list |
35 |
| - conn = pymysql.connect(host='localhost', port=3306, |
36 |
| - db='crawler', user='root', |
37 |
| - passwd='123456', charset='utf8') |
38 |
| - try: |
39 |
| - for link in link_list: |
40 |
| - if link not in visited_list: |
41 |
| - visited_list.add(link) |
42 |
| - print(link) |
43 |
| - html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312')) |
44 |
| - if html: |
45 |
| - title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE) |
46 |
| - match_list = title_regex.findall(html) |
47 |
| - if len(match_list) > 0: |
48 |
| - title = match_list[0] |
49 |
| - with conn.cursor() as cursor: |
50 |
| - cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)', |
51 |
| - (title, link)) |
52 |
| - conn.commit() |
53 |
| - finally: |
54 |
| - conn.close() |
55 |
| - print('执行完成!') |
| 69 | + ssl._create_default_https_context = ssl._create_unverified_context |
| 70 | + start_crawl('http://sports.sohu.com/nba_a.shtml', |
| 71 | + r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']') |
56 | 72 |
|
57 | 73 |
|
58 | 74 | if __name__ == '__main__':
|
59 | 75 | main()
|
60 |
| - |
|
0 commit comments