|
3 | 3 |
|
4 | 4 | import re |
5 | 5 | import pymysql |
| 6 | +import ssl |
6 | 7 |
|
| 8 | +from pymysql import Error |
7 | 9 |
|
8 | | -def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )): |
| 10 | + |
| 11 | +def decode_page(page_bytes, charsets=('utf-8', )): |
| 12 | + page_html = None |
| 13 | + for charset in charsets: |
| 14 | + try: |
| 15 | + page_html = page_bytes.decode(charset) |
| 16 | + break |
| 17 | + except UnicodeDecodeError: |
| 18 | + pass |
| 19 | + # logging.error('Decode:', error) |
| 20 | + return page_html |
| 21 | + |
| 22 | + |
| 23 | +def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8', )): |
| 24 | + page_html = None |
9 | 25 | try: |
10 | | - for charset in charsets: |
11 | | - try: |
12 | | - html = urlopen(start_url).read().decode(charset) |
13 | | - break |
14 | | - except UnicodeDecodeError: |
15 | | - html = None |
16 | | - except URLError as ex: |
17 | | - print('Error:', ex) |
18 | | - return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \ |
19 | | - retry_times > 0 else None |
20 | | - return html |
| 26 | + page_html = decode_page(urlopen(seed_url).read(), charsets) |
| 27 | + except URLError: |
| 28 | + # logging.error('URL:', error) |
| 29 | + if retry_times > 0: |
| 30 | + return get_page_html(seed_url, retry_times=retry_times - 1, |
| 31 | + charsets=charsets) |
| 32 | + return page_html |
| 33 | + |
| 34 | + |
| 35 | +def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I): |
| 36 | + pattern_regex = re.compile(pattern_str, pattern_ignore_case) |
| 37 | + return pattern_regex.findall(page_html) if page_html else [] |
| 38 | + |
| 39 | + |
| 40 | +def start_crawl(seed_url, match_pattern): |
| 41 | + conn = pymysql.connect(host='localhost', port=3306, |
| 42 | + database='crawler', user='root', |
| 43 | + password='123456', charset='utf8') |
| 44 | + try: |
| 45 | + with conn.cursor() as cursor: |
| 46 | + url_list = [seed_url] |
| 47 | + while url_list: |
| 48 | + current_url = url_list.pop(0) |
| 49 | + page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312')) |
| 50 | + links_list = get_matched_parts(page_html, match_pattern) |
| 51 | + url_list += links_list |
| 52 | + param_list = [] |
| 53 | + for link in links_list: |
| 54 | + page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312')) |
| 55 | + headings = get_matched_parts(page_html, r'<h1>(.*)<span') |
| 56 | + if headings: |
| 57 | + param_list.append((headings[0], link)) |
| 58 | + cursor.executemany('insert into tb_result values (default, %s, %s)', |
| 59 | + param_list) |
| 60 | + conn.commit() |
| 61 | + except Error: |
| 62 | + pass |
| 63 | + # logging.error('SQL:', error) |
| 64 | + finally: |
| 65 | + conn.close() |
21 | 66 |
|
22 | 67 |
|
23 | 68 | def main(): |
24 | | - url_list = ['http://sports.sohu.com/nba_a.shtml'] |
25 | | - visited_list = set({}) |
26 | | - while len(url_list) > 0: |
27 | | - current_url = url_list.pop(0) |
28 | | - visited_list.add(current_url) |
29 | | - print(current_url) |
30 | | - html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312')) |
31 | | - if html: |
32 | | - link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) |
33 | | - link_list = re.findall(link_regex, html) |
34 | | - url_list += link_list |
35 | | - conn = pymysql.connect(host='localhost', port=3306, |
36 | | - db='crawler', user='root', |
37 | | - passwd='123456', charset='utf8') |
38 | | - try: |
39 | | - for link in link_list: |
40 | | - if link not in visited_list: |
41 | | - visited_list.add(link) |
42 | | - print(link) |
43 | | - html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312')) |
44 | | - if html: |
45 | | - title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE) |
46 | | - match_list = title_regex.findall(html) |
47 | | - if len(match_list) > 0: |
48 | | - title = match_list[0] |
49 | | - with conn.cursor() as cursor: |
50 | | - cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)', |
51 | | - (title, link)) |
52 | | - conn.commit() |
53 | | - finally: |
54 | | - conn.close() |
55 | | - print('执行完成!') |
| 69 | + ssl._create_default_https_context = ssl._create_unverified_context |
| 70 | + start_crawl('http://sports.sohu.com/nba_a.shtml', |
| 71 | + r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']') |
56 | 72 |
|
57 | 73 |
|
58 | 74 | if __name__ == '__main__': |
59 | 75 | main() |
60 | | - |
|
0 commit comments