Skip to content

Commit 452b6f1

Browse files
committed
更新了爬虫第1天代码
1 parent 402e056 commit 452b6f1

File tree

3 files changed

+137
-0
lines changed

3 files changed

+137
-0
lines changed

Day66-75/code/example01.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from urllib.error import URLError
2+
from urllib.request import urlopen
3+
4+
import re
5+
import pymysql
6+
7+
8+
def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )):
9+
try:
10+
for charset in charsets:
11+
try:
12+
html = urlopen(start_url).read().decode(charset)
13+
break
14+
except UnicodeDecodeError:
15+
html = None
16+
except URLError as ex:
17+
print('Error:', ex)
18+
return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \
19+
retry_times > 0 else None
20+
return html
21+
22+
23+
def main():
24+
url_list = ['http://sports.sohu.com/nba_a.shtml']
25+
visited_list = set({})
26+
while len(url_list) > 0:
27+
current_url = url_list.pop(0)
28+
visited_list.add(current_url)
29+
print(current_url)
30+
html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
31+
if html:
32+
link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
33+
link_list = re.findall(link_regex, html)
34+
url_list += link_list
35+
conn = pymysql.connect(host='localhost', port=3306,
36+
db='crawler', user='root',
37+
passwd='123456', charset='utf8')
38+
try:
39+
for link in link_list:
40+
if link not in visited_list:
41+
visited_list.add(link)
42+
print(link)
43+
html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312'))
44+
if html:
45+
title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE)
46+
match_list = title_regex.findall(html)
47+
if len(match_list) > 0:
48+
title = match_list[0]
49+
with conn.cursor() as cursor:
50+
cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)',
51+
(title, link))
52+
conn.commit()
53+
finally:
54+
conn.close()
55+
print('执行完成!')
56+
57+
58+
if __name__ == '__main__':
59+
main()
60+

Day66-75/code/example02.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from bs4 import BeautifulSoup
2+
3+
import re
4+
5+
6+
def main():
7+
html = """
8+
<!DOCTYPE html>
9+
<html lang="en">
10+
<head>
11+
<meta charset="UTF-8">
12+
<title>首页</title>
13+
</head>
14+
<body>
15+
<h1>Hello, world!</h1>
16+
<p>Good!!!</p>
17+
<hr>
18+
<div>
19+
<h2>这是一个例子程序</h2>
20+
<p>静夜思</p>
21+
<p class="foo">床前明月光</p>
22+
<p id="bar">疑似地上霜</p>
23+
<p class="foo">举头望明月</p>
24+
<div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
25+
</div>
26+
<a class="foo" href="http://www.qq.com">腾讯网</a>
27+
<img src="./img/pretty-girl.png" alt="美女">
28+
<img src="./img/hellokitty.png" alt="凯蒂猫">
29+
<img src="./static/img/pretty-girl.png" alt="美女">
30+
<goup>Hello, Goup!</goup>
31+
</body>
32+
</html>
33+
"""
34+
# resp = requests.get('http://sports.sohu.com/nba_a.shtml')
35+
# html = resp.content.decode('gbk')
36+
soup = BeautifulSoup(html, 'lxml')
37+
print(soup.title)
38+
# JavaScript: document.body.h1
39+
# JavaScript: document.forms[0]
40+
print(soup.body.h1)
41+
print(soup.find_all(re.compile(r'p$')))
42+
print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
43+
print(soup.find_all(lambda x: len(x.attrs) == 2))
44+
print(soup.find_all('p', {'class': 'foo'}))
45+
for elem in soup.select('a[href]'):
46+
print(elem.attrs['href'])
47+
48+
49+
if __name__ == '__main__':
50+
main()

Day66-75/code/example03.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from bs4 import BeautifulSoup
2+
3+
import requests
4+
5+
import re
6+
7+
8+
def main():
9+
# 通过requests第三方库的get方法获取页面
10+
resp = requests.get('http://sports.sohu.com/nba_a.shtml')
11+
# 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码)
12+
html = resp.content.decode('gbk')
13+
# 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM)
14+
bs = BeautifulSoup(html, 'lxml')
15+
# 通过CSS选择器语法查找元素并通过循环进行处理
16+
# for elem in bs.find_all(lambda x: 'test' in x.attrs):
17+
for elem in bs.select('a[test]'):
18+
# 通过attrs属性(字典)获取元素的属性值
19+
link_url = elem.attrs['href']
20+
resp = requests.get(link_url)
21+
bs_sub = BeautifulSoup(resp.text, 'lxml')
22+
# 使用正则表达式对获取的数据做进一步的处理
23+
print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text))
24+
25+
26+
if __name__ == '__main__':
27+
main()

0 commit comments

Comments
 (0)