Skip to content

Commit b872556

Browse files
committed
更新了爬虫第3天的代码
1 parent 8adc031 commit b872556

File tree

4 files changed

+109
-2
lines changed

4 files changed

+109
-2
lines changed

Day66-75/02.数据采集和解析.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787

8888
> 说明:更多内容可以参考BeautifulSoup的[官方文档]()
8989
90-
### 例子 - 获取知乎发现上的问题链接
90+
### 实例 - 获取知乎发现上的问题链接
9191

9292
```Python
9393
from urllib.parse import urljoin

Day66-75/03.存储数据.md

+58
Original file line numberDiff line numberDiff line change
@@ -197,5 +197,63 @@ b'admin'
197197

198198

199199

200+
### 实例 - 缓存知乎发现上的链接和页面代码
201+
202+
```Python
203+
204+
from hashlib import sha1
205+
from urllib.parse import urljoin
206+
207+
import pickle
208+
import re
209+
import requests
210+
import zlib
211+
212+
from bs4 import BeautifulSoup
213+
from redis import Redis
214+
215+
216+
def main():
217+
# 指定种子页面
218+
base_url = 'https://www.zhihu.com/'
219+
seed_url = urljoin(base_url, 'explore')
220+
# 创建Redis客户端
221+
client = Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
222+
# 设置用户代理(否则访问会被拒绝)
223+
headers = {'user-agent': 'Baiduspider'}
224+
# 通过requests模块发送GET请求并指定用户代理
225+
resp = requests.get(seed_url, headers=headers)
226+
# 创建BeautifulSoup对象并指定使用lxml作为解析器
227+
soup = BeautifulSoup(resp.text, 'lxml')
228+
href_regex = re.compile(r'^/question')
229+
# 查找所有href属性以/question打头的a标签
230+
for a_tag in soup.find_all('a', {'href': href_regex}):
231+
# 获取a标签的href属性值并组装完整的URL
232+
href = a_tag.attrs['href']
233+
full_url = urljoin(base_url, href)
234+
# 将URL处理成SHA1摘要(长度固定更简短)
235+
hasher = sha1()
236+
hasher.update(full_url.encode('utf-8'))
237+
field_key = hasher.hexdigest()
238+
# 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
239+
if not client.hexists('zhihu', field_key):
240+
html_page = requests.get(full_url, headers=headers).text
241+
# 对页面进行序列化和压缩操作
242+
zipped_page = zlib.compress(pickle.dumps(html_page))
243+
# 使用hash数据类型保存URL摘要及其对应的页面代码
244+
client.hset('zhihu', field_key, zipped_page)
245+
# 显示总共缓存了多少个页面
246+
print('Total %d question pages found.' % client.hlen('zhihu'))
247+
248+
249+
if __name__ == '__main__':
250+
main()
251+
252+
```
253+
254+
255+
256+
257+
200258
201259

Day66-75/code/example05.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
5050

5151
# 开始执行爬虫程序
5252
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
53-
client = redis.Redis(host='120.77.222.217', port=11223, password='1qaz2wsx')
53+
client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
5454
charsets = ('utf-8', 'gbk', 'gb2312')
5555
logging.info('[Redis ping]', client.ping())
5656
url_list = [seed_url]

Day66-75/code/example06.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
2+
from hashlib import sha1
3+
from urllib.parse import urljoin
4+
5+
import pickle
6+
import re
7+
import requests
8+
import zlib
9+
10+
from bs4 import BeautifulSoup
11+
from redis import Redis
12+
13+
14+
def main():
15+
# 指定种子页面
16+
base_url = 'https://www.zhihu.com/'
17+
seed_url = urljoin(base_url, 'explore')
18+
# 创建Redis客户端
19+
client = Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
20+
# 设置用户代理
21+
headers = {'user-agent': 'Baiduspider'}
22+
# 通过requests模块发送GET请求并指定用户代理
23+
resp = requests.get(seed_url, headers=headers)
24+
# 创建BeautifulSoup对象并指定使用lxml作为解析器
25+
soup = BeautifulSoup(resp.text, 'lxml')
26+
href_regex = re.compile(r'^/question')
27+
# 查找所有href属性以/question打头的a标签
28+
for a_tag in soup.find_all('a', {'href': href_regex}):
29+
# 获取a标签的href属性值并组装完整的URL
30+
href = a_tag.attrs['href']
31+
full_url = urljoin(base_url, href)
32+
# 将URL处理成SHA1摘要(长度固定更简短)
33+
hasher = sha1()
34+
hasher.update(full_url.encode('utf-8'))
35+
field_key = hasher.hexdigest()
36+
# 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
37+
if not client.hexists('zhihu', field_key):
38+
html_page = requests.get(full_url, headers=headers).text
39+
# 对页面进行序列化和压缩操作
40+
zipped_page = zlib.compress(pickle.dumps(html_page))
41+
# 使用hash数据类型保存URL摘要及其对应的页面代码
42+
client.hset('zhihu', field_key, zipped_page)
43+
# 显示总共缓存了多少个页面
44+
print('Total %d question pages found.' % client.hlen('zhihu'))
45+
46+
47+
if __name__ == '__main__':
48+
main()
49+

0 commit comments

Comments
 (0)