File tree 1 file changed +5
-3
lines changed
1 file changed +5
-3
lines changed Original file line number Diff line number Diff line change @@ -17,20 +17,22 @@ def main():
17
17
seed_url = urljoin (base_url , 'explore' )
18
18
# 创建Redis客户端
19
19
client = Redis (host = '1.2.3.4' , port = 6379 , password = '1qaz2wsx' )
20
- # 设置用户代理
20
+ # 设置用户代理(否则访问会被拒绝)
21
21
headers = {'user-agent' : 'Baiduspider' }
22
22
# 通过requests模块发送GET请求并指定用户代理
23
23
resp = requests .get (seed_url , headers = headers )
24
24
# 创建BeautifulSoup对象并指定使用lxml作为解析器
25
25
soup = BeautifulSoup (resp .text , 'lxml' )
26
26
href_regex = re .compile (r'^/question' )
27
+ # 将URL处理成SHA1摘要(长度固定更简短)
28
+ hasher_proto = sha1 ()
27
29
# 查找所有href属性以/question打头的a标签
28
30
for a_tag in soup .find_all ('a' , {'href' : href_regex }):
29
31
# 获取a标签的href属性值并组装完整的URL
30
32
href = a_tag .attrs ['href' ]
31
33
full_url = urljoin (base_url , href )
32
- # 将URL处理成SHA1摘要(长度固定更简短)
33
- hasher = sha1 ()
34
+ # 传入URL生成SHA1摘要
35
+ hasher = hasher_proto . copy ()
34
36
hasher .update (full_url .encode ('utf-8' ))
35
37
field_key = hasher .hexdigest ()
36
38
# 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
You can’t perform that action at this time.
0 commit comments