Skip to content

Commit b4b53b4

Browse files
author
xingag
committed
多线程爬取百思不得姐的图片和文字信息并保存到本地csv中
1 parent 86f3583 commit b4b53b4

File tree

2 files changed

+216
-2
lines changed

2 files changed

+216
-2
lines changed

README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,22 @@
11
# spider_python
2+
## 普通的爬虫
3+
24
* [爬取电影天堂最新的电影数据 - xpath](./spiders/spider_dytt.py)
35

46
* [爬取腾讯招聘的职位数据 - xpath](./spiders/spider_tencent_recruit.py)
57

6-
7-
88
* [爬取中国天气网全国天气并生成饼状图 - bs4](./spiders/spider_china_weather.py)
9+
910
* [爬取古诗词网的数据 - re](./spiders/spider_gushiwen.py)
11+
1012
* [爬取糗事百科上的段子数据 - re](./spiders/spider_qiu_shi_bai_ke.py)
13+
14+
15+
16+
17+
## 多线程爬虫
18+
1119
* [多线程爬取斗图吧的表情图并下载到本地 - xpath + threading](./spiders/spider_dou_tu_la.py)
20+
* [多线程爬取百思不得姐的文字和图片信息并写入到csv中](./spiders/spider_bai_si_bu_de_jie.py)
1221

1322

spiders/spider_bai_si_bu_de_jie.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
4+
"""
5+
@version: v1.0
6+
@author: xag
7+
@license: Apache Licence
8+
9+
@site: http://www.xingag.top
10+
@software: PyCharm
11+
@file: spider_bai_si_bu_de_jie.py
12+
@time: 2018/9/25 19:58
13+
@description:利用多线程爬取【百思不得姐】网站的文字和图片并下载到csv文件中
14+
"""
15+
16+
import requests
17+
from lxml import etree
18+
import threading
19+
from queue import Queue
20+
import time
21+
import csv
22+
from urllib import request
23+
import fileutils
24+
25+
HEADERS = {
26+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
27+
'Referer': 'http://www.budejie.com/hot/1'
28+
}
29+
30+
31+
class BSSpider(threading.Thread):
32+
"""
33+
爬取每一页的数据
34+
"""
35+
36+
def __init__(self, page_queue, joke_queue, name, *args, **kwargs):
37+
super(BSSpider, self).__init__(*args, **kwargs)
38+
39+
# 1.初始化数据
40+
self.page_queue = page_queue
41+
self.joke_queue = joke_queue
42+
self.name = name
43+
44+
def run(self):
45+
while True:
46+
# 2.如果页面队列为空,就退出循环
47+
if self.page_queue.empty():
48+
print(self.name + '任务完成~')
49+
# while not self.joke_queue.empty():
50+
# print(self.joke_queue.get())
51+
break
52+
53+
# 3.从队列中获取页面地址
54+
page_url = self.page_queue.get()
55+
self.spider_page(page_url)
56+
57+
# 6.休眠0.5秒
58+
time.sleep(0.5)
59+
60+
def spider_page(self, page_url):
61+
"""
62+
爬取一页的数据
63+
:param page_url:页面的url
64+
:return:
65+
"""
66+
response = requests.get(page_url, headers=HEADERS)
67+
text_raw = response.text
68+
html_element = etree.HTML(text_raw)
69+
70+
# 4.利用xpath去解析数据
71+
div_elements = html_element.xpath('//div[@class="j-r-list"]')
72+
73+
for div_element in div_elements:
74+
duan_zi_elments = div_element.xpath('./ul/li')
75+
for duan_zi_elment in duan_zi_elments:
76+
# 【数据】用户名
77+
username = duan_zi_elment.xpath('.//a[@class="u-user-name"]/text()')[0]
78+
79+
# 【数据】段子发布时间
80+
pubtime = duan_zi_elment.xpath('.//span/text()')[0]
81+
82+
desc_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-desc"]')[0]
83+
# 【数据】段子描述内容
84+
content = desc_element.xpath('./a/text()')[0]
85+
86+
img_div_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-img"]')[0]
87+
img = img_div_element.xpath('.//img/@data-original')[0]
88+
alt = img_div_element.xpath('.//img/@alt')[0]
89+
90+
# 5.把解析后的数据以元组的方式放入到队列中去
91+
self.joke_queue.put((username, content, img, alt, pubtime))
92+
93+
94+
class BSWriter(threading.Thread):
95+
"""
96+
下载图片、写入文字数据到csv文件中
97+
"""
98+
99+
def __init__(self, page_queue, joke_queue, writer, gLock, name, *args, **kwargs):
100+
super(BSWriter, self).__init__(*args, **kwargs)
101+
102+
# 1.初始化
103+
self.page_queue = page_queue
104+
self.joke_queue = joke_queue
105+
self.writer = writer
106+
self.gLock = gLock
107+
self.name = name
108+
109+
def run(self):
110+
while True:
111+
if self.joke_queue.empty() and self.page_queue.empty():
112+
print(self.name + '任务完成~')
113+
break
114+
115+
# 2.从joke_queue队列中获取数据
116+
joke_info = self.joke_queue.get(timeout=40)
117+
username, content, img, alt, pubtime = joke_info
118+
119+
# 3.上锁
120+
self.gLock.acquire()
121+
122+
# 4.写入数据到csv中
123+
self.writer.writerow((username, content, img, alt, pubtime))
124+
125+
# 5.下载图片到本地
126+
# file_name = alt + fileutils.get_file_suffix(img)
127+
# request.urlretrieve(img, './imgs/%s' % file_name)
128+
129+
# 6.释放锁
130+
self.gLock.release()
131+
132+
print('写入一条数据成功')
133+
134+
135+
class BSDownImg(threading.Thread):
136+
"""
137+
下载图片的消费者
138+
"""
139+
140+
def __init__(self, page_queue, joke_queue, gLock, name, *args, **kwargs):
141+
super(BSDownImg, self).__init__(*args, **kwargs)
142+
self.page_queue = page_queue
143+
self.joke_queue = joke_queue
144+
self.gLock = gLock
145+
self.name = name
146+
147+
def run(self):
148+
while True:
149+
if self.joke_queue.empty() and self.page_queue.empty():
150+
print(self.name + '任务完成~')
151+
break
152+
username, content, img, alt, pubtime = self.joke_queue.get(timeout=40)
153+
154+
# 上锁并下载图片
155+
self.gLock.acquire()
156+
file_name = alt + fileutils.get_file_suffix(img)
157+
request.urlretrieve(img, './imgs/%s' % file_name)
158+
self.gLock.release()
159+
160+
print('下载一张图片成功')
161+
162+
163+
def spider():
164+
"""
165+
爬取百思不得姐的前20页数据
166+
:return:
167+
"""
168+
169+
# 1.构建队列【生产者、消费者需要上锁的对象】
170+
page_queue = Queue(20)
171+
joke_queue = Queue(200)
172+
173+
# 2.锁对象
174+
gLock = threading.Lock()
175+
176+
# 3.写入
177+
fp = open('jokes.csv', 'a', newline='', encoding='utf-8')
178+
writer = csv.writer(fp)
179+
180+
# 4.写入csv表头信息
181+
writer.writerow(['username', 'content', 'img', 'alt', 'pubtime'])
182+
183+
# 5.前10页待爬取的地址,放入到队列中
184+
for page_num in range(1, 11):
185+
page_url = 'http://www.budejie.com/hot/%d' % page_num
186+
page_queue.put(page_url)
187+
188+
# 6.构建10个生成者来进行爬虫
189+
for x in range(1, 6):
190+
t = BSSpider(page_queue, joke_queue, name='生产者%d' % x)
191+
t.start()
192+
193+
# 7.构建 20 个消费者来写入数据到csv文件中
194+
for x in range(1, 21):
195+
t = BSWriter(page_queue, joke_queue, writer, gLock, name='消费者-文字%d' % x)
196+
t.start()
197+
198+
# 8.构建 50 个消费者来下载图片
199+
for x in range(1, 51):
200+
t = BSDownImg(page_queue, joke_queue, gLock, name='消费者-图片%d' % x)
201+
t.start()
202+
203+
204+
if __name__ == '__main__':
205+
spider()

0 commit comments

Comments
 (0)