Skip to content

Commit 53030d2

Browse files
author
liuhao
committed
localchange
1 parent f24954a commit 53030d2

File tree

1 file changed

+177
-183
lines changed

1 file changed

+177
-183
lines changed

Comics/Comics/spiders/comics.py

Lines changed: 177 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -1,191 +1,185 @@
1-
#coding:utf-8
1+
# coding:utf-8
22

33
import scrapy
44
import os
55
import urllib
66
import zlib
77
from bs4 import BeautifulSoup
88

9-
class Comics(scrapy.Spider):
10-
11-
name = "comics"
12-
# allowed_domains = ['http://www.xeall.com']
13-
# start_urls = ['http://www.xeall.com/shenshi/p20.html']
14-
15-
def start_requests(self):
16-
# urls = ['http://www.xeall.com/shenshi/6458.html']
17-
urls = ['http://www.xeall.com/shenshi']
18-
for url in urls:
19-
yield scrapy.Request(url=url, callback=self.parse)
20-
# yield scrapy.Request(url=url, callback=self.comics_parse)
21-
22-
def parse(self, response):
23-
# 请求返回的html源码
24-
content = response.body;
25-
if not content:
26-
self.log('parse body error.')
27-
return
28-
29-
# 用BeautifulSoup库进行节点的解析
30-
soup = BeautifulSoup(content, "html5lib")
31-
32-
# 获取包含漫画列表的标签
33-
listcon_tag = soup.find('ul', class_='listcon')
34-
if len(listcon_tag) < 1:
35-
self.log('extract comics list error')
36-
return
37-
38-
# 列表中每部漫画的<a>标签
39-
com_a_list = listcon_tag.find_all('a', attrs={'href': True})
40-
if len(com_a_list) < 1:
41-
self.log('Can not find <a> that contain href attribute.')
42-
return
43-
44-
# 获取列表中所有漫画的url
45-
comics_url_list = []
46-
base = 'http://www.xeall.com'
47-
for tag_a in com_a_list:
48-
url = base + tag_a['href']
49-
comics_url_list.append(url)
50-
51-
print('\n>>>>>>>>>>>>>>>>>>> current page comics list <<<<<<<<<<<<<<<<<<<<')
52-
print(comics_url_list)
53-
54-
# 处理当前页每部漫画
55-
for url in comics_url_list:
56-
print('>>>>>>>> parse comics:' + url)
57-
yield scrapy.Request(url=url, callback=self.comics_parse)
58-
59-
# # 只爬取当前一页
60-
# return
61-
62-
# 漫画列表下方的选页栏
63-
page_tag = soup.find('ul', class_='pagelist')
64-
if len(page_tag) < 1:
65-
self.log('extract page list error')
66-
return
67-
68-
# 获取下一页的url
69-
page_a_list = page_tag.find_all('a', attrs={'href': True})
70-
if len(page_a_list) < 2:
71-
self.log('extract page tag a error.')
72-
return
73-
74-
# 根据select控件来判断当前是否为最后一页
75-
select_tag = soup.find('select', attrs={'name': 'sldd'})
76-
option_list = select_tag.find_all('option')
77-
78-
# 最后一个option标签,若有 selected 属性,则说明为最后一页
79-
last_option = option_list[-1]
80-
current_option = select_tag.find('option' ,attrs={'selected': True})
81-
82-
is_last = (last_option.string == current_option.string)
83-
if not is_last:
84-
# 最后一个为“末页”,倒数第二个为“下一页”
85-
next_page = 'http://www.xeall.com/shenshi/' + page_a_list[-2]['href']
86-
if next_page is not None:
87-
print('\n------ parse next page --------')
88-
print(next_page)
89-
yield scrapy.Request(next_page, callback=self.parse)
90-
pass
91-
else:
92-
print('========= Last page ==========')
93-
94-
95-
def comics_parse(self, response):
96-
# 提取每部漫画数据
97-
content = response.body;
98-
if not content:
99-
self.log('parse comics body error.')
100-
return;
101-
102-
# 注意BeautifulSoup的解析器参数,不能指定为'html.parser',因为有些网页可能为 lxml
103-
soup = BeautifulSoup(content, "html5lib")
104-
105-
# 选择页控件标签
106-
page_list_tag = soup.find('ul', class_='pagelist')
107-
108-
# 当前页数
109-
current_li = page_list_tag.find('li', class_='thisclass')
110-
page_num = current_li.a.string
111-
self.log('current page = ' + page_num)
112-
113-
# 显示当前页图片标签
114-
li_tag = soup.find('li', id='imgshow')
115-
img_tag = li_tag.find('img')
116-
117-
# 当前图片url
118-
img_url = img_tag['src']
119-
self.log('img url: ' + img_url)
120-
121-
# 漫画标题
122-
# title = soup.title.string
123-
title = img_tag['alt']
124-
125-
# 将图片保存到本地
126-
self.save_img(page_num, title, img_url)
127-
128-
# 下一页图片的url,当下一页标签的href属性为‘#’时为漫画的最后一页
129-
a_tag_list = page_list_tag.find_all('a')
130-
next_page = a_tag_list[-1]['href']
131-
if next_page == '#':
132-
self.log('parse comics:' + title + 'finished.')
133-
else:
134-
next_page = 'http://www.xeall.com/shenshi/' + next_page
135-
yield scrapy.Request(next_page, callback=self.comics_parse)
136-
137-
def save_img(self, img_mun, title, img_url):
138-
# 将图片保存到本地
139-
self.log('saving pic: ' + img_url)
140-
141-
# 保存漫画的文件夹
142-
document = '/Users/moshuqi/Desktop/cartoon'
143-
144-
# 每部漫画的文件名以标题命名
145-
comics_path = document + '/' + title
146-
exists = os.path.exists(comics_path)
147-
if not exists:
148-
self.log('create document: ' + title)
149-
os.makedirs(comics_path)
150-
151-
# 每张图片以页数命名
152-
pic_name = comics_path + '/' + img_mun + '.jpg'
153-
154-
# 检查图片是否已经下载到本地,若存在则不再重新下载
155-
exists = os.path.exists(pic_name)
156-
if exists:
157-
self.log('pic exists: ' + pic_name)
158-
return
159-
160-
try:
161-
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
162-
headers = { 'User-Agent' : user_agent }
163-
164-
req = urllib.request.Request(img_url, headers=headers)
165-
response = urllib.request.urlopen(req, timeout=30)
166-
167-
# 请求返回到的数据
168-
data = response.read()
169-
170-
# 若返回数据为压缩数据需要先进行解压
171-
if response.info().get('Content-Encoding') == 'gzip':
172-
data = zlib.decompress(data, 16 + zlib.MAX_WBITS)
173-
174-
# 图片保存到本地
175-
fp = open(pic_name, "wb")
176-
fp.write(data)
177-
fp.close
178-
179-
self.log('save image finished:' + pic_name)
180-
181-
# urllib.request.urlretrieve(img_url, pic_name)
182-
except Exception as e:
183-
self.log('save image error.')
184-
self.log(e)
185-
186-
1879

188-
189-
190-
191-
10+
class Comics(scrapy.Spider):
11+
name = "comics"
12+
13+
# allowed_domains = ['http://www.xeall.com']
14+
# start_urls = ['http://www.xeall.com/shenshi/p20.html']
15+
16+
def start_requests(self):
17+
# urls = ['http://www.xeall.com/shenshi/6458.html']
18+
urls = ['http://www.xeall.com/shenshi']
19+
for url in urls:
20+
yield scrapy.Request(url=url, callback=self.parse)
21+
# yield scrapy.Request(url=url, callback=self.comics_parse)
22+
23+
def parse(self, response):
24+
# 请求返回的html源码
25+
content = response.body;
26+
if not content:
27+
self.log('parse body error.')
28+
return
29+
30+
# 用BeautifulSoup库进行节点的解析
31+
soup = BeautifulSoup(content, "html5lib")
32+
33+
# 获取包含漫画列表的标签
34+
listcon_tag = soup.find('ul', class_='listcon')
35+
if len(listcon_tag) < 1:
36+
self.log('extract comics list error')
37+
return
38+
39+
# 列表中每部漫画的<a>标签
40+
com_a_list = listcon_tag.find_all('a', attrs={'href': True})
41+
if len(com_a_list) < 1:
42+
self.log('Can not find <a> that contain href attribute.')
43+
return
44+
45+
# 获取列表中所有漫画的url
46+
comics_url_list = []
47+
base = 'http://www.xeall.com'
48+
for tag_a in com_a_list:
49+
url = base + tag_a['href']
50+
comics_url_list.append(url)
51+
52+
print('\n>>>>>>>>>>>>>>>>>>> current page comics list <<<<<<<<<<<<<<<<<<<<')
53+
print(comics_url_list)
54+
55+
# 处理当前页每部漫画
56+
for url in comics_url_list:
57+
print('>>>>>>>> parse comics:' + url)
58+
yield scrapy.Request(url=url, callback=self.comics_parse)
59+
60+
# # 只爬取当前一页
61+
# return
62+
63+
# 漫画列表下方的选页栏
64+
page_tag = soup.find('ul', class_='pagelist')
65+
if len(page_tag) < 1:
66+
self.log('extract page list error')
67+
return
68+
69+
# 获取下一页的url
70+
page_a_list = page_tag.find_all('a', attrs={'href': True})
71+
if len(page_a_list) < 2:
72+
self.log('extract page tag a error.')
73+
return
74+
75+
# 根据select控件来判断当前是否为最后一页
76+
select_tag = soup.find('select', attrs={'name': 'sldd'})
77+
option_list = select_tag.find_all('option')
78+
79+
# 最后一个option标签,若有 selected 属性,则说明为最后一页
80+
last_option = option_list[-1]
81+
current_option = select_tag.find('option', attrs={'selected': True})
82+
83+
is_last = (last_option.string == current_option.string)
84+
if not is_last:
85+
# 最后一个为“末页”,倒数第二个为“下一页”
86+
next_page = 'http://www.xeall.com/shenshi/' + page_a_list[-2]['href']
87+
if next_page is not None:
88+
print('\n------ parse next page --------')
89+
print(next_page)
90+
yield scrapy.Request(next_page, callback=self.parse)
91+
pass
92+
else:
93+
print('========= Last page ==========')
94+
95+
def comics_parse(self, response):
96+
# 提取每部漫画数据
97+
content = response.body;
98+
if not content:
99+
self.log('parse comics body error.')
100+
return;
101+
102+
# 注意BeautifulSoup的解析器参数,不能指定为'html.parser',因为有些网页可能为 lxml
103+
soup = BeautifulSoup(content, "html5lib")
104+
105+
# 选择页控件标签
106+
page_list_tag = soup.find('ul', class_='pagelist')
107+
108+
# 当前页数
109+
current_li = page_list_tag.find('li', class_='thisclass')
110+
page_num = current_li.a.string
111+
self.log('current page = ' + page_num)
112+
113+
# 显示当前页图片标签
114+
li_tag = soup.find('li', id='imgshow')
115+
img_tag = li_tag.find('img')
116+
117+
# 当前图片url
118+
img_url = img_tag['src']
119+
self.log('img url: ' + img_url)
120+
121+
# 漫画标题
122+
# title = soup.title.string
123+
title = img_tag['alt']
124+
125+
# 将图片保存到本地
126+
self.save_img(page_num, title, img_url)
127+
128+
# 下一页图片的url,当下一页标签的href属性为‘#’时为漫画的最后一页
129+
a_tag_list = page_list_tag.find_all('a')
130+
next_page = a_tag_list[-1]['href']
131+
if next_page == '#':
132+
self.log('parse comics:' + title + 'finished.')
133+
else:
134+
next_page = 'http://www.xeall.com/shenshi/' + next_page
135+
yield scrapy.Request(next_page, callback=self.comics_parse)
136+
137+
def save_img(self, img_mun, title, img_url):
138+
# 将图片保存到本地
139+
self.log('saving pic: ' + img_url)
140+
141+
# 保存漫画的文件夹
142+
# document = '/Users/moshuqi/Desktop/cartoon'
143+
document = 'cartoon'
144+
145+
# 每部漫画的文件名以标题命名
146+
comics_path = document + '/' + title
147+
exists = os.path.exists(comics_path)
148+
if not exists:
149+
self.log('create document: ' + title)
150+
os.makedirs(comics_path)
151+
152+
# 每张图片以页数命名
153+
pic_name = comics_path + '/' + img_mun + '.jpg'
154+
155+
# 检查图片是否已经下载到本地,若存在则不再重新下载
156+
exists = os.path.exists(pic_name)
157+
if exists:
158+
self.log('pic exists: ' + pic_name)
159+
return
160+
161+
try:
162+
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
163+
headers = {'User-Agent': user_agent}
164+
165+
req = urllib.request.Request(img_url, headers=headers)
166+
response = urllib.request.urlopen(req, timeout=180)
167+
168+
# 请求返回到的数据
169+
data = response.read()
170+
171+
# 若返回数据为压缩数据需要先进行解压
172+
if response.info().get('Content-Encoding') == 'gzip':
173+
data = zlib.decompress(data, 16 + zlib.MAX_WBITS)
174+
175+
# 图片保存到本地
176+
fp = open(pic_name, "wb")
177+
fp.write(data)
178+
fp.close
179+
180+
self.log('save image finished:' + pic_name)
181+
182+
# urllib.request.urlretrieve(img_url, pic_name)
183+
except Exception as e:
184+
self.log('save image error.')
185+
self.log(e)

0 commit comments

Comments
 (0)