Skip to content

Commit c5e8d22

Browse files
committed
爬取漫画demo
1 parent cb9ffb1 commit c5e8d22

File tree

4 files changed

+278
-0
lines changed

4 files changed

+278
-0
lines changed

.DS_Store

4 KB
Binary file not shown.

Cartoon/Cartoon.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#coding:utf-8
2+
3+
import urllib2
4+
import re
5+
import zlib
6+
import os
7+
import urllib
8+
import urllib
9+
10+
class Cartoon:
11+
def __init__(self, url):
12+
self.base_url = "http://www.xeall.com/shenshi"
13+
self.url = url
14+
15+
content = self.get_content(self.url)
16+
if not content:
17+
print "Cartoon init failed."
18+
return
19+
20+
self.title = self.get_title(content)
21+
self.page_url_arr = self.get_page_url_arr(content)
22+
23+
# 标记每次下载图片时,是否先检查本地已存在对应图片
24+
self.need_check_pic = False
25+
26+
27+
def get_content(self, url):
28+
# 打开网页
29+
try:
30+
request = urllib2.Request(url)
31+
response = urllib2.urlopen(request, timeout=20)
32+
33+
# 将网页内容解压缩
34+
decompressed_data = zlib.decompress(response.read(), 16 + zlib.MAX_WBITS)
35+
36+
# 网页编码格式为 gb2312
37+
content = decompressed_data.decode('gb2312', 'ignore')
38+
# print content
39+
return content
40+
except Exception, e:
41+
print e
42+
print "open url: " + url + " failed."
43+
return None
44+
45+
def get_title(self, content):
46+
# 获取漫画名称
47+
pattern = re.compile('name="keywords".*?content="(.*?)".*?/', re.S)
48+
result = re.search(pattern, content)
49+
50+
if result:
51+
title = result.groups(1)
52+
print "title: " + title[0]
53+
return title[0]
54+
else:
55+
print "获取标题失败。"
56+
return None
57+
58+
def get_page_url_arr(self, content):
59+
# 获取包含每一页漫画url的数组
60+
pattern = re.compile('class="pagelist">(.*?)</ul>', re.S)
61+
result = re.search(pattern, content)
62+
page_list = result.groups(1)
63+
64+
pattern = re.compile('<a href=\'(.*?)\'>.*?</a>', re.S)
65+
items = re.findall(pattern, page_list[0])
66+
67+
arr = []
68+
for item in items:
69+
page_url = self.base_url + "/" + item
70+
arr.append(page_url)
71+
# print item
72+
73+
# pagelist中还包含了上一页和下一页,根据网页格式可知分别在开始和结束,所以去掉首尾元素避免重复
74+
arr.pop(0)
75+
arr.pop(0)
76+
arr.pop(len(arr) - 1)
77+
print arr
78+
print self.title + " total pages: " + str(len(arr))
79+
return arr
80+
81+
def get_pic_url(self, page_url):
82+
# 获取每一页中图片的url
83+
content = self.get_content(page_url)
84+
if not content:
85+
return None
86+
87+
pattern = re.compile('<img alt.*?src="(.*?)".*?/>', re.S)
88+
result = re.search(pattern, content)
89+
90+
if result:
91+
pic = result.groups(1)
92+
# print "Picture url: " + pic[0]
93+
return pic[0]
94+
else:
95+
print "获取图片地址失败。"
96+
print "url: " + page_url
97+
return None
98+
99+
def save(self, path):
100+
dir_path = path + "/" + self.title
101+
self.create_dir_path(dir_path)
102+
103+
# 判断是否已经下载过
104+
list = os.listdir(dir_path)
105+
if len(list) >= len(self.page_url_arr):
106+
print self.title + " has been downloaded."
107+
return
108+
109+
# 获取图片时会偶尔出现请求超时的情况,会导致一部漫画存在部分缺失,此时文件夹中已存在大部分图片
110+
# 当前已存在图片大于一定数量时判定为存在少数缺页情况,这时候通过判断只对未存在图片进行请求
111+
if len(list) >= (len(self.page_url_arr) / 2):
112+
print "每张图片下载前先检查本地是否已存在."
113+
self.need_check_pic = True
114+
115+
for i in range(0, len(self.page_url_arr)):
116+
page_url = self.page_url_arr[i]
117+
pic_url = self.get_pic_url(page_url)
118+
if pic_url == None:
119+
continue
120+
121+
pic_path = dir_path + "/" + str(i + 1) + ".jpg"
122+
if (self.need_check_pic):
123+
exists = os.path.exists(pic_path)
124+
if exists:
125+
print "pic: " + pic_url + " exists."
126+
continue
127+
128+
self.save_pic(pic_url, pic_path)
129+
130+
print self.title + " fetch finished."
131+
132+
def create_dir_path(self, path):
133+
# 以漫画名创建文件夹
134+
exists = os.path.exists(path)
135+
if not exists:
136+
print "创建文件夹"
137+
os.makedirs(path)
138+
else:
139+
print "文件夹已存在"
140+
141+
def save_pic(self, pic_url, path):
142+
# 将图片保存到指定文件夹中
143+
req = urllib2.Request(pic_url)
144+
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
145+
req.add_header('GET', pic_url)
146+
147+
try:
148+
print "save pic url:" + pic_url
149+
resp = urllib2.urlopen(req, timeout=20)
150+
data = resp.read()
151+
# print data
152+
153+
fp = open(path, "wb")
154+
fp.write(data)
155+
fp.close
156+
print "save pic finished."
157+
except Exception, e:
158+
print e
159+
print "save pic: " + pic_url + " failed."
160+
161+
162+

Cartoon/Gentleman.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#coding:utf-8
2+
3+
import urllib2
4+
import re
5+
import zlib
6+
from Cartoon import *
7+
8+
class Gentleman:
9+
def __init__(self, url, path):
10+
exists = os.path.exists(path)
11+
if not exists:
12+
print "文件路径无效."
13+
exit(0)
14+
15+
self.base_url = url
16+
self.path = path
17+
content = self.get_content(url)
18+
19+
self.page_url_arr = self.get_page_url_arr(content)
20+
21+
def get_content(self, url):
22+
# 打开网页
23+
try:
24+
request = urllib2.Request(url)
25+
response = urllib2.urlopen(request, timeout=20)
26+
27+
# 将网页内容解压缩
28+
decompressed_data = zlib.decompress(response.read(), 16 + zlib.MAX_WBITS)
29+
30+
# 网页编码格式为 gb2312
31+
content = decompressed_data.decode('gb2312', 'ignore')
32+
# print content
33+
return content
34+
except Exception, e:
35+
print e
36+
print "打开网页: " + url + "失败."
37+
return None
38+
39+
def get_page_url_arr(self, content):
40+
# 每一页展示部分漫画封面,获取所有这些页面,返回包含这些页面url的数组
41+
42+
# 总页数和每一页对应的url在 select 控件里,先将内容全取出来
43+
pattern = re.compile('name=\'sldd\'.*?>(.*?)</select>', re.S)
44+
result = re.search(pattern, content)
45+
option_list = result.groups(1)
46+
# print option_list
47+
48+
# 再获取每一页的url
49+
pattern = re.compile('value=\'(.*?)\'.*?</option>', re.S)
50+
items = re.findall(pattern, option_list[0])
51+
52+
arr = []
53+
for item in items:
54+
# print item
55+
page_url = self.base_url + '/' + item
56+
arr.append(page_url)
57+
58+
# print arr
59+
print "total pages: " + str(len(arr))
60+
return arr
61+
62+
def get_cartoon_arr(self, url):
63+
# 获取每一页所包含的漫画
64+
content = self.get_content(url)
65+
if not content:
66+
print "获取网页失败."
67+
return None
68+
69+
# 先获取包含漫画信息的内容
70+
pattern = re.compile('class="piclist listcon".*?>(.*?)</ul>', re.S)
71+
result = re.search(pattern, content)
72+
cartoon_list = result.groups(1)
73+
74+
# 再获取每部漫画的url
75+
pattern = re.compile('href="/shenshi/(.*?)".*?class="pic show"', re.S)
76+
items = re.findall(pattern, cartoon_list[0])
77+
78+
arr = []
79+
for item in items:
80+
# print item
81+
page_url = self.base_url + '/' + item
82+
arr.append(page_url)
83+
84+
return arr
85+
86+
def hentai(self):
87+
# 遍历每一页的内容
88+
for i in range(0, len(self.page_url_arr)):
89+
# 获取每一页漫画的url
90+
cartoon_arr = self.get_cartoon_arr(self.page_url_arr[i])
91+
print "page " + str(i + 1) + ":"
92+
print cartoon_arr
93+
for j in range(0, len(cartoon_arr)):
94+
cartoon = Cartoon(cartoon_arr[j])
95+
cartoon.save(self.path)
96+
print "======= page " + str(i + 1) + " fetch finished ======="

Cartoon/Main.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#coding:utf-8
2+
3+
from Cartoon import *
4+
from Gentleman import *
5+
6+
# cartoon = Cartoon("http://www.xeall.com/shenshi/6895.html")
7+
# cartoon.save("/Users/moshuqi/Desktop/test")
8+
9+
# http://www.xeall.com/ribenmanhua/
10+
url = "http://www.xeall.com/shenshi"
11+
12+
# enter your path
13+
save_path = ""
14+
15+
gentleman = Gentleman(url, save_path)
16+
gentleman.hentai()
17+
18+
19+
20+

0 commit comments

Comments
 (0)