|
| 1 | +#coding:utf-8 |
| 2 | + |
| 3 | +import urllib2 |
| 4 | +import re |
| 5 | +import zlib |
| 6 | +import os |
| 7 | +import urllib |
| 8 | +import urllib |
| 9 | + |
| 10 | +class Cartoon: |
| 11 | + def __init__(self, url): |
| 12 | + self.base_url = "http://www.xeall.com/shenshi" |
| 13 | + self.url = url |
| 14 | + |
| 15 | + content = self.get_content(self.url) |
| 16 | + if not content: |
| 17 | + print "Cartoon init failed." |
| 18 | + return |
| 19 | + |
| 20 | + self.title = self.get_title(content) |
| 21 | + self.page_url_arr = self.get_page_url_arr(content) |
| 22 | + |
| 23 | + # 标记每次下载图片时,是否先检查本地已存在对应图片 |
| 24 | + self.need_check_pic = False |
| 25 | + |
| 26 | + |
| 27 | + def get_content(self, url): |
| 28 | + # 打开网页 |
| 29 | + try: |
| 30 | + request = urllib2.Request(url) |
| 31 | + response = urllib2.urlopen(request, timeout=20) |
| 32 | + |
| 33 | + # 将网页内容解压缩 |
| 34 | + decompressed_data = zlib.decompress(response.read(), 16 + zlib.MAX_WBITS) |
| 35 | + |
| 36 | + # 网页编码格式为 gb2312 |
| 37 | + content = decompressed_data.decode('gb2312', 'ignore') |
| 38 | + # print content |
| 39 | + return content |
| 40 | + except Exception, e: |
| 41 | + print e |
| 42 | + print "open url: " + url + " failed." |
| 43 | + return None |
| 44 | + |
| 45 | + def get_title(self, content): |
| 46 | + # 获取漫画名称 |
| 47 | + pattern = re.compile('name="keywords".*?content="(.*?)".*?/', re.S) |
| 48 | + result = re.search(pattern, content) |
| 49 | + |
| 50 | + if result: |
| 51 | + title = result.groups(1) |
| 52 | + print "title: " + title[0] |
| 53 | + return title[0] |
| 54 | + else: |
| 55 | + print "获取标题失败。" |
| 56 | + return None |
| 57 | + |
| 58 | + def get_page_url_arr(self, content): |
| 59 | + # 获取包含每一页漫画url的数组 |
| 60 | + pattern = re.compile('class="pagelist">(.*?)</ul>', re.S) |
| 61 | + result = re.search(pattern, content) |
| 62 | + page_list = result.groups(1) |
| 63 | + |
| 64 | + pattern = re.compile('<a href=\'(.*?)\'>.*?</a>', re.S) |
| 65 | + items = re.findall(pattern, page_list[0]) |
| 66 | + |
| 67 | + arr = [] |
| 68 | + for item in items: |
| 69 | + page_url = self.base_url + "/" + item |
| 70 | + arr.append(page_url) |
| 71 | + # print item |
| 72 | + |
| 73 | + # pagelist中还包含了上一页和下一页,根据网页格式可知分别在开始和结束,所以去掉首尾元素避免重复 |
| 74 | + arr.pop(0) |
| 75 | + arr.pop(0) |
| 76 | + arr.pop(len(arr) - 1) |
| 77 | + print arr |
| 78 | + print self.title + " total pages: " + str(len(arr)) |
| 79 | + return arr |
| 80 | + |
| 81 | + def get_pic_url(self, page_url): |
| 82 | + # 获取每一页中图片的url |
| 83 | + content = self.get_content(page_url) |
| 84 | + if not content: |
| 85 | + return None |
| 86 | + |
| 87 | + pattern = re.compile('<img alt.*?src="(.*?)".*?/>', re.S) |
| 88 | + result = re.search(pattern, content) |
| 89 | + |
| 90 | + if result: |
| 91 | + pic = result.groups(1) |
| 92 | + # print "Picture url: " + pic[0] |
| 93 | + return pic[0] |
| 94 | + else: |
| 95 | + print "获取图片地址失败。" |
| 96 | + print "url: " + page_url |
| 97 | + return None |
| 98 | + |
| 99 | + def save(self, path): |
| 100 | + dir_path = path + "/" + self.title |
| 101 | + self.create_dir_path(dir_path) |
| 102 | + |
| 103 | + # 判断是否已经下载过 |
| 104 | + list = os.listdir(dir_path) |
| 105 | + if len(list) >= len(self.page_url_arr): |
| 106 | + print self.title + " has been downloaded." |
| 107 | + return |
| 108 | + |
| 109 | + # 获取图片时会偶尔出现请求超时的情况,会导致一部漫画存在部分缺失,此时文件夹中已存在大部分图片 |
| 110 | + # 当前已存在图片大于一定数量时判定为存在少数缺页情况,这时候通过判断只对未存在图片进行请求 |
| 111 | + if len(list) >= (len(self.page_url_arr) / 2): |
| 112 | + print "每张图片下载前先检查本地是否已存在." |
| 113 | + self.need_check_pic = True |
| 114 | + |
| 115 | + for i in range(0, len(self.page_url_arr)): |
| 116 | + page_url = self.page_url_arr[i] |
| 117 | + pic_url = self.get_pic_url(page_url) |
| 118 | + if pic_url == None: |
| 119 | + continue |
| 120 | + |
| 121 | + pic_path = dir_path + "/" + str(i + 1) + ".jpg" |
| 122 | + if (self.need_check_pic): |
| 123 | + exists = os.path.exists(pic_path) |
| 124 | + if exists: |
| 125 | + print "pic: " + pic_url + " exists." |
| 126 | + continue |
| 127 | + |
| 128 | + self.save_pic(pic_url, pic_path) |
| 129 | + |
| 130 | + print self.title + " fetch finished." |
| 131 | + |
| 132 | + def create_dir_path(self, path): |
| 133 | + # 以漫画名创建文件夹 |
| 134 | + exists = os.path.exists(path) |
| 135 | + if not exists: |
| 136 | + print "创建文件夹" |
| 137 | + os.makedirs(path) |
| 138 | + else: |
| 139 | + print "文件夹已存在" |
| 140 | + |
| 141 | + def save_pic(self, pic_url, path): |
| 142 | + # 将图片保存到指定文件夹中 |
| 143 | + req = urllib2.Request(pic_url) |
| 144 | + req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36') |
| 145 | + req.add_header('GET', pic_url) |
| 146 | + |
| 147 | + try: |
| 148 | + print "save pic url:" + pic_url |
| 149 | + resp = urllib2.urlopen(req, timeout=20) |
| 150 | + data = resp.read() |
| 151 | + # print data |
| 152 | + |
| 153 | + fp = open(path, "wb") |
| 154 | + fp.write(data) |
| 155 | + fp.close |
| 156 | + print "save pic finished." |
| 157 | + except Exception, e: |
| 158 | + print e |
| 159 | + print "save pic: " + pic_url + " failed." |
| 160 | + |
| 161 | + |
| 162 | + |
0 commit comments