Skip to content

Commit 124488b

Browse files
committed
添加了爬虫的代码并更新了部分文档
1 parent fe9e187 commit 124488b

File tree

166 files changed

+393
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

166 files changed

+393
-0
lines changed

Day66-75/code/image360/image360/__init__.py

Whitespace-only changes.
+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
11+
class GoodsItem(scrapy.Item):
12+
13+
price = scrapy.Field()
14+
deal = scrapy.Field()
15+
title = scrapy.Field()
16+
17+
18+
class BeautyItem(scrapy.Item):
19+
20+
title = scrapy.Field()
21+
tag = scrapy.Field()
22+
width = scrapy.Field()
23+
height = scrapy.Field()
24+
url = scrapy.Field()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
from scrapy import signals
9+
from scrapy.http import HtmlResponse
10+
11+
from selenium import webdriver
12+
from selenium.common.exceptions import TimeoutException
13+
14+
15+
class Image360SpiderMiddleware(object):
16+
# Not all methods need to be defined. If a method is not defined,
17+
# scrapy acts as if the spider middleware does not modify the
18+
# passed objects.
19+
20+
@classmethod
21+
def from_crawler(cls, crawler):
22+
# This method is used by Scrapy to create your spiders.
23+
s = cls()
24+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
25+
return s
26+
27+
def process_spider_input(self, response, spider):
28+
# Called for each response that goes through the spider
29+
# middleware and into the spider.
30+
31+
# Should return None or raise an exception.
32+
return None
33+
34+
def process_spider_output(self, response, result, spider):
35+
# Called with the results returned from the Spider, after
36+
# it has processed the response.
37+
38+
# Must return an iterable of Request, dict or Item objects.
39+
for i in result:
40+
yield i
41+
42+
def process_spider_exception(self, response, exception, spider):
43+
# Called when a spider or process_spider_input() method
44+
# (from other spider middleware) raises an exception.
45+
46+
# Should return either None or an iterable of Response, dict
47+
# or Item objects.
48+
pass
49+
50+
def process_start_requests(self, start_requests, spider):
51+
# Called with the start requests of the spider, and works
52+
# similarly to the process_spider_output() method, except
53+
# that it doesn’t have a response associated.
54+
55+
# Must return only requests (not items).
56+
for r in start_requests:
57+
yield r
58+
59+
def spider_opened(self, spider):
60+
spider.logger.info('Spider opened: %s' % spider.name)
61+
62+
63+
class Image360DownloaderMiddleware(object):
64+
# Not all methods need to be defined. If a method is not defined,
65+
# scrapy acts as if the downloader middleware does not modify the
66+
# passed objects.
67+
68+
@classmethod
69+
def from_crawler(cls, crawler):
70+
# This method is used by Scrapy to create your spiders.
71+
s = cls()
72+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
73+
return s
74+
75+
def process_request(self, request, spider):
76+
# Called for each request that goes through the downloader
77+
# middleware.
78+
79+
# Must either:
80+
# - return None: continue processing this request
81+
# - or return a Response object
82+
# - or return a Request object
83+
# - or raise IgnoreRequest: process_exception() methods of
84+
# installed downloader middleware will be called
85+
return None
86+
87+
def process_response(self, request, response, spider):
88+
# Called with the response returned from the downloader.
89+
90+
# Must either;
91+
# - return a Response object
92+
# - return a Request object
93+
# - or raise IgnoreRequest
94+
return response
95+
96+
def process_exception(self, request, exception, spider):
97+
# Called when a download handler or a process_request()
98+
# (from other downloader middleware) raises an exception.
99+
100+
# Must either:
101+
# - return None: continue processing this exception
102+
# - return a Response object: stops process_exception() chain
103+
# - return a Request object: stops process_exception() chain
104+
pass
105+
106+
def spider_opened(self, spider):
107+
spider.logger.info('Spider opened: %s' % spider.name)
108+
109+
110+
class TaobaoDownloaderMiddleWare(object):
111+
112+
def __init__(self, timeout=None):
113+
self.timeout = timeout
114+
self.browser = webdriver.Chrome()
115+
self.browser.set_window_size(1000, 600)
116+
self.browser.set_page_load_timeout(self.timeout)
117+
118+
def __del__(self):
119+
self.browser.close()
120+
121+
def process_request(self, request, spider):
122+
try:
123+
self.browser.get(request.url)
124+
return HtmlResponse(url=request.url, body=self.browser.page_source,
125+
request=request, encoding='utf-8', status=200)
126+
except TimeoutException:
127+
return HtmlResponse(url=request.url, status=500, request=request)
128+
129+
def process_response(self, request, response, spider):
130+
return response
131+
132+
def process_exception(self, request, exception, spider):
133+
pass
134+
135+
@classmethod
136+
def from_crawler(cls, crawler):
137+
return cls(timeout=10)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
import logging
8+
9+
from pymongo import MongoClient
10+
from scrapy import Request
11+
from scrapy.exceptions import DropItem
12+
from scrapy.pipelines.images import ImagesPipeline
13+
14+
15+
logger = logging.getLogger('SaveImagePipeline')
16+
17+
18+
class SaveImagePipeline(ImagesPipeline):
19+
20+
def get_media_requests(self, item, info):
21+
yield Request(url=item['url'])
22+
23+
def item_completed(self, results, item, info):
24+
logger.debug('图片下载完成!')
25+
if not results[0][0]:
26+
raise DropItem('下载失败')
27+
return item
28+
29+
def file_path(self, request, response=None, info=None):
30+
return request.url.split('/')[-1]
31+
32+
33+
class SaveToMongoPipeline(object):
34+
35+
def __init__(self, mongo_url, db_name):
36+
self.mongo_url = mongo_url
37+
self.db_name = db_name
38+
self.client = None
39+
self.db = None
40+
41+
def process_item(self, item, spider):
42+
return item
43+
44+
def open_spider(self, spider):
45+
self.client = MongoClient(self.mongo_url)
46+
self.db = self.client[self.db_name]
47+
48+
def close_spider(self, spider):
49+
self.client.close()
50+
51+
@classmethod
52+
def from_crawler(cls, crawler):
53+
return cls(crawler.settings.get('MONGO_URL'),
54+
crawler.settings.get('MONGO_DB'))
55+
+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for image360 project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# https://doc.scrapy.org/en/latest/topics/settings.html
9+
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME = 'image360'
13+
14+
SPIDER_MODULES = ['image360.spiders']
15+
NEWSPIDER_MODULE = 'image360.spiders'
16+
17+
MONGO_URL = 'mongodb://120.77.222.217:27017'
18+
MONGO_DB = 'image360'
19+
20+
21+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
22+
USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
23+
24+
# Obey robots.txt rules
25+
ROBOTSTXT_OBEY = False
26+
27+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
28+
CONCURRENT_REQUESTS = 2
29+
30+
# Configure a delay for requests for the same website (default: 0)
31+
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
32+
# See also autothrottle settings and docs
33+
DOWNLOAD_DELAY = 3
34+
RANDOMIZE_DOWNLOAD_DELAY = True
35+
# The download delay setting will honor only one of:
36+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
37+
#CONCURRENT_REQUESTS_PER_IP = 16
38+
39+
# Disable cookies (enabled by default)
40+
#COOKIES_ENABLED = False
41+
42+
# Disable Telnet Console (enabled by default)
43+
#TELNETCONSOLE_ENABLED = False
44+
45+
# Override the default request headers:
46+
#DEFAULT_REQUEST_HEADERS = {
47+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48+
# 'Accept-Language': 'en',
49+
#}
50+
51+
# Enable or disable spider middlewares
52+
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
53+
#SPIDER_MIDDLEWARES = {
54+
# 'image360.middlewares.Image360SpiderMiddleware': 543,
55+
#}
56+
57+
# Enable or disable downloader middlewares
58+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
59+
DOWNLOADER_MIDDLEWARES = {
60+
# 'image360.middlewares.Image360DownloaderMiddleware': 543,
61+
'image360.middlewares.TaobaoDownloaderMiddleWare': 500,
62+
}
63+
64+
# Enable or disable extensions
65+
# See https://doc.scrapy.org/en/latest/topics/extensions.html
66+
#EXTENSIONS = {
67+
# 'scrapy.extensions.telnet.TelnetConsole': None,
68+
#}
69+
70+
IMAGES_STORE = './resources/'
71+
72+
# Configure item pipelines
73+
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
74+
# ITEM_PIPELINES = {
75+
# 'image360.pipelines.SaveImagePipeline': 300,
76+
# 'image360.pipelines.SaveToMongoPipeline': 301,
77+
# }
78+
79+
LOG_LEVEL = 'DEBUG'
80+
81+
# Enable and configure the AutoThrottle extension (disabled by default)
82+
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
83+
#AUTOTHROTTLE_ENABLED = True
84+
# The initial download delay
85+
#AUTOTHROTTLE_START_DELAY = 5
86+
# The maximum download delay to be set in case of high latencies
87+
#AUTOTHROTTLE_MAX_DELAY = 60
88+
# The average number of requests Scrapy should be sending in parallel to
89+
# each remote server
90+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
91+
# Enable showing throttling stats for every response received:
92+
#AUTOTHROTTLE_DEBUG = False
93+
94+
# Enable and configure HTTP caching (disabled by default)
95+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
96+
#HTTPCACHE_ENABLED = True
97+
#HTTPCACHE_EXPIRATION_SECS = 0
98+
#HTTPCACHE_DIR = 'httpcache'
99+
#HTTPCACHE_IGNORE_HTTP_CODES = []
100+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# -*- coding: utf-8 -*-
2+
from json import loads
3+
from urllib.parse import urlencode
4+
5+
import scrapy
6+
7+
from image360.items import BeautyItem
8+
9+
10+
class ImageSpider(scrapy.Spider):
11+
name = 'image'
12+
allowed_domains = ['image.so.com']
13+
14+
def start_requests(self):
15+
base_url = 'http://image.so.com/zj?'
16+
param = {'ch': 'beauty', 'listtype': 'new', 'temp': 1}
17+
for page in range(10):
18+
param['sn'] = page * 30
19+
full_url = base_url + urlencode(param)
20+
yield scrapy.Request(url=full_url)
21+
22+
def parse(self, response):
23+
model_dict = loads(response.text)
24+
for elem in model_dict['list']:
25+
item = BeautyItem()
26+
item['title'] = elem['group_title']
27+
item['tag'] = elem['tag']
28+
item['width'] = elem['cover_width']
29+
item['height'] = elem['cover_height']
30+
item['url'] = elem['qhimg_url']
31+
yield item
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# -*- coding: utf-8 -*-
2+
from urllib.parse import urlencode
3+
4+
import scrapy
5+
6+
from image360.items import GoodsItem
7+
8+
9+
class TaobaoSpider(scrapy.Spider):
10+
name = 'taobao'
11+
allowed_domains = ['www.taobao.com']
12+
13+
def start_requests(self):
14+
base_url = 'https://s.taobao.com/search?'
15+
params = {}
16+
for keyword in ['ipad', 'iphone', '小米手机']:
17+
params['q'] = keyword
18+
for page in range(10):
19+
params['s'] = page * 44
20+
full_url = base_url + urlencode(params)
21+
yield scrapy.Request(url=full_url, callback=self.parse)
22+
23+
def parse(self, response):
24+
goods_list = response.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]')
25+
for goods in goods_list:
26+
item = GoodsItem()
27+
item['price'] = goods.xpath('div[5]/div[2]/div[1]/div[1]/strong/text()').extract_first()
28+
item['deal'] = goods.xpath('div[5]/div[2]/div[1]/div[2]/text()').extract_first()
29+
item['title'] = goods.xpath('div[6]/div[2]/div[2]/a/text()').extract_first()
30+
yield item
31+

0 commit comments

Comments
 (0)