Skip to content

Commit e6a502e

Browse files
committed
to 0011
1 parent 235f134 commit e6a502e

File tree

5 files changed

+35
-9
lines changed

5 files changed

+35
-9
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Gerapy Pyppeteer Changelog
22

3+
## 0.0.11 (2020-08-05)
4+
5+
### Bug Fixes
6+
7+
* Fix bug about `asyncio` in Python 3.8 on Windows [https://github.com/Gerapy/GerapyPyppeteer/issues/5](https://github.com/Gerapy/GerapyPyppeteer/issues/5)
8+
* Fix bug of setting cookies [https://github.com/Gerapy/GerapyPyppeteer/issues/11](https://github.com/Gerapy/GerapyPyppeteer/issues/11)
9+
10+
### Features
11+
12+
* Add settings of `GERAPY_ENABLE_REQUEST_INTERCEPTION` [https://github.com/Gerapy/GerapyPyppeteer/issues/6](https://github.com/Gerapy/GerapyPyppeteer/issues/6)
13+
314
## 0.0.10 (2020-08-01)
415

516
### Features

example/example/spiders/movie.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ def start_requests(self):
2222
for page in range(1, self.max_page + 1):
2323
url = f'{self.base_url}/page/{page}'
2424
logger.debug('start url %s', url)
25-
yield PyppeteerRequest(url, callback=self.parse_index, priority=10, wait_for='.item', pretend=True)
25+
cookies = {
26+
'name': 'germey'
27+
}
28+
yield PyppeteerRequest(url, callback=self.parse_index, priority=10, wait_for='.item', pretend=True, cookies=cookies)
2629

2730
def parse_index(self, response):
2831
"""

gerapy_pyppeteer/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
VERSION = (0, 0, '10')
1+
VERSION = (0, 0, '11')
22

33
version = __version__ = '.'.join(map(str, VERSION))

gerapy_pyppeteer/downloadermiddlewares.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import sys
22
import asyncio
33
from io import BytesIO
4-
54
from pyppeteer.errors import PageError, TimeoutError
65
from scrapy.http import HtmlResponse
76
import twisted.internet
@@ -11,6 +10,10 @@
1110
from pyppeteer import launch
1211
from gerapy_pyppeteer.pretend import SCRIPTS as PRETEND_SCRIPTS
1312
from gerapy_pyppeteer.settings import *
13+
import urllib.parse
14+
15+
if sys.platform == 'win32':
16+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
1417

1518
reactor = AsyncioSelectorReactor(asyncio.get_event_loop())
1619

@@ -117,6 +120,8 @@ def from_crawler(cls, crawler):
117120
cls.screenshot = settings.get('GERAPY_PYPPETEER_SCREENSHOT', GERAPY_PYPPETEER_SCREENSHOT)
118121
cls.pretend = settings.get('GERAPY_PYPPETEER_PRETEND', GERAPY_PYPPETEER_PRETEND)
119122
cls.sleep = settings.get('GERAPY_PYPPETEER_SLEEP', GERAPY_PYPPETEER_SLEEP)
123+
cls.enable_request_interception = settings.getbool('GERAPY_ENABLE_REQUEST_INTERCEPTION',
124+
GERAPY_ENABLE_REQUEST_INTERCEPTION)
120125
cls.retry_enabled = settings.getbool('RETRY_ENABLED')
121126
cls.max_retry_times = settings.getint('RETRY_TIMES')
122127
cls.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
@@ -198,16 +203,20 @@ async def _process_request(self, request, spider):
198203
await page.evaluateOnNewDocument(script)
199204

200205
# set cookies
206+
parse_result = urllib.parse.urlsplit(request.url)
207+
domain = parse_result.hostname
208+
_cookies = []
201209
if isinstance(request.cookies, dict):
202-
await page.setCookie(*[
203-
{'name': k, 'value': v}
204-
for k, v in request.cookies.items()
205-
])
210+
_cookies = [{'name': k, 'value': v, 'domain': domain}
211+
for k, v in request.cookies.items()]
206212
else:
207-
await page.setCookie(request.cookies)
213+
for _cookie in _cookies:
214+
if isinstance(_cookie, dict) and 'domain' not in _cookie.keys():
215+
_cookie['domain'] = domain
216+
await page.setCookie(*_cookies)
208217

209218
# the headers must be set using request interception
210-
await page.setRequestInterception(True)
219+
await page.setRequestInterception(self.enable_request_interception)
211220

212221
@page.on('request')
213222
async def _handle_interception(pu_request):

gerapy_pyppeteer/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,6 @@
3838
GERAPY_PYPPETEER_IGNORE_RESOURCE_TYPES = []
3939
GERAPY_PYPPETEER_SCREENSHOT = None
4040
GERAPY_PYPPETEER_SLEEP = 1
41+
GERAPY_ENABLE_REQUEST_INTERCEPTION = True
42+
43+

0 commit comments

Comments
 (0)