|
| 1 | +import re |
| 2 | +import time |
| 3 | + |
| 4 | +import requests |
| 5 | +from pyquery import PyQuery as pq |
| 6 | + |
| 7 | +from .BaseFetcher import BaseFetcher |
| 8 | + |
| 9 | +class XiaoShuFetcher(BaseFetcher): |
| 10 | + """ |
| 11 | + http://www.xsdaili.cn/ |
| 12 | + 代码由 [Zealot666](https://github.com/Zealot666) 提供 |
| 13 | + """ |
| 14 | + def __init__(self): |
| 15 | + super().__init__() |
| 16 | + self.index = 0 |
| 17 | + |
| 18 | + def fetch(self): |
| 19 | + """ |
| 20 | + 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http |
| 21 | + 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] |
| 22 | + """ |
| 23 | + self.index += 1 |
| 24 | + new_index = self.index % 10 |
| 25 | + |
| 26 | + urls = set() |
| 27 | + proxies = [] |
| 28 | + headers = { |
| 29 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" |
| 30 | + } |
| 31 | + for page in range(new_index, new_index + 1): |
| 32 | + response = requests.get("http://www.xsdaili.cn/dayProxy/" + str(page) + ".html", headers=headers, timeout=10) |
| 33 | + for item in pq(response.text)('a').items(): |
| 34 | + try: |
| 35 | + if "/dayProxy/ip" in item.attr("href"): |
| 36 | + urls.add("http://www.xsdaili.cn" + item.attr("href")) |
| 37 | + except Exception: |
| 38 | + continue |
| 39 | + for url in urls: |
| 40 | + response = requests.get(url, headers=headers, timeout=8) |
| 41 | + doc = pq(response.text) |
| 42 | + for item in doc(".cont").items(): |
| 43 | + for line in item.text().split("\n"): |
| 44 | + ip = line.split('@')[0].split(':')[0] |
| 45 | + port = line.split('@')[0].split(':')[1] |
| 46 | + proxies.append(("http", ip, port)) |
| 47 | + |
| 48 | + return list(set(proxies)) |
0 commit comments