Skip to content

Commit d493c69

Browse files
committed
新增开心代理,西拉代理,小舒代理
1 parent cbc7aa0 commit d493c69

File tree

6 files changed

+160
-1
lines changed

6 files changed

+160
-1
lines changed

fetchers/KaiXinFetcher.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import re
2+
import time
3+
4+
import requests
5+
from pyquery import PyQuery as pq
6+
7+
from .BaseFetcher import BaseFetcher
8+
9+
class KaiXinFetcher(BaseFetcher):
10+
"""
11+
http://www.kxdaili.com/dailiip.html
12+
代码由 [Zealot666](https://github.com/Zealot666) 提供
13+
"""
14+
15+
def fetch(self):
16+
"""
17+
执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http
18+
返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
19+
"""
20+
21+
urls = []
22+
urls = urls + [f'http://www.kxdaili.com/dailiip/1/{page}.html' for page in range(1, 11)]
23+
urls = urls + [f'http://www.kxdaili.com/dailiip/2/{page}.html' for page in range(1, 11)]
24+
25+
proxies = []
26+
ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
27+
port_regex = re.compile(r'^\d+$')
28+
29+
for url in urls:
30+
html = requests.get(url, timeout=10).text
31+
doc = pq(html)
32+
for line in doc('tr').items():
33+
tds = list(line('td').items())
34+
if len(tds) >= 2:
35+
ip = tds[0].text().strip()
36+
port = tds[1].text().strip()
37+
if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
38+
proxies.append(('http', ip, int(port)))
39+
40+
return list(set(proxies))

fetchers/XiLaFetcher.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import re
2+
import time
3+
4+
import requests
5+
from pyquery import PyQuery as pq
6+
7+
from .BaseFetcher import BaseFetcher
8+
9+
class XiLaFetcher(BaseFetcher):
10+
"""
11+
http://www.xiladaili.com/gaoni/
12+
代码由 [Zealot666](https://github.com/Zealot666) 提供
13+
"""
14+
def __init__(self):
15+
super().__init__()
16+
self.index = 0
17+
18+
def fetch(self):
19+
"""
20+
执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http
21+
返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
22+
"""
23+
self.index += 1
24+
new_index = self.index % 30
25+
26+
urls = []
27+
urls = urls + [f'http://www.xiladaili.com/gaoni/{page}/' for page in range(new_index, new_index + 11)]
28+
urls = urls + [f'http://www.xiladaili.com/http/{page}/' for page in range(new_index, new_index + 11)]
29+
30+
proxies = []
31+
ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
32+
port_regex = re.compile(r'^\d+$')
33+
34+
for url in urls:
35+
time.sleep(1)
36+
html = requests.get(url, timeout=10).text
37+
doc = pq(html)
38+
for line in doc('tr').items():
39+
tds = list(line('td').items())
40+
if len(tds) >= 2:
41+
ip = tds[0].text().strip().split(":")[0]
42+
port = tds[0].text().strip().split(":")[1]
43+
if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
44+
proxies.append(('http', ip, int(port)))
45+
46+
return list(set(proxies))

fetchers/XiaoShuFetcher.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import re
2+
import time
3+
4+
import requests
5+
from pyquery import PyQuery as pq
6+
7+
from .BaseFetcher import BaseFetcher
8+
9+
class XiaoShuFetcher(BaseFetcher):
10+
"""
11+
http://www.xsdaili.cn/
12+
代码由 [Zealot666](https://github.com/Zealot666) 提供
13+
"""
14+
def __init__(self):
15+
super().__init__()
16+
self.index = 0
17+
18+
def fetch(self):
19+
"""
20+
执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http
21+
返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
22+
"""
23+
self.index += 1
24+
new_index = self.index % 10
25+
26+
urls = set()
27+
proxies = []
28+
headers = {
29+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
30+
}
31+
for page in range(new_index, new_index + 1):
32+
response = requests.get("http://www.xsdaili.cn/dayProxy/" + str(page) + ".html", headers=headers, timeout=10)
33+
for item in pq(response.text)('a').items():
34+
try:
35+
if "/dayProxy/ip" in item.attr("href"):
36+
urls.add("http://www.xsdaili.cn" + item.attr("href"))
37+
except Exception:
38+
continue
39+
for url in urls:
40+
response = requests.get(url, headers=headers, timeout=8)
41+
doc = pq(response.text)
42+
for item in doc(".cont").items():
43+
for line in item.text().split("\n"):
44+
ip = line.split('@')[0].split(':')[0]
45+
port = line.split('@')[0].split(':')[1]
46+
proxies.append(("http", ip, port))
47+
48+
return list(set(proxies))

fetchers/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
from .JiangxianliFetcher import JiangxianliFetcher
1212
from .IHuanFetcher import IHuanFetcher
1313
from .IP89Fetcher import IP89Fetcher
14+
from .KaiXinFetcher import KaiXinFetcher
15+
from .XiLaFetcher import XiLaFetcher
16+
from .XiaoShuFetcher import XiaoShuFetcher
1417

1518
fetchers = [
1619
Fetcher(name='uu-proxy.com', fetcher=UUFetcher),
@@ -21,4 +24,7 @@
2124
Fetcher(name='ip.jiangxianli.com', fetcher=JiangxianliFetcher),
2225
Fetcher(name='ip.ihuan.me', fetcher=IHuanFetcher),
2326
Fetcher(name='www.89ip.cn', fetcher=IP89Fetcher),
27+
Fetcher(name='www.kxdaili.com', fetcher=KaiXinFetcher),
28+
Fetcher(name='www.xiladaili.com', fetcher=XiLaFetcher),
29+
Fetcher(name='www.xsdaili.cn', fetcher=XiaoShuFetcher),
2430
]

test.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# encoding : utf-8
2+
3+
import requests
4+
5+
def main():
6+
proxy_uri = 'http://223.10.82.66:8118'
7+
# proxy_uri = 'http://localhost:8118'
8+
if len(proxy_uri) == 0:
9+
print(u'暂时没有可用代理')
10+
return
11+
print(u'获取到的代理是:' + proxy_uri)
12+
13+
proxies = { 'http': proxy_uri, 'https': proxy_uri }
14+
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
15+
html = requests.get('https://www.baidu.com', proxies=proxies).text
16+
print(html)
17+
18+
if __name__ == '__main__':
19+
main()

test/testFetcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
def run():
88
proxies_cnt = dict()
99
for item in fetchers:
10-
if item.name != 'uu-proxy.com': continue # 这行表示只测试特定的爬取器
10+
if item.name != 'www.xsdaili.cn': continue # 这行表示只测试特定的爬取器
1111

1212
print('='*10, 'RUNNING ' + item.name, '='*10)
1313
fetcher = item.fetcher() # 实例化爬取器

0 commit comments

Comments
 (0)