新增开心代理，西拉代理，小舒代理

OxOOo · OxOOo · commit d493c69cac43 · 2021-07-24T22:48:51.000+08:00
diff --git a/fetchers/KaiXinFetcher.py b/fetchers/KaiXinFetcher.py
@@ -0,0 +1,40 @@
+import re
+import time
+
+import requests
+from pyquery import PyQuery as pq
+
+from .BaseFetcher import BaseFetcher
+
+class KaiXinFetcher(BaseFetcher):
+    """
+    http://www.kxdaili.com/dailiip.html
+    代码由 [Zealot666](https://github.com/Zealot666) 提供
+    """
+
+    def fetch(self):
+        """
+        执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocol是协议名称，目前主要为http
+        返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
+        """
+
+        urls = []
+        urls = urls + [f'http://www.kxdaili.com/dailiip/1/{page}.html' for page in range(1, 11)]
+        urls = urls + [f'http://www.kxdaili.com/dailiip/2/{page}.html' for page in range(1, 11)]
+
+        proxies = []
+        ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
+        port_regex = re.compile(r'^\d+$')
+
+        for url in urls:
+            html = requests.get(url, timeout=10).text
+            doc = pq(html)
+            for line in doc('tr').items():
+                tds = list(line('td').items())
+                if len(tds) >= 2:
+                    ip = tds[0].text().strip()
+                    port = tds[1].text().strip()
+                    if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
+                        proxies.append(('http', ip, int(port)))
+
+        return list(set(proxies))
diff --git a/fetchers/XiLaFetcher.py b/fetchers/XiLaFetcher.py
@@ -0,0 +1,46 @@
+import re
+import time
+
+import requests
+from pyquery import PyQuery as pq
+
+from .BaseFetcher import BaseFetcher
+
+class XiLaFetcher(BaseFetcher):
+    """
+    http://www.xiladaili.com/gaoni/
+    代码由 [Zealot666](https://github.com/Zealot666) 提供
+    """
+    def __init__(self):
+        super().__init__()
+        self.index = 0
+
+    def fetch(self):
+        """
+        执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocol是协议名称，目前主要为http
+        返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
+        """
+        self.index += 1
+        new_index = self.index % 30
+
+        urls = []
+        urls = urls + [f'http://www.xiladaili.com/gaoni/{page}/' for page in range(new_index, new_index + 11)]
+        urls = urls + [f'http://www.xiladaili.com/http/{page}/' for page in range(new_index, new_index + 11)]
+
+        proxies = []
+        ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
+        port_regex = re.compile(r'^\d+$')
+
+        for url in urls:
+            time.sleep(1)
+            html = requests.get(url, timeout=10).text
+            doc = pq(html)
+            for line in doc('tr').items():
+                tds = list(line('td').items())
+                if len(tds) >= 2:
+                    ip = tds[0].text().strip().split(":")[0]
+                    port = tds[0].text().strip().split(":")[1]
+                    if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
+                        proxies.append(('http', ip, int(port)))
+
+        return list(set(proxies))
diff --git a/fetchers/XiaoShuFetcher.py b/fetchers/XiaoShuFetcher.py
@@ -0,0 +1,48 @@
+import re
+import time
+
+import requests
+from pyquery import PyQuery as pq
+
+from .BaseFetcher import BaseFetcher
+
+class XiaoShuFetcher(BaseFetcher):
+    """
+    http://www.xsdaili.cn/
+    代码由 [Zealot666](https://github.com/Zealot666) 提供
+    """
+    def __init__(self):
+        super().__init__()
+        self.index = 0
+
+    def fetch(self):
+        """
+        执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocol是协议名称，目前主要为http
+        返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
+        """
+        self.index += 1
+        new_index = self.index % 10
+
+        urls = set()
+        proxies = []
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
+        }
+        for page in range(new_index, new_index + 1):
+            response = requests.get("http://www.xsdaili.cn/dayProxy/" + str(page) + ".html", headers=headers, timeout=10)
+            for item in pq(response.text)('a').items():
+                try:
+                    if "/dayProxy/ip" in item.attr("href"):
+                        urls.add("http://www.xsdaili.cn" + item.attr("href"))
+                except Exception:
+                    continue
+            for url in urls:
+                response = requests.get(url, headers=headers, timeout=8)
+                doc = pq(response.text)
+                for item in doc(".cont").items():
+                    for line in item.text().split("\n"):
+                        ip = line.split('@')[0].split(':')[0]
+                        port = line.split('@')[0].split(':')[1]
+                        proxies.append(("http", ip, port))
+
+            return list(set(proxies))
diff --git a/fetchers/__init__.py b/fetchers/__init__.py
@@ -11,6 +11,9 @@
 from .JiangxianliFetcher import JiangxianliFetcher
 from .IHuanFetcher import IHuanFetcher
 from .IP89Fetcher import IP89Fetcher
+from .KaiXinFetcher import KaiXinFetcher
+from .XiLaFetcher import XiLaFetcher
+from .XiaoShuFetcher import XiaoShuFetcher
 
 fetchers = [
     Fetcher(name='uu-proxy.com', fetcher=UUFetcher),
@@ -21,4 +24,7 @@
     Fetcher(name='ip.jiangxianli.com', fetcher=JiangxianliFetcher),
     Fetcher(name='ip.ihuan.me', fetcher=IHuanFetcher),
     Fetcher(name='www.89ip.cn', fetcher=IP89Fetcher),
+    Fetcher(name='www.kxdaili.com', fetcher=KaiXinFetcher),
+    Fetcher(name='www.xiladaili.com', fetcher=XiLaFetcher),
+    Fetcher(name='www.xsdaili.cn', fetcher=XiaoShuFetcher),
 ]
diff --git a/test.py b/test.py
@@ -0,0 +1,19 @@
+# encoding : utf-8
+
+import requests
+
+def main():
+    proxy_uri = 'http://223.10.82.66:8118'
+    # proxy_uri = 'http://localhost:8118'
+    if len(proxy_uri) == 0:
+        print(u'暂时没有可用代理')
+        return
+    print(u'获取到的代理是：' + proxy_uri)
+    
+    proxies = { 'http': proxy_uri, 'https': proxy_uri }
+    # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
+    html = requests.get('https://www.baidu.com', proxies=proxies).text
+    print(html)
+
+if __name__ == '__main__':
+    main()
diff --git a/test/testFetcher.py b/test/testFetcher.py
@@ -7,7 +7,7 @@
 def run():
     proxies_cnt = dict()
     for item in fetchers:
-        if item.name != 'uu-proxy.com': continue # 这行表示只测试特定的爬取器
+        if item.name != 'www.xsdaili.cn': continue # 这行表示只测试特定的爬取器
 
         print('='*10, 'RUNNING ' + item.name, '='*10)
         fetcher = item.fetcher() # 实例化爬取器