3
3
from enum import Enum , unique
4
4
from hashlib import sha1
5
5
from random import random
6
- from threading import Thread , current_thread
6
+ from threading import Thread , current_thread , local
7
7
from time import sleep
8
8
from urllib .parse import urlparse
9
9
@@ -79,13 +79,16 @@ def parse(self, html_page, *, domain='m.sohu.com'):
79
79
path = parser .path
80
80
query = '?' + parser .query if parser .query else ''
81
81
full_url = f'{ scheme } ://{ netloc } { path } { query } '
82
+ redis_client = thread_local .redis_client
82
83
if not redis_client .sismember ('visited_urls' , full_url ):
83
84
redis_client .rpush ('m_sohu_task' , full_url )
84
85
85
86
def extract (self , html_page ):
86
87
pass
87
88
88
89
def store (self , data_dict ):
90
+ # redis_client = thread_local.redis_client
91
+ # mongo_db = thread_local.mongo_db
89
92
pass
90
93
91
94
@@ -96,6 +99,10 @@ def __init__(self, name, spider):
96
99
self .spider = spider
97
100
98
101
def run (self ):
102
+ redis_client = redis .Redis (host = '1.2.3.4' , port = 6379 , password = '1qaz2wsx' )
103
+ mongo_client = pymongo .MongoClient (host = '1.2.3.4' , port = 27017 )
104
+ thread_local .redis_client = redis_client
105
+ thread_local .mongo_db = mongo_client .msohu
99
106
while True :
100
107
current_url = redis_client .lpop ('m_sohu_task' )
101
108
while not current_url :
@@ -109,6 +116,7 @@ def run(self):
109
116
hasher = hasher_proto .copy ()
110
117
hasher .update (current_url .encode ('utf-8' ))
111
118
doc_id = hasher .hexdigest ()
119
+ sohu_data_coll = mongo_client .msohu .webpages
112
120
if not sohu_data_coll .find_one ({'_id' : doc_id }):
113
121
sohu_data_coll .insert_one ({
114
122
'_id' : doc_id ,
@@ -124,17 +132,15 @@ def is_any_alive(spider_threads):
124
132
for spider_thread in spider_threads ])
125
133
126
134
127
- redis_client = redis .Redis (host = '1.2.3.4' ,
128
- port = 6379 , password = '1qaz2wsx' )
129
- mongo_client = pymongo .MongoClient (host = '120.77.222.217' , port = 27017 )
130
- db = mongo_client .msohu
131
- sohu_data_coll = db .webpages
135
+ thread_local = local ()
132
136
hasher_proto = sha1 ()
133
137
134
138
135
139
def main ():
140
+ redis_client = redis .Redis (host = '1.2.3.4' , port = 6379 , password = '1qaz2wsx' )
136
141
if not redis_client .exists ('m_sohu_task' ):
137
142
redis_client .rpush ('m_sohu_task' , 'http://m.sohu.com/' )
143
+
138
144
spider_threads = [SpiderThread ('thread-%d' % i , Spider ())
139
145
for i in range (10 )]
140
146
for spider_thread in spider_threads :
0 commit comments