1+ #!/usr/bin/env python
2+ # -*- encoding: utf-8 -*-
3+ # Created on 2016-11-02 09:27:35
4+ # Project: reo
5+
6+ from pyspider .libs .base_handler import *
7+ import pymongo
8+
9+ class Handler (BaseHandler ):
10+ crawl_config = {
11+ }
12+
13+ @every (minutes = 24 * 60 )
14+ def on_start (self ):
15+ self .crawl ('http://www.reeoo.com' , callback = self .index_page )
16+
17+ @config (age = 10 * 24 * 60 * 60 )
18+ def index_page (self , response ):
19+ for each in response .doc ('div[class="thumb"]' ).items ():
20+ detail_url = each ('a' ).attr .href
21+ print (detail_url )
22+ self .crawl (detail_url , callback = self .detail_page )
23+
24+ @config (priority = 2 )
25+ def detail_page (self , response ):
26+ header = response .doc ('body > article > section > header' )
27+ title = header ('h1' ).text ()
28+
29+ tags = []
30+ for each in header .items ('a' ):
31+ tags .append (each .text ())
32+
33+ content = response .doc ('div[id="post_content"]' )
34+ description = content ('blockquote > p' ).text ()
35+
36+ website_url = content ('a' ).attr .href
37+
38+ image_url_list = []
39+ for each in content .items ('img[data-src]' ):
40+ image_url_list .append (each .attr ('data-src' ))
41+
42+ return {
43+ "title" : title ,
44+ "tags" : tags ,
45+ "description" : description ,
46+ "image_url_list" : image_url_list ,
47+ "website_url" : website_url ,
48+ }
49+
50+
51+ def on_result (self , result ):
52+
53+ if not result :
54+ return
55+
56+ print ('------------------' )
57+ print (result )
58+ print ('------------------' )
59+
60+ client = pymongo .MongoClient (host = '127.0.0.1' , port = 27017 )
61+ db = client ['pyspyspider_projectdb' ]
62+ coll = db ['website' ]
63+
64+ data = {
65+ 'title' : result ['title' ],
66+ 'tags' : result ['tags' ],
67+ 'description' : result ['description' ],
68+ 'website_url' : result ['website_url' ],
69+ 'image_url_list' : result ['image_url_list' ]
70+ }
71+
72+ data_id = coll .insert (data )
73+ print (data_id )
74+
75+
0 commit comments