Skip to content

Commit 93e2ff2

Browse files
committed
pyspider demo
1 parent c22ef18 commit 93e2ff2

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

Pysp/pysp.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python
2+
# -*- encoding: utf-8 -*-
3+
# Created on 2016-11-02 09:27:35
4+
# Project: reo
5+
6+
from pyspider.libs.base_handler import *
7+
import pymongo
8+
9+
class Handler(BaseHandler):
10+
crawl_config = {
11+
}
12+
13+
@every(minutes=24 * 60)
14+
def on_start(self):
15+
self.crawl('http://www.reeoo.com', callback=self.index_page)
16+
17+
@config(age=10 * 24 * 60 * 60)
18+
def index_page(self, response):
19+
for each in response.doc('div[class="thumb"]').items():
20+
detail_url = each('a').attr.href
21+
print (detail_url)
22+
self.crawl(detail_url, callback=self.detail_page)
23+
24+
@config(priority=2)
25+
def detail_page(self, response):
26+
header = response.doc('body > article > section > header')
27+
title = header('h1').text()
28+
29+
tags = []
30+
for each in header.items('a'):
31+
tags.append(each.text())
32+
33+
content = response.doc('div[id="post_content"]')
34+
description = content('blockquote > p').text()
35+
36+
website_url = content('a').attr.href
37+
38+
image_url_list = []
39+
for each in content.items('img[data-src]'):
40+
image_url_list.append(each.attr('data-src'))
41+
42+
return {
43+
"title": title,
44+
"tags": tags,
45+
"description": description,
46+
"image_url_list": image_url_list,
47+
"website_url": website_url,
48+
}
49+
50+
51+
def on_result(self, result):
52+
53+
if not result:
54+
return
55+
56+
print ('------------------')
57+
print (result)
58+
print ('------------------')
59+
60+
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
61+
db = client['pyspyspider_projectdb']
62+
coll = db['website']
63+
64+
data = {
65+
'title': result['title'],
66+
'tags': result['tags'],
67+
'description': result['description'],
68+
'website_url': result['website_url'],
69+
'image_url_list': result['image_url_list']
70+
}
71+
72+
data_id = coll.insert(data)
73+
print (data_id)
74+
75+

0 commit comments

Comments
 (0)