Skip to content

Commit e1f8540

Browse files
committed
initial commit of code to scrape pastebin
1 parent 45d20ae commit e1f8540

File tree

4 files changed

+179
-0
lines changed

4 files changed

+179
-0
lines changed

pastebin-mirror/__main__.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from scraper import PastebinComScraper
2+
from storage import SQLite3Storage
3+
import time
4+
import sys
5+
import os
6+
7+
8+
def usage():
9+
print("pastebin-mirror [sqlite-file]")
10+
11+
12+
if __name__ == '__main__':
13+
if len(sys.argv) < 2:
14+
usage()
15+
sys.exit(1)
16+
17+
pastebin_sqlite_database = sys.argv[1]
18+
19+
if os.path.isdir(pastebin_sqlite_database):
20+
usage()
21+
sys.exit(1)
22+
23+
scraper = PastebinComScraper()
24+
storage = SQLite3Storage()
25+
26+
storage.initialize_tables()
27+
28+
while True:
29+
recent_pastes = scraper.get_recent_pastes()
30+
31+
for paste in recent_pastes:
32+
key = paste['key']
33+
34+
storage.save_paste_reference(
35+
key,
36+
paste['date'],
37+
paste['size'],
38+
paste['expire'],
39+
paste['title'],
40+
paste['syntax'],
41+
paste['user']
42+
)
43+
44+
if not storage.has_paste_content(key):
45+
print("Fetching paste content for %s" % key)
46+
47+
storage.save_paste_content(key, scraper.get_paste_content(key))
48+
49+
time.sleep(1)

pastebin-mirror/scraper.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python3
2+
3+
import requests
4+
5+
6+
class PastebinComScraper:
7+
def __init__(self):
8+
self.__RAW_URL__ = 'https://pastebin.com/raw/'
9+
self.__ITEM_URL__ = 'https://pastebin.com/api_scrape_item.php'
10+
self.__METADATA_URL__ = 'https://pastebin.com/api_scrape_item_meta.php'
11+
self.__LIST_URL__ = 'https://pastebin.com/api_scraping.php'
12+
self.__ERROR_TEXT__ = 'Error, we cannot find this paste.'
13+
14+
def get_paste_content(self, key):
15+
result = requests.get(self.__RAW_URL__ + key)
16+
17+
if not result.ok:
18+
return None
19+
20+
return result.content
21+
22+
def get_paste_metadata(self, key):
23+
paste = requests.get(self.__METADATA_URL__, params={'i': key})
24+
25+
if not paste.ok:
26+
return None
27+
28+
if paste.text == self.__ERROR_TEXT__:
29+
return None
30+
31+
return paste.json()[0]
32+
33+
def get_recent_pastes(self, limit=250):
34+
paste_list = requests.get(self.__LIST_URL__, {limit: min(250, limit)})
35+
36+
if not paste_list.ok:
37+
return []
38+
39+
return paste_list.json()

pastebin-mirror/storage.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import sqlite3
2+
3+
class SQLite3Storage:
4+
def __init__(self, location='pastebin.db'):
5+
self.connection = sqlite3.connect(location)
6+
7+
def initialize_tables(self):
8+
self.connection.execute(
9+
'''
10+
CREATE TABLE IF NOT EXISTS paste (
11+
paste_key CHAR(8) PRIMARY KEY,
12+
timestamp TIMESTAMP,
13+
size INT,
14+
expires TIMESTAMP,
15+
title TEXT,
16+
syntax TEXT,
17+
user TEXT NULL
18+
);
19+
'''
20+
)
21+
22+
self.connection.execute(
23+
'''
24+
CREATE TABLE IF NOT EXISTS paste_content (
25+
paste_key CHAR(8) PRIMARY KEY,
26+
raw_content TEXT
27+
);
28+
'''
29+
)
30+
31+
def has_paste_content(self, key):
32+
cursor = self.connection.cursor()
33+
34+
cursor.execute('SELECT COUNT(*) FROM paste_content WHERE paste_key = ?', (key,))
35+
36+
paste_content_count = cursor.fetchone()[0]
37+
38+
return paste_content_count > 0
39+
40+
def save_paste_reference(self, key, size, timestamp, expires, title, syntax, user):
41+
self.connection.execute(
42+
'''
43+
INSERT OR REPLACE INTO paste
44+
(paste_key, timestamp, size, expires, title, syntax, user)
45+
VALUES
46+
(?, ?, ?, ?, ?, ?, ?)
47+
''',
48+
(
49+
key,
50+
timestamp,
51+
size,
52+
expires,
53+
title,
54+
syntax,
55+
user,
56+
)
57+
)
58+
59+
self.connection.commit()
60+
61+
def save_paste_content(self, key, content):
62+
self.connection.execute(
63+
'''
64+
INSERT OR REPLACE INTO paste_content
65+
(paste_key, raw_content)
66+
VALUES
67+
(?, ?)
68+
''',
69+
(
70+
key,
71+
content,
72+
)
73+
)

setup.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from setuptools import setup
2+
3+
setup(
4+
name='pastebin-mirror',
5+
version='0.0.1',
6+
description='Mirror Pastebin to an SQLite DB',
7+
url='http://github.com/imnotjames/pastebin-mirror',
8+
author='James Ward',
9+
author_email='[email protected]',
10+
license='MIT',
11+
packages=[
12+
'pastebin-mirror'
13+
],
14+
install_requires=[
15+
'requests',
16+
],
17+
zip_safe=False
18+
)

0 commit comments

Comments
 (0)