Skip to content

Commit 59482e7

Browse files
committed
getting
1 parent bccd339 commit 59482e7

File tree

2 files changed

+269
-0
lines changed

2 files changed

+269
-0
lines changed

code-python3/2to3.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Updating the code from Python 2 to Python 3
2+
3+
1. The first and most obvious difference is that in Python 3 `print` takes parentheses.
4+
This means that every
5+
6+
```
7+
print "stuff", 1
8+
```
9+
10+
had to be replaced with
11+
12+
```
13+
print("stuff", 1)
14+
```
15+
16+
This is mostly tedious.
17+
18+
2. <a href="https://www.python.org/dev/peps/pep-3113/">PEP-3113</a> eliminates
19+
tuple unpacking in certain places. In particular, that means that code like
20+
21+
```
22+
key=lambda (a, b): b
23+
```
24+
25+
has to be replaced with
26+
27+
```
28+
key=lambda pair: pair[1]
29+
```
30+
31+
3. In Python 3, laziness is the order of the day. In particular, `dict`-like
32+
objects no longer have `.iteritems()` properties, so those all have to be replaced
33+
with `.items()`
34+
35+
4. Binary mode for CSVs. In Python 2 you would open CSV files in binary mode to
36+
make sure you dealt properly with Windows line endings:
37+
38+
```
39+
f = open("some.csv", "rb")
40+
```
41+
42+
In Python 3 you open them in text mode and just specify the line ending types:
43+
44+
```
45+
f = open("some.csv", 'r', encoding='utf8', newline='')
46+
```

code-python3/getting_data.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
from collections import Counter
2+
import math, random, csv, json, re
3+
4+
from bs4 import BeautifulSoup
5+
import requests
6+
7+
######
8+
#
9+
# BOOKS ABOUT DATA
10+
#
11+
######
12+
13+
def is_video(td):
14+
"""it's a video if it has exactly one pricelabel, and if
15+
the stripped text inside that pricelabel starts with 'Video'"""
16+
pricelabels = td('span', 'pricelabel')
17+
return (len(pricelabels) == 1 and
18+
pricelabels[0].text.strip().startswith("Video"))
19+
20+
def book_info(td):
21+
"""given a BeautifulSoup <td> Tag representing a book,
22+
extract the book's details and return a dict"""
23+
24+
title = td.find("div", "thumbheader").a.text
25+
by_author = td.find('div', 'AuthorName').text
26+
authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")]
27+
isbn_link = td.find("div", "thumbheader").a.get("href")
28+
isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0]
29+
date = td.find("span", "directorydate").text.strip()
30+
31+
return {
32+
"title" : title,
33+
"authors" : authors,
34+
"isbn" : isbn,
35+
"date" : date
36+
}
37+
38+
from time import sleep
39+
40+
def scrape(num_pages=31):
41+
base_url = "http://shop.oreilly.com/category/browse-subjects/" + \
42+
"data.do?sortby=publicationDate&page="
43+
44+
books = []
45+
46+
for page_num in range(1, num_pages + 1):
47+
print("souping page", page_num)
48+
url = base_url + str(page_num)
49+
soup = BeautifulSoup(requests.get(url).text, 'html5lib')
50+
51+
for td in soup('td', 'thumbtext'):
52+
if not is_video(td):
53+
books.append(book_info(td))
54+
55+
# now be a good citizen and respect the robots.txt!
56+
sleep(30)
57+
58+
return books
59+
60+
def get_year(book):
61+
"""book["date"] looks like 'November 2014' so we need to
62+
split on the space and then take the second piece"""
63+
return int(book["date"].split()[1])
64+
65+
def plot_years(plt, books):
66+
# 2014 is the last complete year of data (when I ran this)
67+
year_counts = Counter(get_year(book) for book in books
68+
if get_year(book) <= 2014)
69+
70+
years = sorted(year_counts)
71+
book_counts = [year_counts[year] for year in x]
72+
plt.bar([x - 0.5 for x in years], book_counts)
73+
plt.xlabel("year")
74+
plt.ylabel("# of data books")
75+
plt.title("Data is Big!")
76+
plt.show()
77+
78+
##
79+
#
80+
# APIs
81+
#
82+
##
83+
84+
endpoint = "https://api.github.com/users/joelgrus/repos"
85+
86+
repos = json.loads(requests.get(endpoint).text)
87+
88+
from dateutil.parser import parse
89+
90+
dates = [parse(repo["created_at"]) for repo in repos]
91+
month_counts = Counter(date.month for date in dates)
92+
weekday_counts = Counter(date.weekday() for date in dates)
93+
94+
####
95+
#
96+
# Twitter
97+
#
98+
####
99+
100+
from twython import Twython
101+
102+
# fill these in if you want to use the code
103+
CONSUMER_KEY = ""
104+
CONSUMER_SECRET = ""
105+
ACCESS_TOKEN = ""
106+
ACCESS_TOKEN_SECRET = ""
107+
108+
def call_twitter_search_api():
109+
110+
twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)
111+
112+
# search for tweets containing the phrase "data science"
113+
for status in twitter.search(q='"data science"')["statuses"]:
114+
user = status["user"]["screen_name"].encode('utf-8')
115+
text = status["text"].encode('utf-8')
116+
print(user, ":", text)
117+
print()
118+
119+
from twython import TwythonStreamer
120+
121+
# appending data to a global variable is pretty poor form
122+
# but it makes the example much simpler
123+
tweets = []
124+
125+
class MyStreamer(TwythonStreamer):
126+
"""our own subclass of TwythonStreamer that specifies
127+
how to interact with the stream"""
128+
129+
def on_success(self, data):
130+
"""what do we do when twitter sends us data?
131+
here data will be a Python object representing a tweet"""
132+
133+
# only want to collect English-language tweets
134+
if data['lang'] == 'en':
135+
tweets.append(data)
136+
137+
# stop when we've collected enough
138+
if len(tweets) >= 1000:
139+
self.disconnect()
140+
141+
def on_error(self, status_code, data):
142+
print(status_code, data)
143+
self.disconnect()
144+
145+
def call_twitter_streaming_api():
146+
stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET,
147+
ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
148+
149+
# starts consuming public statuses that contain the keyword 'data'
150+
stream.statuses.filter(track='data')
151+
152+
153+
if __name__ == "__main__":
154+
155+
def process(date, symbol, price):
156+
print(date, symbol, price)
157+
158+
print("tab delimited stock prices:")
159+
160+
with open('tab_delimited_stock_prices.txt', 'r', encoding='utf8',newline='') as f:
161+
reader = csv.reader(f, delimiter='\t')
162+
# reader = csv.reader(codecs.iterdecode(f, 'utf-8'), delimiter='\t')
163+
for row in reader:
164+
date = row[0]
165+
symbol = row[1]
166+
closing_price = float(row[2])
167+
process(date, symbol, closing_price)
168+
169+
print()
170+
171+
print("colon delimited stock prices:")
172+
173+
with open('colon_delimited_stock_prices.txt', 'r', encoding='utf8',newline='') as f:
174+
reader = csv.DictReader(f, delimiter=':')
175+
# reader = csv.DictReader(codecs.iterdecode(f, 'utf-8'), delimiter=':')
176+
for row in reader:
177+
date = row["date"]
178+
symbol = row["symbol"]
179+
closing_price = float(row["closing_price"])
180+
process(date, symbol, closing_price)
181+
182+
print()
183+
184+
print("writing out comma_delimited_stock_prices.txt")
185+
186+
today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 }
187+
188+
with open('comma_delimited_stock_prices.txt','w', encoding='utf8',newline='') as f:
189+
writer = csv.writer(f, delimiter=',')
190+
for stock, price in today_prices.items():
191+
writer.writerow([stock, price])
192+
193+
print("BeautifulSoup")
194+
html = requests.get("http://www.example.com").text
195+
soup = BeautifulSoup(html)
196+
print(soup)
197+
print()
198+
199+
print("parsing json")
200+
201+
serialized = """{ "title" : "Data Science Book",
202+
"author" : "Joel Grus",
203+
"publicationYear" : 2014,
204+
"topics" : [ "data", "science", "data science"] }"""
205+
206+
# parse the JSON to create a Python object
207+
deserialized = json.loads(serialized)
208+
if "data science" in deserialized["topics"]:
209+
print(deserialized)
210+
211+
print()
212+
213+
print("GitHub API")
214+
print("dates", dates)
215+
print("month_counts", month_counts)
216+
print("weekday_count", weekday_counts)
217+
218+
last_5_repositories = sorted(repos,
219+
key=lambda r: r["created_at"],
220+
reverse=True)[:5]
221+
222+
print("last five languages", [repo["language"]
223+
for repo in last_5_repositories])

0 commit comments

Comments
 (0)