|
| 1 | +from collections import Counter |
| 2 | +import math, random, csv, json, re |
| 3 | + |
| 4 | +from bs4 import BeautifulSoup |
| 5 | +import requests |
| 6 | + |
| 7 | +###### |
| 8 | +# |
| 9 | +# BOOKS ABOUT DATA |
| 10 | +# |
| 11 | +###### |
| 12 | + |
| 13 | +def is_video(td): |
| 14 | + """it's a video if it has exactly one pricelabel, and if |
| 15 | + the stripped text inside that pricelabel starts with 'Video'""" |
| 16 | + pricelabels = td('span', 'pricelabel') |
| 17 | + return (len(pricelabels) == 1 and |
| 18 | + pricelabels[0].text.strip().startswith("Video")) |
| 19 | + |
| 20 | +def book_info(td): |
| 21 | + """given a BeautifulSoup <td> Tag representing a book, |
| 22 | + extract the book's details and return a dict""" |
| 23 | + |
| 24 | + title = td.find("div", "thumbheader").a.text |
| 25 | + by_author = td.find('div', 'AuthorName').text |
| 26 | + authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")] |
| 27 | + isbn_link = td.find("div", "thumbheader").a.get("href") |
| 28 | + isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0] |
| 29 | + date = td.find("span", "directorydate").text.strip() |
| 30 | + |
| 31 | + return { |
| 32 | + "title" : title, |
| 33 | + "authors" : authors, |
| 34 | + "isbn" : isbn, |
| 35 | + "date" : date |
| 36 | + } |
| 37 | + |
| 38 | +from time import sleep |
| 39 | + |
| 40 | +def scrape(num_pages=31): |
| 41 | + base_url = "http://shop.oreilly.com/category/browse-subjects/" + \ |
| 42 | + "data.do?sortby=publicationDate&page=" |
| 43 | + |
| 44 | + books = [] |
| 45 | + |
| 46 | + for page_num in range(1, num_pages + 1): |
| 47 | + print("souping page", page_num) |
| 48 | + url = base_url + str(page_num) |
| 49 | + soup = BeautifulSoup(requests.get(url).text, 'html5lib') |
| 50 | + |
| 51 | + for td in soup('td', 'thumbtext'): |
| 52 | + if not is_video(td): |
| 53 | + books.append(book_info(td)) |
| 54 | + |
| 55 | + # now be a good citizen and respect the robots.txt! |
| 56 | + sleep(30) |
| 57 | + |
| 58 | + return books |
| 59 | + |
| 60 | +def get_year(book): |
| 61 | + """book["date"] looks like 'November 2014' so we need to |
| 62 | + split on the space and then take the second piece""" |
| 63 | + return int(book["date"].split()[1]) |
| 64 | + |
| 65 | +def plot_years(plt, books): |
| 66 | + # 2014 is the last complete year of data (when I ran this) |
| 67 | + year_counts = Counter(get_year(book) for book in books |
| 68 | + if get_year(book) <= 2014) |
| 69 | + |
| 70 | + years = sorted(year_counts) |
| 71 | + book_counts = [year_counts[year] for year in x] |
| 72 | + plt.bar([x - 0.5 for x in years], book_counts) |
| 73 | + plt.xlabel("year") |
| 74 | + plt.ylabel("# of data books") |
| 75 | + plt.title("Data is Big!") |
| 76 | + plt.show() |
| 77 | + |
| 78 | +## |
| 79 | +# |
| 80 | +# APIs |
| 81 | +# |
| 82 | +## |
| 83 | + |
| 84 | +endpoint = "https://api.github.com/users/joelgrus/repos" |
| 85 | + |
| 86 | +repos = json.loads(requests.get(endpoint).text) |
| 87 | + |
| 88 | +from dateutil.parser import parse |
| 89 | + |
| 90 | +dates = [parse(repo["created_at"]) for repo in repos] |
| 91 | +month_counts = Counter(date.month for date in dates) |
| 92 | +weekday_counts = Counter(date.weekday() for date in dates) |
| 93 | + |
| 94 | +#### |
| 95 | +# |
| 96 | +# Twitter |
| 97 | +# |
| 98 | +#### |
| 99 | + |
| 100 | +from twython import Twython |
| 101 | + |
| 102 | +# fill these in if you want to use the code |
| 103 | +CONSUMER_KEY = "" |
| 104 | +CONSUMER_SECRET = "" |
| 105 | +ACCESS_TOKEN = "" |
| 106 | +ACCESS_TOKEN_SECRET = "" |
| 107 | + |
| 108 | +def call_twitter_search_api(): |
| 109 | + |
| 110 | + twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET) |
| 111 | + |
| 112 | + # search for tweets containing the phrase "data science" |
| 113 | + for status in twitter.search(q='"data science"')["statuses"]: |
| 114 | + user = status["user"]["screen_name"].encode('utf-8') |
| 115 | + text = status["text"].encode('utf-8') |
| 116 | + print(user, ":", text) |
| 117 | + print() |
| 118 | + |
| 119 | +from twython import TwythonStreamer |
| 120 | + |
| 121 | +# appending data to a global variable is pretty poor form |
| 122 | +# but it makes the example much simpler |
| 123 | +tweets = [] |
| 124 | + |
| 125 | +class MyStreamer(TwythonStreamer): |
| 126 | + """our own subclass of TwythonStreamer that specifies |
| 127 | + how to interact with the stream""" |
| 128 | + |
| 129 | + def on_success(self, data): |
| 130 | + """what do we do when twitter sends us data? |
| 131 | + here data will be a Python object representing a tweet""" |
| 132 | + |
| 133 | + # only want to collect English-language tweets |
| 134 | + if data['lang'] == 'en': |
| 135 | + tweets.append(data) |
| 136 | + |
| 137 | + # stop when we've collected enough |
| 138 | + if len(tweets) >= 1000: |
| 139 | + self.disconnect() |
| 140 | + |
| 141 | + def on_error(self, status_code, data): |
| 142 | + print(status_code, data) |
| 143 | + self.disconnect() |
| 144 | + |
| 145 | +def call_twitter_streaming_api(): |
| 146 | + stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET, |
| 147 | + ACCESS_TOKEN, ACCESS_TOKEN_SECRET) |
| 148 | + |
| 149 | + # starts consuming public statuses that contain the keyword 'data' |
| 150 | + stream.statuses.filter(track='data') |
| 151 | + |
| 152 | + |
| 153 | +if __name__ == "__main__": |
| 154 | + |
| 155 | + def process(date, symbol, price): |
| 156 | + print(date, symbol, price) |
| 157 | + |
| 158 | + print("tab delimited stock prices:") |
| 159 | + |
| 160 | + with open('tab_delimited_stock_prices.txt', 'r', encoding='utf8',newline='') as f: |
| 161 | + reader = csv.reader(f, delimiter='\t') |
| 162 | + # reader = csv.reader(codecs.iterdecode(f, 'utf-8'), delimiter='\t') |
| 163 | + for row in reader: |
| 164 | + date = row[0] |
| 165 | + symbol = row[1] |
| 166 | + closing_price = float(row[2]) |
| 167 | + process(date, symbol, closing_price) |
| 168 | + |
| 169 | + print() |
| 170 | + |
| 171 | + print("colon delimited stock prices:") |
| 172 | + |
| 173 | + with open('colon_delimited_stock_prices.txt', 'r', encoding='utf8',newline='') as f: |
| 174 | + reader = csv.DictReader(f, delimiter=':') |
| 175 | + # reader = csv.DictReader(codecs.iterdecode(f, 'utf-8'), delimiter=':') |
| 176 | + for row in reader: |
| 177 | + date = row["date"] |
| 178 | + symbol = row["symbol"] |
| 179 | + closing_price = float(row["closing_price"]) |
| 180 | + process(date, symbol, closing_price) |
| 181 | + |
| 182 | + print() |
| 183 | + |
| 184 | + print("writing out comma_delimited_stock_prices.txt") |
| 185 | + |
| 186 | + today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 } |
| 187 | + |
| 188 | + with open('comma_delimited_stock_prices.txt','w', encoding='utf8',newline='') as f: |
| 189 | + writer = csv.writer(f, delimiter=',') |
| 190 | + for stock, price in today_prices.items(): |
| 191 | + writer.writerow([stock, price]) |
| 192 | + |
| 193 | + print("BeautifulSoup") |
| 194 | + html = requests.get("http://www.example.com").text |
| 195 | + soup = BeautifulSoup(html) |
| 196 | + print(soup) |
| 197 | + print() |
| 198 | + |
| 199 | + print("parsing json") |
| 200 | + |
| 201 | + serialized = """{ "title" : "Data Science Book", |
| 202 | + "author" : "Joel Grus", |
| 203 | + "publicationYear" : 2014, |
| 204 | + "topics" : [ "data", "science", "data science"] }""" |
| 205 | + |
| 206 | + # parse the JSON to create a Python object |
| 207 | + deserialized = json.loads(serialized) |
| 208 | + if "data science" in deserialized["topics"]: |
| 209 | + print(deserialized) |
| 210 | + |
| 211 | + print() |
| 212 | + |
| 213 | + print("GitHub API") |
| 214 | + print("dates", dates) |
| 215 | + print("month_counts", month_counts) |
| 216 | + print("weekday_count", weekday_counts) |
| 217 | + |
| 218 | + last_5_repositories = sorted(repos, |
| 219 | + key=lambda r: r["created_at"], |
| 220 | + reverse=True)[:5] |
| 221 | + |
| 222 | + print("last five languages", [repo["language"] |
| 223 | + for repo in last_5_repositories]) |
0 commit comments