0% found this document useful (0 votes)

169 views4 pages

SDFG

The document contains code for scraping book data from a website, making API calls to GitHub and Twitter, and parsing JSON. It includes functions for extracting book metadata from HTML, making requests to various APIs, parsing responses into Python objects, and collecting tweets from Twitter's streaming API. The code also demonstrates writing to and reading from CSV files with different delimiters and parsing sample JSON data.

Uploaded by

gprasadatvu

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

169 views4 pages

SDFG

Uploaded by

gprasadatvu

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.

from future import division

from collections import Counter
import math, random, csv, json

from bs4 import BeautifulSoup

import requests

######
#
# BOOKS ABOUT DATA
#
######

def is_video(td):
"""it's a video if it has exactly one pricelabel, and if
the stripped text inside that pricelabel starts with 'Video'"""
pricelabels = td('span', 'pricelabel')
return (len(pricelabels) == 1 and
pricelabels[0].text.strip().startswith("Video"))

def book_info(td):
"""given a BeautifulSoup <td> Tag representing a book,
extract the book's details and return a dict"""

title = td.find("div", "thumbheader").a.text

by_author = td.find('div', 'AuthorName').text
authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")]
isbn_link = td.find("div", "thumbheader").a.get("href")
isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0]
date = td.find("span", "directorydate").text.strip()

return {
"title" : title,
"authors" : authors,
"isbn" : isbn,
"date" : date
}

from time import sleep

def scrape(num_pages=31):
base_url = "http://shop.oreilly.com/category/browse-subjects/" + \
"data.do?sortby=publicationDate&page="

books = []

for page_num in range(1, num_pages + 1):

print "souping page", page_num
url = base_url + str(page_num)
soup = BeautifulSoup(requests.get(url).text, 'html5lib')

for td in soup('td', 'thumbtext'):

if not is_video(td):
books.append(book_info(td))

# now be a good citizen and respect the robots.txt!

sleep(30)

return books

def get_year(book):
"""book["date"] looks like 'November 2014' so we need to
split on the space and then take the second piece"""
return int(book["date"].split()[1])

def plot_years(plt, books):

# 2014 is the last complete year of data (when I ran this)
year_counts = Counter(get_year(book) for book in books
if get_year(book) <= 2014)
https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.py 1/4
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.py

years = sorted(year_counts)
book_counts = [year_counts[year] for year in x]
plt.bar([x - 0.5 for x in years], book_counts)
plt.xlabel("year")
plt.ylabel("# of data books")
plt.title("Data is Big!")
plt.show()

##
#
# APIs
#
##

endpoint = "https://api.github.com/users/joelgrus/repos"

repos = json.loads(requests.get(endpoint).text)

from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]

month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

####
#
# Twitter
#
####

from twython import Twython

# fill these in if you want to use the code

CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_TOKEN = ""
ACCESS_TOKEN_SECRET = ""

def call_twitter_search_api():

twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)

# search for tweets containing the phrase "data science"

for status in twitter.search(q='"data science"')["statuses"]:
user = status["user"]["screen_name"].encode('utf-8')
text = status["text"].encode('utf-8')
print user, ":", text
print

from twython import TwythonStreamer

# appending data to a global variable is pretty poor form

# but it makes the example much simpler
tweets = []

class MyStreamer(TwythonStreamer):
"""our own subclass of TwythonStreamer that specifies
how to interact with the stream"""

def on_success(self, data):

"""what do we do when twitter sends us data?
here data will be a Python object representing a tweet"""

# only want to collect English-language tweets

if data['lang'] == 'en':
tweets.append(data)

# stop when we've collected enough

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.py 2/4
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.py

if len(tweets) >= 1000:

self.disconnect()

def on_error(self, status_code, data):

print status_code, data
self.disconnect()

def call_twitter_streaming_api():
stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET,
ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# starts consuming public statuses that contain the keyword 'data'

stream.statuses.filter(track='data')

if __name__ == "__main__":

def process(date, symbol, price):

print date, symbol, price

print "tab delimited stock prices:"

with open('tab_delimited_stock_prices.txt', 'rb') as f:

reader = csv.reader(f, delimiter='\t')
for row in reader:
date = row[0]
symbol = row[1]
closing_price = float(row[2])
process(date, symbol, closing_price)

print "colon delimited stock prices:"

with open('colon_delimited_stock_prices.txt', 'rb') as f:

reader = csv.DictReader(f, delimiter=':')
for row in reader:
date = row["date"]
symbol = row["symbol"]
closing_price = float(row["closing_price"])
process(date, symbol, closing_price)

print "writing out comma_delimited_stock_prices.txt"

today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 }

with open('comma_delimited_stock_prices.txt','wb') as f:
writer = csv.writer(f, delimiter=',')
for stock, price in today_prices.items():
writer.writerow([stock, price])

print "BeautifulSoup"
html = requests.get("http://www.example.com").text
soup = BeautifulSoup(html)
print soup
print

print "parsing json"

serialized = """{ "title" : "Data Science Book",

"author" : "Joel Grus",
"publicationYear" : 2014,
"topics" : [ "data", "science", "data science"] }"""

# parse the JSON to create a Python object

deserialized = json.loads(serialized)
if "data science" in deserialized["topics"]:

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.py 3/4
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.py

print deserialized

print "GitHub API"

print "dates", dates
print "month_counts", month_counts
print "weekday_count", weekday_counts

last_5_repositories = sorted(repos,
key=lambda r: r["created_at"],
reverse=True)[:5]

print "last five languages", [repo["language"]

for repo in last_5_repositories]

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.py 4/4

혼자 공부하는 파이썬
No ratings yet
혼자 공부하는 파이썬
70 pages
Essential n8n Playbook
From Everand
Essential n8n Playbook
Leandro Calado
No ratings yet
Foundational Python For Data Science
100% (1)
Foundational Python For Data Science
324 pages
Data Science With Python
No ratings yet
Data Science With Python
16 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Ultimate Step by Step Guide To Machine Learning Using Python Predictive
100% (3)
Ultimate Step by Step Guide To Machine Learning Using Python Predictive
56 pages
Python For Data Science Extended Ebook PDF
100% (5)
Python For Data Science Extended Ebook PDF
56 pages
Ir & Emerging Socio Economic Scenario...
89% (9)
Ir & Emerging Socio Economic Scenario...
8 pages
Mastering Python - Sample Chapter
100% (1)
Mastering Python - Sample Chapter
45 pages
Python Basics (By Murtaza's Workshop)
100% (1)
Python Basics (By Murtaza's Workshop)
38 pages
I
No ratings yet
I
54 pages
Python Using AI
No ratings yet
Python Using AI
9 pages
Data Science Papers
No ratings yet
Data Science Papers
109 pages
Data Science - A First Introduction With Python (Z-Lib - Io)
No ratings yet
Data Science - A First Introduction With Python (Z-Lib - Io)
452 pages
Programming 2 Lectures
No ratings yet
Programming 2 Lectures
52 pages
Advance Data Mining Assignment
No ratings yet
Advance Data Mining Assignment
10 pages
Python Indepth Live Session
No ratings yet
Python Indepth Live Session
8 pages
Python Record Manual
No ratings yet
Python Record Manual
18 pages
Cheat Sheet: Python For Data Science
No ratings yet
Cheat Sheet: Python For Data Science
4 pages
Cheat Sheet: Python For Data Science
No ratings yet
Cheat Sheet: Python For Data Science
4 pages
A Z Cheatsheet Python DA
No ratings yet
A Z Cheatsheet Python DA
7 pages
Shirin Dalvi Fundraiser Analysis: Getting The Donor Info
No ratings yet
Shirin Dalvi Fundraiser Analysis: Getting The Donor Info
6 pages
The Ultimate Guide to Python Programming With Python 3.10
No ratings yet
The Ultimate Guide to Python Programming With Python 3.10
2 pages
PDS MERGED NEW
No ratings yet
PDS MERGED NEW
19 pages
Exercises 5
No ratings yet
Exercises 5
7 pages
DeepSeek - Python Tutorial
No ratings yet
DeepSeek - Python Tutorial
8 pages
Sma 2
No ratings yet
Sma 2
9 pages
Python Data Science Group Bootcamp NYC (Affordable Machine Learning)
No ratings yet
Python Data Science Group Bootcamp NYC (Affordable Machine Learning)
16 pages
Api and data structure
No ratings yet
Api and data structure
3 pages
Coding - Python Camp - Notes
No ratings yet
Coding - Python Camp - Notes
5 pages
DATASCIENCE(Unit-1) Question Bank
No ratings yet
DATASCIENCE(Unit-1) Question Bank
6 pages
Python Applications
No ratings yet
Python Applications
8 pages
Zarnain Test
No ratings yet
Zarnain Test
6 pages
Python Itinerary
No ratings yet
Python Itinerary
4 pages
Anis D. Ultimate Step by Step Guide To Data Science..Python.2021
No ratings yet
Anis D. Ultimate Step by Step Guide To Data Science..Python.2021
161 pages
Data Visulization Chapter 2
No ratings yet
Data Visulization Chapter 2
24 pages
MCP Lab-2023 ContentForPythonLibrariesTopic
No ratings yet
MCP Lab-2023 ContentForPythonLibrariesTopic
9 pages
Python Cheat Sheet - The Basics CC
No ratings yet
Python Cheat Sheet - The Basics CC
2 pages
Python Data Science Handbook - Python Data Science Handbook
No ratings yet
Python Data Science Handbook - Python Data Science Handbook
4 pages
Python Cheat Sheet - The Basics Edx
No ratings yet
Python Cheat Sheet - The Basics Edx
2 pages
Efficient Python Tricks and Tools For Data Scientists
100% (1)
Efficient Python Tricks and Tools For Data Scientists
23 pages
Python Essentials Objectives
No ratings yet
Python Essentials Objectives
2 pages
Web Scraping Weather Data Using Python - by Abhishek Khatri - Medium
No ratings yet
Web Scraping Weather Data Using Python - by Abhishek Khatri - Medium
8 pages
Data Extraction: Parse A 3-Nested JSON Object and Convert It To A Pandas Dataframe
No ratings yet
Data Extraction: Parse A 3-Nested JSON Object and Convert It To A Pandas Dataframe
1 page
NgRx SignalStore: An effortless solution for state management
From Everand
NgRx SignalStore: An effortless solution for state management
Abdelfattah Ragab
No ratings yet
Ass1 DSBDA Writeup
No ratings yet
Ass1 DSBDA Writeup
8 pages
Data Processing with Python and R
No ratings yet
Data Processing with Python and R
6 pages
Automation Cheat Sheet 2.0
100% (1)
Automation Cheat Sheet 2.0
6 pages
Esc Enter M Y A B D + D Z F Shift + Up/Down Space Shift + Space
No ratings yet
Esc Enter M Y A B D + D Z F Shift + Up/Down Space Shift + Space
12 pages
SMA 3
No ratings yet
SMA 3
3 pages
Extract Transform Load
No ratings yet
Extract Transform Load
80 pages
Lecture 2 - Collecting, Analyzing, and Visualizing Data with Python Part I
No ratings yet
Lecture 2 - Collecting, Analyzing, and Visualizing Data with Python Part I
15 pages
Top 18 Python Libraries
100% (1)
Top 18 Python Libraries
11 pages
Python for Data Science – Ultimate Library Guide
No ratings yet
Python for Data Science – Ultimate Library Guide
5 pages
Python Notes
No ratings yet
Python Notes
12 pages
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
Python MP2
No ratings yet
Python MP2
14 pages
Python Programming- A Complete Beginner's Guide
No ratings yet
Python Programming- A Complete Beginner's Guide
6 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Simplifying Data Science With Python
From Everand
Simplifying Data Science With Python
Billy David millican
No ratings yet
6-Week Project Plan_ Advanced NIFTY 50 Stock Prediction System
No ratings yet
6-Week Project Plan_ Advanced NIFTY 50 Stock Prediction System
9 pages
Data Science Assignment 1
No ratings yet
Data Science Assignment 1
5 pages
(MultiDisplicine) Simple DA
No ratings yet
(MultiDisplicine) Simple DA
34 pages
How to a Developers Guide to 4k: Developer edition, #3
From Everand
How to a Developers Guide to 4k: Developer edition, #3
Xinc Cyberwizard
No ratings yet
Hall Ticket Cum Test Fee Receipt: Important Instructions
No ratings yet
Hall Ticket Cum Test Fee Receipt: Important Instructions
1 page
1000 Words Writing Practice for Kids
No ratings yet
1000 Words Writing Practice for Kids
84 pages
1000 Simple Sentences for Kids
No ratings yet
1000 Simple Sentences for Kids
200 pages
Human Resource Planning IGNOU All in One
No ratings yet
Human Resource Planning IGNOU All in One
202 pages
Cheat Sheets of Python Libraries Tensorflow
No ratings yet
Cheat Sheets of Python Libraries Tensorflow
3 pages
Regex - Extract Pattern From String, Strip Text, Convert To Numeric and Sum in R Data
No ratings yet
Regex - Extract Pattern From String, Strip Text, Convert To Numeric and Sum in R Data
2 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
RPubs - Text-Mining With Rvest and Qdap
No ratings yet
RPubs - Text-Mining With Rvest and Qdap
17 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
Know Thy Complexities!: Big-O Complexity Chart
No ratings yet
Know Thy Complexities!: Big-O Complexity Chart
2 pages
Q
No ratings yet
Q
2 pages
MSC in Applied Data Science & Big Data - Data ScienceTech Institute
No ratings yet
MSC in Applied Data Science & Big Data - Data ScienceTech Institute
8 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Working With Data
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Working With Data
7 pages
Infographic Study Abroad en
No ratings yet
Infographic Study Abroad en
1 page
1.3 Python As A Calculator
100% (1)
1.3 Python As A Calculator
2 pages
Real Time Drowsiness Detection System
No ratings yet
Real Time Drowsiness Detection System
24 pages
Organizing and Sharing Python Code: Dr. Carol Alexandru University of Zurich, Department of Informatics
No ratings yet
Organizing and Sharing Python Code: Dr. Carol Alexandru University of Zurich, Department of Informatics
55 pages
Technology F4 Updated
No ratings yet
Technology F4 Updated
16 pages
Adp A2
No ratings yet
Adp A2
2 pages
Project Report On Virtual Assistant Subm
No ratings yet
Project Report On Virtual Assistant Subm
30 pages
MandeepSingh
No ratings yet
MandeepSingh
1 page
Beginning Python Programming: Kantesh Raj (@kanteshraj)
No ratings yet
Beginning Python Programming: Kantesh Raj (@kanteshraj)
28 pages
Flight DElay Report
No ratings yet
Flight DElay Report
49 pages
Be10105-B Icps
No ratings yet
Be10105-B Icps
2 pages
Data Analysis using Python (1) NAVTTC
No ratings yet
Data Analysis using Python (1) NAVTTC
17 pages
Python Book
100% (3)
Python Book
445 pages
DA Using Python - Original - 17th
No ratings yet
DA Using Python - Original - 17th
127 pages
Computer Science 2
No ratings yet
Computer Science 2
27 pages
Iot Unit 3 Decode
No ratings yet
Iot Unit 3 Decode
5 pages
Class 11 IP Ch-2, 3, 4 Python Programming Basics Notes
50% (2)
Class 11 IP Ch-2, 3, 4 Python Programming Basics Notes
37 pages
LN1 Financial Modeling with Python and Excel
No ratings yet
LN1 Financial Modeling with Python and Excel
2 pages
BCA TY NEW Syllabus 2021 22
No ratings yet
BCA TY NEW Syllabus 2021 22
27 pages
Python (Programming Language) - Wikipedia
No ratings yet
Python (Programming Language) - Wikipedia
231 pages
Class 12 Ip Project
100% (1)
Class 12 Ip Project
21 pages
Project Report
No ratings yet
Project Report
40 pages
12computer Science-Python Libraries and Idea of Efficiency-Notes
No ratings yet
12computer Science-Python Libraries and Idea of Efficiency-Notes
14 pages
Report On Bank Management System
No ratings yet
Report On Bank Management System
25 pages
Emotion Based Music System
No ratings yet
Emotion Based Music System
51 pages
(Lab-1 Manual) CS-465 - Python Fundamentals
No ratings yet
(Lab-1 Manual) CS-465 - Python Fundamentals
6 pages
python interwiew questions
No ratings yet
python interwiew questions
3 pages
Vidya Bal Bhawan Sr. Sec. School Autumn Break Homework (2021-22) Class-IX
No ratings yet
Vidya Bal Bhawan Sr. Sec. School Autumn Break Homework (2021-22) Class-IX
4 pages

SDFG

Uploaded by

SDFG

Uploaded by

9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/getting_data.

from __future__ import division

from bs4 import BeautifulSoup

title = td.find("div", "thumbheader").a.text

from time import sleep

for page_num in range(1, num_pages + 1):

for td in soup('td', 'thumbtext'):

# now be a good citizen and respect the robots.txt!

def plot_years(plt, books):

from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]

from twython import Twython

# fill these in if you want to use the code

twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)

# search for tweets containing the phrase "data science"

from twython import TwythonStreamer

# appending data to a global variable is pretty poor form

def on_success(self, data):

# only want to collect English-language tweets

# stop when we've collected enough

if len(tweets) >= 1000:

def on_error(self, status_code, data):

# starts consuming public statuses that contain the keyword 'data'

def process(date, symbol, price):

print "tab delimited stock prices:"

with open('tab_delimited_stock_prices.txt', 'rb') as f:

print "colon delimited stock prices:"

with open('colon_delimited_stock_prices.txt', 'rb') as f:

print "writing out comma_delimited_stock_prices.txt"

today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 }

print "parsing json"

serialized = """{ "title" : "Data Science Book",

# parse the JSON to create a Python object

print "GitHub API"

print "last five languages", [repo["language"]

You might also like

from future import division