Skip to content

Commit 452855e

Browse files
committed
latest edits
1 parent 95e7bda commit 452855e

18 files changed

+20370
-249
lines changed

code/clustering.py

Lines changed: 79 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ def plot_squared_clustering_errors(plt):
6363
# using clustering to recolor an image
6464
#
6565

66-
def recolor_image(input_file, k):
66+
def recolor_image(input_file, k=5):
6767

6868
img = mpimg.imread(path_to_png_file)
6969
pixels = [pixel for row in img for pixel in row]
70-
clusterer = KMeans(5)
70+
clusterer = KMeans(k)
7171
clusterer.train(pixels) # this might take a while
7272

7373
def recolor(pixel):
@@ -85,91 +85,80 @@ def recolor(pixel):
8585
# hierarchical clustering
8686
#
8787

88+
def is_leaf(cluster):
89+
"""a cluster is a leaf if it has length 1"""
90+
return len(cluster) == 1
91+
92+
def get_children(cluster):
93+
"""returns the two children of this cluster if it's a merged cluster;
94+
raises an exception if this is a leaf cluster"""
95+
if is_leaf(cluster):
96+
raise TypeError("a leaf cluster has no children")
97+
else:
98+
return cluster[1]
99+
100+
def get_values(cluster):
101+
"""returns the value in this cluster (if it's a leaf cluster)
102+
or all the values in the leaf clusters below it (if it's not)"""
103+
if is_leaf(cluster):
104+
return cluster # is already a 1-tuple containing value
105+
else:
106+
return [value
107+
for child in get_children(cluster)
108+
for value in get_values(child)]
109+
88110
def cluster_distance(cluster1, cluster2, distance_agg=min):
89111
"""finds the aggregate distance between elements of cluster1
90112
and elements of cluster2"""
91-
return distance_agg(distance(input_i, input_j)
92-
for input_i in cluster1.members()
93-
for input_j in cluster2.members())
94-
95-
class LeafCluster:
96-
"""stores a single input
97-
it has 'infinite depth' so that we never try to split it"""
98-
99-
def __init__(self, value):
100-
self.value = value
101-
self.depth = float('inf')
102-
103-
def __repr__(self):
104-
return str(self.value)
105-
106-
def members(self):
107-
"""a LeafCluster has only one member"""
108-
return [self.value]
109-
110-
class MergedCluster:
111-
"""a new cluster that's the result of 'merging' two clusters"""
112-
113-
def __init__(self, branches, depth):
114-
self.branches = branches
115-
self.depth = depth
116-
117-
def __repr__(self):
118-
"""show as {(depth) child1, child2}"""
119-
return ("{(" + str(self.depth) + ") " +
120-
", ".join(str(b) for b in self.branches) + " }")
121-
122-
def members(self):
123-
"""recursively get members by looking for members of branches"""
124-
return [member
125-
for cluster in self.branches
126-
for member in cluster.members()]
127-
128-
129-
class BottomUpClusterer:
130-
131-
def __init__(self, distance_agg=min):
132-
self.agg = distance_agg
133-
self.clusters = None
134-
135-
def train(self, inputs):
136-
# start with each input its own cluster
137-
self.clusters = [LeafCluster(input) for input in inputs]
138-
139-
while len(self.clusters) > 1:
140-
141-
# find the two closest clusters
142-
c1, c2 = min([(cluster1, cluster2)
143-
for cluster1 in self.clusters
144-
for cluster2 in self.clusters
145-
if cluster1 != cluster2],
146-
key=lambda (c1, c2): cluster_distance(c1, c2,
147-
self.agg))
148-
149-
merged_cluster = MergedCluster([c1, c2], len(self.clusters))
150-
151-
self.clusters = [c for c in self.clusters
152-
if c not in [c1, c2]]
153-
154-
self.clusters.append(merged_cluster)
155-
156-
def get_clusters(self, num_clusters):
157-
"""extract num_clusters clusters from the hierachy"""
158-
159-
clusters = self.clusters[:] # create a copy so we can modify it
160-
while len(clusters) < num_clusters:
161-
# choose the least deep cluster
162-
next_cluster = min(clusters, key=lambda c: c.depth)
163-
# remove it from the list
164-
clusters = [c for c in clusters if c != next_cluster]
165-
# and add its children
166-
clusters.extend(next_cluster.branches)
113+
return distance_agg([distance(input1, input2)
114+
for input1 in get_values(cluster1)
115+
for input2 in get_values(cluster2)])
116+
117+
def get_merge_order(cluster):
118+
if is_leaf(cluster):
119+
return float('inf')
120+
else:
121+
return cluster[0] # merge_order is first element of 2-tuple
122+
123+
def bottom_up_cluster(inputs, distance_agg=min):
124+
# start with every input a leaf cluster / 1-tuple
125+
clusters = [(input,) for input in inputs]
126+
127+
# as long as we have more than one cluster left...
128+
while len(clusters) > 1:
129+
# find the two closest clusters
130+
c1, c2 = min([(cluster1, cluster2)
131+
for i, cluster1 in enumerate(clusters)
132+
for cluster2 in clusters[:i]],
133+
key=lambda (x, y): cluster_distance(x, y, distance_agg))
167134

168-
return clusters
135+
# remove them from the list of clusters
136+
clusters = [c for c in clusters if c != c1 and c != c2]
169137

138+
# merge them, using merge_order = # of clusters left
139+
merged_cluster = (len(clusters), [c1, c2])
170140

141+
# and add their merge
142+
clusters.append(merged_cluster)
171143

144+
# when there's only one cluster left, return it
145+
return clusters[0]
172146

147+
def generate_clusters(base_cluster, num_clusters):
148+
# start with a list with just the base cluster
149+
clusters = [base_cluster]
150+
151+
# as long as we don't have enough clusters yet...
152+
while len(clusters) < num_clusters:
153+
# choose the last-merged of our clusters
154+
next_cluster = min(clusters, key=get_merge_order)
155+
# remove it from the list
156+
clusters = [c for c in clusters if c != next_cluster]
157+
# and add its children to the list (i.e., unmerge it)
158+
clusters.extend(get_children(next_cluster))
159+
160+
# once we have enough clusters...
161+
return clusters
173162

174163
if __name__ == "__main__":
175164

@@ -198,11 +187,16 @@ def get_clusters(self, num_clusters):
198187

199188
print "bottom up hierarchical clustering"
200189

201-
buc = BottomUpClusterer() # or BottomUpClusterer(max) if you like
202-
buc.train(inputs)
203-
print buc.clusters[0]
190+
base_cluster = bottom_up_cluster(inputs)
191+
print base_cluster
192+
193+
print
194+
print "three clusters, min:"
195+
for cluster in generate_clusters(base_cluster, 3):
196+
print get_values(cluster)
204197

205198
print
206-
print "three clusters:"
207-
for cluster in buc.get_clusters(3):
208-
print cluster
199+
print "three clusters, max:"
200+
base_cluster = bottom_up_cluster(inputs, max)
201+
for cluster in generate_clusters(base_cluster, 3):
202+
print get_values(cluster)

code/databases.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ def insert(self, row_values):
1717
row_dict = dict(zip(self.columns, row_values))
1818
self.rows.append(row_dict)
1919

20+
def update(self, updates, predicate):
21+
for row in self.rows:
22+
if predicate(row):
23+
for column, new_value in updates.iteritems():
24+
row[column] = new_value
25+
2026
def delete(self, predicate=lambda row: True):
2127
"""delete all rows matching predicate
2228
or all rows if no predicate supplied"""
@@ -240,7 +246,7 @@ def count_interests(rows):
240246
# SUBQUERIES
241247

242248
likes_sql_user_ids = user_interests \
243-
.where(lambda row: row["interest"] == "SQL")
249+
.where(lambda row: row["interest"] == "SQL") \
244250
.select(keep_columns=['user_id'])
245251

246252
likes_sql_user_ids.group_by(group_by_columns=[],

code/decision_trees.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ def class_probabilities(labels):
1313
for count in Counter(labels).values()]
1414

1515
def data_entropy(labeled_data):
16-
return entropy(class_probabilities([label
17-
for _, label in labeled_data]))
16+
labels = [label for _, label in labeled_data]
17+
probabilities = class_probabilities(labels)
18+
return entropy(probabilities)
1819

1920
def partition_entropy(subsets):
2021
"""find the entropy from this partition of data into subsets"""
@@ -51,11 +52,14 @@ def classify(tree, input):
5152

5253
# otherwise find the correct subtree
5354
attribute, subtree_dict = tree
54-
value = input[attribute]
55-
subtree = subtree_dict[value]
5655

57-
# and use it to classify the input
58-
return classify(subtree, input)
56+
subtree_key = input.get(attribute) # None if input is missing attribute
57+
58+
if subtree_key not in subtree_dict: # if no subtree for key,
59+
subtree_key = None # we'll use the None subtree
60+
61+
subtree = subtree_dict[subtree_key] # choose the appropriate subtree
62+
return classify(subtree, input) # and use it to classify the input
5963

6064
def build_tree_id3(inputs, split_candidates=None):
6165

@@ -90,6 +94,8 @@ def build_tree_id3(inputs, split_candidates=None):
9094
subtrees = { attribute : build_tree_id3(subset, new_candidates)
9195
for attribute, subset in partitions.iteritems() }
9296

97+
subtrees[None] = num_trues > num_falses # default case
98+
9399
return (best_attribute, subtrees)
94100

95101
def forest_classify(trees, input):

code/getting_data.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,23 @@
1111
#
1212
######
1313

14+
def is_video(td):
15+
"""it's a video if it has exactly one pricelabel, and if
16+
the stripped text inside that pricelabel starts with 'Video'"""
17+
pricelabels = td('span', 'pricelabel')
18+
return (len(pricelabels) == 1 and
19+
pricelabels[0].text.strip().startswith("Video"))
20+
1421
def book_info(td):
1522
"""given a BeautifulSoup <td> Tag representing a book,
1623
extract the book's details and return a dict"""
1724

18-
title = td.find("div", "thumbheader").a.string
19-
by_author = td.find('div', 'AuthorName').string
25+
title = td.find("div", "thumbheader").a.text
26+
by_author = td.find('div', 'AuthorName').text
2027
authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")]
2128
isbn_link = td.find("div", "thumbheader").a.get("href")
2229
isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0]
23-
date = td.find("span", "directorydate").string.strip()
30+
date = td.find("span", "directorydate").text.strip()
2431

2532
return {
2633
"title" : title,
@@ -31,7 +38,7 @@ def book_info(td):
3138

3239
from time import sleep
3340

34-
def scrape(num_pages=28):
41+
def scrape(num_pages=31):
3542
base_url = "http://shop.oreilly.com/category/browse-subjects/" + \
3643
"data.do?sortby=publicationDate&page="
3744

@@ -40,27 +47,33 @@ def scrape(num_pages=28):
4047
for page_num in range(1, num_pages + 1):
4148
print "souping page", page_num
4249
url = base_url + str(page_num)
43-
soup = BeautifulSoup(requests.get(url).text)
50+
soup = BeautifulSoup(requests.get(url).text, 'html5lib')
4451

45-
for td in soup('td'):
46-
if td.find('div', 'AuthorName') and not td.td:
52+
for td in soup('td', 'thumbtext'):
53+
if not is_video(td):
4754
books.append(book_info(td))
4855

4956
# now be a good citizen and respect the robots.txt!
5057
sleep(30)
5158

52-
def year(book):
59+
return books
60+
61+
def get_year(book):
5362
"""book["date"] looks like 'November 2014' so we need to
5463
split on the space and then take the second piece"""
5564
return int(book["date"].split()[1])
5665

57-
5866
def plot_years(plt, books):
67+
# 2014 is the last complete year of data (when I ran this)
68+
year_counts = Counter(get_year(book) for book in books
69+
if get_year(book) <= 2014)
70+
5971
years = sorted(year_counts)
6072
book_counts = [year_counts[year] for year in x]
6173
plt.bar([x - 0.5 for x in years], book_counts)
6274
plt.xlabel("year")
6375
plt.ylabel("# of data books")
76+
plt.title("Data is Big!")
6477
plt.show()
6578

6679
##

0 commit comments

Comments
 (0)