obinsc
diff --git a/‎code/clustering.py
Lines changed: 79 additions & 85 deletions b/‎code/clustering.py
Lines changed: 79 additions & 85 deletions
diff --git a/‎code/databases.py
Lines changed: 7 additions & 1 deletion b/‎code/databases.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎code/decision_trees.py
Lines changed: 12 additions & 6 deletions b/‎code/decision_trees.py
Lines changed: 12 additions & 6 deletions
diff --git a/‎code/getting_data.py
Lines changed: 22 additions & 9 deletions b/‎code/getting_data.py
Lines changed: 22 additions & 9 deletions
@@ -63,11 +63,11 @@ def plot_squared_clustering_errors(plt):
 # using clustering to recolor an image
 #
 
-def recolor_image(input_file, k):
+def recolor_image(input_file, k=5):
 
     img = mpimg.imread(path_to_png_file)
     pixels = [pixel for row in img for pixel in row]
-    clusterer = KMeans(5)
+    clusterer = KMeans(k)
     clusterer.train(pixels) # this might take a while    
 
     def recolor(pixel):
@@ -85,91 +85,80 @@ def recolor(pixel):
 # hierarchical clustering
 #
 
+def is_leaf(cluster):
+    """a cluster is a leaf if it has length 1"""
+    return len(cluster) == 1
+
+def get_children(cluster):
+    """returns the two children of this cluster if it's a merged cluster;
+    raises an exception if this is a leaf cluster"""
+    if is_leaf(cluster):
+        raise TypeError("a leaf cluster has no children")
+    else:
+        return cluster[1]
+
+def get_values(cluster):
+    """returns the value in this cluster (if it's a leaf cluster)
+    or all the values in the leaf clusters below it (if it's not)"""
+    if is_leaf(cluster):
+        return cluster # is already a 1-tuple containing value
+    else:
+        return [value
+                for child in get_children(cluster)
+                for value in get_values(child)]
+
 def cluster_distance(cluster1, cluster2, distance_agg=min):
     """finds the aggregate distance between elements of cluster1
     and elements of cluster2"""
-    return distance_agg(distance(input_i, input_j)
-                        for input_i in cluster1.members()
-                        for input_j in cluster2.members())
-
-class LeafCluster:
-    """stores a single input
-    it has 'infinite depth' so that we never try to split it"""
-
-    def __init__(self, value):
-        self.value = value
-        self.depth = float('inf')
-        
-    def __repr__(self):
-        return str(self.value)
-        
-    def members(self):
-        """a LeafCluster has only one member"""
-        return [self.value]
-                
-class MergedCluster:
-    """a new cluster that's the result of 'merging' two clusters"""
-
-    def __init__(self, branches, depth):
-        self.branches = branches
-        self.depth = depth
-
-    def __repr__(self):
-        """show as {(depth) child1, child2}"""
-        return ("{(" + str(self.depth) + ") " +
-                ", ".join(str(b) for b in self.branches) + " }")
-        
-    def members(self):
-        """recursively get members by looking for members of branches"""
-        return [member
-                for cluster in self.branches
-                for member in cluster.members()]
-
-
-class BottomUpClusterer:
-
-    def __init__(self, distance_agg=min):
-        self.agg = distance_agg
-        self.clusters = None
-        
-    def train(self, inputs):
-        # start with each input its own cluster
-        self.clusters = [LeafCluster(input) for input in inputs]
-
-        while len(self.clusters) > 1:
-                    
-            # find the two closest clusters
-            c1, c2 = min([(cluster1, cluster2)
-                          for cluster1 in self.clusters
-                          for cluster2 in self.clusters
-                          if cluster1 != cluster2],
-                         key=lambda (c1, c2): cluster_distance(c1, c2, 
-                                                               self.agg))
-
-            merged_cluster = MergedCluster([c1, c2], len(self.clusters))
-                                            
-            self.clusters = [c for c in self.clusters
-                             if c not in [c1, c2]]
-                              
-            self.clusters.append(merged_cluster)
-            
-    def get_clusters(self, num_clusters):
-        """extract num_clusters clusters from the hierachy"""
-        
-        clusters = self.clusters[:] # create a copy so we can modify it
-        while len(clusters) < num_clusters:
-            # choose the least deep cluster
-            next_cluster = min(clusters, key=lambda c: c.depth)
-            # remove it from the list
-            clusters = [c for c in clusters if c != next_cluster]
-            # and add its children
-            clusters.extend(next_cluster.branches)
+    return distance_agg([distance(input1, input2)
+                        for input1 in get_values(cluster1)
+                        for input2 in get_values(cluster2)])
+
+def get_merge_order(cluster):
+    if is_leaf(cluster):
+        return float('inf')
+    else:
+        return cluster[0] # merge_order is first element of 2-tuple
+
+def bottom_up_cluster(inputs, distance_agg=min):
+    # start with every input a leaf cluster / 1-tuple
+    clusters = [(input,) for input in inputs]
+    
+    # as long as we have more than one cluster left...
+    while len(clusters) > 1:
+        # find the two closest clusters
+        c1, c2 = min([(cluster1, cluster2)
+                     for i, cluster1 in enumerate(clusters)
+                     for cluster2 in clusters[:i]],
+                     key=lambda (x, y): cluster_distance(x, y, distance_agg))
 
-        return clusters
+        # remove them from the list of clusters
+        clusters = [c for c in clusters if c != c1 and c != c2]
 
+        # merge them, using merge_order = # of clusters left
+        merged_cluster = (len(clusters), [c1, c2])
 
+        # and add their merge
+        clusters.append(merged_cluster)
 
+    # when there's only one cluster left, return it
+    return clusters[0]
 
+def generate_clusters(base_cluster, num_clusters):
+    # start with a list with just the base cluster
+    clusters = [base_cluster]
+    
+    # as long as we don't have enough clusters yet...
+    while len(clusters) < num_clusters:
+        # choose the last-merged of our clusters
+        next_cluster = min(clusters, key=get_merge_order)
+        # remove it from the list
+        clusters = [c for c in clusters if c != next_cluster]
+        # and add its children to the list (i.e., unmerge it)
+        clusters.extend(get_children(next_cluster))
+
+    # once we have enough clusters...
+    return clusters
 
 if __name__ == "__main__":
 
@@ -198,11 +187,16 @@ def get_clusters(self, num_clusters):
 
     print "bottom up hierarchical clustering"
 
-    buc = BottomUpClusterer() # or BottomUpClusterer(max) if you like
-    buc.train(inputs)
-    print buc.clusters[0]
+    base_cluster = bottom_up_cluster(inputs)
+    print base_cluster
+
+    print
+    print "three clusters, min:"
+    for cluster in generate_clusters(base_cluster, 3):
+        print get_values(cluster)
 
     print
-    print "three clusters:"
-    for cluster in buc.get_clusters(3):
-        print cluster
+    print "three clusters, max:"
+    base_cluster = bottom_up_cluster(inputs, max)
+    for cluster in generate_clusters(base_cluster, 3):
+        print get_values(cluster)
@@ -17,6 +17,12 @@ def insert(self, row_values):
         row_dict = dict(zip(self.columns, row_values))
         self.rows.append(row_dict)
 
+    def update(self, updates, predicate):
+        for row in self.rows:
+            if predicate(row):
+                for column, new_value in updates.iteritems():
+                    row[column] = new_value
+
     def delete(self, predicate=lambda row: True):
         """delete all rows matching predicate
         or all rows if no predicate supplied"""
@@ -240,7 +246,7 @@ def count_interests(rows):
     # SUBQUERIES
 
     likes_sql_user_ids = user_interests \
-        .where(lambda row: row["interest"] == "SQL")
+        .where(lambda row: row["interest"] == "SQL") \
         .select(keep_columns=['user_id'])
 
     likes_sql_user_ids.group_by(group_by_columns=[],
 
@@ -13,8 +13,9 @@ def class_probabilities(labels):
             for count in Counter(labels).values()]
 
 def data_entropy(labeled_data):        
-    return entropy(class_probabilities([label
-                                        for _, label in labeled_data]))
+    labels = [label for _, label in labeled_data]
+    probabilities = class_probabilities(labels)
+    return entropy(probabilities)
 
 def partition_entropy(subsets):
     """find the entropy from this partition of data into subsets"""
@@ -51,11 +52,14 @@ def classify(tree, input):
 
     # otherwise find the correct subtree
     attribute, subtree_dict = tree
-    value = input[attribute]
-    subtree = subtree_dict[value]
 
-    # and use it to classify the input
-    return classify(subtree, input)
+    subtree_key = input.get(attribute)  # None if input is missing attribute
+
+    if subtree_key not in subtree_dict: # if no subtree for key,
+        subtree_key = None              # we'll use the None subtree
+    
+    subtree = subtree_dict[subtree_key] # choose the appropriate subtree
+    return classify(subtree, input)     # and use it to classify the input
 
 def build_tree_id3(inputs, split_candidates=None):
 
@@ -90,6 +94,8 @@ def build_tree_id3(inputs, split_candidates=None):
     subtrees = { attribute : build_tree_id3(subset, new_candidates)
                  for attribute, subset in partitions.iteritems() }
 
+    subtrees[None] = num_trues > num_falses # default case
+
     return (best_attribute, subtrees)
 
 def forest_classify(trees, input):
 
@@ -11,16 +11,23 @@
 #
 ######
 
+def is_video(td):
+    """it's a video if it has exactly one pricelabel, and if
+    the stripped text inside that pricelabel starts with 'Video'"""
+    pricelabels = td('span', 'pricelabel')
+    return (len(pricelabels) == 1 and
+            pricelabels[0].text.strip().startswith("Video"))
+
 def book_info(td):
     """given a BeautifulSoup <td> Tag representing a book,
     extract the book's details and return a dict"""
 
-    title = td.find("div", "thumbheader").a.string
-    by_author = td.find('div', 'AuthorName').string
+    title = td.find("div", "thumbheader").a.text
+    by_author = td.find('div', 'AuthorName').text
     authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")]
     isbn_link = td.find("div", "thumbheader").a.get("href")
     isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0]
-    date = td.find("span", "directorydate").string.strip()
+    date = td.find("span", "directorydate").text.strip()
 
     return {
         "title" : title,
@@ -31,7 +38,7 @@ def book_info(td):
 
 from time import sleep
 
-def scrape(num_pages=28):
+def scrape(num_pages=31):
     base_url = "http://shop.oreilly.com/category/browse-subjects/" + \
            "data.do?sortby=publicationDate&page="
 
@@ -40,27 +47,33 @@ def scrape(num_pages=28):
     for page_num in range(1, num_pages + 1):
         print "souping page", page_num
         url = base_url + str(page_num)
-        soup = BeautifulSoup(requests.get(url).text)
+        soup = BeautifulSoup(requests.get(url).text, 'html5lib')
 
-        for td in soup('td'):
-            if td.find('div', 'AuthorName') and not td.td:
+        for td in soup('td', 'thumbtext'):
+            if not is_video(td):
                 books.append(book_info(td))
 
         # now be a good citizen and respect the robots.txt!
         sleep(30)
 
-def year(book):
+    return books
+
+def get_year(book):
     """book["date"] looks like 'November 2014' so we need to 
     split on the space and then take the second piece"""
     return int(book["date"].split()[1])
 
-
 def plot_years(plt, books):
+    # 2014 is the last complete year of data (when I ran this)
+    year_counts = Counter(get_year(book) for book in books
+                          if get_year(book) <= 2014)
+
     years = sorted(year_counts)
     book_counts = [year_counts[year] for year in x]
     plt.bar([x - 0.5 for x in years], book_counts)
     plt.xlabel("year")
     plt.ylabel("# of data books")
+    plt.title("Data is Big!")
     plt.show()
 
 ##