obinsc
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎charts.py b/‎charts.py
diff --git a/‎colon_delimited_stock_prices.txt
Lines changed: 4 additions & 0 deletions b/‎colon_delimited_stock_prices.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎comma_delimited_stock_prices.csv
Lines changed: 6 additions & 0 deletions b/‎comma_delimited_stock_prices.csv
Lines changed: 6 additions & 0 deletions
diff --git a/‎comma_delimited_stock_prices.txt
Lines changed: 3 additions & 0 deletions b/‎comma_delimited_stock_prices.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎decision_trees.py
Lines changed: 149 additions & 0 deletions b/‎decision_trees.py
Lines changed: 149 additions & 0 deletions
diff --git a/‎egrep.py
Lines changed: 15 additions & 0 deletions b/‎egrep.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎enemble_methods.py
Lines changed: 94 additions & 0 deletions b/‎enemble_methods.py
Lines changed: 94 additions & 0 deletions
@@ -0,0 +1,4 @@
+date:symbol:closing_price
+6/20/2014:AAPL:90.91
+6/20/2014:MSFT:41.68
+6/20/2014:FB:64.5
@@ -0,0 +1,6 @@
+6/20/2014,AAPL,90.91
+6/20/2014,MSFT,41.68
+6/20/3014,FB,64.5
+6/19/2014,AAPL,91.86
+6/19/2014,MSFT,n/a
+6/19/2014,FB,64.34
@@ -0,0 +1,3 @@
+AAPL,90.91
+FB,64.5
+MSFT,41.68
@@ -0,0 +1,149 @@
+from __future__ import division
+from collections import Counter, defaultdict
+from functools import partial
+import math, random
+
+def entropy(class_probabilities):
+    """given a list of class probabilities, compute the entropy"""
+    return sum(-p * math.log(p, 2) for p in class_probabilities if p)
+
+def class_probabilities(labels):
+    total_count = len(labels)
+    return [count / total_count
+            for count in Counter(labels).values()]
+
+def data_entropy(labeled_data):        
+    return entropy(class_probabilities([label
+                                        for _, label in labeled_data]))
+
+def partition_entropy(subsets):
+    """find the entropy from this partition of data into subsets"""
+    total_count = sum(len(subset) for subset in subsets)
+    
+    return sum( data_entropy(subset) * len(subset) / total_count
+                for subset in subsets )
+
+def group_by(items, key_fn):
+    """returns a defaultdict(list), where each input item 
+    is in the list whose key is key_fn(item)"""
+    groups = defaultdict(list)
+    for item in items:
+        key = key_fn(item)
+        groups[key].append(item)
+    return groups
+    
+def partition_by(inputs, attribute):
+    """returns a dict of inputs partitioned by the attribute
+    each input is a pair (attribute_dict, label)"""
+    return group_by(inputs, lambda x: x[0][attribute])    
+
+def partition_entropy_by(inputs,attribute):
+    """computes the entropy corresponding to the given partition"""        
+    partitions = partition_by(inputs, attribute)
+    return partition_entropy(partitions.values())        
+
+def classify(tree, input):
+    """classify the input using the given decision tree"""
+    
+    # if this is a leaf node, return its value
+    if tree in [True, False]:
+        return tree
+   
+    # otherwise find the correct subtree
+    attribute, subtree_dict = tree
+    value = input[attribute]
+    subtree = subtree_dict[value]
+    
+    # and use it to classify the input
+    return classify(subtree, input)
+
+def build_tree_id3(inputs, split_candidates=None):
+
+    # if this is our first pass, 
+    # all keys of the first input are split candidates
+    if split_candidates is None:
+        split_candidates = inputs[0][0].keys()
+
+    # count Trues and Falses in the inputs
+    num_inputs = len(inputs)
+    num_trues = len([label for item, label in inputs if label])
+    num_falses = num_inputs - num_trues
+    
+    if num_trues == 0:                  # if only Falses are left
+        return False                    # return a "False" leaf
+        
+    if num_falses == 0:                 # if only Trues are left
+        return True                     # return a "True" leaf
+
+    if not split_candidates:            # if no split candidates left
+        return num_trues >= num_falses  # return the majority leaf
+                            
+    # otherwise, split on the best attribute
+    best_attribute = min(split_candidates,
+        key=partial(partition_entropy_by, inputs))
+
+    partitions = partition_by(inputs, best_attribute)
+    new_candidates = [a for a in split_candidates 
+                      if a != best_attribute]
+    
+    # recursively build the subtrees
+    subtrees = { attribute : build_tree_id3(subset, new_candidates)
+                 for attribute, subset in partitions.iteritems() }
+
+    return (best_attribute, subtrees)
+
+def forest_classify(trees, input):
+    votes = [classify(tree, input) for tree in trees]
+    vote_counts = Counter(votes)
+    return vote_counts.most_common(1)[0][0]
+
+
+if __name__ == "__main__":
+
+    inputs = [
+        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
+        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
+        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
+        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
+        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
+        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
+        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
+        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
+        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
+        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
+        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
+        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
+        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
+        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
+    ]
+
+    for key in ['level','lang','tweets','phd']:
+        print key, partition_entropy_by(inputs, key)
+    print
+
+    senior_inputs = [(input, label)
+                     for input, label in inputs if input["level"] == "Senior"]
+
+    for key in ['lang', 'tweets', 'phd']:
+        print key, partition_entropy_by(senior_inputs, key)
+    print
+
+    print "building the tree"
+    tree = build_tree_id3(inputs)
+    print tree
+
+    print "Junior / Java / tweets / no phd", classify(tree, 
+        { "level" : "Junior", 
+          "lang" : "Java", 
+          "tweets" : "yes", 
+          "phd" : "no"} ) 
+
+    print "Junior / Java / tweets / phd", classify(tree, 
+        { "level" : "Junior", 
+                 "lang" : "Java", 
+                 "tweets" : "yes", 
+                 "phd" : "yes"} )
+
+    print "Intern", classify(tree, { "level" : "Intern" } )
+    print "Senior", classify(tree, { "level" : "Senior" } )
+
@@ -0,0 +1,15 @@
+# egrep.py
+import sys, re
+
+if __name__ == "__main__":
+
+    # sys.argv is the list of command-line arguments
+    # sys.argv[0] is the name of the program itself
+    # sys.argv[1] will be the regex specfied at the command line
+    regex = sys.argv[1]
+
+    # for every line passed into the script
+    for line in sys.stdin:
+        # if it matches the regex, write it to stdout
+        if re.search(regex, line):
+            sys.stdout.write(line)
@@ -0,0 +1,94 @@
+
+num_variables = 100
+num_points = 10000
+
+data = [[random.random() for _ in range(num_variables)]
+        for _ in range(num_points)]
+
+def output(row):
+    average = sum(row) / num_variables
+    return 1 if average > 0.5 else 0
+
+outcomes = map(output, data)
+
+def predictor_using(i):
+    def prediction(row):
+        return 1 if row[i] > 0.5 else 0
+    return prediction
+
+weak_learners = map(predictor_using, range(num_variables))
+
+def majority_vote(votes):
+    c = Counter(votes)
+    return c.most_common(1)[0][0]
+
+def majority_predictor(row, predictors=weak_learners):
+    return majority_vote(predictor(row) 
+                         for predictor in predictors)
+
+def majority_subpredictor(row, n):
+    subpredictors = random.sample(weak_learners, n)
+    return majority_vote(predictor(row) 
+                         for predictor in subpredictors)
+
+
+def classify(predictor):
+    results = Counter()
+    for x, y in zip(data, outcomes):
+        prediction = predictor(x)
+        if y and prediction:
+            results["tp"] += 1
+        elif y:
+            results["fn"] += 1
+        elif prediction:
+            results["fp"] += 1
+        else:
+            results["tn"] += 1
+    return results
+
+def precision_and_recall(counts):
+    precision = counts["tp"] / (counts["tp"] + counts["fp"])
+    recall = counts["tp"] / (counts["tp"] + counts["fn"])
+    return precision, recall
+
+for i in range(num_variables):
+    c = classify(predictor_using(i))
+    precision, recall = precision_and_recall(c)
+    print i, precision, recall
+
+ensemble = classify(majority_predictor)
+precision, recall = precision_and_recall(ensemble)
+print "ensemble", precision, recall
+
+for n in range(5,100):
+    predictor = partial(majority_predictor,predictors=weak_learners[:n])
+    ensemble = classify(predictor)
+    precision, recall = precision_and_recall(ensemble)
+    print n, precision, recall
+
+
+def f(*args, **kwargs):
+    print args
+    print kwargs
+
+
+
+
+def B(alpha, beta):
+    return math.gamma(alpha + beta) / math.gamma(alpha) / math.gamma(beta)
+
+def beta_pdf(x, alpha=1, beta=1):
+    return x ** (alpha - 1) * (1 - x) ** (beta - 1) * B(alpha, beta)
+
+xs = [i / 100 for i in range(1,100)]
+
+alpha = .1
+beta = .1
+ys = [beta_pdf(x, alpha, beta) for x in xs]
+plt.plot(xs, ys)
+plt.show()
+
+def choose(n, k):
+    return math.factorial(n) // math.factorial(n - k) // math.factorial(k)
+
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+AAPL,90.91`
	`2`	`+FB,64.5`
	`3`	`+MSFT,41.68`