Skip to content

Commit bddc46f

Browse files
committed
ewfwew
1 parent b22bdcf commit bddc46f

27 files changed

+2812
-0
lines changed

__init__.py

Whitespace-only changes.

charts.py

Whitespace-only changes.

colon_delimited_stock_prices.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
date:symbol:closing_price
2+
6/20/2014:AAPL:90.91
3+
6/20/2014:MSFT:41.68
4+
6/20/2014:FB:64.5

comma_delimited_stock_prices.csv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
6/20/2014,AAPL,90.91
2+
6/20/2014,MSFT,41.68
3+
6/20/3014,FB,64.5
4+
6/19/2014,AAPL,91.86
5+
6/19/2014,MSFT,n/a
6+
6/19/2014,FB,64.34

comma_delimited_stock_prices.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
AAPL,90.91
2+
FB,64.5
3+
MSFT,41.68

decision_trees.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
from __future__ import division
2+
from collections import Counter, defaultdict
3+
from functools import partial
4+
import math, random
5+
6+
def entropy(class_probabilities):
7+
"""given a list of class probabilities, compute the entropy"""
8+
return sum(-p * math.log(p, 2) for p in class_probabilities if p)
9+
10+
def class_probabilities(labels):
11+
total_count = len(labels)
12+
return [count / total_count
13+
for count in Counter(labels).values()]
14+
15+
def data_entropy(labeled_data):
16+
return entropy(class_probabilities([label
17+
for _, label in labeled_data]))
18+
19+
def partition_entropy(subsets):
20+
"""find the entropy from this partition of data into subsets"""
21+
total_count = sum(len(subset) for subset in subsets)
22+
23+
return sum( data_entropy(subset) * len(subset) / total_count
24+
for subset in subsets )
25+
26+
def group_by(items, key_fn):
27+
"""returns a defaultdict(list), where each input item
28+
is in the list whose key is key_fn(item)"""
29+
groups = defaultdict(list)
30+
for item in items:
31+
key = key_fn(item)
32+
groups[key].append(item)
33+
return groups
34+
35+
def partition_by(inputs, attribute):
36+
"""returns a dict of inputs partitioned by the attribute
37+
each input is a pair (attribute_dict, label)"""
38+
return group_by(inputs, lambda x: x[0][attribute])
39+
40+
def partition_entropy_by(inputs,attribute):
41+
"""computes the entropy corresponding to the given partition"""
42+
partitions = partition_by(inputs, attribute)
43+
return partition_entropy(partitions.values())
44+
45+
def classify(tree, input):
46+
"""classify the input using the given decision tree"""
47+
48+
# if this is a leaf node, return its value
49+
if tree in [True, False]:
50+
return tree
51+
52+
# otherwise find the correct subtree
53+
attribute, subtree_dict = tree
54+
value = input[attribute]
55+
subtree = subtree_dict[value]
56+
57+
# and use it to classify the input
58+
return classify(subtree, input)
59+
60+
def build_tree_id3(inputs, split_candidates=None):
61+
62+
# if this is our first pass,
63+
# all keys of the first input are split candidates
64+
if split_candidates is None:
65+
split_candidates = inputs[0][0].keys()
66+
67+
# count Trues and Falses in the inputs
68+
num_inputs = len(inputs)
69+
num_trues = len([label for item, label in inputs if label])
70+
num_falses = num_inputs - num_trues
71+
72+
if num_trues == 0: # if only Falses are left
73+
return False # return a "False" leaf
74+
75+
if num_falses == 0: # if only Trues are left
76+
return True # return a "True" leaf
77+
78+
if not split_candidates: # if no split candidates left
79+
return num_trues >= num_falses # return the majority leaf
80+
81+
# otherwise, split on the best attribute
82+
best_attribute = min(split_candidates,
83+
key=partial(partition_entropy_by, inputs))
84+
85+
partitions = partition_by(inputs, best_attribute)
86+
new_candidates = [a for a in split_candidates
87+
if a != best_attribute]
88+
89+
# recursively build the subtrees
90+
subtrees = { attribute : build_tree_id3(subset, new_candidates)
91+
for attribute, subset in partitions.iteritems() }
92+
93+
return (best_attribute, subtrees)
94+
95+
def forest_classify(trees, input):
96+
votes = [classify(tree, input) for tree in trees]
97+
vote_counts = Counter(votes)
98+
return vote_counts.most_common(1)[0][0]
99+
100+
101+
if __name__ == "__main__":
102+
103+
inputs = [
104+
({'level':'Senior','lang':'Java','tweets':'no','phd':'no'}, False),
105+
({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False),
106+
({'level':'Mid','lang':'Python','tweets':'no','phd':'no'}, True),
107+
({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True),
108+
({'level':'Junior','lang':'R','tweets':'yes','phd':'no'}, True),
109+
({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'}, False),
110+
({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'}, True),
111+
({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
112+
({'level':'Senior','lang':'R','tweets':'yes','phd':'no'}, True),
113+
({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
114+
({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
115+
({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'}, True),
116+
({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'}, True),
117+
({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
118+
]
119+
120+
for key in ['level','lang','tweets','phd']:
121+
print key, partition_entropy_by(inputs, key)
122+
print
123+
124+
senior_inputs = [(input, label)
125+
for input, label in inputs if input["level"] == "Senior"]
126+
127+
for key in ['lang', 'tweets', 'phd']:
128+
print key, partition_entropy_by(senior_inputs, key)
129+
print
130+
131+
print "building the tree"
132+
tree = build_tree_id3(inputs)
133+
print tree
134+
135+
print "Junior / Java / tweets / no phd", classify(tree,
136+
{ "level" : "Junior",
137+
"lang" : "Java",
138+
"tweets" : "yes",
139+
"phd" : "no"} )
140+
141+
print "Junior / Java / tweets / phd", classify(tree,
142+
{ "level" : "Junior",
143+
"lang" : "Java",
144+
"tweets" : "yes",
145+
"phd" : "yes"} )
146+
147+
print "Intern", classify(tree, { "level" : "Intern" } )
148+
print "Senior", classify(tree, { "level" : "Senior" } )
149+

egrep.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# egrep.py
2+
import sys, re
3+
4+
if __name__ == "__main__":
5+
6+
# sys.argv is the list of command-line arguments
7+
# sys.argv[0] is the name of the program itself
8+
# sys.argv[1] will be the regex specfied at the command line
9+
regex = sys.argv[1]
10+
11+
# for every line passed into the script
12+
for line in sys.stdin:
13+
# if it matches the regex, write it to stdout
14+
if re.search(regex, line):
15+
sys.stdout.write(line)

enemble_methods.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
2+
num_variables = 100
3+
num_points = 10000
4+
5+
data = [[random.random() for _ in range(num_variables)]
6+
for _ in range(num_points)]
7+
8+
def output(row):
9+
average = sum(row) / num_variables
10+
return 1 if average > 0.5 else 0
11+
12+
outcomes = map(output, data)
13+
14+
def predictor_using(i):
15+
def prediction(row):
16+
return 1 if row[i] > 0.5 else 0
17+
return prediction
18+
19+
weak_learners = map(predictor_using, range(num_variables))
20+
21+
def majority_vote(votes):
22+
c = Counter(votes)
23+
return c.most_common(1)[0][0]
24+
25+
def majority_predictor(row, predictors=weak_learners):
26+
return majority_vote(predictor(row)
27+
for predictor in predictors)
28+
29+
def majority_subpredictor(row, n):
30+
subpredictors = random.sample(weak_learners, n)
31+
return majority_vote(predictor(row)
32+
for predictor in subpredictors)
33+
34+
35+
def classify(predictor):
36+
results = Counter()
37+
for x, y in zip(data, outcomes):
38+
prediction = predictor(x)
39+
if y and prediction:
40+
results["tp"] += 1
41+
elif y:
42+
results["fn"] += 1
43+
elif prediction:
44+
results["fp"] += 1
45+
else:
46+
results["tn"] += 1
47+
return results
48+
49+
def precision_and_recall(counts):
50+
precision = counts["tp"] / (counts["tp"] + counts["fp"])
51+
recall = counts["tp"] / (counts["tp"] + counts["fn"])
52+
return precision, recall
53+
54+
for i in range(num_variables):
55+
c = classify(predictor_using(i))
56+
precision, recall = precision_and_recall(c)
57+
print i, precision, recall
58+
59+
ensemble = classify(majority_predictor)
60+
precision, recall = precision_and_recall(ensemble)
61+
print "ensemble", precision, recall
62+
63+
for n in range(5,100):
64+
predictor = partial(majority_predictor,predictors=weak_learners[:n])
65+
ensemble = classify(predictor)
66+
precision, recall = precision_and_recall(ensemble)
67+
print n, precision, recall
68+
69+
70+
def f(*args, **kwargs):
71+
print args
72+
print kwargs
73+
74+
75+
76+
77+
def B(alpha, beta):
78+
return math.gamma(alpha + beta) / math.gamma(alpha) / math.gamma(beta)
79+
80+
def beta_pdf(x, alpha=1, beta=1):
81+
return x ** (alpha - 1) * (1 - x) ** (beta - 1) * B(alpha, beta)
82+
83+
xs = [i / 100 for i in range(1,100)]
84+
85+
alpha = .1
86+
beta = .1
87+
ys = [beta_pdf(x, alpha, beta) for x in xs]
88+
plt.plot(xs, ys)
89+
plt.show()
90+
91+
def choose(n, k):
92+
return math.factorial(n) // math.factorial(n - k) // math.factorial(k)
93+
94+

0 commit comments

Comments
 (0)