Skip to content

Commit ad5058b

Browse files
committed
stats
1 parent 84b4799 commit ad5058b

File tree

1 file changed

+191
-0
lines changed

1 file changed

+191
-0
lines changed
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
from probability import normal_cdf, inverse_normal_cdf
2+
import math, random
3+
4+
def normal_approximation_to_binomial(n, p):
5+
"""finds mu and sigma corresponding to a Binomial(n, p)"""
6+
mu = p * n
7+
sigma = math.sqrt(p * (1 - p) * n)
8+
return mu, sigma
9+
10+
#####
11+
#
12+
# probabilities a normal lies in an interval
13+
#
14+
######
15+
16+
# the normal cdf _is_ the probability the variable is below a threshold
17+
normal_probability_below = normal_cdf
18+
19+
# it's above the threshold if it's not below the threshold
20+
def normal_probability_above(lo, mu=0, sigma=1):
21+
return 1 - normal_cdf(lo, mu, sigma)
22+
23+
# it's between if it's less than hi, but not less than lo
24+
def normal_probability_between(lo, hi, mu=0, sigma=1):
25+
return normal_cdf(hi, mu, sigma) - normal_cdf(lo, mu, sigma)
26+
27+
# it's outside if it's not between
28+
def normal_probability_outside(lo, hi, mu=0, sigma=1):
29+
return 1 - normal_probability_between(lo, hi, mu, sigma)
30+
31+
######
32+
#
33+
# normal bounds
34+
#
35+
######
36+
37+
38+
def normal_upper_bound(probability, mu=0, sigma=1):
39+
"""returns the z for which P(Z <= z) = probability"""
40+
return inverse_normal_cdf(probability, mu, sigma)
41+
42+
def normal_lower_bound(probability, mu=0, sigma=1):
43+
"""returns the z for which P(Z >= z) = probability"""
44+
return inverse_normal_cdf(1 - probability, mu, sigma)
45+
46+
def normal_two_sided_bounds(probability, mu=0, sigma=1):
47+
"""returns the symmetric (about the mean) bounds
48+
that contain the specified probability"""
49+
tail_probability = (1 - probability) / 2
50+
51+
# upper bound should have tail_probability above it
52+
upper_bound = normal_lower_bound(tail_probability, mu, sigma)
53+
54+
# lower bound should have tail_probability below it
55+
lower_bound = normal_upper_bound(tail_probability, mu, sigma)
56+
57+
return lower_bound, upper_bound
58+
59+
def two_sided_p_value(x, mu=0, sigma=1):
60+
if x >= mu:
61+
# if x is greater than the mean, the tail is above x
62+
return 2 * normal_probability_above(x, mu, sigma)
63+
else:
64+
# if x is less than the mean, the tail is below x
65+
return 2 * normal_probability_below(x, mu, sigma)
66+
67+
def count_extreme_values():
68+
extreme_value_count = 0
69+
for _ in range(100000):
70+
num_heads = sum(1 if random.random() < 0.5 else 0 # count # of heads
71+
for _ in range(1000)) # in 1000 flips
72+
if num_heads >= 530 or num_heads <= 470: # and count how often
73+
extreme_value_count += 1 # the # is 'extreme'
74+
75+
return extreme_value_count / 100000
76+
77+
upper_p_value = normal_probability_above
78+
lower_p_value = normal_probability_below
79+
80+
##
81+
#
82+
# P-hacking
83+
#
84+
##
85+
86+
def run_experiment():
87+
"""flip a fair coin 1000 times, True = heads, False = tails"""
88+
return [random.random() < 0.5 for _ in range(1000)]
89+
90+
def reject_fairness(experiment):
91+
"""using the 5% significance levels"""
92+
num_heads = len([flip for flip in experiment if flip])
93+
return num_heads < 469 or num_heads > 531
94+
95+
96+
##
97+
#
98+
# running an A/B test
99+
#
100+
##
101+
102+
def estimated_parameters(N, n):
103+
p = n / N
104+
sigma = math.sqrt(p * (1 - p) / N)
105+
return p, sigma
106+
107+
def a_b_test_statistic(N_A, n_A, N_B, n_B):
108+
p_A, sigma_A = estimated_parameters(N_A, n_A)
109+
p_B, sigma_B = estimated_parameters(N_B, n_B)
110+
return (p_B - p_A) / math.sqrt(sigma_A ** 2 + sigma_B ** 2)
111+
112+
##
113+
#
114+
# Bayesian Inference
115+
#
116+
##
117+
118+
def B(alpha, beta):
119+
"""a normalizing constant so that the total probability is 1"""
120+
return math.gamma(alpha) * math.gamma(beta) / math.gamma(alpha + beta)
121+
122+
def beta_pdf(x, alpha, beta):
123+
if x < 0 or x > 1: # no weight outside of [0, 1]
124+
return 0
125+
return x ** (alpha - 1) * (1 - x) ** (beta - 1) / B(alpha, beta)
126+
127+
128+
if __name__ == "__main__":
129+
130+
mu_0, sigma_0 = normal_approximation_to_binomial(1000, 0.5)
131+
print("mu_0", mu_0)
132+
print("sigma_0", sigma_0)
133+
print("normal_two_sided_bounds(0.95, mu_0, sigma_0)", normal_two_sided_bounds(0.95, mu_0, sigma_0))
134+
print()
135+
print("power of a test")
136+
137+
print("95% bounds based on assumption p is 0.5")
138+
139+
lo, hi = normal_two_sided_bounds(0.95, mu_0, sigma_0)
140+
print("lo", lo)
141+
print("hi", hi)
142+
143+
print("actual mu and sigma based on p = 0.55")
144+
mu_1, sigma_1 = normal_approximation_to_binomial(1000, 0.55)
145+
print("mu_1", mu_1)
146+
print("sigma_1", sigma_1)
147+
148+
# a type 2 error means we fail to reject the null hypothesis
149+
# which will happen when X is still in our original interval
150+
type_2_probability = normal_probability_between(lo, hi, mu_1, sigma_1)
151+
power = 1 - type_2_probability # 0.887
152+
153+
print("type 2 probability", type_2_probability)
154+
print("power", power)
155+
print
156+
157+
print("one-sided test")
158+
hi = normal_upper_bound(0.95, mu_0, sigma_0)
159+
print("hi", hi) # is 526 (< 531, since we need more probability in the upper tail)
160+
type_2_probability = normal_probability_below(hi, mu_1, sigma_1)
161+
power = 1 - type_2_probability # = 0.936
162+
print("type 2 probability", type_2_probability)
163+
print("power", power)
164+
print()
165+
166+
print("two_sided_p_value(529.5, mu_0, sigma_0)", two_sided_p_value(529.5, mu_0, sigma_0))
167+
168+
print("two_sided_p_value(531.5, mu_0, sigma_0)", two_sided_p_value(531.5, mu_0, sigma_0))
169+
170+
print("upper_p_value(525, mu_0, sigma_0)", upper_p_value(525, mu_0, sigma_0))
171+
print("upper_p_value(527, mu_0, sigma_0)", upper_p_value(527, mu_0, sigma_0))
172+
print()
173+
174+
print("P-hacking")
175+
176+
random.seed(0)
177+
experiments = [run_experiment() for _ in range(1000)]
178+
num_rejections = len([experiment
179+
for experiment in experiments
180+
if reject_fairness(experiment)])
181+
182+
print(num_rejections, "rejections out of 1000")
183+
print()
184+
185+
print("A/B testing")
186+
z = a_b_test_statistic(1000, 200, 1000, 180)
187+
print("a_b_test_statistic(1000, 200, 1000, 180)", z)
188+
print("p-value", two_sided_p_value(z))
189+
z = a_b_test_statistic(1000, 200, 1000, 150)
190+
print("a_b_test_statistic(1000, 200, 1000, 150)", z)
191+
print("p-value", two_sided_p_value(z))

0 commit comments

Comments
 (0)