Skip to content

Commit c3ee2a1

Browse files
committed
Commented Naive Bayes
1 parent 463162d commit c3ee2a1

File tree

1 file changed

+105
-7
lines changed

1 file changed

+105
-7
lines changed

NaiveBayes.py

Lines changed: 105 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,47 @@
1+
# Naive Bayes Assignment
2+
# Oscar Kosar-Kosarewicz
3+
# opk18
4+
# 11/20/2020
5+
6+
from sys import argv
17
import re
28
import numpy as np
39

410

11+
def main(train_path, test_path):
12+
# train_path = 'data/NaiveBayes/breast_cancer.train.txt'
13+
# test_path = 'data/NaiveBayes/breast_cancer.test.txt'
14+
15+
# train_path = 'data/NaiveBayes/led.train.txt'
16+
# test_path = 'data/NaiveBayes/led.test.txt'
17+
18+
# scan files to get number of attributes and number of lines
19+
train_attributes, train_lines = scan_file(train_path)
20+
test_attributes, test_lines = scan_file(test_path)
21+
num_attributes = max(train_attributes, test_attributes)
22+
23+
# read data from files
24+
train_labels, train_x = read_data(train_path, num_attributes, train_lines)
25+
test_labels, test_x = read_data(test_path, num_attributes, test_lines)
26+
27+
# Train model
28+
class_weights = train(train_x, train_labels)
29+
30+
# predict labels
31+
predicted_labels = predict(train_x, class_weights)
32+
predicted_labels_test = predict(test_x, class_weights)
33+
34+
# print results
35+
print_summary(train_labels, predicted_labels)
36+
print_summary(test_labels, predicted_labels_test)
37+
38+
539
def scan_file(filepath):
40+
"""
41+
Scan the file to find the number of lines and attributes
42+
:param filepath: input filepath
43+
:return: number of attributes, number of lines
44+
"""
645
max_index = 0
746
num_lines = 0
847
with open(filepath) as f:
@@ -14,6 +53,13 @@ def scan_file(filepath):
1453

1554

1655
def read_data(filepath, num_attributes, num_lines):
56+
"""
57+
Read data from file
58+
:param filepath: input file
59+
:param num_attributes: number of attributes
60+
:param num_lines: number of lines
61+
:return: numpy ndarray with data
62+
"""
1763
labels = np.ndarray(num_lines, dtype=np.int)
1864
attributes = np.zeros((num_lines, num_attributes), dtype=np.int)
1965
with open(filepath) as f:
@@ -24,12 +70,64 @@ def read_data(filepath, num_attributes, num_lines):
2470
return labels, attributes
2571

2672

27-
train_path = 'data/NaiveBayes/breast_cancer.train.txt'
28-
test_path = 'data/NaiveBayes/breast_cancer.test.txt'
73+
def train(data, labels):
74+
"""
75+
Trains a Naive Bayes classifier using Laplace smoothing.
76+
77+
:param data: training data
78+
:param labels: training labels
79+
:return: A 2 dimensional list containing dictionaries of probabilities for each
80+
attribute for each class. shape is num_classes x num_attributes.
81+
"""
82+
class_weights = []
83+
# loop over classes/labels
84+
for label in np.unique(labels):
85+
label_count = np.sum(labels == label) + 2
86+
attribute_weights = []
87+
# loop over attributes
88+
for i in range(data.shape[1]):
89+
# get the unique values and respective counts for the attribute in rows with this label
90+
values, counts = np.unique(data[labels == label, i], return_counts=True)
91+
# Append dictionary with label probabilities for each value of this attribute
92+
attribute_weights.append({value: (count + 1) / label_count for value, count in zip(values, counts)})
93+
class_weights.append(attribute_weights)
94+
return class_weights
95+
96+
97+
def predict(data, class_weights):
98+
"""
99+
Use the class weights of the Naive Bayes classifier to predict labels.
100+
101+
:param data: data for prediction
102+
:param class_weights: A 2 dimensional list containing dictionaries of probabilities for each
103+
attribute for each class. shape is num_classes x num_attributes.
104+
:return: predicted labels
105+
"""
106+
probabilities = np.ndarray((data.shape[0], len(class_weights)))
107+
for i, row in enumerate(data):
108+
for r, label_weights in enumerate(class_weights):
109+
probabilities[i, r] = np.product([label_weights[j].get(value, 0) for j, value in enumerate(row)])
110+
result = np.argmax(probabilities, axis=1)
111+
result[result == 0] = -1
112+
return result
113+
114+
115+
def print_summary(true_y, predicted_y):
116+
"""
117+
Print descriptive summary of Naive Bayes performance
118+
119+
:param true_y: True labels from dataset
120+
:param predicted_y: predicted labels from model
121+
:return:
122+
"""
123+
matches = predicted_y[true_y == predicted_y]
124+
differences = predicted_y[true_y != predicted_y]
125+
true_positives = np.count_nonzero(matches[matches == 1])
126+
true_negatives = np.count_nonzero(matches[matches == -1])
127+
false_positives = np.count_nonzero(differences[differences == 1])
128+
false_negatives = np.count_nonzero(differences[differences == -1])
129+
print(f'{true_positives} {false_negatives} {false_positives} {true_negatives}')
29130

30-
train_attributes, train_lines = scan_file(train_path)
31-
test_attributes, test_lines = scan_file(test_path)
32-
num_attributes = max(train_attributes, test_attributes)
33131

34-
labels_train, attributes_train = read_data(train_path, num_attributes, train_lines)
35-
labels_test, attributes_test = read_data(test_path, num_attributes, test_lines)
132+
if __name__ == '__main__':
133+
main(argv[1], argv[2])

0 commit comments

Comments
 (0)