1
+ # Naive Bayes Assignment
2
+ # Oscar Kosar-Kosarewicz
3
+ # opk18
4
+ # 11/20/2020
5
+
6
+ from sys import argv
1
7
import re
2
8
import numpy as np
3
9
4
10
11
+ def main (train_path , test_path ):
12
+ # train_path = 'data/NaiveBayes/breast_cancer.train.txt'
13
+ # test_path = 'data/NaiveBayes/breast_cancer.test.txt'
14
+
15
+ # train_path = 'data/NaiveBayes/led.train.txt'
16
+ # test_path = 'data/NaiveBayes/led.test.txt'
17
+
18
+ # scan files to get number of attributes and number of lines
19
+ train_attributes , train_lines = scan_file (train_path )
20
+ test_attributes , test_lines = scan_file (test_path )
21
+ num_attributes = max (train_attributes , test_attributes )
22
+
23
+ # read data from files
24
+ train_labels , train_x = read_data (train_path , num_attributes , train_lines )
25
+ test_labels , test_x = read_data (test_path , num_attributes , test_lines )
26
+
27
+ # Train model
28
+ class_weights = train (train_x , train_labels )
29
+
30
+ # predict labels
31
+ predicted_labels = predict (train_x , class_weights )
32
+ predicted_labels_test = predict (test_x , class_weights )
33
+
34
+ # print results
35
+ print_summary (train_labels , predicted_labels )
36
+ print_summary (test_labels , predicted_labels_test )
37
+
38
+
5
39
def scan_file (filepath ):
40
+ """
41
+ Scan the file to find the number of lines and attributes
42
+ :param filepath: input filepath
43
+ :return: number of attributes, number of lines
44
+ """
6
45
max_index = 0
7
46
num_lines = 0
8
47
with open (filepath ) as f :
@@ -14,6 +53,13 @@ def scan_file(filepath):
14
53
15
54
16
55
def read_data (filepath , num_attributes , num_lines ):
56
+ """
57
+ Read data from file
58
+ :param filepath: input file
59
+ :param num_attributes: number of attributes
60
+ :param num_lines: number of lines
61
+ :return: numpy ndarray with data
62
+ """
17
63
labels = np .ndarray (num_lines , dtype = np .int )
18
64
attributes = np .zeros ((num_lines , num_attributes ), dtype = np .int )
19
65
with open (filepath ) as f :
@@ -24,12 +70,64 @@ def read_data(filepath, num_attributes, num_lines):
24
70
return labels , attributes
25
71
26
72
27
- train_path = 'data/NaiveBayes/breast_cancer.train.txt'
28
- test_path = 'data/NaiveBayes/breast_cancer.test.txt'
73
+ def train (data , labels ):
74
+ """
75
+ Trains a Naive Bayes classifier using Laplace smoothing.
76
+
77
+ :param data: training data
78
+ :param labels: training labels
79
+ :return: A 2 dimensional list containing dictionaries of probabilities for each
80
+ attribute for each class. shape is num_classes x num_attributes.
81
+ """
82
+ class_weights = []
83
+ # loop over classes/labels
84
+ for label in np .unique (labels ):
85
+ label_count = np .sum (labels == label ) + 2
86
+ attribute_weights = []
87
+ # loop over attributes
88
+ for i in range (data .shape [1 ]):
89
+ # get the unique values and respective counts for the attribute in rows with this label
90
+ values , counts = np .unique (data [labels == label , i ], return_counts = True )
91
+ # Append dictionary with label probabilities for each value of this attribute
92
+ attribute_weights .append ({value : (count + 1 ) / label_count for value , count in zip (values , counts )})
93
+ class_weights .append (attribute_weights )
94
+ return class_weights
95
+
96
+
97
+ def predict (data , class_weights ):
98
+ """
99
+ Use the class weights of the Naive Bayes classifier to predict labels.
100
+
101
+ :param data: data for prediction
102
+ :param class_weights: A 2 dimensional list containing dictionaries of probabilities for each
103
+ attribute for each class. shape is num_classes x num_attributes.
104
+ :return: predicted labels
105
+ """
106
+ probabilities = np .ndarray ((data .shape [0 ], len (class_weights )))
107
+ for i , row in enumerate (data ):
108
+ for r , label_weights in enumerate (class_weights ):
109
+ probabilities [i , r ] = np .product ([label_weights [j ].get (value , 0 ) for j , value in enumerate (row )])
110
+ result = np .argmax (probabilities , axis = 1 )
111
+ result [result == 0 ] = - 1
112
+ return result
113
+
114
+
115
+ def print_summary (true_y , predicted_y ):
116
+ """
117
+ Print descriptive summary of Naive Bayes performance
118
+
119
+ :param true_y: True labels from dataset
120
+ :param predicted_y: predicted labels from model
121
+ :return:
122
+ """
123
+ matches = predicted_y [true_y == predicted_y ]
124
+ differences = predicted_y [true_y != predicted_y ]
125
+ true_positives = np .count_nonzero (matches [matches == 1 ])
126
+ true_negatives = np .count_nonzero (matches [matches == - 1 ])
127
+ false_positives = np .count_nonzero (differences [differences == 1 ])
128
+ false_negatives = np .count_nonzero (differences [differences == - 1 ])
129
+ print (f'{ true_positives } { false_negatives } { false_positives } { true_negatives } ' )
29
130
30
- train_attributes , train_lines = scan_file (train_path )
31
- test_attributes , test_lines = scan_file (test_path )
32
- num_attributes = max (train_attributes , test_attributes )
33
131
34
- labels_train , attributes_train = read_data ( train_path , num_attributes , train_lines )
35
- labels_test , attributes_test = read_data ( test_path , num_attributes , test_lines )
132
+ if __name__ == '__main__' :
133
+ main ( argv [ 1 ], argv [ 2 ] )
0 commit comments