Skip to content

Commit 8ce5225

Browse files
committed
Adjusted the rest of the database files.
1 parent 3d2f1dd commit 8ce5225

File tree

6 files changed

+40
-31
lines changed

6 files changed

+40
-31
lines changed

decision_trees/datasets/boston_house_prices_raw.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from typing import Tuple
12
import numpy as np
23
from sklearn import datasets
34
from sklearn.utils import shuffle
@@ -10,7 +11,7 @@ def __init__(self, number_of_train_samples: int, number_of_test_samples: int):
1011
self._number_of_train_samples = number_of_train_samples
1112
self._number_of_test_samples = number_of_test_samples
1213

13-
def _load_data(self):
14+
def load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
1415
boston = datasets.load_boston()
1516
# print(boston.data.shape)
1617
# print(boston.target.shape)
@@ -24,12 +25,15 @@ def _load_data(self):
2425
test_data = boston.data[self._number_of_train_samples:self._number_of_train_samples+self._number_of_test_samples]
2526
test_target = boston.target[self._number_of_train_samples:self._number_of_train_samples+self._number_of_test_samples]
2627

28+
# TODO(MF): insert normalisation routine here
29+
2730
return train_data, train_target, test_data, test_target
2831

2932
@staticmethod
3033
def _normalise(data: np.ndarray):
3134
# in case of MNIST data it is possible to just divide each data by maximum value
3235
# each feature is in range 0-255
36+
# TODO(MF): add normalisation
3337
data = data / 255
3438

3539
return data
@@ -56,13 +60,10 @@ def test_boston_raw():
5660
if __name__ == "__main__":
5761
d = BostonRaw(400, 100)
5862

59-
train_data, train_target, test_data, test_target = d._load_data()
63+
train_data, train_target, test_data, test_target = d.load_data()
6064

6165
print(f"train_data.shape: {train_data.shape}")
6266

63-
# train_data = d._normalise(train_data)
64-
# test_data = d._normalise(test_data)
65-
6667
from decision_trees import dataset_tester
6768

6869
# dataset_tester.perform_experiment(train_data[:60000], train_target[:60000],
@@ -83,4 +84,4 @@ def test_boston_raw():
8384
# dataset_tester.test_dataset(40,
8485
# train_data, train_target, test_data, test_target,
8586
# dataset_tester.ClassifierType.random_forest_regressor,
86-
# )
87+
# )

decision_trees/datasets/digits_raw.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from typing import Tuple
12
import matplotlib.pyplot as plt
23
import numpy as np
34
from sklearn import datasets
@@ -56,7 +57,7 @@ def __init__(self, number_of_train_samples: int, number_of_test_samples: int):
5657
self._number_of_train_samples = number_of_train_samples
5758
self._number_of_test_samples = number_of_test_samples
5859

59-
def _load_data(self):
60+
def load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
6061
digits = datasets.load_digits()
6162
# print(digits.data.shape)
6263
# print(digits.target.shape)
@@ -66,6 +67,8 @@ def _load_data(self):
6667
data = digits.data.reshape((len(digits.data), -1))
6768
# print(len(data))
6869

70+
data = self._normalise(data)
71+
6972
train_data = data[:self._number_of_train_samples]
7073
train_target = digits.target[:self._number_of_train_samples]
7174
test_data = data[
@@ -108,5 +111,5 @@ def test_digits_raw():
108111

109112

110113
if __name__ == "__main__":
111-
#sample_from_scikit()
114+
# sample_from_scikit()
112115
test_digits_raw()

decision_trees/datasets/emg_raw.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1+
from typing import Tuple, List
12
import csv
23
import os
3-
from typing import Tuple, List
4-
54
import numpy as np
65

76
from decision_trees.datasets.dataset_base import DatasetBase
@@ -50,7 +49,7 @@ def _load_files(self, files_paths: List[str], is_output: bool) -> np.ndarray:
5049

5150
return data_array
5251

53-
def _load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
52+
def load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
5453
input_train_files = []
5554
output_train_files = []
5655
input_test_files = []
@@ -71,6 +70,9 @@ def _load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
7170
input_test_data = self._load_files(input_test_files, is_output=False)
7271
output_test_data = self._load_files(output_test_files, is_output=True)
7372

73+
input_train_data = self._normalise(input_train_data)
74+
input_test_data = self._normalise(input_test_data)
75+
7476
return input_train_data, output_train_data, input_test_data, output_test_data
7577

7678
def _normalise(self, data: np.ndarray):
@@ -83,16 +85,13 @@ def _normalise(self, data: np.ndarray):
8385
if __name__ == "__main__":
8486
d = EMGRaw("./../../data/EMG/")
8587

86-
train_data, train_target, test_data, test_target = d._load_data()
88+
train_data, train_target, test_data, test_target = d.load_data()
8789

8890
print(f"train_data.shape: {train_data.shape}")
8991
print(f"test_data.shape: {test_data.shape}")
9092
print(f"np.unique(train_target): {np.unique(train_target)}")
9193
print(f"np.unique(test_target): {np.unique(test_target)}")
9294

93-
train_data = d._normalise(train_data)
94-
test_data = d._normalise(test_data)
95-
9695
from decision_trees import dataset_tester
9796

9897
dataset_tester.perform_gridsearch(train_data[:19000], train_target[:19000],
@@ -106,4 +105,4 @@ def _normalise(self, data: np.ndarray):
106105
# train_data[:19000], train_target[:19000],
107106
# test_data[:10000], test_target[:10000],
108107
# dataset_tester.ClassifierType.RANDOM_FOREST,
109-
# )
108+
# )

decision_trees/datasets/fashion_mnist_raw.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import numpy as np
2-
from sklearn.utils import shuffle
32
from typing import Tuple
43

54
from decision_trees.datasets.dataset_base import DatasetBase
@@ -13,13 +12,13 @@ class FashionMnistRaw(DatasetBase):
1312
def __init__(self):
1413
...
1514

16-
def _load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
15+
def load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
1716
X_train, y_train = load_mnist('./../../submodules/fashion-mnist/data/fashion', kind='train')
1817
X_test, y_test = load_mnist('./../../submodules/fashion-mnist/data/fashion/', kind='t10k')
1918

20-
train_data = X_train
19+
train_data = self._normalise(X_train)
2120
train_target = y_train
22-
test_data = X_test
21+
test_data = self._normalise(X_test)
2322
test_target = y_test
2423

2524
return train_data, train_target, test_data, test_target
@@ -48,17 +47,14 @@ def test_mnist_raw():
4847
assert True
4948

5049

51-
if __name__ == "__main__":
50+
def main():
5251
d = FashionMnistRaw()
5352

54-
train_data, train_target, test_data, test_target = d._load_data()
53+
train_data, train_target, test_data, test_target = d.load_data()
5554

5655
print(f"train_data.shape: {train_data.shape}")
5756
print(f"np.unique(test_target): {np.unique(test_target)}")
5857

59-
train_data = d._normalise(train_data)
60-
test_data = d._normalise(test_data)
61-
6258
from decision_trees import dataset_tester
6359

6460
dataset_tester.perform_gridsearch(train_data[:60000], train_target[:60000],
@@ -72,3 +68,7 @@ def test_mnist_raw():
7268
# train_data[:60000], train_target[:60000], test_data[:10000], test_target[:10000],
7369
# dataset_tester.ClassifierType.DECISION_TREE,
7470
# )
71+
72+
73+
if __name__ == "__main__":
74+
main()

decision_trees/datasets/inria_hog.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1+
from typing import Tuple
12
import pickle
2-
33
import numpy as np
44

55
from decision_trees.datasets.dataset_base import DatasetBase
@@ -13,7 +13,7 @@ def __init__(self, data_filename: str, nr_pos_train: int, nr_pos_test: int, nr_n
1313
self._nr_neg_train = nr_neg_train
1414
self._nr_neg_test = nr_neg_test
1515

16-
def _load_data(self):
16+
def load_data(self)-> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
1717
# prepare the training data
1818
with open("..\\data\\positive_train_" + self._data_filename + ".pickle", "rb") as f:
1919
train_data_positive = pickle.load(f)
@@ -69,5 +69,6 @@ def test_inria_hog():
6969

7070
assert True
7171

72+
7273
if __name__ == "__main__":
7374
test_inria_hog()

decision_trees/datasets/mnist_raw.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from typing import Tuple
12
import numpy as np
23
from sklearn import datasets
34
from sklearn.utils import shuffle
@@ -10,7 +11,7 @@ def __init__(self, number_of_train_samples: int, number_of_test_samples: int):
1011
self._number_of_train_samples = number_of_train_samples
1112
self._number_of_test_samples = number_of_test_samples
1213

13-
def _load_data(self):
14+
def load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
1415
mnist = datasets.fetch_mldata('MNIST original', data_home=".//data//MNIST//")
1516
# print(mnist.data.shape)
1617
# print(mnist.target.shape)
@@ -19,10 +20,14 @@ def _load_data(self):
1920
# it is necessary to shuffle the data as all 0's are at the front and all 9's are at the end
2021
mnist.data, mnist.target = shuffle(mnist.data, mnist.target)
2122

22-
train_data = mnist.data[:self._number_of_train_samples]
23+
train_data = self._normalise(mnist.data[:self._number_of_train_samples])
2324
train_target = mnist.target[:self._number_of_train_samples]
24-
test_data = mnist.data[self._number_of_train_samples:self._number_of_train_samples+self._number_of_test_samples]
25-
test_target = mnist.target[self._number_of_train_samples:self._number_of_train_samples+self._number_of_test_samples]
25+
test_data = self._normalise(
26+
mnist.data[self._number_of_train_samples:self._number_of_train_samples+self._number_of_test_samples]
27+
)
28+
test_target = mnist.target[
29+
self._number_of_train_samples:self._number_of_train_samples+self._number_of_test_samples
30+
]
2631

2732
return train_data, train_target, test_data, test_target
2833

0 commit comments

Comments
 (0)