Skip to content

Commit 20db7be

Browse files
committed
Added new dataset
1 parent 00f20ab commit 20db7be

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from typing import Tuple
2+
import numpy as np
3+
import os
4+
import cv2
5+
import random
6+
import pandas
7+
8+
from decision_trees.datasets.dataset_base import DatasetBase
9+
10+
11+
class AerialCactusRaw(DatasetBase):
12+
def __init__(
13+
self, path: str,
14+
number_of_train_samples: int, number_of_test_samples: int
15+
):
16+
random.seed(42)
17+
self._path = path
18+
self._number_of_train_samples = number_of_train_samples
19+
self._number_of_test_samples = number_of_test_samples
20+
21+
# NOTE(MF): does not work, as it gives a string as bte array that can not be easily used to index
22+
# labels2 = dict(np.genfromtxt(
23+
# os.path.join(self._path, 'train.csv'),
24+
# delimiter=',', skip_header=1,
25+
# dtype=[('file_name', '<S50'), ('flag', 'i1')]
26+
# ))
27+
# print(len(labels2))
28+
# print(labels2)
29+
30+
labels = dict(pandas.read_csv(os.path.join(self._path, 'train.csv')).as_matrix())
31+
# print(len(labels))
32+
# print(labels)
33+
34+
files = []
35+
# r=root, d=directories, f = files
36+
for r, d, f in os.walk(os.path.join(self._path, 'train')):
37+
for file in f:
38+
if '.jpg' in file:
39+
files.append(os.path.join(r, file))
40+
41+
print(len(files))
42+
data = {}
43+
for f in files:
44+
data[os.path.basename(f)] = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
45+
print(len(data))
46+
47+
keys = list(data.keys())
48+
random.shuffle(keys)
49+
50+
self._train_data = []
51+
self._train_target = []
52+
self._test_data = []
53+
self._test_target = []
54+
for key in keys[:self._number_of_train_samples]:
55+
# print(data[key].shape)
56+
# data has to be flatten (8x8 image -> 64x1 matrix)
57+
d = data[key].flatten()
58+
# print(d.shape)
59+
# print(d)
60+
d = self._normalise(d)
61+
# print(d)
62+
self._train_data.append(d)
63+
self._train_target.append(labels[key])
64+
for key in keys[self._number_of_train_samples:self._number_of_train_samples + self._number_of_train_samples]:
65+
d = self._normalise(data[key].flatten())
66+
self._test_data.append(d)
67+
self._test_target.append(labels[key])
68+
69+
def load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
70+
return np.asarray(self._train_data), np.asarray(self._train_target), \
71+
np.asarray(self._test_data), np.asarray(self._test_target)
72+
73+
@staticmethod
74+
def _normalise(data: np.ndarray):
75+
# in case of digits data it is possible to just divide each data by maximum value
76+
# each feature is in range 0-16
77+
data = data / 256
78+
79+
return data
80+
81+
82+
def main():
83+
# d = AerialCactusRaw('./../../data/datasets/aerial-cactus-identification/', 15000, 2500)
84+
d = AerialCactusRaw('./../../data/datasets/aerial-cactus-identification/', 15000, 250)
85+
86+
# train_data, train_target, test_data, test_target = d.load_data()
87+
# print(train_data[0])
88+
# print(train_target[0])
89+
#
90+
# print(test_data[10])
91+
# print(test_target[10])
92+
93+
for i in range(1, 9):
94+
d.test_as_classifier(i, './../../data/vhdl/')
95+
96+
97+
if __name__ == '__main__':
98+
main()

0 commit comments

Comments
 (0)