Skip to content
This repository was archived by the owner on Aug 17, 2018. It is now read-only.

Commit c4cc2e6

Browse files
Removed global variables from script.
Also, added error handling and changed output file format.
1 parent 307d534 commit c4cc2e6

File tree

1 file changed

+47
-28
lines changed

1 file changed

+47
-28
lines changed

data/generate_mfcc.m

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,60 @@
22
%containing MFCCs for the three languages english, deutsch and italian.
33
%this file assumes VoiceBox is in your Octave/Matlab's path.
44

5-
en_endpoint = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'
6-
de_endpoint = 'http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'
7-
it_endpoint = 'http://www.repository.voxforge1.org/downloads/it/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'
5+
en_endpoint = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/';
6+
de_endpoint = 'http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/';
7+
it_endpoint = 'http://www.repository.voxforge1.org/downloads/it/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/';
8+
endpoint = en_endpoint;
9+
10+
flist = urlread(endpoint);
11+
12+
[s,e] = regexp(flist, ">([a-zA-Z0-9]*-[a-zA-Z0-9]*)+\.tgz<");
13+
%truncate the amount of data to be crawled
14+
s = s(1:1500);
15+
e = e(1:1500);
816

9-
list = urlread(en_endpoint);
10-
[s,e] = regexp(list, ">([a-zA-Z0-9]*-[a-zA-Z0-9]*)+\.tgz<");
1117
confirm_recursive_rmdir(0)
1218
filename = 'en_de_it.mat';
13-
%TODO: remove crapy hardcoded stuff
14-
data = zeros(1,26);
1519

16-
for i=1:size(s,2)
20+
function data = fetch_data(flist, endpoint, anfang, ende, id)
21+
%print(int2str(id));
1722
%at each step fetch a file from the corpus
18-
currfile = list(s(i) + 1: e(i) - 1);
19-
mkdir temp;
20-
urlwrite(strcat(en_endpoint, currfile), strcat("./temp/", currfile));
23+
currfile = flist(anfang + 1: ende - 1);
24+
currdir = strcat("temp", int2str(id));
25+
26+
mkdir(currdir);cd(currdir);
27+
data = zeros(26, 1);
28+
status = urlwrite(strcat(endpoint, currfile), currfile);
2129

2230
read_size = 0;
23-
%Unzip the mfc files to temp dir and add them to the dataset.
24-
%TODO: only working in Linux.
25-
26-
cd temp; untar(currfile); cd(currfile(1:end-4)); cd mfc;
27-
mfcs = ls("*.mfc");
28-
for j=1:size(mfcs,1)
29-
[d,fp,dt,tc,t]=readhtk(strtrim(mfcs(j, :)));
30-
%check if this file contains mfccs.
31-
if dt!=6
32-
continue
33-
else
34-
read_size = read_size + size(d, 1);
35-
data = [data; d];
31+
%Unzip the mfc files to temp dir and add them to the dataset.
32+
%TODO: only working in Linux?.
33+
untar(currfile); cd(currfile(1:end-4)); cd mfc;
34+
mfcs = ls("*.mfc");
35+
for j=1:size(mfcs,1)
36+
[d,fp,dt,tc,t]=readhtk(strtrim(mfcs(j, :)));
37+
%check if this file contains mfccs.
38+
if dt!=6
39+
continue
40+
else
41+
%read_size = read_size + size(d, 1);
42+
data = [data, d'];
3643
end
3744
end
38-
cd ../../..
39-
rmdir("./temp/", "s");
45+
cd ../../..
46+
rmdir(currdir, "s");
47+
end
48+
%here goes what to put in the output when the function fails.
49+
function retcode = eh(error)
50+
a = error
51+
retcode = zeros(26,1).+255;
4052
end
41-
read_size
42-
save filename data
53+
54+
55+
mfccs = pararrayfun(numWorkers = 30,
56+
@(anfang, ende, id)fetch_data(flist, endpoint, anfang, ende, id),
57+
s, e, 1:size(s,2),
58+
"ErrorHandler" , @eh);
59+
60+
read_size = size(mfccs)
61+
save("-mat4-binary" ,filename, mfccs);

0 commit comments

Comments
 (0)