22% containing MFCCs for the three languages english, deutsch and italian.
33% this file assumes VoiceBox is in your Octave/Matlab's path.
44
5- en_endpoint = ' http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'
6- de_endpoint = ' http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'
7- it_endpoint = ' http://www.repository.voxforge1.org/downloads/it/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'
5+ en_endpoint = ' http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/' ;
6+ de_endpoint = ' http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/' ;
7+ it_endpoint = ' http://www.repository.voxforge1.org/downloads/it/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/' ;
8+ endpoint = en_endpoint ;
9+
10+ flist = urlread(endpoint );
11+
12+ [s ,e ] = regexp(flist , " >([a-zA-Z0-9]*-[a-zA-Z0-9]*)+\.tgz<" );
13+ % truncate the amount of data to be crawled
14+ s = s(1 : 1500 );
15+ e = e(1 : 1500 );
816
9- list = urlread(en_endpoint );
10- [s ,e ] = regexp(list , " >([a-zA-Z0-9]*-[a-zA-Z0-9]*)+\.tgz<" );
1117confirm_recursive_rmdir(0 )
1218filename = ' en_de_it.mat' ;
13- % TODO: remove crapy hardcoded stuff
14- data = zeros(1 ,26 );
1519
16- for i= 1 : size(s ,2 )
20+ function data = fetch_data(flist , endpoint , anfang , ende , id )
21+ % print(int2str(id));
1722 % at each step fetch a file from the corpus
18- currfile = list(s(i ) + 1 : e(i ) - 1 );
19- mkdir temp ;
20- urlwrite(strcat(en_endpoint , currfile ), strcat(" ./temp/" , currfile ));
23+ currfile = flist(anfang + 1 : ende - 1 );
24+ currdir = strcat(" temp" , int2str(id ));
25+
26+ mkdir(currdir );cd(currdir );
27+ data = zeros(26 , 1 );
28+ status = urlwrite(strcat(endpoint , currfile ), currfile );
2129
2230 read_size = 0 ;
23- % Unzip the mfc files to temp dir and add them to the dataset.
24- % TODO: only working in Linux.
25-
26- cd temp ; untar(currfile ); cd(currfile(1 : end - 4 )); cd mfc ;
27- mfcs = ls(" *.mfc" );
28- for j= 1 : size(mfcs ,1 )
29- [d ,fp ,dt ,tc ,t ]=readhtk(strtrim(mfcs(j , : )));
30- % check if this file contains mfccs.
31- if dt !=6
32- continue
33- else
34- read_size = read_size + size(d , 1 );
35- data = [data ; d ];
31+ % Unzip the mfc files to temp dir and add them to the dataset.
32+ % TODO: only working in Linux?.
33+ untar(currfile ); cd(currfile(1 : end - 4 )); cd mfc ;
34+ mfcs = ls(" *.mfc" );
35+ for j= 1 : size(mfcs ,1 )
36+ [d ,fp ,dt ,tc ,t ]=readhtk(strtrim(mfcs(j , : )));
37+ % check if this file contains mfccs.
38+ if dt !=6
39+ continue
40+ else
41+ % read_size = read_size + size(d, 1);
42+ data = [data , d ' ];
3643 end
3744 end
38- cd ../../..
39- rmdir(" ./temp/" , " s" );
45+ cd ../../..
46+ rmdir(currdir , " s" );
47+ end
48+ % here goes what to put in the output when the function fails.
49+ function retcode = eh(error )
50+ a = error
51+ retcode = zeros(26 ,1 ).+255 ;
4052end
41- read_size
42- save filename data
53+
54+
55+ mfccs = pararrayfun(numWorkers = 30 ,
56+ @(anfang , ende , id )fetch_data(flist , endpoint , anfang , ende , id ),
57+ s , e , 1 : size(s ,2 ),
58+ " ErrorHandler" , @eh );
59+
60+ read_size = size(mfccs )
61+ save(" -mat4-binary" ,filename , mfccs );
0 commit comments