|
| 1 | +%this script will crawl the MFCC data from VoxForge to generate the en_de_it.mat file |
| 2 | +%containing MFCCs for the three languages english, deutsch and italian. |
| 3 | +%this file assumes VoiceBox is in your Octave/Matlab's path. |
| 4 | + |
| 5 | +en_endpoint = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'; |
| 6 | +de_endpoint = 'http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'; |
| 7 | +it_endpoint = 'http://www.repository.voxforge1.org/downloads/it/Trunk/Audio/MFCC/8kHz_16bit/MFCC_0_D/'; |
| 8 | +<<<<<<< HEAD |
| 9 | +endpoint = it_endpoint; |
| 10 | +limit = 1500; |
| 11 | +======= |
| 12 | +endpoint = en_endpoint; |
| 13 | +>>>>>>> c4cc2e620e82e17e56aa44a67402c178ed13742d |
| 14 | + |
| 15 | +flist = urlread(endpoint); |
| 16 | + |
| 17 | +[s,e] = regexp(flist, ">([a-zA-Z0-9]*-[a-zA-Z0-9]*)+\.tgz<"); |
| 18 | +%truncate the amount of data to be crawled |
| 19 | +<<<<<<< HEAD |
| 20 | +s = s(1:min(limit, size(s,2))); |
| 21 | +e = e(1:min(limit, size(s,2))); |
| 22 | + |
| 23 | +confirm_recursive_rmdir(0) |
| 24 | +filename = "it.mat"; |
| 25 | +======= |
| 26 | +s = s(1:1500); |
| 27 | +e = e(1:1500); |
| 28 | + |
| 29 | +confirm_recursive_rmdir(0) |
| 30 | +filename = 'en_de_it.mat'; |
| 31 | +>>>>>>> c4cc2e620e82e17e56aa44a67402c178ed13742d |
| 32 | + |
| 33 | +function data = fetch_data(flist, endpoint, anfang, ende, id) |
| 34 | + %print(int2str(id)); |
| 35 | + %at each step fetch a file from the corpus |
| 36 | + currfile = flist(anfang + 1: ende - 1); |
| 37 | + currdir = strcat("temp", int2str(id)); |
| 38 | + |
| 39 | + mkdir(currdir);cd(currdir); |
| 40 | + data = zeros(26, 1); |
| 41 | + status = urlwrite(strcat(endpoint, currfile), currfile); |
| 42 | + |
| 43 | + read_size = 0; |
| 44 | + %Unzip the mfc files to temp dir and add them to the dataset. |
| 45 | + %TODO: only working in Linux?. |
| 46 | + untar(currfile); cd(currfile(1:end-4)); cd mfc; |
| 47 | + mfcs = ls("*.mfc"); |
| 48 | + for j=1:size(mfcs,1) |
| 49 | + [d,fp,dt,tc,t]=readhtk(strtrim(mfcs(j, :))); |
| 50 | + %check if this file contains mfccs. |
| 51 | + if dt!=6 |
| 52 | + continue |
| 53 | + else |
| 54 | + %read_size = read_size + size(d, 1); |
| 55 | + data = [data, d']; |
| 56 | + end |
| 57 | + end |
| 58 | + cd ../../.. |
| 59 | + rmdir(currdir, "s"); |
| 60 | +end |
| 61 | +<<<<<<< HEAD |
| 62 | + |
| 63 | +======= |
| 64 | +>>>>>>> c4cc2e620e82e17e56aa44a67402c178ed13742d |
| 65 | +%here goes what to put in the output when the function fails. |
| 66 | +function retcode = eh(error) |
| 67 | + a = error |
| 68 | + retcode = zeros(26,1).+255; |
| 69 | +end |
| 70 | + |
| 71 | + |
| 72 | +mfccs = pararrayfun(numWorkers = 30, |
| 73 | +<<<<<<< HEAD |
| 74 | + @(anfang, ende, id)fetch_data(flist, endpoint, anfang, ende, id), %currying with anonym funct |
| 75 | + s, e, 1:size(s,2), %parameters for the function |
| 76 | + "ErrorHandler" , @eh); |
| 77 | + |
| 78 | +read_size = size(mfccs) |
| 79 | +save("-mat4-binary", filename, "mfccs"); |
| 80 | +======= |
| 81 | + @(anfang, ende, id)fetch_data(flist, endpoint, anfang, ende, id), |
| 82 | + s, e, 1:size(s,2), |
| 83 | + "ErrorHandler" , @eh); |
| 84 | + |
| 85 | +read_size = size(mfccs) |
| 86 | +save("-mat4-binary" ,filename, mfccs); |
| 87 | +>>>>>>> c4cc2e620e82e17e56aa44a67402c178ed13742d |
0 commit comments