Skip to content

Commit 93a863f

Browse files
author
josh
committed
german
1 parent 64cc65a commit 93a863f

File tree

2 files changed

+45
-64
lines changed

2 files changed

+45
-64
lines changed

.compute

Lines changed: 44 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -3,89 +3,72 @@
33
set -xe
44

55
apt-get install -y python3-venv
6-
_LANG="zh-TW"
7-
CV="${SHARED_DIR}/data/mozilla/CommonVoice/v2.0-alpha2.0/${_LANG}"
6+
_LANG="en"
7+
CV="${SHARED_DIR}/data/mozilla/CommonVoice/v2.0"
88

9-
# venv
10-
apt-get update -y
11-
apt-get install -y python3-venv swig
12-
python3 -m venv /tmp/venv
13-
source /tmp/venv/bin/activate
149
# check HTTP_PROXY
1510
if ! (( $( env | grep -iq "^http_proxy=" ) )); then
1611
source /etc/profile
1712
fi
1813

14+
# venv
15+
apt-get update -y
16+
apt-get install -y python3-venv swig
17+
python3 -m venv /tmp/venv
18+
source /tmp/venv/bin/activate
1919
pip install -r <(grep -v tensorflow requirements.txt)
2020
pip install tensorflow-gpu==1.13.0-rc2
2121
pip install wheel
2222

23-
pushd ../src/native_client/ctcdecode
24-
make clean
25-
make NUM_PROCESSES=16
26-
pip install dist/*.whl
27-
popd
28-
29-
3023
###############################
3124
### INSTALL KENLM + DECODER ###
3225
###############################
3326

34-
# pip install "/data/rw/home/ds_ctcdecoder-0.5.0a1-cp36-cp36m-manylinux1_x86_64.whl"
35-
# pip install $(python util/taskcluster.py --decoder)
36-
# python util/taskcluster.py --arch gpu --target ../tmp/native_client
27+
pip install "/data/rw/home/ds_ctcdecoder-0.5.0a1-cp36-cp36m-manylinux1_x86_64.whl"
28+
python util/taskcluster.py --arch gpu --target ../tmp/native_client
3729

3830

3931
# kenlm Dependencies
40-
# apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev libeigen3-dev
32+
apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev libeigen3-dev
4133

4234

43-
###################################
44-
### CREATE ALPHABET / LM / TRIE ###
45-
###################################
4635

47-
# # alphabet.txt
48-
# python util/check_characters.py \
49-
# -csv "${CV}/cv_${_LANG}_valid_train.csv","${CV}/cv_${_LANG}_valid_train.csv","${CV}/cv_${_LANG}_valid_train.csv" \
50-
# -alpha \
51-
# > /data/rw/home/zh-TW/alphabet.txt
36+
CREATE ALPHABET / LM / TRIE ###
5237

53-
# python util/check_characters.py \
54-
# -csv "/data/rw/home/ky/clips/train.csv","/data/rw/home/ky/clips/test.csv","/data/rw/home/ky/clips/dev.csv" \
55-
# -alpha \
56-
# > /data/rw/home/ky/alphabet.txt
5738

39+
# # alphabet.txt
40+
mkdir /data/rw/home/${_LANG}
41+
python util/check_characters.py \
42+
-csv "${CV}/${_LANG}/clips/train.csv","${CV}/${_LANG}/clips/dev.csv","${CV}/${_LANG}/clips/test.csv" \
43+
-alpha \
44+
> /data/rw/home/${_LANG}/alphabet.txt
5845

5946
# lm.arpa
6047
#TEXT="${SHARED_DIR}/data/wikipedia/${_LANG}/wiki.txt"
61-
# cut -d',' -f3 /data/rw/home/ky/clips/train.csv > /data/rw/home/ky/text.txt
62-
# TEXT="/data/rw/home/ky/text.txt"
63-
# TEXT="${SHARED_DIR}/data/wikipedia/${_LANG}/wiki.txt"
64-
# TEXT="/data/rw/home/CHAR_GRAMS_ZH_TW"
48+
cut -d',' -f3 "${CV}/${_LANG}/clips/train.csv" > /data/rw/home/${_LANG}/text.txt
49+
TEXT="/data/rw/home/${_LANG}/text.txt"
6550

51+
/data/rw/home/kenlm/build/bin/lmplz \
52+
--skip_symbols \
53+
--order 2 \
54+
--text "${TEXT}" \
55+
--arpa /tmp/lm.arpa
6656

67-
# /data/rw/home/kenlm/build/bin/lmplz \
68-
# --skip_symbols \
69-
# --order 2 \
70-
# --text "${TEXT}" \
71-
# --arpa lm.arpa
57+
# # lm.binary
58+
/data/rw/home/kenlm/build/bin/build_binary \
59+
-a 255 \
60+
-q 8 \
61+
trie \
62+
/tmp/lm.arpa \
63+
/data/rw/home/${_LANG}/lm.binary
7264

73-
# # # lm.binary
74-
# /data/rw/home/kenlm/build/bin/build_binary \
75-
# -a 255 \
76-
# -q 8 \
77-
# trie \
78-
# lm.arpa \
79-
# /data/rw/home/ky/lm.binary
65+
# # trie
66+
/data/rw/home/generate_trie \
67+
/data/rw/home/${_LANG}/alphabet.txt \
68+
/data/rw/home/${_LANG}/lm.binary \
69+
/data/rw/home/${_LANG}/trie_utf8
8070

81-
# # # trie
82-
# # ../tmp/native_client/generate_trie \
83-
# /data/rw/home/generate_trie \
84-
# /data/rw/home/ky/alphabet.txt \
85-
# /data/rw/home/ky/lm.binary \
86-
# /data/rw/home/ky/trie_utf8
87-
88-
# rm lm.arpa
71+
rm /tmp/lm.arpa
8972

9073

9174
########################
@@ -94,14 +77,11 @@ popd
9477

9578
mkdir -p ../keep/summaries
9679

97-
# --train_files "${CV}/cv_${_LANG}_valid_dev.csv" \
98-
# --dev_files "${CV}/cv_${_LANG}_valid_test.csv" \
99-
# --test_files "${CV}/cv_${_LANG}_valid_train.csv" \
100-
80+
10181
python -u DeepSpeech.py \
102-
--train_files "/data/rw/home/ky/clips/train.csv" \
103-
--dev_files "/data/rw/home/ky/clips/dev.csv" \
104-
--test_files "/data/rw/home/ky/clips/test.csv" \
82+
--train_files "${CV}/${_LANG}/clips/train.csv" \
83+
--dev_files "${CV}/${_LANG}/clips/dev.csv" \
84+
--test_files "${CV}/${_LANG}/clips/test.csv" \
10585
--train_batch_size 24 \
10686
--dev_batch_size 48 \
10787
--test_batch_size 48 \
@@ -116,6 +96,6 @@ python -u DeepSpeech.py \
11696
--summary_dir "../keep/summaries" \
11797
--report_count 100 \
11898
--test_output_file "../keep/RESULTS.json" \
119-
--lm-binary-path "/data/rw/home/ky/lm.binary" \
120-
--lm-trie-path "/data/rw/home/ky/trie_utf8" \
121-
--alphabet-config-path "/data/rw/home/ky/alphabet.txt"
99+
--lm-binary-path "/data/rw/home/${_LANG}/lm.binary" \
100+
--lm-trie-path "/data/rw/home/${_LANG}/trie_utf8" \
101+
--alphabet-config-path "/data/rw/home/${_LANG}/alphabet.txt"

RESULTS.de

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)