33set -xe
44
55apt-get install -y python3-venv
6- _LANG=" zh-TW "
7- CV=" ${SHARED_DIR} /data/mozilla/CommonVoice/v2.0-alpha2.0/ ${_LANG} "
6+ _LANG=" en "
7+ CV=" ${SHARED_DIR} /data/mozilla/CommonVoice/v2.0"
88
9- # venv
10- apt-get update -y
11- apt-get install -y python3-venv swig
12- python3 -m venv /tmp/venv
13- source /tmp/venv/bin/activate
149# check HTTP_PROXY
1510if ! (( $( env | grep - iq "^http_proxy= " ) )) ; then
1611 source /etc/profile
1712fi
1813
14+ # venv
15+ apt-get update -y
16+ apt-get install -y python3-venv swig
17+ python3 -m venv /tmp/venv
18+ source /tmp/venv/bin/activate
1919pip install -r <( grep -v tensorflow requirements.txt)
2020pip install tensorflow-gpu==1.13.0-rc2
2121pip install wheel
2222
23- pushd ../src/native_client/ctcdecode
24- make clean
25- make NUM_PROCESSES=16
26- pip install dist/* .whl
27- popd
28-
29-
3023# ##############################
3124# ## INSTALL KENLM + DECODER ###
3225# ##############################
3326
34- # pip install "/data/rw/home/ds_ctcdecoder-0.5.0a1-cp36-cp36m-manylinux1_x86_64.whl"
35- # pip install $(python util/taskcluster.py --decoder)
36- # python util/taskcluster.py --arch gpu --target ../tmp/native_client
27+ pip install " /data/rw/home/ds_ctcdecoder-0.5.0a1-cp36-cp36m-manylinux1_x86_64.whl"
28+ python util/taskcluster.py --arch gpu --target ../tmp/native_client
3729
3830
3931# kenlm Dependencies
40- # apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev libeigen3-dev
32+ apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev libeigen3-dev
4133
4234
43- # ##################################
44- # ## CREATE ALPHABET / LM / TRIE ###
45- # ##################################
4635
47- # # alphabet.txt
48- # python util/check_characters.py \
49- # -csv "${CV}/cv_${_LANG}_valid_train.csv","${CV}/cv_${_LANG}_valid_train.csv","${CV}/cv_${_LANG}_valid_train.csv" \
50- # -alpha \
51- # > /data/rw/home/zh-TW/alphabet.txt
36+ CREATE ALPHABET / LM / TRIE # ##
5237
53- # python util/check_characters.py \
54- # -csv "/data/rw/home/ky/clips/train.csv","/data/rw/home/ky/clips/test.csv","/data/rw/home/ky/clips/dev.csv" \
55- # -alpha \
56- # > /data/rw/home/ky/alphabet.txt
5738
39+ # # alphabet.txt
40+ mkdir /data/rw/home/${_LANG}
41+ python util/check_characters.py \
42+ -csv " ${CV} /${_LANG} /clips/train.csv" ," ${CV} /${_LANG} /clips/dev.csv" ," ${CV} /${_LANG} /clips/test.csv" \
43+ -alpha \
44+ > /data/rw/home/${_LANG} /alphabet.txt
5845
5946# lm.arpa
6047# TEXT="${SHARED_DIR}/data/wikipedia/${_LANG}/wiki.txt"
61- # cut -d',' -f3 /data/rw/home/ky/clips/train.csv > /data/rw/home/ky/text.txt
62- # TEXT="/data/rw/home/ky/text.txt"
63- # TEXT="${SHARED_DIR}/data/wikipedia/${_LANG}/wiki.txt"
64- # TEXT="/data/rw/home/CHAR_GRAMS_ZH_TW"
48+ cut -d' ,' -f3 " ${CV} /${_LANG} /clips/train.csv" > /data/rw/home/${_LANG} /text.txt
49+ TEXT=" /data/rw/home/${_LANG} /text.txt"
6550
51+ /data/rw/home/kenlm/build/bin/lmplz \
52+ --skip_symbols \
53+ --order 2 \
54+ --text " ${TEXT} " \
55+ --arpa /tmp/lm.arpa
6656
67- # /data/rw/home/kenlm/build/bin/lmplz \
68- # --skip_symbols \
69- # --order 2 \
70- # --text "${TEXT}" \
71- # --arpa lm.arpa
57+ # # lm.binary
58+ /data/rw/home/kenlm/build/bin/build_binary \
59+ -a 255 \
60+ -q 8 \
61+ trie \
62+ /tmp/lm.arpa \
63+ /data/rw/home/${_LANG} /lm.binary
7264
73- # # # lm.binary
74- # /data/rw/home/kenlm/build/bin/build_binary \
75- # -a 255 \
76- # -q 8 \
77- # trie \
78- # lm.arpa \
79- # /data/rw/home/ky/lm.binary
65+ # # trie
66+ /data/rw/home/generate_trie \
67+ /data/rw/home/${_LANG} /alphabet.txt \
68+ /data/rw/home/${_LANG} /lm.binary \
69+ /data/rw/home/${_LANG} /trie_utf8
8070
81- # # # trie
82- # # ../tmp/native_client/generate_trie \
83- # /data/rw/home/generate_trie \
84- # /data/rw/home/ky/alphabet.txt \
85- # /data/rw/home/ky/lm.binary \
86- # /data/rw/home/ky/trie_utf8
87-
88- # rm lm.arpa
71+ rm /tmp/lm.arpa
8972
9073
9174# #######################
9477
9578mkdir -p ../keep/summaries
9679
97- # --train_files "${CV}/cv_${_LANG}_valid_dev.csv" \
98- # --dev_files "${CV}/cv_${_LANG}_valid_test.csv" \
99- # --test_files "${CV}/cv_${_LANG}_valid_train.csv" \
100-
80+
10181python -u DeepSpeech.py \
102- --train_files " /data/rw/home/ky /clips/train.csv" \
103- --dev_files " /data/rw/home/ky /clips/dev.csv" \
104- --test_files " /data/rw/home/ky /clips/test.csv" \
82+ --train_files " ${CV} / ${_LANG} /clips/train.csv" \
83+ --dev_files " ${CV} / ${_LANG} /clips/dev.csv" \
84+ --test_files " ${CV} / ${_LANG} /clips/test.csv" \
10585 --train_batch_size 24 \
10686 --dev_batch_size 48 \
10787 --test_batch_size 48 \
@@ -116,6 +96,6 @@ python -u DeepSpeech.py \
11696 --summary_dir " ../keep/summaries" \
11797 --report_count 100 \
11898 --test_output_file " ../keep/RESULTS.json" \
119- --lm-binary-path " /data/rw/home/ky /lm.binary" \
120- --lm-trie-path " /data/rw/home/ky /trie_utf8" \
121- --alphabet-config-path " /data/rw/home/ky /alphabet.txt"
99+ --lm-binary-path " /data/rw/home/${_LANG} /lm.binary" \
100+ --lm-trie-path " /data/rw/home/${_LANG} /trie_utf8" \
101+ --alphabet-config-path " /data/rw/home/${_LANG} /alphabet.txt"
0 commit comments