Skip to content

Commit db13ba0

Browse files
author
josh
committed
script to make alphabet/lm/trie on cluster
1 parent 864b0da commit db13ba0

File tree

2 files changed

+67
-6
lines changed

2 files changed

+67
-6
lines changed

.compute

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,25 @@ pip install $(python3 util/taskcluster.py --decoder)
1414
mkdir -p ../keep/summaries
1515

1616
LANG="sl"
17-
cv="$SHARED_DIR/data/mozilla/CommonVoice/v2.0-alpha2.0/$LANG"
17+
CV="${SHARED_DIR}/data/mozilla/CommonVoice/v2.0-alpha2.0/${LANG}"
18+
19+
# the *.csv on cluster have old paths
20+
cp ${CV}/*.csv .
21+
sed -Ei 's/snakepit/data\/ro/g' cv_${LANG}_valid_*.csv
22+
1823

1924
python -u DeepSpeech.py \
20-
--train_files '$CV/cv_${LANG}_valid_train.csv' \
21-
--dev_files '$CV/cv_${LANG}_valid_dev.csv' \
22-
--test_files '$CV/cv_${LANG}_valid_test.csv' \
25+
--train_files "cv_${LANG}_valid_train.csv" \
26+
--dev_files "cv_${LANG}_valid_dev.csv" \
27+
--test_files "cv_${LANG}_valid_test.csv" \
2328
--train_batch_size 24 \
2429
--dev_batch_size 48 \
2530
--test_batch_size 48 \
2631
--noearly_stop \
2732
--n_hidden 2048 \
2833
--learning_rate 0.0001 \
29-
--dropout_rate 0.15 \
30-
--epoch 1 \
34+
--dropout_rate 0.2 \
35+
--epoch 30 \
3136
--display_step 0 \
3237
--validation_step 1 \
3338
--checkpoint_dir "../keep" \

util/create_alpha_lm_trie.sh

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/bin/bash
2+
3+
LANG=$1
4+
TEXT=$2
5+
6+
echo "$0: Looking for CSV transcripts at cv_${LANG}_valid_{train/dev/test}.csv"
7+
echo "$0: Looking for text training corpus at ${TEXT}"
8+
9+
# kenlm Dependencies
10+
apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev libeigen3-dev
11+
12+
# Install Kenlm #
13+
14+
wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
15+
mkdir kenlm/build
16+
cd kenlm/build
17+
cmake ..
18+
make -j `nproc`
19+
cd ../..
20+
21+
#################
22+
### CREATE LM ###
23+
#################
24+
25+
# Make alphabet.txt #
26+
27+
python3 util/check_characters.py \
28+
-csv "cv_${LANG}_valid_train.csv","cv_${LANG}_valid_train.csv","cv_${LANG}_valid_train.csv" \
29+
-alpha \
30+
| data/alphabet.txt
31+
32+
# Make lm.arpa #
33+
34+
kenlm/build/bin/lmplz \
35+
--order 2 \
36+
--text ${TEXT} \
37+
--arpa /tmp/lm.arpa
38+
39+
# Make lm.binary #
40+
41+
kenlm/build/bin/build_binary \
42+
-a 255 \
43+
-q 8 trie \
44+
/tmp/lm.arpa \
45+
data/lm/lm.binary
46+
47+
# Make trie #
48+
49+
native_client/generate_trie \
50+
data/alphabet.txt \
51+
data/lm/lm.binary \
52+
data/lm/trie
53+
54+
rm /tmp/lm.arpa
55+
56+

0 commit comments

Comments
 (0)