Skip to content

Commit 664c548

Browse files
author
josh
committed
utf-8 working, trying with and without scorer in evatluate.py
1 parent 0be1d84 commit 664c548

File tree

2 files changed

+18
-18
lines changed

2 files changed

+18
-18
lines changed

.compute

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,13 @@ fi
1717

1818
pip install -r <(grep -v tensorflow requirements.txt)
1919
pip install tensorflow-gpu==1.13.0-rc2
20+
<<<<<<< HEAD
2021
# Install ds_ctcdecoder package from TaskCluster
2122
pip install $(python3 util/taskcluster.py --decoder)
2223

2324
# kenlm Dependencies
2425
apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev libbz2-dev liblzma-dev libeigen3-dev
2526

26-
# Install Kenlm #
27-
# wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz --no-same-owner
28-
# mkdir kenlm/build
29-
# cd kenlm/build
30-
# cmake ..
31-
# make -j `nproc`
32-
# cd ../..
33-
3427

3528
###################################
3629
### CREATE ALPHABET / LM / TRIE ###
@@ -43,11 +36,15 @@ python util/check_characters.py \
4336
> data/alphabet.txt
4437

4538
# lm.arpa
46-
# TEXT="${SHARED_DIR}/data/wikipedia/zh-tw/wiki.txt"
47-
# sed -e 's/\(.\)/\1 /g' <$TEXT >CHAR_GRAMS
39+
# TEXT="${SHARED_DIR}/data/wikipedia/${_LANG}/wiki.txt"
40+
# sed -e 's/\(.\)/\1 /g' <$TEXT >CHAR_GRAMS_ZH_TW
41+
TEXT="/data/rw/home/CHAR_GRAMS_ZH_TW"
42+
43+
4844
/data/rw/home/kenlm/build/bin/lmplz \
45+
--skip_symbols \
4946
--order 2 \
50-
--text "/data/rw/home/CHAR_GRAMS_ZH_TW" \
47+
--text "${TEXT}" \
5148
--arpa lm.arpa
5249

5350
# lm.binary
@@ -57,7 +54,8 @@ python util/check_characters.py \
5754
lm.arpa \
5855
data/lm/lm.binary
5956

60-
# trie
57+
# trie
58+
# ../tmp/native_client/generate_trie \
6159
/data/rw/home/generate_trie \
6260
data/alphabet.txt \
6361
data/lm/lm.binary \
@@ -73,17 +71,17 @@ rm lm.arpa
7371
mkdir -p ../keep/summaries
7472

7573
python -u DeepSpeech.py \
76-
--train_files "${CV}/cv_${_LANG}_valid_train.csv" \
77-
--dev_files "${CV}/cv_${_LANG}_valid_dev.csv" \
78-
--test_files "${CV}/cv_${_LANG}_valid_test.csv" \
74+
--train_files "${CV}/cv_${_LANG}_valid_dev.csv" \
75+
--dev_files "${CV}/cv_${_LANG}_valid_test.csv" \
76+
--test_files "${CV}/cv_${_LANG}_valid_train.csv" \
7977
--train_batch_size 24 \
8078
--dev_batch_size 48 \
8179
--test_batch_size 48 \
82-
--noearly_stop \
8380
--n_hidden 2048 \
8481
--learning_rate 0.0001 \
8582
--dropout_rate 0.2 \
86-
--epoch 2 \
83+
--epoch 1000 \
84+
--earlystop_nsteps 5 \
8785
--display_step 0 \
8886
--validation_step 1 \
8987
--checkpoint_dir "../keep" \

evaluate.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,10 @@ def create_windows(features):
144144
# Second pass, decode logits and compute WER and edit distance metrics
145145
for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
146146
seq_lengths = batch['features_len'].values.astype(np.int32)
147+
# decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, Config.alphabet, FLAGS.beam_width,
148+
# num_processes=num_processes, scorer=scorer)
147149
decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, Config.alphabet, FLAGS.beam_width,
148-
num_processes=num_processes, scorer=scorer)
150+
num_processes=num_processes)
149151

150152
ground_truths.extend(Config.alphabet.decode(l.astype(np.uint8)) for l in batch['transcript'])
151153
predictions.extend(d[0][1] for d in decoded)

0 commit comments

Comments
 (0)