Skip to content

Commit 7a718ec

Browse files
authored
Merge pull request mozilla#1915 from igorfritzsch/master
Improved Nodejs streaming inference with VAD and FFmpeg
2 parents 481dd7d + e816916 commit 7a718ec

File tree

2 files changed

+98
-38
lines changed

2 files changed

+98
-38
lines changed

examples/ffmpeg_vad_streaming/README.MD

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,49 @@ sudo apt-get install ffmpeg
2020

2121
Here is an example for a local audio file:
2222
```bash
23-
node ./index.js --audio <AUDIO_FILE> --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
23+
node ./index.js --audio <AUDIO_FILE> \
24+
--model $HOME/models/output_graph.pbmm \
25+
--alphabet $HOME/models/alphabet.txt
2426
```
2527

2628
Here is an example for a remote RTMP-Stream:
2729
```bash
28-
node ./index.js --audio rtmp://<IP>:1935/live/teststream --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
30+
node ./index.js --audio rtmp://<IP>:1935/live/teststream \
31+
--model $HOME/models/output_graph.pbmm \
32+
--alphabet $HOME/models/alphabet.txt
2933
```
34+
35+
## Examples
36+
Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
37+
```bash
38+
node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
39+
--lm $HOME/models/lm.binary \
40+
--trie $HOME/models/trie \
41+
--model $HOME/models/output_graph.pbmm \
42+
--alphabet $HOME/models/alphabet.txt
43+
```
44+
```bash
45+
node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
46+
--lm $HOME/models/lm.binary \
47+
--trie $HOME/models/trie \
48+
--model $HOME/models/output_graph.pbmm \
49+
--alphabet $HOME/models/alphabet.txt
50+
```
51+
```bash
52+
node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
53+
--lm $HOME/models/lm.binary \
54+
--trie $HOME/models/trie \
55+
--model $HOME/models/output_graph.pbmm \
56+
--alphabet $HOME/models/alphabet.txt
57+
```
58+
Real time streaming inference in combination with a RTMP server.
59+
```bash
60+
node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
61+
--lm $HOME/models/lm.binary \
62+
--trie $HOME/models/trie \
63+
--model $HOME/models/output_graph.pbmm \
64+
--alphabet $HOME/models/alphabet.txt
65+
```
66+
67+
## Notes
68+
To get the best result mapped on to your own scenario, it might be helpful to adjust the parameters `VAD_MODE` and `DEBUNCE_TIME`.

examples/ffmpeg_vad_streaming/index.js

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ const VAD = require("node-vad");
44
const Ds = require('deepspeech');
55
const argparse = require('argparse');
66
const util = require('util');
7+
const { spawn } = require('child_process');
78

89
// These constants control the beam search decoder
910

1011
// Beam width used in the CTC decoder when building candidate transcriptions
11-
const BEAM_WIDTH = 1024;
12+
const BEAM_WIDTH = 500;
1213

1314
// The alpha hyperparameter of the CTC decoder. Language Model weight
1415
const LM_ALPHA = 0.75;
@@ -44,7 +45,7 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
4445
parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
4546
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
4647
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
47-
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
48+
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
4849
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
4950
let args = parser.parseArgs();
5051

@@ -67,51 +68,71 @@ if (args['lm'] && args['trie']) {
6768
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
6869
}
6970

70-
const vad = new VAD(VAD.Mode.NORMAL);
71-
const voice = {START: true, STOP: false};
72-
let sctx = model.setupStream(150, 16000);
73-
let state = voice.STOP;
74-
75-
function finishStream() {
76-
const model_load_start = process.hrtime();
77-
console.error('Running inference.');
78-
console.log('Transcription: ', model.finishStream(sctx));
79-
const model_load_end = process.hrtime(model_load_start);
80-
console.error('Inference took %ds.', totalTime(model_load_end));
81-
}
71+
// Default initial allocation = 3 seconds := 150
72+
const PRE_ALLOC_FRAMES = 150;
73+
74+
// Default is 16kHz
75+
const AUDIO_SAMPLE_RATE = 16000;
76+
77+
// Defines different thresholds for voice detection
78+
// NORMAL: Suitable for high bitrate, low-noise data. May classify noise as voice, too.
79+
// LOW_BITRATE: Detection mode optimised for low-bitrate audio.
80+
// AGGRESSIVE: Detection mode best suited for somewhat noisy, lower quality audio.
81+
// VERY_AGGRESSIVE: Detection mode with lowest miss-rate. Works well for most inputs.
82+
const VAD_MODE = VAD.Mode.NORMAL;
83+
// const VAD_MODE = VAD.Mode.LOW_BITRATE;
84+
// const VAD_MODE = VAD.Mode.AGGRESSIVE;
85+
// const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
86+
87+
// Time in milliseconds for debouncing speech active state
88+
const DEBOUNCE_TIME = 20;
89+
90+
// Create voice activity stream
91+
const VAD_STREAM = VAD.createStream({
92+
mode: VAD_MODE,
93+
audioFrequency: AUDIO_SAMPLE_RATE,
94+
debounceTime: DEBOUNCE_TIME
95+
});
8296

83-
let ffmpeg = require('child_process').spawn('ffmpeg', [
97+
// Spawn ffmpeg process
98+
const ffmpeg = spawn('ffmpeg', [
8499
'-hide_banner',
85100
'-nostats',
86101
'-loglevel', 'fatal',
87102
'-i', args['audio'],
88-
'-af', 'highpass=f=200,lowpass=f=3000',
89103
'-vn',
90104
'-acodec', 'pcm_s16le',
91105
'-ac', 1,
92-
'-ar', 16000,
106+
'-ar', AUDIO_SAMPLE_RATE,
93107
'-f', 's16le',
94108
'pipe:'
95109
]);
96110

97-
ffmpeg.stdout.on('data', chunk => {
98-
vad.processAudio(chunk, 16000).then(res => {
99-
switch (res) {
100-
case VAD.Event.SILENCE:
101-
if (state === voice.START) {
102-
state = voice.STOP;
103-
finishStream();
104-
sctx = model.setupStream(150,16000);
105-
}
106-
break;
107-
case VAD.Event.VOICE:
108-
state = voice.START;
109-
model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
110-
break;
111-
}
112-
});
113-
});
111+
let audioLength = 0;
112+
let sctx = model.setupStream(PRE_ALLOC_FRAMES, AUDIO_SAMPLE_RATE);
114113

115-
ffmpeg.stdout.on('close', code => {
114+
function finishStream() {
115+
const model_load_start = process.hrtime();
116+
console.error('Running inference.');
117+
console.log('Transcription: ', model.finishStream(sctx));
118+
const model_load_end = process.hrtime(model_load_start);
119+
console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
120+
audioLength = 0;
121+
}
122+
123+
function intermediateDecode() {
116124
finishStream();
117-
});
125+
sctx = model.setupStream(PRE_ALLOC_FRAMES, AUDIO_SAMPLE_RATE);
126+
}
127+
128+
function feedAudioContent(chunk) {
129+
audioLength += (chunk.length / 2) * ( 1 / AUDIO_SAMPLE_RATE);
130+
model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
131+
}
132+
133+
function processVad(data) {
134+
if (data.speech.start||data.speech.state) feedAudioContent(data.audioData)
135+
else if (data.speech.end) { feedAudioContent(data.audioData); intermediateDecode() }
136+
}
137+
138+
ffmpeg.stdout.pipe(VAD_STREAM).on('data', processVad);

0 commit comments

Comments
 (0)