Merge pull request mozilla#1915 from igorfritzsch/master

lissyx · web-flow · commit 7a718ecd864b · 2019-03-18T17:04:51.000+01:00
Improved Nodejs streaming inference with VAD and FFmpeg
diff --git a/examples/ffmpeg_vad_streaming/README.MD b/examples/ffmpeg_vad_streaming/README.MD
@@ -20,10 +20,49 @@ sudo apt-get install ffmpeg
 
 Here is an example for a local audio file:
 ```bash
-node ./index.js --audio <AUDIO_FILE> --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
+node ./index.js --audio <AUDIO_FILE> \
+                --model $HOME/models/output_graph.pbmm \
+                --alphabet $HOME/models/alphabet.txt
 ```
 
 Here is an example for a remote RTMP-Stream:
 ```bash
-node ./index.js  --audio rtmp://<IP>:1935/live/teststream --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
+node ./index.js  --audio rtmp://<IP>:1935/live/teststream \
+                 --model $HOME/models/output_graph.pbmm \
+                 --alphabet $HOME/models/alphabet.txt
 ```
+
+## Examples
+Real time streaming inference with DeepSpeech's example audio ([audio-0.4.1.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz)).
+```bash
+node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
+                --lm $HOME/models/lm.binary \
+                --trie $HOME/models/trie \
+                --model $HOME/models/output_graph.pbmm \
+                --alphabet $HOME/models/alphabet.txt
+```
+```bash
+node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
+                --lm $HOME/models/lm.binary \
+                --trie $HOME/models/trie \
+                --model $HOME/models/output_graph.pbmm \
+                --alphabet $HOME/models/alphabet.txt
+```
+```bash
+node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
+                --lm $HOME/models/lm.binary \
+                --trie $HOME/models/trie \
+                --model $HOME/models/output_graph.pbmm \
+                --alphabet $HOME/models/alphabet.txt
+```
+Real time streaming inference in combination with a RTMP server.
+```bash
+node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
+                --lm $HOME/models/lm.binary \
+                --trie $HOME/models/trie \
+                --model $HOME/models/output_graph.pbmm \
+                --alphabet $HOME/models/alphabet.txt
+```
+
+## Notes
+To get the best result mapped on to your own scenario, it might be helpful to adjust the parameters `VAD_MODE` and `DEBUNCE_TIME`.
diff --git a/examples/ffmpeg_vad_streaming/index.js b/examples/ffmpeg_vad_streaming/index.js
@@ -4,11 +4,12 @@ const VAD = require("node-vad");
 const Ds = require('deepspeech');
 const argparse = require('argparse');
 const util = require('util');
+const { spawn } = require('child_process');
 
 // These constants control the beam search decoder
 
 // Beam width used in the CTC decoder when building candidate transcriptions
-const BEAM_WIDTH = 1024;
+const BEAM_WIDTH = 500;
 
 // The alpha hyperparameter of the CTC decoder. Language Model weight
 const LM_ALPHA = 0.75;
@@ -44,7 +45,7 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
 parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
 parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
 parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
-parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
+parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
 parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
 let args = parser.parseArgs();
 
@@ -67,51 +68,71 @@ if (args['lm'] && args['trie']) {
 	console.error('Loaded language model in %ds.', totalTime(lm_load_end));
 }
 
-const vad = new VAD(VAD.Mode.NORMAL);
-const voice = {START: true, STOP: false};
-let sctx = model.setupStream(150, 16000);
-let state = voice.STOP;
-
-function finishStream() {
-	const model_load_start = process.hrtime();
-	console.error('Running inference.');
-	console.log('Transcription: ', model.finishStream(sctx));
-	const model_load_end = process.hrtime(model_load_start);
-	console.error('Inference took %ds.', totalTime(model_load_end));
-}
+// Default initial allocation = 3 seconds := 150
+const PRE_ALLOC_FRAMES = 150;
+
+// Default is 16kHz
+const AUDIO_SAMPLE_RATE = 16000;
+
+// Defines different thresholds for voice detection
+// NORMAL: Suitable for high bitrate, low-noise data. May classify noise as voice, too.
+// LOW_BITRATE: Detection mode optimised for low-bitrate audio.
+// AGGRESSIVE: Detection mode best suited for somewhat noisy, lower quality audio.
+// VERY_AGGRESSIVE: Detection mode with lowest miss-rate. Works well for most inputs.
+const VAD_MODE = VAD.Mode.NORMAL;
+// const VAD_MODE = VAD.Mode.LOW_BITRATE;
+// const VAD_MODE = VAD.Mode.AGGRESSIVE;
+// const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
+
+// Time in milliseconds for debouncing speech active state
+const DEBOUNCE_TIME = 20;
+
+// Create voice activity stream
+const VAD_STREAM = VAD.createStream({
+	mode: VAD_MODE,
+	audioFrequency: AUDIO_SAMPLE_RATE,
+	debounceTime: DEBOUNCE_TIME
+});
 
-let ffmpeg = require('child_process').spawn('ffmpeg', [
+// Spawn ffmpeg process
+const ffmpeg = spawn('ffmpeg', [
 	'-hide_banner',
 	'-nostats',
 	'-loglevel', 'fatal',
 	'-i', args['audio'],
-	'-af', 'highpass=f=200,lowpass=f=3000',
 	'-vn',
 	'-acodec', 'pcm_s16le',
 	'-ac', 1,
-	'-ar', 16000,
+	'-ar', AUDIO_SAMPLE_RATE,
 	'-f', 's16le',
 	'pipe:'
 ]);
 
-ffmpeg.stdout.on('data', chunk => {
-	vad.processAudio(chunk, 16000).then(res => {
-		switch (res) {
-			case VAD.Event.SILENCE:
-				if (state === voice.START) {
-					state = voice.STOP;
-					finishStream();
-					sctx = model.setupStream(150,16000);
-				}
-				break;
-			case VAD.Event.VOICE:
-				state = voice.START;
-				model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
-				break;
-		}
-	});
-});
+let audioLength = 0;
+let sctx = model.setupStream(PRE_ALLOC_FRAMES, AUDIO_SAMPLE_RATE);
 
-ffmpeg.stdout.on('close', code => {
+function finishStream() {
+	const model_load_start = process.hrtime();
+	console.error('Running inference.');
+	console.log('Transcription: ', model.finishStream(sctx));
+	const model_load_end = process.hrtime(model_load_start);
+	console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
+	audioLength = 0;
+}
+
+function intermediateDecode() {
 	finishStream();
-});
+	sctx = model.setupStream(PRE_ALLOC_FRAMES, AUDIO_SAMPLE_RATE);
+}
+
+function feedAudioContent(chunk) {
+	audioLength += (chunk.length / 2) * ( 1 / AUDIO_SAMPLE_RATE);
+	model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
+}
+
+function processVad(data) {
+	if (data.speech.start||data.speech.state) feedAudioContent(data.audioData)
+	else if (data.speech.end) { feedAudioContent(data.audioData); intermediateDecode() }
+}
+
+ffmpeg.stdout.pipe(VAD_STREAM).on('data', processVad);