commitmoji
diff --git a/‎.install‎
Lines changed: 1 addition & 1 deletion b/‎.install‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎DeepSpeech.py‎
Lines changed: 1 addition & 0 deletions b/‎DeepSpeech.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/ffmpeg_vad_streaming/README.MD‎
Lines changed: 29 additions & 0 deletions b/‎examples/ffmpeg_vad_streaming/README.MD‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎examples/ffmpeg_vad_streaming/index.js‎
Lines changed: 118 additions & 0 deletions b/‎examples/ffmpeg_vad_streaming/index.js‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎examples/ffmpeg_vad_streaming/package.json‎
Lines changed: 16 additions & 0 deletions b/‎examples/ffmpeg_vad_streaming/package.json‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎examples/mic_vad_streaming/README.md‎
Lines changed: 6 additions & 0 deletions b/‎examples/mic_vad_streaming/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎native_client/Android.mk‎
Lines changed: 14 additions & 0 deletions b/‎native_client/Android.mk‎
Lines changed: 14 additions & 0 deletions
@@ -3,7 +3,7 @@
 virtualenv -p python3 ../tmp/venv
 source ../tmp/venv/bin/activate
 pip install -r <(grep -v tensorflow requirements.txt)
-pip install tensorflow-gpu==1.12.0rc2
+pip install tensorflow-gpu==1.12.0
 
 python3 util/taskcluster.py --arch gpu --target ../tmp/native_client
 
 
@@ -890,6 +890,7 @@ def main(_):
         if len(FLAGS.worker_hosts) == 0:
             # Only one local task: this process (default case - no cluster)
             with tf.Graph().as_default():
+                tf.set_random_seed(FLAGS.random_seed)
                 train()
             # Now do a final test epoch
             if FLAGS.test:
 
@@ -186,7 +186,7 @@ RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_clie
 
 # Install TensorFlow
 WORKDIR /DeepSpeech/
-RUN pip install tensorflow-gpu==1.12.0rc2
+RUN pip install tensorflow-gpu==1.12.0
 
 
 # Make DeepSpeech and install Python bindings
 
@@ -227,7 +227,7 @@ If you have a capable (Nvidia, at least 8GB of VRAM) GPU, it is highly recommend
 
 ```bash
 pip3 uninstall tensorflow
-pip3 install 'tensorflow-gpu==1.12.0rc2'
+pip3 install 'tensorflow-gpu==1.12.0'
 ```
 
 ### Common Voice training data
@@ -284,7 +284,7 @@ If you are brave enough, you can also include the `other` dataset, which contain
 The central (Python) script is `DeepSpeech.py` in the project's root directory. For its list of command line options, you can call:
 
 ```bash
-./DeepSpeech.py --help
+./DeepSpeech.py --helpfull
 ```
 
 To get the output of this in a slightly better-formatted way, you can also look up the option definitions top of `DeepSpeech.py`.
 
@@ -1 +1 @@
-0.4.0-alpha.0
+0.4.0-alpha.2
@@ -0,0 +1,29 @@
+# FFmpeg VAD Streaming
+
+Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js.
+
+This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
+
+## Installation
+
+```bash
+npm install
+```
+
+Moreover FFmpeg must be installed:
+
+```bash
+sudo apt-get install ffmpeg
+```
+
+## Usage
+
+Here is an example for a local audio file:
+```bash
+node ./index.js --audio <AUDIO_FILE> --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
+```
+
+Here is an example for a remote RTMP-Stream:
+```bash
+node ./index.js  --audio rtmp://<IP>:1935/live/teststream --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
+```
@@ -0,0 +1,118 @@
+#!/usr/bin/env node
+
+const VAD = require("node-vad");
+const Ds = require('deepspeech');
+const argparse = require('argparse');
+const util = require('util');
+
+// These constants control the beam search decoder
+
+// Beam width used in the CTC decoder when building candidate transcriptions
+const BEAM_WIDTH = 1024;
+
+// The alpha hyperparameter of the CTC decoder. Language Model weight
+const LM_WEIGHT = 1.50;
+
+// Valid word insertion weight. This is used to lessen the word insertion penalty
+// when the inserted word is part of the vocabulary
+const VALID_WORD_COUNT_WEIGHT = 2.25;
+
+// These constants are tied to the shape of the graph used (changing them changes
+// the geometry of the first layer), so make sure you use the same constants that
+// were used during training
+
+// Number of MFCC features to use
+const N_FEATURES = 26;
+
+// Size of the context window used for producing timesteps in the input vector
+const N_CONTEXT = 9;
+
+let VersionAction = function VersionAction(options) {
+	options = options || {};
+	options.nargs = 0;
+	argparse.Action.call(this, options);
+};
+
+util.inherits(VersionAction, argparse.Action);
+
+VersionAction.prototype.call = function(parser) {
+	Ds.printVersions();
+	process.exit(0);
+};
+
+let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
+parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
+parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
+parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
+parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
+parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
+parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
+let args = parser.parseArgs();
+
+function totalTime(hrtimeValue) {
+	return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
+}
+
+console.error('Loading model from file %s', args['model']);
+const model_load_start = process.hrtime();
+let model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH);
+const model_load_end = process.hrtime(model_load_start);
+console.error('Loaded model in %ds.', totalTime(model_load_end));
+
+if (args['lm'] && args['trie']) {
+	console.error('Loading language model from files %s %s', args['lm'], args['trie']);
+	const lm_load_start = process.hrtime();
+	model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'],
+		LM_WEIGHT, VALID_WORD_COUNT_WEIGHT);
+	const lm_load_end = process.hrtime(lm_load_start);
+	console.error('Loaded language model in %ds.', totalTime(lm_load_end));
+}
+
+const vad = new VAD(VAD.Mode.NORMAL);
+const voice = {START: true, STOP: false};
+let sctx = model.setupStream(150, 16000);
+let state = voice.STOP;
+
+function finishStream() {
+	const model_load_start = process.hrtime();
+	console.error('Running inference.');
+	console.log('Transcription: ', model.finishStream(sctx));
+	const model_load_end = process.hrtime(model_load_start);
+	console.error('Inference took %ds.', totalTime(model_load_end));
+}
+
+let ffmpeg = require('child_process').spawn('ffmpeg', [
+	'-hide_banner',
+	'-nostats',
+	'-loglevel', 'fatal',
+	'-i', args['audio'],
+	'-af', 'highpass=f=200,lowpass=f=3000',
+	'-vn',
+	'-acodec', 'pcm_s16le',
+	'-ac', 1,
+	'-ar', 16000,
+	'-f', 's16le',
+	'pipe:'
+]);
+
+ffmpeg.stdout.on('data', chunk => {
+	vad.processAudio(chunk, 16000).then(res => {
+		switch (res) {
+			case VAD.Event.SILENCE:
+				if (state === voice.START) {
+					state = voice.STOP;
+					finishStream();
+					sctx = model.setupStream(150,16000);
+				}
+				break;
+			case VAD.Event.VOICE:
+				state = voice.START;
+				model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
+				break;
+		}
+	});
+});
+
+ffmpeg.stdout.on('close', code => {
+	finishStream();
+});
@@ -0,0 +1,16 @@
+{
+  "name": "ffmpeg-vad-streaming",
+  "version": "1.0.0",
+  "description": "Streaming inference from arbitrary source with VAD and FFmpeg",
+  "main": "index.js",
+  "scripts": {
+    "start": "node ./index.js"
+  },
+  "dependencies": {
+    "argparse": "^1.0.10",
+    "deepspeech": "^0.3.0",
+    "node-vad": "^1.1.1",
+    "util": "^0.11.1"
+  },
+  "license" : "MIT"
+}
@@ -14,6 +14,12 @@ Uses portaudio for microphone access, so on Linux, you may need to install its h
 sudo apt install portaudio19-dev
 ```
 
+Installation on MacOS may fail due to portaudio, use brew to install it:
+
+```bash
+brew install portaudio
+```
+
 ## Usage
 
 ```
 
@@ -0,0 +1,14 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE    := deepspeech-prebuilt
+LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+include $(CLEAR_VARS)
+LOCAL_CPP_EXTENSION    := .cc .cxx .cpp
+LOCAL_MODULE           := deepspeech
+LOCAL_SRC_FILES        := client.cc
+LOCAL_SHARED_LIBRARIES := deepspeech-prebuilt
+LOCAL_LDFLAGS          := -Wl,--no-as-needed
+include $(BUILD_EXECUTABLE)