@@ -4,11 +4,12 @@ const VAD = require("node-vad");
44const Ds = require ( 'deepspeech' ) ;
55const argparse = require ( 'argparse' ) ;
66const util = require ( 'util' ) ;
7+ const { spawn } = require ( 'child_process' ) ;
78
89// These constants control the beam search decoder
910
1011// Beam width used in the CTC decoder when building candidate transcriptions
11- const BEAM_WIDTH = 1024 ;
12+ const BEAM_WIDTH = 500 ;
1213
1314// The alpha hyperparameter of the CTC decoder. Language Model weight
1415const LM_ALPHA = 0.75 ;
@@ -44,7 +45,7 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
4445parser . addArgument ( [ '--alphabet' ] , { required : true , help : 'Path to the configuration file specifying the alphabet used by the network' } ) ;
4546parser . addArgument ( [ '--lm' ] , { help : 'Path to the language model binary file' , nargs : '?' } ) ;
4647parser . addArgument ( [ '--trie' ] , { help : 'Path to the language model trie file created with native_client/generate_trie' , nargs : '?' } ) ;
47- parser . addArgument ( [ '--audio' ] , { required : true , help : 'Path to the audio file to run (WAV format )' } ) ;
48+ parser . addArgument ( [ '--audio' ] , { required : true , help : 'Path to the audio source to run (ffmpeg supported formats )' } ) ;
4849parser . addArgument ( [ '--version' ] , { action : VersionAction , help : 'Print version and exits' } ) ;
4950let args = parser . parseArgs ( ) ;
5051
@@ -67,51 +68,71 @@ if (args['lm'] && args['trie']) {
6768 console . error ( 'Loaded language model in %ds.' , totalTime ( lm_load_end ) ) ;
6869}
6970
70- const vad = new VAD ( VAD . Mode . NORMAL ) ;
71- const voice = { START : true , STOP : false } ;
72- let sctx = model . setupStream ( 150 , 16000 ) ;
73- let state = voice . STOP ;
74-
75- function finishStream ( ) {
76- const model_load_start = process . hrtime ( ) ;
77- console . error ( 'Running inference.' ) ;
78- console . log ( 'Transcription: ' , model . finishStream ( sctx ) ) ;
79- const model_load_end = process . hrtime ( model_load_start ) ;
80- console . error ( 'Inference took %ds.' , totalTime ( model_load_end ) ) ;
81- }
71+ // Default initial allocation = 3 seconds := 150
72+ const PRE_ALLOC_FRAMES = 150 ;
73+
74+ // Default is 16kHz
75+ const AUDIO_SAMPLE_RATE = 16000 ;
76+
77+ // Defines different thresholds for voice detection
78+ // NORMAL: Suitable for high bitrate, low-noise data. May classify noise as voice, too.
79+ // LOW_BITRATE: Detection mode optimised for low-bitrate audio.
80+ // AGGRESSIVE: Detection mode best suited for somewhat noisy, lower quality audio.
81+ // VERY_AGGRESSIVE: Detection mode with lowest miss-rate. Works well for most inputs.
82+ const VAD_MODE = VAD . Mode . NORMAL ;
83+ // const VAD_MODE = VAD.Mode.LOW_BITRATE;
84+ // const VAD_MODE = VAD.Mode.AGGRESSIVE;
85+ // const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;
86+
87+ // Time in milliseconds for debouncing speech active state
88+ const DEBOUNCE_TIME = 20 ;
89+
90+ // Create voice activity stream
91+ const VAD_STREAM = VAD . createStream ( {
92+ mode : VAD_MODE ,
93+ audioFrequency : AUDIO_SAMPLE_RATE ,
94+ debounceTime : DEBOUNCE_TIME
95+ } ) ;
8296
83- let ffmpeg = require ( 'child_process' ) . spawn ( 'ffmpeg' , [
97+ // Spawn ffmpeg process
98+ const ffmpeg = spawn ( 'ffmpeg' , [
8499 '-hide_banner' ,
85100 '-nostats' ,
86101 '-loglevel' , 'fatal' ,
87102 '-i' , args [ 'audio' ] ,
88- '-af' , 'highpass=f=200,lowpass=f=3000' ,
89103 '-vn' ,
90104 '-acodec' , 'pcm_s16le' ,
91105 '-ac' , 1 ,
92- '-ar' , 16000 ,
106+ '-ar' , AUDIO_SAMPLE_RATE ,
93107 '-f' , 's16le' ,
94108 'pipe:'
95109] ) ;
96110
97- ffmpeg . stdout . on ( 'data' , chunk => {
98- vad . processAudio ( chunk , 16000 ) . then ( res => {
99- switch ( res ) {
100- case VAD . Event . SILENCE :
101- if ( state === voice . START ) {
102- state = voice . STOP ;
103- finishStream ( ) ;
104- sctx = model . setupStream ( 150 , 16000 ) ;
105- }
106- break ;
107- case VAD . Event . VOICE :
108- state = voice . START ;
109- model . feedAudioContent ( sctx , chunk . slice ( 0 , chunk . length / 2 ) ) ;
110- break ;
111- }
112- } ) ;
113- } ) ;
111+ let audioLength = 0 ;
112+ let sctx = model . setupStream ( PRE_ALLOC_FRAMES , AUDIO_SAMPLE_RATE ) ;
114113
115- ffmpeg . stdout . on ( 'close' , code => {
114+ function finishStream ( ) {
115+ const model_load_start = process . hrtime ( ) ;
116+ console . error ( 'Running inference.' ) ;
117+ console . log ( 'Transcription: ' , model . finishStream ( sctx ) ) ;
118+ const model_load_end = process . hrtime ( model_load_start ) ;
119+ console . error ( 'Inference took %ds for %ds audio file.' , totalTime ( model_load_end ) , audioLength . toPrecision ( 4 ) ) ;
120+ audioLength = 0 ;
121+ }
122+
123+ function intermediateDecode ( ) {
116124 finishStream ( ) ;
117- } ) ;
125+ sctx = model . setupStream ( PRE_ALLOC_FRAMES , AUDIO_SAMPLE_RATE ) ;
126+ }
127+
128+ function feedAudioContent ( chunk ) {
129+ audioLength += ( chunk . length / 2 ) * ( 1 / AUDIO_SAMPLE_RATE ) ;
130+ model . feedAudioContent ( sctx , chunk . slice ( 0 , chunk . length / 2 ) ) ;
131+ }
132+
133+ function processVad ( data ) {
134+ if ( data . speech . start || data . speech . state ) feedAudioContent ( data . audioData )
135+ else if ( data . speech . end ) { feedAudioContent ( data . audioData ) ; intermediateDecode ( ) }
136+ }
137+
138+ ffmpeg . stdout . pipe ( VAD_STREAM ) . on ( 'data' , processVad ) ;
0 commit comments