Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"@ffmpeg/core": "^0.12.10",
"@ffmpeg/ffmpeg": "^0.12.15",
"@ffmpeg/util": "^0.12.2",
"@huggingface/transformers": "^3.1.0",
"@hello-pangea/dnd": "^18.0.1",
"@hookform/resolvers": "^3.9.1",
"@opencut/auth": "workspace:*",
Expand Down
254 changes: 254 additions & 0 deletions apps/web/public/workers/speech-to-text.worker.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
// Speech-to-Text Worker
// This worker loads @huggingface/transformers from CDN using ESM

let pipeline, WhisperTextStreamer;

// Load transformers from CDN using ESM
const initTransformers = async () => {
try {
const transformers = await import('https://cdn.jsdelivr.net/npm/@huggingface/[email protected]/+esm');
pipeline = transformers.pipeline;
WhisperTextStreamer = transformers.WhisperTextStreamer;
} catch (error) {
console.error('Failed to import transformers:', error);
// Send error back to main thread
self.postMessage({
status: "error",
data: { message: "Failed to load AI model dependencies" }
});
throw error;
}
};

class ASRPipelineFactory {
static instance = null;
static currentModel = null;

static async getInstance(model, progressCallback) {
if (this.instance === null || this.currentModel !== model) {
if (this.instance) {
try {
await this.instance.dispose();
} catch (error) {
console.warn('Failed to dispose previous model instance:', error);
}
this.instance = null;
}

this.currentModel = model;
this.instance = await pipeline("automatic-speech-recognition", model, {
dtype: {
encoder_model: model === "onnx-community/whisper-large-v3-turbo" ? "fp16" : "fp32",
decoder_model_merged: "q4",
},
device: "webgpu",
progress_callback: progressCallback,
});
}
return this.instance;
}
Comment on lines +27 to +49
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Make device configuration dynamic and improve dtype selection

The device is hardcoded to 'webgpu' (line 44) but WebGPU might not be available on all devices. The store detects capabilities but this information isn't passed to the worker. Also, the dtype selection only checks for one specific model.

-  static async getInstance(model, progressCallback) {
+  static async getInstance(model, progressCallback, deviceConfig = { device: 'webgpu', dtype: 'fp32' }) {
     if (this.instance === null || this.currentModel !== model) {
       if (this.instance) {
         try {
           await this.instance.dispose();
         } catch (error) {
           console.warn('Failed to dispose previous model instance:', error);
         }
         this.instance = null;
       }

       this.currentModel = model;
+      
+      // Determine dtype based on model and device
+      const getDtype = (model, device) => {
+        const isLargeModel = model.includes('large') || model.includes('medium');
+        const canUseFp16 = device === 'webgpu' && !model.includes('whisper-base');
+        
+        return {
+          encoder_model: canUseFp16 ? 'fp16' : 'fp32',
+          decoder_model_merged: isLargeModel ? 'q4' : 'q8'
+        };
+      };
+      
       this.instance = await pipeline("automatic-speech-recognition", model, {
-        dtype: {
-          encoder_model: model === "onnx-community/whisper-large-v3-turbo" ? "fp16" : "fp32",
-          decoder_model_merged: "q4",
-        },
-        device: "webgpu",
+        dtype: deviceConfig.customDtype || getDtype(model, deviceConfig.device),
+        device: deviceConfig.device,
         progress_callback: progressCallback,
       });
     }
     return this.instance;
   }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
static async getInstance(model, progressCallback) {
if (this.instance === null || this.currentModel !== model) {
if (this.instance) {
try {
await this.instance.dispose();
} catch (error) {
console.warn('Failed to dispose previous model instance:', error);
}
this.instance = null;
}
this.currentModel = model;
this.instance = await pipeline("automatic-speech-recognition", model, {
dtype: {
encoder_model: model === "onnx-community/whisper-large-v3-turbo" ? "fp16" : "fp32",
decoder_model_merged: "q4",
},
device: "webgpu",
progress_callback: progressCallback,
});
}
return this.instance;
}
static async getInstance(model, progressCallback, deviceConfig = { device: 'webgpu', dtype: 'fp32' }) {
if (this.instance === null || this.currentModel !== model) {
if (this.instance) {
try {
await this.instance.dispose();
} catch (error) {
console.warn('Failed to dispose previous model instance:', error);
}
this.instance = null;
}
this.currentModel = model;
// Determine dtype based on model and device
const getDtype = (model, device) => {
const isLargeModel = model.includes('large') || model.includes('medium');
const canUseFp16 = device === 'webgpu' && !model.includes('whisper-base');
return {
encoder_model: canUseFp16 ? 'fp16' : 'fp32',
decoder_model_merged: isLargeModel ? 'q4' : 'q8'
};
};
this.instance = await pipeline("automatic-speech-recognition", model, {
dtype: deviceConfig.customDtype || getDtype(model, deviceConfig.device),
device: deviceConfig.device,
progress_callback: progressCallback,
});
}
return this.instance;
}
🤖 Prompt for AI Agents
In apps/web/public/workers/speech-to-text.worker.js between lines 27 and 49, the
device is hardcoded to 'webgpu', which may not be supported on all devices, and
dtype selection only checks for one specific model. Modify the getInstance
method to accept device capability information as a parameter or retrieve it
dynamically, then use this to set the device option instead of hardcoding
'webgpu'. Also, enhance the dtype selection logic to handle multiple models or
make it configurable rather than only checking for
"onnx-community/whisper-large-v3-turbo".


static dispose() {
if (this.instance) {
try {
this.instance.dispose();
} catch (error) {
console.warn('Failed to dispose model instance:', error);
}
this.instance = null;
this.currentModel = null;
}
}
}

// Handle incoming messages
self.addEventListener("message", async (event) => {
const message = event.data;

// Handle test connectivity message
if (message.test) {
self.postMessage({
status: "update",
data: { stage: "ready", progress: 100 }
});
return;
}

try {
// Initialize transformers if not already done
if (!pipeline || !WhisperTextStreamer) {
await initTransformers();
}

const transcript = await transcribe(message);
if (transcript === null) {
self.postMessage({
status: "error",
data: { message: "Transcription returned null result" },
});
return;
}

self.postMessage({
status: "complete",
data: transcript,
});
} catch (error) {
const errorMessage = error instanceof Error ? error.message : 'Unknown transcription error';
self.postMessage({
status: "error",
data: { message: errorMessage },
});
}
});

const transcribe = async ({ audio, model, subtask, language }) => {
if (!audio || audio.length === 0) {
throw new Error('No audio data provided');
}

if (!model) {
throw new Error('No model specified');
}

const isDistilWhisper = model.startsWith("distil-whisper/");
const isEnglishOnly = model.includes('.en');

try {
// Load transcriber model with progress tracking
const transcriber = await ASRPipelineFactory.getInstance(model, (data) => {
// Forward progress updates to main thread
if (data.status === 'downloading' || data.status === 'loading') {
self.postMessage({
status: "update",
data: {
stage: data.status,
progress: data.progress || 0,
file: data.file,
},
});
}
});

if (!transcriber?.processor?.feature_extractor?.config) {
throw new Error('Invalid transcriber configuration - missing feature extractor');
}

if (!transcriber.model?.config?.max_source_positions) {
throw new Error('Invalid transcriber configuration - missing model config');
}

const time_precision =
transcriber.processor.feature_extractor.config.chunk_length /
transcriber.model.config.max_source_positions;

// Initialize transcription state
const chunks = [];
const chunk_length_s = isDistilWhisper ? 20 : 30;
const stride_length_s = isDistilWhisper ? 3 : 5;

let chunk_count = 0;
let start_time = null;
let num_tokens = 0;
let tps;

Comment on lines +151 to +154
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Initialize tps variable to avoid potential undefined value

The tps variable is declared without initialization and might remain undefined if no tokens are processed.

     let chunk_count = 0;
     let start_time = null;
     let num_tokens = 0;
-    let tps;
+    let tps = 0;
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
let start_time = null;
let num_tokens = 0;
let tps;
let chunk_count = 0;
let start_time = null;
let num_tokens = 0;
let tps = 0;
🤖 Prompt for AI Agents
In apps/web/public/workers/speech-to-text.worker.js around lines 151 to 154, the
variable tps is declared but not initialized, which can lead to it being
undefined if no tokens are processed. Initialize tps to a default value such as
0 to ensure it always has a defined numeric value.

const streamer = new WhisperTextStreamer(transcriber.tokenizer, {
time_precision,
on_chunk_start: (x) => {
const offset = (chunk_length_s - stride_length_s) * chunk_count;
chunks.push({
text: "",
timestamp: [offset + x, null],
finalised: false,
offset,
});
},
token_callback_function: () => {
start_time ??= performance.now();
if (num_tokens++ > 0) {
tps = (num_tokens / (performance.now() - start_time)) * 1000;
}
},
callback_function: (text) => {
if (chunks.length === 0) return;

const lastChunk = chunks[chunks.length - 1];
if (lastChunk) {
lastChunk.text += text;

// Calculate progress based on processed chunks
const estimatedProgress = Math.min(95, (chunk_count * 15) + (chunks.length * 5));

self.postMessage({
status: "update",
data: {
stage: "transcribing",
progress: estimatedProgress,
chunks: chunks.filter(c => c.finalised),
currentText: lastChunk.text,
tps,
},
});
}
},
on_chunk_end: (x) => {
const current = chunks[chunks.length - 1];
if (current) {
current.timestamp[1] = x + current.offset;
current.finalised = true;
}
},
on_finalize: () => {
start_time = null;
num_tokens = 0;
++chunk_count;
},
});

// Configure transcription options
const transcriptionOptions = {
top_k: 0,
do_sample: false,
chunk_length_s,
stride_length_s,
return_timestamps: true,
force_full_sequences: false,
streamer,
};

// Add language and task for multilingual models
if (!isEnglishOnly) {
transcriptionOptions.language = language;
transcriptionOptions.task = subtask;
}

// Run transcription
const output = await transcriber(audio, transcriptionOptions);

if (!output) {
throw new Error('Transcription returned no output');
}

// Format and validate chunks
const formattedChunks = chunks
.filter(chunk => chunk.finalised && chunk.text.trim())
.map(chunk => {
const startTime = chunk.timestamp[0];
const endTime = chunk.timestamp[1] || startTime;
return {
text: chunk.text.trim(),
timestamp: [startTime, endTime]
};
});

return {
tps,
text: output.text || '',
chunks: formattedChunks,
};

} catch (error) {
const errorMessage = error instanceof Error ? error.message : 'Transcription failed';
throw new Error(`Transcription error: ${errorMessage}`);
}
};
Loading