Implementing Real-time Speech-to-Text Transcription using AWS Transcribe with Next.js API Routes #6655
Replies: 2 comments 1 reply
-
I've added some changes which I'm not sure might be enough for your needs. Please let me know if I missed anything. APII checked out your code and it's looking good overall! The main thing I noticed is just how you're handling errors. Rather than sending back a 500 status (which triggers those annoying browser errors), I tweaked it to return a 200 status with a friendly message instead. The key difference is in the catch block: catch (error) {
console.error('Error in transcription:', error);
// Return a 200 status instead of 500 to suppress browser errors
return NextResponse.json(
{
transcript: '',
status: 'warning',
message: 'Could not transcribe audio. This service only supports Japanese.'
},
{ status: 200 }
);
} This approach prevents browser error messages and gives you more control over how errors are presented to users // src/app/api/transcribe/route.ts
import { NextRequest, NextResponse } from 'next/server';
import {
TranscribeStreamingClient,
StartStreamTranscriptionCommand,
AudioStream,
} from '@aws-sdk/client-transcribe-streaming';
export const runtime = 'nodejs';
export async function POST(request: NextRequest) {
let client: TranscribeStreamingClient | undefined;
try {
client = new TranscribeStreamingClient({
region: process.env.NEXT_AWS_REGION,
credentials: {
accessKeyId: process.env.NEXT_AWS_TRANSCRIBE_ACCESS_KEY_ID!,
secretAccessKey: process.env.NEXT_AWS_TRANSCRIBE_SECRET_ACCESS_KEY!,
},
});
const { audioChunk } = await request.json();
const audioStream: AsyncIterable<AudioStream> = {
async *[Symbol.asyncIterator]() {
yield {
AudioEvent: {
AudioChunk: Buffer.from(audioChunk)
}
};
}
};
const command = new StartStreamTranscriptionCommand({
LanguageCode: 'ja-JP',
MediaEncoding: 'pcm',
MediaSampleRateHertz: 44100,
AudioStream: audioStream,
});
const response = await client.send(command);
let transcriptResult = '';
if (response.TranscriptResultStream) {
for await (const event of response.TranscriptResultStream) {
if (event.TranscriptEvent?.Transcript?.Results?.[0]) {
const result = event.TranscriptEvent.Transcript.Results[0];
if (result.Alternatives?.[0]?.Transcript) {
transcriptResult = result.Alternatives[0].Transcript;
}
}
}
}
return NextResponse.json({ transcript: transcriptResult });
} catch (error) {
console.error('Error in transcription:', error);
// Return a 200 status instead of 500 to suppress browser errors
return NextResponse.json(
{
transcript: '',
status: 'warning',
message: 'Could not transcribe audio. This service only supports Japanese.'
},
{ status: 200 }
);
} finally {
client?.destroy();
}
} HookAdded
|
Beta Was this translation helpful? Give feedback.
-
Update on Transcription Issue:After implementing the initial changes, I encountered another issue with AWS Transcribe:
Solution:API// src/app/api/transcribe/route.ts
import { NextRequest, NextResponse } from 'next/server';
import {
TranscribeStreamingClient,
StartStreamTranscriptionCommand,
AudioStream,
} from '@aws-sdk/client-transcribe-streaming';
export const runtime = 'nodejs';
const MAX_FRAME_SIZE = 4000;
export async function POST(request: NextRequest) {
let client: TranscribeStreamingClient | undefined;
try {
client = new TranscribeStreamingClient({
region: process.env.NEXT_AWS_REGION,
credentials: {
accessKeyId: process.env.NEXT_AWS_TRANSCRIBE_ACCESS_KEY_ID!,
secretAccessKey: process.env.NEXT_AWS_TRANSCRIBE_SECRET_ACCESS_KEY!,
},
requestHandler: {
timeoutInMs: 10000 // 10-second timeout on requests
}
});
// Parse the audio chunk from the request
const { audioChunk } = await request.json();
const fullBuffer = Buffer.from(audioChunk);
// Split the buffer into smaller chunks to avoid "stream too big" error
const chunks: Buffer[] = [];
for (let i = 0; i < fullBuffer.length; i += MAX_FRAME_SIZE) {
chunks.push(fullBuffer.slice(i, i + MAX_FRAME_SIZE));
}
// Create an audio stream with proper error handling that sends smaller chunks
const audioStream: AsyncIterable<AudioStream> = {
async *[Symbol.asyncIterator]() {
try {
for (const chunk of chunks) {
yield {
AudioEvent: {
AudioChunk: chunk
}
};
}
} catch (err) {
console.error('Error in audio stream generator:', err);
throw err;
}
}
};
const command = new StartStreamTranscriptionCommand({
LanguageCode: 'ja-JP',
MediaEncoding: 'pcm',
MediaSampleRateHertz: 44100,
AudioStream: audioStream,
});
const response = await client.send(command);
let transcriptResult = '';
if (response.TranscriptResultStream) {
try {
for await (const event of response.TranscriptResultStream) {
if (event.TranscriptEvent?.Transcript?.Results?.[0]) {
const result = event.TranscriptEvent.Transcript.Results[0];
if (result.Alternatives?.[0]?.Transcript) {
transcriptResult = result.Alternatives[0].Transcript;
}
}
}
} catch (streamError) {
console.error('Error processing transcript stream:', streamError);
if (transcriptResult) {
return NextResponse.json({ transcript: transcriptResult });
}
throw streamError;
}
}
return NextResponse.json({ transcript: transcriptResult });
} catch (error) {
console.error('Error in transcription:', error);
// Return a 200 status instead of 500 to suppress browser errors
return NextResponse.json(
{
transcript: '',
status: 'warning',
message: 'Could not transcribe audio. This service only supports Japanese.'
},
{ status: 200 }
);
} finally {
if (client) {
try {
await client.destroy();
} catch (destroyError) {
console.error('Error destroying client:', destroyError);
}
}
}
} HOOK// src/hooks/useTranscribe.ts
"use client"
import MicrophoneStream from 'microphone-stream';
import { useState, useRef } from 'react';
import update from 'immutability-helper';
import { Buffer } from 'buffer';
const pcmEncodeChunk = (chunk: Buffer) => {
const input = MicrophoneStream.toRaw(chunk);
let offset = 0;
const buffer = new ArrayBuffer(input.length * 2);
const view = new DataView(buffer);
for (let i = 0; i < input.length; i++, offset += 2) {
const s = Math.max(-1, Math.min(1, input[i]));
view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
}
return Buffer.from(buffer);
};
const useTranscribe = () => {
const [micStream, setMicStream] = useState<MicrophoneStream | undefined>();
const [recording, setRecording] = useState(false);
const [error, setError] = useState(false);
const [errorMessage, setErrorMessage] = useState<string>('');
const [transcripts, setTranscripts] = useState<
{
isPartial: boolean;
transcript: string;
}[]
>([]);
const abortControllerRef = useRef<AbortController | null>(null);
const startStream = async (mic: MicrophoneStream) => {
// Create a new AbortController for this stream session
abortControllerRef.current = new AbortController();
const signal = abortControllerRef.current.signal;
let accumulatedChunks: Buffer[] = [];
// Collection interval in milliseconds - adjust based on your needs
// Shorter intervals mean more frequent API calls but less data per call
const SEND_INTERVAL_MS = 500;
let lastSendTime = Date.now();
for await (const chunk of mic as unknown as Buffer[]) {
// Check if the stream has been aborted
if (signal.aborted) {
break;
}
try {
const encodedChunk = pcmEncodeChunk(chunk);
// Add to accumulated buffer
accumulatedChunks.push(encodedChunk);
const currentTime = Date.now();
// Only send if we've accumulated enough time or data
if (currentTime - lastSendTime >= SEND_INTERVAL_MS) {
// Combine all accumulated chunks
const combinedChunk = Buffer.concat(accumulatedChunks);
const response = await fetch('/api/transcribe', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
audioChunk: Array.from(combinedChunk)
}),
signal, // Pass the abort signal to the fetch request
});
// Reset accumulation
accumulatedChunks = [];
lastSendTime = currentTime;
if (!response.ok) {
throw new Error(`Transcription request failed: ${response.status}`);
}
const data = await response.json();
if (data.status === 'warning') {
console.warn('Transcription warning:', data.message);
}
if (data.transcript) {
setTranscripts((prev) => {
const index = prev.length - 1;
if (prev.length === 0 || !prev[prev.length - 1].isPartial) {
return update(prev, {
$push: [{ isPartial: false, transcript: data.transcript }],
});
} else {
return update(prev, {
$splice: [[index, 1, { isPartial: false, transcript: data.transcript }]],
});
}
});
}
}
} catch (e) {
if (e instanceof DOMException && e.name === 'AbortError') {
console.log('Fetch request aborted');
continue;
}
console.error('Error in transcription:', e);
setError(true);
setErrorMessage(e instanceof Error ? e.message : String(e));
break;
}
}
};
const startTranscription = async () => {
const mic = new MicrophoneStream();
try {
setMicStream(mic);
const stream = await window.navigator.mediaDevices.getUserMedia({
video: false,
audio: {
channelCount: 1,
sampleRate: 44100,
},
});
mic.setStream(stream);
setError(false);
setErrorMessage('');
setTranscripts([]);
setRecording(true);
await startStream(mic);
} catch (e) {
console.error('Error starting transcription:', e);
setError(true);
setErrorMessage(e instanceof Error ? e.message : String(e));
} finally {
if (mic) {
try {
mic.stop();
} catch (stopError) {
console.error('Error stopping microphone:', stopError);
}
}
setRecording(false);
setMicStream(undefined);
}
};
const stopTranscription = () => {
if (abortControllerRef.current) {
abortControllerRef.current.abort();
abortControllerRef.current = null;
}
if (micStream) {
micStream.stop();
setRecording(false);
setMicStream(undefined);
}
};
const resetTranscripts = () => {
setError(false);
setErrorMessage('');
setTranscripts([]);
};
return {
startTranscription,
stopTranscription,
resetTranscripts,
recording,
transcripts,
error,
errorMessage,
};
};
export default useTranscribe;
|
Beta Was this translation helpful? Give feedback.
-
I want to create an API for speech-to-text transcription using AWS SDK v3's AWS Transcribe with Next.js API Routes.
I tried to create the following API and hooks based on this reference code, but unfortunately, it's not working as expected. I would really appreciate any guidance.
https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/javascriptv3/example_code/cross-services/transcribe-streaming-app/src/libs/transcribeClient.js
Here's my current implementation that I'm struggling with.
package
API
hooks
Could someone please help me understand what I'm doing wrong and how to implement this correctly?
I'm stuck and would really appreciate any assistance.
Beta Was this translation helpful? Give feedback.
All reactions