Implementing Real-time Speech-to-Text Transcription using AWS Transcribe with Next.js API Routes #6655

10000leaves · 2024-11-13T06:23:24Z

10000leaves
Nov 13, 2024

I want to create an API for speech-to-text transcription using AWS SDK v3's AWS Transcribe with Next.js API Routes.

I tried to create the following API and hooks based on this reference code, but unfortunately, it's not working as expected. I would really appreciate any guidance.
https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/javascriptv3/example_code/cross-services/transcribe-streaming-app/src/libs/transcribeClient.js

Here's my current implementation that I'm struggling with.

package

"@aws-sdk/client-transcribe-streaming": "^3.693.0"
"microphone-stream": "^6.0.1"
"immutability-helper": "^3.1.1"
"buffer": "^6.0.3"

API

// src/app/api/transcribe/route.ts
import { NextRequest, NextResponse } from 'next/server';
import {
  TranscribeStreamingClient,
  StartStreamTranscriptionCommand,
  AudioStream,
} from '@aws-sdk/client-transcribe-streaming';

export const runtime = 'nodejs';

export async function POST(request: NextRequest) {
  let client: TranscribeStreamingClient | undefined;
  
  try {
    client = new TranscribeStreamingClient({
      region: process.env.NEXT_AWS_REGION,
      credentials: {
        accessKeyId: process.env.NEXT_AWS_TRANSCRIBE_ACCESS_KEY_ID!,
        secretAccessKey: process.env.NEXT_AWS_TRANSCRIBE_SECRET_ACCESS_KEY!,
      },
    });

    const { audioChunk } = await request.json();

    const audioStream: AsyncIterable<AudioStream> = {
      async *[Symbol.asyncIterator]() {
        yield {
          AudioEvent: {
            AudioChunk: Buffer.from(audioChunk)
          }
        };
      }
    };

    const command = new StartStreamTranscriptionCommand({
      LanguageCode: 'ja-JP',
      MediaEncoding: 'pcm',
      MediaSampleRateHertz: 44100,
      AudioStream: audioStream,
    });

    const response = await client.send(command);
    let transcriptResult = '';

    if (response.TranscriptResultStream) {
      for await (const event of response.TranscriptResultStream) {
        if (event.TranscriptEvent?.Transcript?.Results?.[0]) {
          const result = event.TranscriptEvent.Transcript.Results[0];
          if (result.Alternatives?.[0]?.Transcript) {
            transcriptResult = result.Alternatives[0].Transcript;
          }
        }
      }
    }

    return NextResponse.json({ transcript: transcriptResult });
  } catch (error) {
    console.error('Error in transcription:', error);
    return NextResponse.json(
      { error: 'Transcription failed', details: error },
      { status: 500 }
    );
  } finally {
    client?.destroy();
  }
}

hooks

// src/hooks/useTranscribe.ts
import MicrophoneStream from 'microphone-stream';
import { useState } from 'react';
import update from 'immutability-helper';
import { Buffer } from 'buffer';

const pcmEncodeChunk = (chunk: Buffer) => {
  const input = MicrophoneStream.toRaw(chunk);
  let offset = 0;
  const buffer = new ArrayBuffer(input.length * 2);
  const view = new DataView(buffer);
  for (let i = 0; i < input.length; i++, offset += 2) {
    const s = Math.max(-1, Math.min(1, input[i]));
    view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
  }
  return Buffer.from(buffer);
};

const useTranscribe = () => {
  const [micStream, setMicStream] = useState<MicrophoneStream | undefined>();
  const [recording, setRecording] = useState(false);
  const [error, setError] = useState(false);
  const [errorMessage, setErrorMessage] = useState<string>('');
  const [transcripts, setTranscripts] = useState<
    {
      isPartial: boolean;
      transcript: string;
    }[]
  >([]);

  const startStream = async (mic: MicrophoneStream) => {
    for await (const chunk of mic as unknown as Buffer[]) {
      try {
        const encodedChunk = pcmEncodeChunk(chunk);
        
        const response = await fetch('/api/transcribe', {
          method: 'POST',
          headers: {
            'Content-Type': 'application/json',
          },
          body: JSON.stringify({ 
            audioChunk: Array.from(encodedChunk)
          }),
        });

        if (!response.ok) {
          throw new Error('Transcription request failed');
        }

        const data = await response.json();
        
        if (data.transcript) {
          setTranscripts((prev) => {
            const index = prev.length - 1;
            if (prev.length === 0 || !prev[prev.length - 1].isPartial) {
              return update(prev, {
                $push: [{ isPartial: false, transcript: data.transcript }],
              });
            } else {
              return update(prev, {
                $splice: [[index, 1, { isPartial: false, transcript: data.transcript }]],
              });
            }
          });
        }
      } catch (e) {
        console.error('Error in transcription:', e);
        setError(true);
        setErrorMessage(e');
      }
    }
  };

  const startTranscription = async () => {
    const mic = new MicrophoneStream();
    try {
      setMicStream(mic);
      mic.setStream(
        await window.navigator.mediaDevices.getUserMedia({
          video: false,
          audio: {
            channelCount: 1,
            sampleRate: 44100,
          },
        })
      );
      
      setError(false);
      setErrorMessage('');
      setTranscripts([]);
      setRecording(true);
      await startStream(mic);
    } catch (e) {
      console.error('Error starting transcription:', e);
      setError(true);
      setErrorMessage(e);
    } finally {
      mic.stop();
      setRecording(false);
      setMicStream(undefined);
    }
  };

  const stopTranscription = () => {
    if (micStream) {
      micStream.stop();
      setRecording(false);
      setMicStream(undefined);
    }
  };

  const resetTranscripts = () => {
    setError(false);
    setErrorMessage('');
    setTranscripts([]);
  };

  return {
    startTranscription,
    stopTranscription,
    resetTranscripts,
    recording,
    transcripts,
    error,
    errorMessage,
  };
};

export default useTranscribe;

Could someone please help me understand what I'm doing wrong and how to implement this correctly?
I'm stuck and would really appreciate any assistance.

McVyp · 2025-04-11T07:27:41Z

McVyp
Apr 11, 2025

@10000leaves

I've added some changes which I'm not sure might be enough for your needs. Please let me know if I missed anything.

API

I checked out your code and it's looking good overall! The main thing I noticed is just how you're handling errors. Rather than sending back a 500 status (which triggers those annoying browser errors), I tweaked it to return a 200 status with a friendly message instead.

The key difference is in the catch block:

catch (error) {
    console.error('Error in transcription:', error);
    // Return a 200 status instead of 500 to suppress browser errors
    return NextResponse.json(
        {
            transcript: '',
            status: 'warning',
            message: 'Could not transcribe audio. This service only supports Japanese.'
        },
        { status: 200 }
    );
}

This approach prevents browser error messages and gives you more control over how errors are presented to users

// src/app/api/transcribe/route.ts
import { NextRequest, NextResponse } from 'next/server';
import {
    TranscribeStreamingClient,
    StartStreamTranscriptionCommand,
    AudioStream,
} from '@aws-sdk/client-transcribe-streaming';
export const runtime = 'nodejs';
export async function POST(request: NextRequest) {
    let client: TranscribeStreamingClient | undefined;

    try {
        client = new TranscribeStreamingClient({
            region: process.env.NEXT_AWS_REGION,
            credentials: {
                accessKeyId: process.env.NEXT_AWS_TRANSCRIBE_ACCESS_KEY_ID!,
                secretAccessKey: process.env.NEXT_AWS_TRANSCRIBE_SECRET_ACCESS_KEY!,
            },
        });
        const { audioChunk } = await request.json();
        const audioStream: AsyncIterable<AudioStream> = {
            async *[Symbol.asyncIterator]() {
                yield {
                    AudioEvent: {
                        AudioChunk: Buffer.from(audioChunk)
                    }
                };
            }
        };
        const command = new StartStreamTranscriptionCommand({
            LanguageCode: 'ja-JP',
            MediaEncoding: 'pcm',
            MediaSampleRateHertz: 44100,
            AudioStream: audioStream,
        });
        const response = await client.send(command);
        let transcriptResult = '';
        if (response.TranscriptResultStream) {
            for await (const event of response.TranscriptResultStream) {
                if (event.TranscriptEvent?.Transcript?.Results?.[0]) {
                    const result = event.TranscriptEvent.Transcript.Results[0];
                    if (result.Alternatives?.[0]?.Transcript) {
                        transcriptResult = result.Alternatives[0].Transcript;
                    }
                }
            }
        }
        return NextResponse.json({ transcript: transcriptResult });
    } catch (error) {
        console.error('Error in transcription:', error);
        // Return a 200 status instead of 500 to suppress browser errors
        return NextResponse.json(
            {
                transcript: '',
                status: 'warning',
                message: 'Could not transcribe audio. This service only supports Japanese.'
            },
            { status: 200 }
        );
    } finally {
        client?.destroy();
    }
}

Hook

Added `"use client"`

This change makes the file run on the browser side (client-side). This is necessary because the hook uses React's useState and accesses browser-specific APIs (like navigator.mediaDevices), which don't work on the server.

Fixed a Bug in the Error Handler

There was a syntax error in how the error message was set in the catch block. I fixed it to:

setErrorMessage(e instanceof Error ? e.message : String(e));

Code

// src/hooks/useTranscribe.ts
"use client"
import MicrophoneStream from 'microphone-stream';
import { useState } from 'react';
import update from 'immutability-helper';
import { Buffer } from 'buffer';

const pcmEncodeChunk = (chunk: Buffer) => {
    const input = MicrophoneStream.toRaw(chunk);
    let offset = 0;
    const buffer = new ArrayBuffer(input.length * 2);
    const view = new DataView(buffer);
    for (let i = 0; i < input.length; i++, offset += 2) {
        const s = Math.max(-1, Math.min(1, input[i]));
        view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
    }
    return Buffer.from(buffer);
};

const useTranscribe = () => {
    const [micStream, setMicStream] = useState<MicrophoneStream | undefined>();
    const [recording, setRecording] = useState(false);
    const [error, setError] = useState(false);
    const [errorMessage, setErrorMessage] = useState<string>('');
    const [transcripts, setTranscripts] = useState<
        {
            isPartial: boolean;
            transcript: string;
        }[]
    >([]);

    const startStream = async (mic: MicrophoneStream) => {
        for await (const chunk of mic as unknown as Buffer[]) {
            try {
                const encodedChunk = pcmEncodeChunk(chunk);

                const response = await fetch('/api/transcribe', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json',
                    },
                    body: JSON.stringify({
                        audioChunk: Array.from(encodedChunk)
                    }),
                });

                if (!response.ok) {
                    throw new Error('Transcription request failed');
                }

                const data = await response.json();

                if (data.transcript) {
                    setTranscripts((prev) => {
                        const index = prev.length - 1;
                        if (prev.length === 0 || !prev[prev.length - 1].isPartial) {
                            return update(prev, {
                                $push: [{ isPartial: false, transcript: data.transcript }],
                            });
                        } else {
                            return update(prev, {
                                $splice: [[index, 1, { isPartial: false, transcript: data.transcript }]],
                            });
                        }
                    });
                }
            } catch (e) {
                console.error('Error in transcription:', e);
                setError(true);
                setErrorMessage(e instanceof Error ? e.message : String(e));
            }
        }
    };

    const startTranscription = async () => {
        const mic = new MicrophoneStream();
        try {
            setMicStream(mic);
            mic.setStream(
                await window.navigator.mediaDevices.getUserMedia({
                    video: false,
                    audio: {
                        channelCount: 1,
                        sampleRate: 44100,
                    },
                })
            );

            setError(false);
            setErrorMessage('');
            setTranscripts([]);
            setRecording(true);
            await startStream(mic);
        } catch (e) {
            console.error('Error starting transcription:', e);
            setError(true);
            setErrorMessage(e instanceof Error ? e.message : String(e));
        } finally {
            mic.stop();
            setRecording(false);
            setMicStream(undefined);
        }
    };

    const stopTranscription = () => {
        if (micStream) {
            micStream.stop();
            setRecording(false);
            setMicStream(undefined);
        }
    };

    const resetTranscripts = () => {
        setError(false);
        setErrorMessage('');
        setTranscripts([]);
    };

    return {
        startTranscription,
        stopTranscription,
        resetTranscripts,
        recording,
        transcripts,
        error,
        errorMessage,
    };
};

export default useTranscribe;

0 replies

McVyp · 2025-04-11T08:15:23Z

McVyp
Apr 11, 2025

Update on Transcription Issue:

After implementing the initial changes, I encountered another issue with AWS Transcribe:

Error in transcription: BadRequestException: Your stream is too big. Reduce the frame size and try your request again.
    at async POST (src/app/api/translate/route.ts:39:29)

Solution:

API

// src/app/api/transcribe/route.ts
import { NextRequest, NextResponse } from 'next/server';
import {
    TranscribeStreamingClient,
    StartStreamTranscriptionCommand,
    AudioStream,
} from '@aws-sdk/client-transcribe-streaming';

export const runtime = 'nodejs';

const MAX_FRAME_SIZE = 4000;

export async function POST(request: NextRequest) {
    let client: TranscribeStreamingClient | undefined;

    try {
        client = new TranscribeStreamingClient({
            region: process.env.NEXT_AWS_REGION,
            credentials: {
                accessKeyId: process.env.NEXT_AWS_TRANSCRIBE_ACCESS_KEY_ID!,
                secretAccessKey: process.env.NEXT_AWS_TRANSCRIBE_SECRET_ACCESS_KEY!,
            },
            requestHandler: {
                timeoutInMs: 10000 // 10-second timeout on requests
            }
        });

        // Parse the audio chunk from the request
        const { audioChunk } = await request.json();
        const fullBuffer = Buffer.from(audioChunk);

        // Split the buffer into smaller chunks to avoid "stream too big" error
        const chunks: Buffer[] = [];
        for (let i = 0; i < fullBuffer.length; i += MAX_FRAME_SIZE) {
            chunks.push(fullBuffer.slice(i, i + MAX_FRAME_SIZE));
        }

        // Create an audio stream with proper error handling that sends smaller chunks
        const audioStream: AsyncIterable<AudioStream> = {
            async *[Symbol.asyncIterator]() {
                try {
                    for (const chunk of chunks) {
                        yield {
                            AudioEvent: {
                                AudioChunk: chunk
                            }
                        };
                    }
                } catch (err) {
                    console.error('Error in audio stream generator:', err);
                    throw err;
                }
            }
        };

        const command = new StartStreamTranscriptionCommand({
            LanguageCode: 'ja-JP',
            MediaEncoding: 'pcm',
            MediaSampleRateHertz: 44100,
            AudioStream: audioStream,
        });

        const response = await client.send(command);
        let transcriptResult = '';

        if (response.TranscriptResultStream) {
            try {
                for await (const event of response.TranscriptResultStream) {
                    if (event.TranscriptEvent?.Transcript?.Results?.[0]) {
                        const result = event.TranscriptEvent.Transcript.Results[0];
                        if (result.Alternatives?.[0]?.Transcript) {
                            transcriptResult = result.Alternatives[0].Transcript;
                        }
                    }
                }
            } catch (streamError) {
                console.error('Error processing transcript stream:', streamError);
                if (transcriptResult) {
                    return NextResponse.json({ transcript: transcriptResult });
                }
                throw streamError; 
            }
        }

        return NextResponse.json({ transcript: transcriptResult });
    } catch (error) {
        console.error('Error in transcription:', error);
        // Return a 200 status instead of 500 to suppress browser errors
        return NextResponse.json(
            {
                transcript: '',
                status: 'warning',
                message: 'Could not transcribe audio. This service only supports Japanese.'
            },
            { status: 200 }
        );
    } finally {
        if (client) {
            try {
                await client.destroy();
            } catch (destroyError) {
                console.error('Error destroying client:', destroyError);
            }
        }
    }
}

HOOK

// src/hooks/useTranscribe.ts
"use client"
import MicrophoneStream from 'microphone-stream';
import { useState, useRef } from 'react';
import update from 'immutability-helper';
import { Buffer } from 'buffer';

const pcmEncodeChunk = (chunk: Buffer) => {
    const input = MicrophoneStream.toRaw(chunk);
    let offset = 0;
    const buffer = new ArrayBuffer(input.length * 2);
    const view = new DataView(buffer);
    for (let i = 0; i < input.length; i++, offset += 2) {
        const s = Math.max(-1, Math.min(1, input[i]));
        view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
    }
    return Buffer.from(buffer);
};

const useTranscribe = () => {
    const [micStream, setMicStream] = useState<MicrophoneStream | undefined>();
    const [recording, setRecording] = useState(false);
    const [error, setError] = useState(false);
    const [errorMessage, setErrorMessage] = useState<string>('');
    const [transcripts, setTranscripts] = useState<
        {
            isPartial: boolean;
            transcript: string;
        }[]
    >([]);
    const abortControllerRef = useRef<AbortController | null>(null);

    const startStream = async (mic: MicrophoneStream) => {
        // Create a new AbortController for this stream session
        abortControllerRef.current = new AbortController();
        const signal = abortControllerRef.current.signal;

        let accumulatedChunks: Buffer[] = [];
        // Collection interval in milliseconds - adjust based on your needs
        // Shorter intervals mean more frequent API calls but less data per call
        const SEND_INTERVAL_MS = 500;
        let lastSendTime = Date.now();

        for await (const chunk of mic as unknown as Buffer[]) {
            // Check if the stream has been aborted
            if (signal.aborted) {
                break;
            }

            try {
                const encodedChunk = pcmEncodeChunk(chunk);

                // Add to accumulated buffer
                accumulatedChunks.push(encodedChunk);

                const currentTime = Date.now();
                // Only send if we've accumulated enough time or data
                if (currentTime - lastSendTime >= SEND_INTERVAL_MS) {
                    // Combine all accumulated chunks
                    const combinedChunk = Buffer.concat(accumulatedChunks);

                    const response = await fetch('/api/transcribe', {
                        method: 'POST',
                        headers: {
                            'Content-Type': 'application/json',
                        },
                        body: JSON.stringify({
                            audioChunk: Array.from(combinedChunk)
                        }),
                        signal, // Pass the abort signal to the fetch request
                    });

                    // Reset accumulation
                    accumulatedChunks = [];
                    lastSendTime = currentTime;

                    if (!response.ok) {
                        throw new Error(`Transcription request failed: ${response.status}`);
                    }

                    const data = await response.json();
                    if (data.status === 'warning') {
                        console.warn('Transcription warning:', data.message);
                    }

                    if (data.transcript) {
                        setTranscripts((prev) => {
                            const index = prev.length - 1;
                            if (prev.length === 0 || !prev[prev.length - 1].isPartial) {
                                return update(prev, {
                                    $push: [{ isPartial: false, transcript: data.transcript }],
                                });
                            } else {
                                return update(prev, {
                                    $splice: [[index, 1, { isPartial: false, transcript: data.transcript }]],
                                });
                            }
                        });
                    }
                }
            } catch (e) {
                if (e instanceof DOMException && e.name === 'AbortError') {
                    console.log('Fetch request aborted');
                    continue;
                }

                console.error('Error in transcription:', e);
                setError(true);
                setErrorMessage(e instanceof Error ? e.message : String(e));
                break;
            }
        }
    };

    const startTranscription = async () => {
        const mic = new MicrophoneStream();
        try {
            setMicStream(mic);
            const stream = await window.navigator.mediaDevices.getUserMedia({
                video: false,
                audio: {
                    channelCount: 1,
                    sampleRate: 44100,
                },
            });

            mic.setStream(stream);

            setError(false);
            setErrorMessage('');
            setTranscripts([]);
            setRecording(true);
            await startStream(mic);
        } catch (e) {
            console.error('Error starting transcription:', e);
            setError(true);
            setErrorMessage(e instanceof Error ? e.message : String(e));
        } finally {
            if (mic) {
                try {
                    mic.stop();
                } catch (stopError) {
                    console.error('Error stopping microphone:', stopError);
                }
            }
            setRecording(false);
            setMicStream(undefined);
        }
    };

    const stopTranscription = () => {
        if (abortControllerRef.current) {
            abortControllerRef.current.abort();
            abortControllerRef.current = null;
        }

        if (micStream) {
            micStream.stop();
            setRecording(false);
            setMicStream(undefined);
        }
    };

    const resetTranscripts = () => {
        setError(false);
        setErrorMessage('');
        setTranscripts([]);
    };

    return {
        startTranscription,
        stopTranscription,
        resetTranscripts,
        recording,
        transcripts,
        error,
        errorMessage,
    };
};

export default useTranscribe;

Key changes:
1. Added frame size limiting (MAX_FRAME_SIZE = 4000) on the API side
2. Implemented chunk accumulation with time-based intervals in the hook
3. Added proper error handling and AbortController for clean cancellation

1 reply

10000leaves Apr 14, 2025
Author

@McVyp
Thanks for making the change, I'll give it a try!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implementing Real-time Speech-to-Text Transcription using AWS Transcribe with Next.js API Routes #6655

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 2 comments 1 reply

{{title}}

{{title}}

{{title}}

Select a reply

Implementing Real-time Speech-to-Text Transcription using AWS Transcribe with Next.js API Routes #6655

10000leaves Nov 13, 2024

package

API

hooks

Replies: 2 comments · 1 reply

McVyp Apr 11, 2025

API

Hook

Added "use client"

Fixed a Bug in the Error Handler

Code

McVyp Apr 11, 2025

Update on Transcription Issue:

Solution:

API

HOOK

10000leaves Apr 14, 2025 Author

10000leaves
Nov 13, 2024

Replies: 2 comments 1 reply

McVyp
Apr 11, 2025

Added `"use client"`

McVyp
Apr 11, 2025

10000leaves Apr 14, 2025
Author