Skip to content

Commit fc0f409

Browse files
EliEronEliEronjongwook
authored
Write each sentence as a separate line for the txt output (openai#101)
* Write each sentence as a separate line for the txt output Write each sentence as a separate line for the txt output * Update utils.py Co-authored-by: EliEron <[email protected]> Co-authored-by: Jong Wook Kim <[email protected]>
1 parent 520796a commit fc0f409

File tree

2 files changed

+7
-2
lines changed

2 files changed

+7
-2
lines changed

whisper/transcribe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram
1111
from .decoding import DecodingOptions, DecodingResult
1212
from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
13-
from .utils import exact_div, format_timestamp, optional_int, optional_float, str2bool, write_vtt, write_srt
13+
from .utils import exact_div, format_timestamp, optional_int, optional_float, str2bool, write_txt, write_vtt, write_srt
1414

1515
if TYPE_CHECKING:
1616
from .model import Whisper
@@ -295,7 +295,7 @@ def cli():
295295

296296
# save TXT
297297
with open(os.path.join(output_dir, audio_basename + ".txt"), "w", encoding="utf-8") as txt:
298-
print(result["text"], file=txt)
298+
write_txt(result["segments"], file=txt)
299299

300300
# save VTT
301301
with open(os.path.join(output_dir, audio_basename + ".vtt"), "w", encoding="utf-8") as vtt:

whisper/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ def format_timestamp(seconds: float, always_include_hours: bool = False):
4444
return f"{hours_marker}{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
4545

4646

47+
def write_txt(transcript: Iterator[dict], file: TextIO):
48+
for segment in transcript:
49+
print(segment['text'].strip(), file=file, flush=True)
50+
51+
4752
def write_vtt(transcript: Iterator[dict], file: TextIO):
4853
print("WEBVTT\n", file=file)
4954
for segment in transcript:

0 commit comments

Comments
 (0)