extend whisper_segment with speaker_turn_next field and save in json …

…output
ggml-org · ggerganov · Jul 4, 2023 · Jun 19, 2023 · Jun 19, 2023 · Jun 19, 2023
commit 713c5b61ad024f3da296539f25af95c97028dbad
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -566,6 +566,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                 const char * text = whisper_full_get_segment_text(ctx, i);
                 const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                 const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+                const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(ctx, i);
 
                 start_obj(nullptr);
                     start_obj("timestamps");
@@ -576,11 +577,13 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                         value_i("from", t0 * 10, false);
                         value_i("to", t1 * 10, true);
                     end_obj(false);
-                    value_s("text", text, !params.diarize);
+                    value_s("text", text, !params.diarize); // TODO@Akash - make configurable with flag
 
                     if (params.diarize && pcmf32s.size() == 2) {
                         value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
                     }
+                    // TODO@Akash - make configurable with flag
+                    value_b("speaker_turn_next", speaker_turn_next, true);
                 end_obj(i == (n_segments - 1));
             }
 

diff --git a/whisper.cpp b/whisper.cpp
@@ -404,6 +404,8 @@ struct whisper_segment {
     std::string text;
 
     std::vector<whisper_token_data> tokens;
+
+    bool speaker_turn_next;
 };
 
 // medium
@@ -4510,18 +4512,24 @@ int whisper_full_with_state(
                 auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
 
                 std::string text;
+                bool speaker_turn_next;
 
                 for (int i = 0; i < (int) tokens_cur.size(); i++) {
                     //printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
                     //        ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
                     //        ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
 
                     if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx) &&
-                        tokens_cur[i].id != whisper_token_solm(ctx)) {
+                        tokens_cur[i].id != whisper_token_solm(ctx)) {  // TODO@Akash - make configurable with flag (may not want it in text)
                     } else {
                         text += whisper_token_to_str(ctx, tokens_cur[i].id);
                     }
 
+                    // record if speaker turn was predicted after current segment
+                    if (tokens_cur[i].id == whisper_token_solm(ctx)){
+                        speaker_turn_next = true;
+                    }
+
                     if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
                         const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
 
@@ -4540,7 +4548,7 @@ int whisper_full_with_state(
 
                             //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
 
-                            result_all.push_back({ tt0, tt1, text, {} });
+                            result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
                             for (int j = i0; j <= i; j++) {
                                 result_all.back().tokens.push_back(tokens_cur[j]);
                             }
@@ -4566,6 +4574,7 @@ int whisper_full_with_state(
                         i--;
                         t0 = t1;
                         i0 = i + 1;
+                        speaker_turn_next = false;
                     }
                 }
 
@@ -4584,7 +4593,7 @@ int whisper_full_with_state(
                         }
                     }
 
-                    result_all.push_back({ tt0, tt1, text, {} });
+                    result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
                     for (int j = i0; j < (int) tokens_cur.size(); j++) {
                         result_all.back().tokens.push_back(tokens_cur[j]);
                     }
@@ -4764,6 +4773,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
     return ctx->state->result_all[i_segment].t1;
 }
 
+bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
+    return ctx->state->result_all[i_segment].speaker_turn_next;
+}
+
 const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
     return state->result_all[i_segment].text.c_str();
 }

diff --git a/whisper.h b/whisper.h
@@ -460,6 +460,9 @@ extern "C" {
     WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
     WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
 
+    // Get whether the next segment is predicted as a speaker turn
+    WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
+
     // Get the text of the specified segment
     WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
     WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);