@@ -56,62 +56,46 @@ def read_transcript_file(json_file):
5656 return json .load (file )
5757
5858
59- def process_transcript_segments (
60- transcripts , video_id , duration = c .DURATION , overlap = c .OVERLAP
61- ):
59+ def process_transcript_segments (transcripts , video_id ):
6260 """
63- Splits transcript into overlapping segments of specified duration.
61+ Processes individual transcript captions, filtering based on length and duration constraints .
6462
6563 Args:
6664 transcripts (list): List of transcript dictionaries
6765 video_id (str): Video identifier for naming segments
68- duration (float): Length of each segment in seconds
69- overlap (float): Overlap duration between segments
7066
7167 Returns:
72- list: List of segmented and processed transcript dictionaries
68+ list: List of processed transcript dictionaries meeting the criteria
7369 """
7470 processed_segments = []
75- segment_start = 0
76- segment_index = - 1
71+ segment_index = 0
7772
7873 # Filter valid transcript entries
79- valid_entries = [t for t in transcripts if "text" in t and "start" in t ]
74+ valid_entries = [t for t in transcripts if "text" in t and "start" in t and "duration" in t ]
8075 if not valid_entries :
8176 print (f"No valid transcripts for video { video_id } " )
8277 return processed_segments
8378
84- final_timestamp = valid_entries [- 1 ]["start" ]
85-
86- while segment_start < final_timestamp :
87- segment_end = segment_start + duration
88- segment_texts = []
89-
90- # Collect text within current segment window
91- for entry in valid_entries :
92- if segment_start <= entry ["start" ] < segment_start + duration :
93- segment_texts .append (entry ["text" ])
94- segment_end = max (
95- segment_end , entry ["start" ] + entry .get ("duration" , 0 )
96- )
97-
98- processed_text = normalize_text (" " .join (segment_texts ))
99- segment_start += duration - overlap
100- segment_index += 1
101-
102- if not processed_text :
103- continue
104-
105- segment_data = {
106- "SENTENCE_NAME" : f"{ video_id } -{ segment_index :03d} " ,
107- "START" : segment_start - (duration - overlap ),
108- "END" : float (np .ceil (segment_end )),
109- "SENTENCE" : processed_text ,
110- }
111- processed_segments .append (segment_data )
112-
113- if valid_entries [- 1 ]["text" ] in segment_texts :
114- break
79+ for entry in valid_entries :
80+ # Get the normalized text
81+ processed_text = normalize_text (entry ["text" ])
82+
83+ # Apply filtering criteria:
84+ # - Text length <= 300 characters
85+ # - Duration between 0.2s and 60s
86+ if (len (processed_text ) <= 300 and
87+ 0.2 <= entry ["duration" ] <= 60.0 and
88+ processed_text ): # Ensure non-empty text
89+
90+ segment_data = {
91+ "VIDEO_NAME" : video_id ,
92+ "SENTENCE_NAME" : f"{ video_id } -{ segment_index :03d} " ,
93+ "START_REALIGNED" : entry ["start" ],
94+ "END_REALIGNED" : entry ["start" ] + entry ["duration" ],
95+ "SENTENCE" : processed_text ,
96+ }
97+ processed_segments .append (segment_data )
98+ segment_index += 1
11599
116100 return processed_segments
117101
@@ -130,6 +114,7 @@ def save_segments_to_csv(segment_data, csv_path):
130114
131115 df .to_csv (
132116 csv_path ,
117+ sep = "\t " ,
133118 mode = mode ,
134119 header = header ,
135120 index = False ,
0 commit comments