balaboom123
diff --git a/‎.idea/markdown.xml‎
Lines changed: 9 additions & 0 deletions b/‎.idea/markdown.xml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎conf.py‎
Lines changed: 4 additions & 2 deletions b/‎conf.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎s2_transcript_preprocess.py‎
Lines changed: 26 additions & 41 deletions b/‎s2_transcript_preprocess.py‎
Lines changed: 26 additions & 41 deletions
diff --git a/‎s3_mediapipe_labelling.py‎
Lines changed: 5 additions & 5 deletions b/‎s3_mediapipe_labelling.py‎
Lines changed: 5 additions & 5 deletions
@@ -18,13 +18,15 @@
 # youtube asl dataset
 ROOT = os.path.dirname(os.path.abspath(__file__))
 ID = "youtube-asl_youtube_asl_video_ids.txt"
+
 VIDEO_DIR = f"{ROOT}/dataset/origin/"
 OUTPUT_DIR = f"{ROOT}/dataset/npy/"
+CSV_FILE = f"youtube_asl.csv"
+
 TRANSCRIPT_DIR = f"{ROOT}/dataset/transcript/"
-CSV_FILE = f"video_info.csv"
 DURATION = 16
 OVERLAP = 4
-MAX_WORKERS = 8
+MAX_WORKERS = 2
 LANGUAGE = [
     "en",
     "ase",
 
@@ -56,62 +56,46 @@ def read_transcript_file(json_file):
         return json.load(file)
 
 
-def process_transcript_segments(
-    transcripts, video_id, duration=c.DURATION, overlap=c.OVERLAP
-):
+def process_transcript_segments(transcripts, video_id):
     """
-    Splits transcript into overlapping segments of specified duration.
+    Processes individual transcript captions, filtering based on length and duration constraints.
 
     Args:
         transcripts (list): List of transcript dictionaries
         video_id (str): Video identifier for naming segments
-        duration (float): Length of each segment in seconds
-        overlap (float): Overlap duration between segments
 
     Returns:
-        list: List of segmented and processed transcript dictionaries
+        list: List of processed transcript dictionaries meeting the criteria
     """
     processed_segments = []
-    segment_start = 0
-    segment_index = -1
+    segment_index = 0
 
     # Filter valid transcript entries
-    valid_entries = [t for t in transcripts if "text" in t and "start" in t]
+    valid_entries = [t for t in transcripts if "text" in t and "start" in t and "duration" in t]
     if not valid_entries:
         print(f"No valid transcripts for video {video_id}")
         return processed_segments
 
-    final_timestamp = valid_entries[-1]["start"]
-
-    while segment_start < final_timestamp:
-        segment_end = segment_start + duration
-        segment_texts = []
-
-        # Collect text within current segment window
-        for entry in valid_entries:
-            if segment_start <= entry["start"] < segment_start + duration:
-                segment_texts.append(entry["text"])
-                segment_end = max(
-                    segment_end, entry["start"] + entry.get("duration", 0)
-                )
-
-        processed_text = normalize_text(" ".join(segment_texts))
-        segment_start += duration - overlap
-        segment_index += 1
-
-        if not processed_text:
-            continue
-
-        segment_data = {
-            "SENTENCE_NAME": f"{video_id}-{segment_index:03d}",
-            "START": segment_start - (duration - overlap),
-            "END": float(np.ceil(segment_end)),
-            "SENTENCE": processed_text,
-        }
-        processed_segments.append(segment_data)
-
-        if valid_entries[-1]["text"] in segment_texts:
-            break
+    for entry in valid_entries:
+        # Get the normalized text
+        processed_text = normalize_text(entry["text"])
+
+        # Apply filtering criteria:
+        # - Text length <= 300 characters
+        # - Duration between 0.2s and 60s
+        if (len(processed_text) <= 300 and
+                0.2 <= entry["duration"] <= 60.0 and
+                processed_text):  # Ensure non-empty text
+
+            segment_data = {
+                "VIDEO_NAME": video_id,
+                "SENTENCE_NAME": f"{video_id}-{segment_index:03d}",
+                "START_REALIGNED": entry["start"],
+                "END_REALIGNED": entry["start"] + entry["duration"],
+                "SENTENCE": processed_text,
+            }
+            processed_segments.append(segment_data)
+            segment_index += 1
 
     return processed_segments
 
@@ -130,6 +114,7 @@ def save_segments_to_csv(segment_data, csv_path):
 
     df.to_csv(
         csv_path,
+        sep="\t",
         mode=mode,
         header=header,
         index=False,
 
@@ -25,12 +25,12 @@ def read_timestamp_data(csv_file: str) -> Dict[str, List[float]]:
         Dict[str, List[float]]: Dictionary mapping segment names to [start, end] timestamps
     """
     try:
-        df = pd.read_csv(csv_file, delimiter=",", on_bad_lines="skip")[
-            ["SENTENCE_NAME", "START", "END"]
+        df = pd.read_csv(csv_file, delimiter="\t", on_bad_lines="skip")[
+            ["SENTENCE_NAME", "START_REALIGNED", "END_REALIGNED"]
         ].dropna()
         return (
-            df.set_index("SENTENCE_NAME")[["START", "END"]]
-            .apply(lambda row: [row["START"], row["END"]], axis=1)
+            df.set_index("SENTENCE_NAME")[["START_REALIGNED", "END_REALIGNED"]]
+            .apply(lambda row: [row["START_REALIGNED"], row["END_REALIGNED"]], axis=1)
             .to_dict()
         )
     except Exception as e:
@@ -135,7 +135,7 @@ def process_video_segment(
 
     # Determine frame skip rate based on video FPS
     fps = cap.get(cv2.CAP_PROP_FPS)
-    frame_skip = 6 if fps > 60 else (4 if fps > 40 else (2 if fps > 20 else 1))
+    frame_skip = 1 if fps <= 16 else c.FRAME_SKIP
 
     # Calculate frame ranges
     start_frame, end_frame = int(start_time * fps), int(end_time * fps)