balaboom123
diff --git a/‎configs/extract_mediapipe.py‎
Lines changed: 4 additions & 9 deletions b/‎configs/extract_mediapipe.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎configs/reduction_normalization.py‎
Lines changed: 87 additions & 0 deletions b/‎configs/reduction_normalization.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 4 additions & 5 deletions b/‎requirements.txt‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎scripts/3a_extract_mediapipe.py‎
Lines changed: 20 additions & 4 deletions b/‎scripts/3a_extract_mediapipe.py‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎scripts/3b_extract_mmpose.py‎
Lines changed: 39 additions & 10 deletions b/‎scripts/3b_extract_mmpose.py‎
Lines changed: 39 additions & 10 deletions
@@ -44,13 +44,7 @@
 
 # Hand landmarks: All 21 keypoints per hand (wrist, palm, fingers)
 HAND_IDX = list(range(21))
-
-# Pose landmarks: Key upper body points for ASL recognition
-# Indices correspond to: shoulders, elbows, hips
 POSE_IDX = [11, 12, 13, 14, 23, 24]
-
-# Face landmarks: Selected facial features for expressions and mouth shapes
-# Total: 37 keypoints covering face contour, eyes, eyebrows, nose, and mouth
 FACE_IDX = [
     # Face contour and key points
     0, 4, 13, 14, 17, 33, 37, 39, 46, 52, 55, 61, 64, 81, 82, 93,
@@ -59,6 +53,7 @@
     294, 311, 323, 362, 386, 397, 468, 473
 ]
 
-# Total output dimension: 255 features
-# = 6 pose × 3 coords + 37 face × 3 coords + 21 left_hand × 3 coords + 21 right_hand × 3 coords
-# = 18 + 111 + 63 + 63 = 255
+# Total output dimensions per frame: 340 features (85 keypoints × 4)
+# = 6 pose + 37 face + 21 left_hand + 21 right_hand = 85 keypoints
+# = 85 keypoints × [x, y, z, visibility] = 340 features
+# Note: Reduction to 255 (85 × 3) happens in Step 4 after normalization
@@ -0,0 +1,87 @@
+"""Configuration for landmark reduction and normalization (scripts/4_reduction_normalization.py)"""
+import os
+
+# =============================================================================
+# PROJECT PATHS
+# =============================================================================
+
+# Base paths
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+# Input directory: raw landmarks from Step 3 (shape: seq_length, num_keypoint, 4)
+INPUT_DIR = os.path.join(ROOT, "dataset", "npy")
+
+# Output directory: normalized landmarks (shape: T, num_keypoint * num_coordinate)
+OUTPUT_DIR = os.path.join(ROOT, "dataset", "npy_normalized")
+
+# =============================================================================
+# PROCESSING CONFIGURATION
+# =============================================================================
+
+# Skip files that already exist in OUTPUT_DIR
+SKIP_EXISTING = True
+
+# Maximum number of worker processes for parallel processing
+# Set to 1 for debugging, increase based on available CPU cores
+MAX_WORKERS = 4
+
+# =============================================================================
+# VISIBILITY MASKING CONFIGURATION
+# =============================================================================
+
+# Enable/disable frame-level masking
+# When True: placeholder frames (all zeros) are set to UNVISIBLE_FRAME
+# Frame-level masking does not use VISIBILITY_THRESHOLD
+MASK_FRAME_LEVEL = True
+UNVISIBLE_FRAME = -999.0
+
+# Enable/disable landmark-level masking
+# When True: individual landmarks with low visibility are set to UNVISIBLE_LANDMARK
+# Requires VISIBILITY_THRESHOLD
+MASK_LANDMARK_LEVEL = True
+UNVISIBLE_LANDMARK = -999.0
+
+# Visibility threshold: landmarks with visibility < this value are masked
+# Only used when MASK_LANDMARK_LEVEL = True
+VISIBILITY_THRESHOLD = 0.3
+
+# =============================================================================
+# NORMALIZATION CONFIGURATION
+# =============================================================================
+
+# Whether to remove z-coordinate (reduces to 2D landmarks)
+# True: Output shape (T, 170) with only x, y coordinates
+# False: Output shape (T, 255) with x, y, z coordinates
+REMOVE_Z = False
+
+# Normalization method: Isotropic unit bounding box scaling
+# Computes single scale factor across x, y, z to preserve aspect ratios
+NORMALIZATION_METHOD = 'minmax'
+
+# =============================================================================
+# NOTES
+# =============================================================================
+
+# Paper methodology (YouTube-ASL):
+# 1. Use MediaPipe Holistic to extract 532 landmarks
+# 2. Reduce to 85 selected landmarks (done in Step 3)
+# 3. Normalize by scaling to fit in unit bounding box across clip duration
+# 4. Represent missing landmarks with large negative value (-999)
+# 5. Ignore visibility in final output (remove 4th dimension)
+# 6. Final output: (T, 255) where 255 = 85 keypoints � 3 coords
+
+# Normalization implementation:
+# - Clip-wise: Compute bounding box across entire video clip, not per-frame
+# - Isotropic: Single scale factor for x, y, z (preserves aspect ratios)
+# - Algorithm:
+#   1. Collect all valid 3D points (x, y, z) from entire clip
+#   2. Compute 3D bounding box: [min_x, min_y, min_z] to [max_x, max_y, max_z]
+#   3. Find max_range = max(x_range, y_range, z_range)
+#   4. Scale: p_norm = (p - coord_min) / max_range
+#   5. Result: Landmarks fit in 1×1×1 unit cube
+# - Sentinel values: UNVISIBLE_FRAME and UNVISIBLE_LANDMARK remain at -999.0
+#
+# Two-level masking system:
+# 1. Frame-level (MASK_FRAME_LEVEL): Entire frames with no detection → UNVISIBLE_FRAME
+# 2. Landmark-level (MASK_LANDMARK_LEVEL): Individual landmarks with low visibility → UNVISIBLE_LANDMARK
+# Both levels are independent and can be enabled/disabled separately
@@ -1,14 +1,13 @@
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
 yt-dlp
 youtube-transcript-api
 tqdm
-numpy
+numpy==1.26.4
 pandas
 opencv-python
 mediapipe
 psutil
-mmpose==1.3.2
-mmcv==2.0.1
-mmdet==3.1.0
-mmengine==0.10.7
 ftfy==6.3.1
 matplotlib==3.9.4
@@ -20,9 +20,14 @@
     - POSE_IDX, FACE_IDX, HAND_IDX: Landmark indices to extract
 
 Output Format:
-    NumPy arrays (.npy) with shape (T, 255) where:
+    NumPy arrays (.npy) with shape (T, 85, 4) where:
     - T: Number of frames
-    - 255: Flattened landmarks (6 pose + 37 face + 21 left_hand + 21 right_hand) × 3 coords
+    - 85: Total keypoints (6 pose + 37 face + 21 left_hand + 21 right_hand)
+    - 4: [x, y, z, visibility] per keypoint
+
+Note:
+    This is the raw landmark extraction. Normalization and visibility masking
+    are applied in Step 4 (scripts/4_reduction_normalization.py)
 """
 import os
 import sys
@@ -96,6 +101,11 @@ def process_video_segment(
         landmark_sequences = []
         current_frame = start_frame
 
+        # Number of landmarks: pose + face + left hand + right hand
+        num_landmarks = (
+            len(cfg.POSE_IDX) + len(cfg.FACE_IDX) + 2 * len(cfg.HAND_IDX)
+        )
+
         while current_frame <= end_frame:
             ret, frame = cap.read()
             if not ret:
@@ -104,8 +114,14 @@ def process_video_segment(
             # Use sampler to decide whether to process this frame
             if sampler.take():
                 landmarks = extractor.process_frame(frame)
-                if landmarks is not None:
-                    landmark_sequences.append(landmarks)
+
+                # If no landmarks are detected, append a placeholder
+                if landmarks is None:
+                    landmarks = np.zeros(
+                        (num_landmarks, 4), dtype=np.float32
+                    )
+
+                landmark_sequences.append(landmarks)
 
             current_frame += 1
 
 
@@ -25,10 +25,14 @@
     - MAX_WORKERS: Number of parallel worker processes
 
 Output Format:
-    NumPy arrays (.npy) with shape:
-    - (T, 340) if ADD_VISIBLE=True: 85 keypoints × (x, y, z, visible)
-    - (T, 255) if ADD_VISIBLE=False: 85 keypoints × (x, y, z)
-    where T is the number of frames
+    NumPy arrays (.npy) with shape (T, 85, 4) where:
+    - T: Number of frames
+    - 85: Total keypoints (COCO-WholeBody subset)
+    - 4: [x, y, z, visibility] per keypoint
+
+Note:
+    This is the raw landmark extraction. Normalization and visibility masking
+    are applied in Step 4 (scripts/4_reduction_normalization.py)
 """
 import os
 import sys
@@ -53,7 +57,7 @@
 from src.asl_prep.pipeline.processor import read_manifest_csv, build_processing_tasks
 from src.asl_prep.common.files import get_video_filenames
 from src.asl_prep.common.video import FPSSampler
-from src.asl_prep.extractors.mmpose import MMPoseExtractor
+from src.asl_prep.extractors.mmpose import MMPoseExtractor, MultiPersonDetected
 
 try:
     from mmdet.apis import init_detector
@@ -163,21 +167,46 @@ def process_video_segment(
         landmark_sequences = []
         current_frame = start_frame
 
+        # Track whether multiple persons are detected in this segment
+        multi_person = False
+
+        # Number of landmarks equals the number of selected keypoints
+        num_landmarks = len(cfg.COCO_WHOLEBODY_IDX)
+
         while current_frame <= end_frame:
             ret, frame = cap.read()
             if not ret:
                 break
 
             # Use sampler to decide whether to process this frame
             if sampler.take():
-                landmarks = extractor.process_frame(frame)
-                if landmarks is not None:
-                    landmark_sequences.append(landmarks)
+                try:
+                    landmarks = extractor.process_frame(frame)
+                except MultiPersonDetected as e:
+                    logger.warning(
+                        f"Multiple persons detected in segment "
+                        f"{video_path} [{start_time:.3f}, {end_time:.3f}] - {e}. "
+                        f"Skipping this segment."
+                    )
+                    multi_person = True
+                    break
+
+                # If no person is detected in this frame, append a placeholder
+                if landmarks is None:
+                    landmarks = np.zeros(
+                        (num_landmarks, 4), dtype=np.float32
+                    )
+
+                landmark_sequences.append(landmarks)
 
             current_frame += 1
 
-        # Save landmarks if valid data exists
-        if landmark_sequences:
+        if multi_person:
+            logger.info(
+                f"Segment skipped due to multiple persons: "
+                f"{video_path} [{start_time:.3f}, {end_time:.3f}]"
+            )
+        elif landmark_sequences:
             landmark_array = np.array(landmark_sequences)
             if landmark_array.size > 0 and np.any(landmark_array):
                 os.makedirs(os.path.dirname(output_file), exist_ok=True)