balaboom123
diff --git a/‎configs/extract_mediapipe.py‎
Lines changed: 20 additions & 3 deletions b/‎configs/extract_mediapipe.py‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎configs/extract_mmpose.py‎
Lines changed: 15 additions & 1 deletion b/‎configs/extract_mmpose.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎configs/reduction_normalization.py‎
Lines changed: 27 additions & 4 deletions b/‎configs/reduction_normalization.py‎
Lines changed: 27 additions & 4 deletions
diff --git a/‎scripts/3a_extract_mediapipe.py‎
Lines changed: 25 additions & 7 deletions b/‎scripts/3a_extract_mediapipe.py‎
Lines changed: 25 additions & 7 deletions
diff --git a/‎scripts/3b_extract_mmpose.py‎
Lines changed: 20 additions & 4 deletions b/‎scripts/3b_extract_mmpose.py‎
Lines changed: 20 additions & 4 deletions
@@ -38,8 +38,21 @@
 # Adjust based on available CPU cores and memory
 MAX_WORKERS = 4
 
+# =============================================================================
+# KEYPOINT REDUCTION CONFIGURATION
+# =============================================================================
+
+# Whether to apply keypoint reduction during extraction (Step 3)
+# True:  Reduce to 85 selected keypoints (6 pose + 41 face + 42 hands)
+#        Output shape: (T, 85, 4) with [x, y, z, visibility]
+# False: Keep all MediaPipe Holistic landmarks (532 total keypoints)
+#        Output shape: (T, 532, 4) with [x, y, z, visibility]
+#        Keypoint reduction deferred to Step 4
+REDUCTION = True
+
 # =============================================================================
 # MEDIAPIPE LANDMARK INDICES (BlazePose Format)
+# Only used when REDUCTION=True
 # =============================================================================
 
 # Hand landmarks: All 21 keypoints per hand (wrist, palm, fingers)
@@ -53,7 +66,11 @@
     294, 311, 323, 362, 386, 397, 468, 473
 ]
 
-# Total output dimensions per frame: 340 features (85 keypoints × 4)
-# = 6 pose + 37 face + 21 left_hand + 21 right_hand = 85 keypoints
+# Total output dimensions per frame (when REDUCTION=True):
+# = 6 pose + 41 face + 21 left_hand + 21 right_hand = 85 keypoints
 # = 85 keypoints × [x, y, z, visibility] = 340 features
-# Note: Reduction to 255 (85 × 3) happens in Step 4 after normalization
+# Note: Visibility channel removal (85 × 3 = 255) happens in Step 4 after normalization
+#
+# When REDUCTION=False: 532 keypoints × 4 = 2128 features
+# MediaPipe Holistic provides: 33 pose + 468 face + 21 left_hand + 21 right_hand = 543 total
+# (Note: Actual count may vary based on MediaPipe version)
@@ -34,8 +34,21 @@
 # Note: Each worker loads models into GPU memory, adjust based on available GPU memory
 MAX_WORKERS = 4
 
+# =============================================================================
+# KEYPOINT REDUCTION CONFIGURATION
+# =============================================================================
+
+# Whether to apply keypoint reduction during extraction (Step 3)
+# True:  Reduce to 85 selected keypoints optimized for ASL
+#        Output shape: (T, 85, 4) with [x, y, z, visibility]
+# False: Keep all COCO-WholeBody keypoints (133 total)
+#        Output shape: (T, 133, 4) with [x, y, z, visibility]
+#        Keypoint reduction deferred to Step 4
+REDUCTION = False
+
 # =============================================================================
 # COCO-WHOLEBODY KEYPOINT SELECTION (85 keypoints total)
+# Only used when REDUCTION=True
 # =============================================================================
 
 # Selected keypoints from COCO-WholeBody format (133 total keypoints)
@@ -57,7 +70,8 @@
     71, 73, 75, 77, 79, 81, 83, 84, 85, 86, 87, 88, 89, 90,
 ] + list(range(91, 133))  # All hand landmarks (42 points): 21 left + 21 right
 
-# Total: 6 body + 37 face + 42 hands = 85 keypoints
+# Total output (when REDUCTION=True): 6 body + 37 face + 42 hands = 85 keypoints
+# When REDUCTION=False: All 133 COCO-WholeBody keypoints (17 body + 68 face + 42 hands + 6 feet)
 
 # =============================================================================
 # MODEL PATHS (RTMPose3D + RTMDet)
 
@@ -25,6 +25,24 @@
 # Set to 1 for debugging, increase based on available CPU cores
 MAX_WORKERS = 4
 
+# =============================================================================
+# KEYPOINT REDUCTION CONFIGURATION
+# =============================================================================
+
+# Whether to apply keypoint reduction during normalization (Step 4)
+# True:  Reduce input keypoints to specified indices (e.g., 133 → 85)
+#        Useful when Step 3 outputs all keypoints (REDUCTION=False in extraction)
+# False: Keep all keypoints from input (no reduction)
+#        Use when Step 3 already applied reduction (REDUCTION=True in extraction)
+REDUCTION = True
+
+# Keypoint indices to select when REDUCTION=True
+# None (default): Auto-select based on input keypoint count
+#   - For 133 keypoints (MMPose): Use COCO-WholeBody 85-keypoint subset
+#   - For 532 keypoints (MediaPipe): Use ASL-optimized 85-keypoint subset
+# List[int]: Custom indices to select (e.g., list(range(85)))
+KEYPOINT_INDICES = None
+
 # =============================================================================
 # VISIBILITY MASKING CONFIGURATION
 # =============================================================================
@@ -38,7 +56,7 @@
 # Enable/disable landmark-level masking
 # When True: individual landmarks with low visibility are set to UNVISIBLE_LANDMARK
 # Requires VISIBILITY_THRESHOLD
-MASK_LANDMARK_LEVEL = True
+MASK_LANDMARK_LEVEL = False
 UNVISIBLE_LANDMARK = -999.0
 
 # Visibility threshold: landmarks with visibility < this value are masked
@@ -54,9 +72,14 @@
 # False: Output shape (T, 255) with x, y, z coordinates
 REMOVE_Z = False
 
-# Normalization method: Isotropic unit bounding box scaling
-# Computes single scale factor across x, y, z to preserve aspect ratios
-NORMALIZATION_METHOD = 'minmax'
+# Normalization mode:
+# - "isotropic_3d": Original YouTube-ASL paper (one scale factor over x,y,z)
+#                   Good for MediaPipe Holistic with metric 3D consistency
+# - "xy_isotropic_z_minmax": Recommended for MMPose 3D (default)
+#                            xy: isotropic scaling in image plane
+#                            z:  per-clip min-max scaling for relative depth
+#                            Better for 2D pose + lifted depth without metric scale
+NORMALIZATION_MODE = "xy_isotropic_z_minmax"
 
 # =============================================================================
 # NOTES
 
@@ -20,9 +20,14 @@
     - POSE_IDX, FACE_IDX, HAND_IDX: Landmark indices to extract
 
 Output Format:
-    NumPy arrays (.npy) with shape (T, 85, 4) where:
+    NumPy arrays (.npy) with shape depending on REDUCTION config:
+    When REDUCTION=True (default):
+        Shape: (T, 85, 4)
+        85 keypoints: 6 pose + 41 face + 21 left_hand + 21 right_hand
+    When REDUCTION=False:
+        Shape: (T, 543, 4)
+        543 keypoints: All MediaPipe Holistic landmarks
     - T: Number of frames
-    - 85: Total keypoints (6 pose + 37 face + 21 left_hand + 21 right_hand)
     - 4: [x, y, z, visibility] per keypoint
 
 Note:
@@ -95,16 +100,24 @@ def process_video_segment(
         extractor = MediaPipeExtractor(
             pose_idx=cfg.POSE_IDX,
             face_idx=cfg.FACE_IDX,
-            hand_idx=cfg.HAND_IDX
+            hand_idx=cfg.HAND_IDX,
+            min_detection_confidence=0.3,
+            min_tracking_confidence=0.3,
+            apply_reduction=cfg.REDUCTION,
         )
 
         landmark_sequences = []
         current_frame = start_frame
 
-        # Number of landmarks: pose + face + left hand + right hand
-        num_landmarks = (
-            len(cfg.POSE_IDX) + len(cfg.FACE_IDX) + 2 * len(cfg.HAND_IDX)
-        )
+        # Number of landmarks depends on REDUCTION setting
+        if cfg.REDUCTION:
+            # Reduced keypoints: pose + face + left hand + right hand
+            num_landmarks = (
+                len(cfg.POSE_IDX) + len(cfg.FACE_IDX) + 2 * len(cfg.HAND_IDX)
+            )
+        else:
+            # All MediaPipe Holistic keypoints: 33 pose + 468 face + 21*2 hands
+            num_landmarks = 543  # Total MediaPipe Holistic landmarks
 
         while current_frame <= end_frame:
             ret, frame = cap.read()
@@ -198,6 +211,11 @@ def main():
 
     logger.info(f"Found {len(video_files)} video files")
     logger.info(f"Configuration:")
+    logger.info(f"  - Keypoint reduction: {cfg.REDUCTION}")
+    if cfg.REDUCTION:
+        logger.info(f"  - Output keypoints: 85 (ASL-optimized subset)")
+    else:
+        logger.info(f"  - Output keypoints: 543 (all MediaPipe Holistic landmarks)")
     logger.info(f"  - FPS reduction: {cfg.REDUCE_FPS_TO}")
     logger.info(f"  - Frame skip: {cfg.FRAME_SKIP}")
     logger.info(f"  - FPS range filter: {cfg.ACCEPT_VIDEO_FPS_WITHIN}")
 
@@ -25,9 +25,14 @@
     - MAX_WORKERS: Number of parallel worker processes
 
 Output Format:
-    NumPy arrays (.npy) with shape (T, 85, 4) where:
+    NumPy arrays (.npy) with shape depending on REDUCTION config:
+    When REDUCTION=True (default):
+        Shape: (T, 85, 4)
+        85 keypoints: ASL-optimized COCO-WholeBody subset
+    When REDUCTION=False:
+        Shape: (T, 133, 4)
+        133 keypoints: All COCO-WholeBody keypoints
     - T: Number of frames
-    - 85: Total keypoints (COCO-WholeBody subset)
     - 4: [x, y, z, visibility] per keypoint
 
 Note:
@@ -162,6 +167,7 @@ def process_video_segment(
             bbox_threshold=cfg.BBOX_THR,
             det_cat_id=cfg.DET_CAT_ID,
             add_visible=cfg.ADD_VISIBLE,
+            apply_reduction=cfg.REDUCTION,
         )
 
         landmark_sequences = []
@@ -170,8 +176,13 @@ def process_video_segment(
         # Track whether multiple persons are detected in this segment
         multi_person = False
 
-        # Number of landmarks equals the number of selected keypoints
-        num_landmarks = len(cfg.COCO_WHOLEBODY_IDX)
+        # Number of landmarks depends on REDUCTION setting
+        if cfg.REDUCTION:
+            # Reduced keypoints: ASL-optimized subset
+            num_landmarks = len(cfg.COCO_WHOLEBODY_IDX)
+        else:
+            # All COCO-WholeBody keypoints
+            num_landmarks = 133
 
         while current_frame <= end_frame:
             ret, frame = cap.read()
@@ -286,6 +297,11 @@ def main():
 
     logger.info(f"Found {len(video_files)} video files")
     logger.info(f"Configuration:")
+    logger.info(f"  - Keypoint reduction: {cfg.REDUCTION}")
+    if cfg.REDUCTION:
+        logger.info(f"  - Output keypoints: 85 (ASL-optimized subset)")
+    else:
+        logger.info(f"  - Output keypoints: 133 (all COCO-WholeBody)")
     logger.info(f"  - FPS reduction: {cfg.REDUCE_FPS_TO}")
     logger.info(f"  - Frame skip: {cfg.FRAME_SKIP}")
     logger.info(f"  - FPS range filter: {cfg.ACCEPT_VIDEO_FPS_WITHIN}")