Skip to content

Commit 9966847

Browse files
committed
Add configurable keypoint reduction with dual normalization modes
Changes: - Add REDUCTION flag to extraction configs to defer keypoint selection to normalization stage - Add KEYPOINT_INDICES parameter to reduction config for flexible keypoint selection - Implement dual normalization modes: isotropic_3d (MediaPipe) and xy_isotropic_z_minmax (MMPose) - Update MediaPipe extraction to support full 532-keypoint output when REDUCTION=False - Update MMPose extraction to support full 133-keypoint COCO-WholeBody output when REDUCTION=False - Refactor normalization stage to handle variable input keypoint counts (85/133/532) - Set MMPose default to REDUCTION=False and MediaPipe default to REDUCTION=True - Update visibility masking: disable landmark-level masking by default - Add auto-detection of keypoint indices based on input shape in normalization
1 parent 4927bf1 commit 9966847

File tree

8 files changed

+466
-118
lines changed

8 files changed

+466
-118
lines changed

configs/extract_mediapipe.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,21 @@
3838
# Adjust based on available CPU cores and memory
3939
MAX_WORKERS = 4
4040

41+
# =============================================================================
42+
# KEYPOINT REDUCTION CONFIGURATION
43+
# =============================================================================
44+
45+
# Whether to apply keypoint reduction during extraction (Step 3)
46+
# True: Reduce to 85 selected keypoints (6 pose + 41 face + 42 hands)
47+
# Output shape: (T, 85, 4) with [x, y, z, visibility]
48+
# False: Keep all MediaPipe Holistic landmarks (532 total keypoints)
49+
# Output shape: (T, 532, 4) with [x, y, z, visibility]
50+
# Keypoint reduction deferred to Step 4
51+
REDUCTION = True
52+
4153
# =============================================================================
4254
# MEDIAPIPE LANDMARK INDICES (BlazePose Format)
55+
# Only used when REDUCTION=True
4356
# =============================================================================
4457

4558
# Hand landmarks: All 21 keypoints per hand (wrist, palm, fingers)
@@ -53,7 +66,11 @@
5366
294, 311, 323, 362, 386, 397, 468, 473
5467
]
5568

56-
# Total output dimensions per frame: 340 features (85 keypoints × 4)
57-
# = 6 pose + 37 face + 21 left_hand + 21 right_hand = 85 keypoints
69+
# Total output dimensions per frame (when REDUCTION=True):
70+
# = 6 pose + 41 face + 21 left_hand + 21 right_hand = 85 keypoints
5871
# = 85 keypoints × [x, y, z, visibility] = 340 features
59-
# Note: Reduction to 255 (85 × 3) happens in Step 4 after normalization
72+
# Note: Visibility channel removal (85 × 3 = 255) happens in Step 4 after normalization
73+
#
74+
# When REDUCTION=False: 532 keypoints × 4 = 2128 features
75+
# MediaPipe Holistic provides: 33 pose + 468 face + 21 left_hand + 21 right_hand = 543 total
76+
# (Note: Actual count may vary based on MediaPipe version)

configs/extract_mmpose.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,21 @@
3434
# Note: Each worker loads models into GPU memory, adjust based on available GPU memory
3535
MAX_WORKERS = 4
3636

37+
# =============================================================================
38+
# KEYPOINT REDUCTION CONFIGURATION
39+
# =============================================================================
40+
41+
# Whether to apply keypoint reduction during extraction (Step 3)
42+
# True: Reduce to 85 selected keypoints optimized for ASL
43+
# Output shape: (T, 85, 4) with [x, y, z, visibility]
44+
# False: Keep all COCO-WholeBody keypoints (133 total)
45+
# Output shape: (T, 133, 4) with [x, y, z, visibility]
46+
# Keypoint reduction deferred to Step 4
47+
REDUCTION = False
48+
3749
# =============================================================================
3850
# COCO-WHOLEBODY KEYPOINT SELECTION (85 keypoints total)
51+
# Only used when REDUCTION=True
3952
# =============================================================================
4053

4154
# Selected keypoints from COCO-WholeBody format (133 total keypoints)
@@ -57,7 +70,8 @@
5770
71, 73, 75, 77, 79, 81, 83, 84, 85, 86, 87, 88, 89, 90,
5871
] + list(range(91, 133)) # All hand landmarks (42 points): 21 left + 21 right
5972

60-
# Total: 6 body + 37 face + 42 hands = 85 keypoints
73+
# Total output (when REDUCTION=True): 6 body + 37 face + 42 hands = 85 keypoints
74+
# When REDUCTION=False: All 133 COCO-WholeBody keypoints (17 body + 68 face + 42 hands + 6 feet)
6175

6276
# =============================================================================
6377
# MODEL PATHS (RTMPose3D + RTMDet)

configs/reduction_normalization.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,24 @@
2525
# Set to 1 for debugging, increase based on available CPU cores
2626
MAX_WORKERS = 4
2727

28+
# =============================================================================
29+
# KEYPOINT REDUCTION CONFIGURATION
30+
# =============================================================================
31+
32+
# Whether to apply keypoint reduction during normalization (Step 4)
33+
# True: Reduce input keypoints to specified indices (e.g., 133 → 85)
34+
# Useful when Step 3 outputs all keypoints (REDUCTION=False in extraction)
35+
# False: Keep all keypoints from input (no reduction)
36+
# Use when Step 3 already applied reduction (REDUCTION=True in extraction)
37+
REDUCTION = True
38+
39+
# Keypoint indices to select when REDUCTION=True
40+
# None (default): Auto-select based on input keypoint count
41+
# - For 133 keypoints (MMPose): Use COCO-WholeBody 85-keypoint subset
42+
# - For 532 keypoints (MediaPipe): Use ASL-optimized 85-keypoint subset
43+
# List[int]: Custom indices to select (e.g., list(range(85)))
44+
KEYPOINT_INDICES = None
45+
2846
# =============================================================================
2947
# VISIBILITY MASKING CONFIGURATION
3048
# =============================================================================
@@ -38,7 +56,7 @@
3856
# Enable/disable landmark-level masking
3957
# When True: individual landmarks with low visibility are set to UNVISIBLE_LANDMARK
4058
# Requires VISIBILITY_THRESHOLD
41-
MASK_LANDMARK_LEVEL = True
59+
MASK_LANDMARK_LEVEL = False
4260
UNVISIBLE_LANDMARK = -999.0
4361

4462
# Visibility threshold: landmarks with visibility < this value are masked
@@ -54,9 +72,14 @@
5472
# False: Output shape (T, 255) with x, y, z coordinates
5573
REMOVE_Z = False
5674

57-
# Normalization method: Isotropic unit bounding box scaling
58-
# Computes single scale factor across x, y, z to preserve aspect ratios
59-
NORMALIZATION_METHOD = 'minmax'
75+
# Normalization mode:
76+
# - "isotropic_3d": Original YouTube-ASL paper (one scale factor over x,y,z)
77+
# Good for MediaPipe Holistic with metric 3D consistency
78+
# - "xy_isotropic_z_minmax": Recommended for MMPose 3D (default)
79+
# xy: isotropic scaling in image plane
80+
# z: per-clip min-max scaling for relative depth
81+
# Better for 2D pose + lifted depth without metric scale
82+
NORMALIZATION_MODE = "xy_isotropic_z_minmax"
6083

6184
# =============================================================================
6285
# NOTES

scripts/3a_extract_mediapipe.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,14 @@
2020
- POSE_IDX, FACE_IDX, HAND_IDX: Landmark indices to extract
2121
2222
Output Format:
23-
NumPy arrays (.npy) with shape (T, 85, 4) where:
23+
NumPy arrays (.npy) with shape depending on REDUCTION config:
24+
When REDUCTION=True (default):
25+
Shape: (T, 85, 4)
26+
85 keypoints: 6 pose + 41 face + 21 left_hand + 21 right_hand
27+
When REDUCTION=False:
28+
Shape: (T, 543, 4)
29+
543 keypoints: All MediaPipe Holistic landmarks
2430
- T: Number of frames
25-
- 85: Total keypoints (6 pose + 37 face + 21 left_hand + 21 right_hand)
2631
- 4: [x, y, z, visibility] per keypoint
2732
2833
Note:
@@ -95,16 +100,24 @@ def process_video_segment(
95100
extractor = MediaPipeExtractor(
96101
pose_idx=cfg.POSE_IDX,
97102
face_idx=cfg.FACE_IDX,
98-
hand_idx=cfg.HAND_IDX
103+
hand_idx=cfg.HAND_IDX,
104+
min_detection_confidence=0.3,
105+
min_tracking_confidence=0.3,
106+
apply_reduction=cfg.REDUCTION,
99107
)
100108

101109
landmark_sequences = []
102110
current_frame = start_frame
103111

104-
# Number of landmarks: pose + face + left hand + right hand
105-
num_landmarks = (
106-
len(cfg.POSE_IDX) + len(cfg.FACE_IDX) + 2 * len(cfg.HAND_IDX)
107-
)
112+
# Number of landmarks depends on REDUCTION setting
113+
if cfg.REDUCTION:
114+
# Reduced keypoints: pose + face + left hand + right hand
115+
num_landmarks = (
116+
len(cfg.POSE_IDX) + len(cfg.FACE_IDX) + 2 * len(cfg.HAND_IDX)
117+
)
118+
else:
119+
# All MediaPipe Holistic keypoints: 33 pose + 468 face + 21*2 hands
120+
num_landmarks = 543 # Total MediaPipe Holistic landmarks
108121

109122
while current_frame <= end_frame:
110123
ret, frame = cap.read()
@@ -198,6 +211,11 @@ def main():
198211

199212
logger.info(f"Found {len(video_files)} video files")
200213
logger.info(f"Configuration:")
214+
logger.info(f" - Keypoint reduction: {cfg.REDUCTION}")
215+
if cfg.REDUCTION:
216+
logger.info(f" - Output keypoints: 85 (ASL-optimized subset)")
217+
else:
218+
logger.info(f" - Output keypoints: 543 (all MediaPipe Holistic landmarks)")
201219
logger.info(f" - FPS reduction: {cfg.REDUCE_FPS_TO}")
202220
logger.info(f" - Frame skip: {cfg.FRAME_SKIP}")
203221
logger.info(f" - FPS range filter: {cfg.ACCEPT_VIDEO_FPS_WITHIN}")

scripts/3b_extract_mmpose.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,14 @@
2525
- MAX_WORKERS: Number of parallel worker processes
2626
2727
Output Format:
28-
NumPy arrays (.npy) with shape (T, 85, 4) where:
28+
NumPy arrays (.npy) with shape depending on REDUCTION config:
29+
When REDUCTION=True (default):
30+
Shape: (T, 85, 4)
31+
85 keypoints: ASL-optimized COCO-WholeBody subset
32+
When REDUCTION=False:
33+
Shape: (T, 133, 4)
34+
133 keypoints: All COCO-WholeBody keypoints
2935
- T: Number of frames
30-
- 85: Total keypoints (COCO-WholeBody subset)
3136
- 4: [x, y, z, visibility] per keypoint
3237
3338
Note:
@@ -162,6 +167,7 @@ def process_video_segment(
162167
bbox_threshold=cfg.BBOX_THR,
163168
det_cat_id=cfg.DET_CAT_ID,
164169
add_visible=cfg.ADD_VISIBLE,
170+
apply_reduction=cfg.REDUCTION,
165171
)
166172

167173
landmark_sequences = []
@@ -170,8 +176,13 @@ def process_video_segment(
170176
# Track whether multiple persons are detected in this segment
171177
multi_person = False
172178

173-
# Number of landmarks equals the number of selected keypoints
174-
num_landmarks = len(cfg.COCO_WHOLEBODY_IDX)
179+
# Number of landmarks depends on REDUCTION setting
180+
if cfg.REDUCTION:
181+
# Reduced keypoints: ASL-optimized subset
182+
num_landmarks = len(cfg.COCO_WHOLEBODY_IDX)
183+
else:
184+
# All COCO-WholeBody keypoints
185+
num_landmarks = 133
175186

176187
while current_frame <= end_frame:
177188
ret, frame = cap.read()
@@ -286,6 +297,11 @@ def main():
286297

287298
logger.info(f"Found {len(video_files)} video files")
288299
logger.info(f"Configuration:")
300+
logger.info(f" - Keypoint reduction: {cfg.REDUCTION}")
301+
if cfg.REDUCTION:
302+
logger.info(f" - Output keypoints: 85 (ASL-optimized subset)")
303+
else:
304+
logger.info(f" - Output keypoints: 133 (all COCO-WholeBody)")
289305
logger.info(f" - FPS reduction: {cfg.REDUCE_FPS_TO}")
290306
logger.info(f" - Frame skip: {cfg.FRAME_SKIP}")
291307
logger.info(f" - FPS range filter: {cfg.ACCEPT_VIDEO_FPS_WITHIN}")

0 commit comments

Comments
 (0)