Skip to content

Commit 4927bf1

Browse files
committed
Refactor landmark normalization to use isotropic unit bounding box scaling
Changes: - scripts/4_reduction_normalization.py: Implement isotropic unit cube normalization using single scale factor across x,y,z dimensions to preserve aspect ratios of signer motion - configs/reduction_normalization.py: Update comments to document isotropic scaling algorithm with 5-step process - scripts/3a_extract_mediapipe.py: Improve docstrings and inline comments for MediaPipe extraction pipeline - scripts/3b_extract_mmpose.py: Enhance MMPose extraction documentation - configs/extract_mediapipe.py: Add clearer parameter descriptions - src/asl_prep/extractors/mediapipe.py: Update extractor documentation - src/asl_prep/extractors/mmpose.py: Enhance MMPose wrapper comments - src/asl_prep/transcripts/preprocess.py: Clarify transcript processing - requirements.txt: Update dependencies Technical details: - Old: Per-coordinate min-max scaling (distorts aspect ratios) - New: Whole-clip 3D bounding box with max_range = max(x,y,z ranges) - Result: Landmarks fit in 1×1×1 cube, aspect ratios preserved - Sentinel values (-999.0) for missing data remain unchanged - All documentation updated to English with comprehensive explanations
1 parent b3684be commit 4927bf1

File tree

9 files changed

+691
-101
lines changed

9 files changed

+691
-101
lines changed

configs/extract_mediapipe.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,7 @@
4444

4545
# Hand landmarks: All 21 keypoints per hand (wrist, palm, fingers)
4646
HAND_IDX = list(range(21))
47-
48-
# Pose landmarks: Key upper body points for ASL recognition
49-
# Indices correspond to: shoulders, elbows, hips
5047
POSE_IDX = [11, 12, 13, 14, 23, 24]
51-
52-
# Face landmarks: Selected facial features for expressions and mouth shapes
53-
# Total: 37 keypoints covering face contour, eyes, eyebrows, nose, and mouth
5448
FACE_IDX = [
5549
# Face contour and key points
5650
0, 4, 13, 14, 17, 33, 37, 39, 46, 52, 55, 61, 64, 81, 82, 93,
@@ -59,6 +53,7 @@
5953
294, 311, 323, 362, 386, 397, 468, 473
6054
]
6155

62-
# Total output dimension: 255 features
63-
# = 6 pose × 3 coords + 37 face × 3 coords + 21 left_hand × 3 coords + 21 right_hand × 3 coords
64-
# = 18 + 111 + 63 + 63 = 255
56+
# Total output dimensions per frame: 340 features (85 keypoints × 4)
57+
# = 6 pose + 37 face + 21 left_hand + 21 right_hand = 85 keypoints
58+
# = 85 keypoints × [x, y, z, visibility] = 340 features
59+
# Note: Reduction to 255 (85 × 3) happens in Step 4 after normalization

configs/reduction_normalization.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""Configuration for landmark reduction and normalization (scripts/4_reduction_normalization.py)"""
2+
import os
3+
4+
# =============================================================================
5+
# PROJECT PATHS
6+
# =============================================================================
7+
8+
# Base paths
9+
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10+
11+
# Input directory: raw landmarks from Step 3 (shape: seq_length, num_keypoint, 4)
12+
INPUT_DIR = os.path.join(ROOT, "dataset", "npy")
13+
14+
# Output directory: normalized landmarks (shape: T, num_keypoint * num_coordinate)
15+
OUTPUT_DIR = os.path.join(ROOT, "dataset", "npy_normalized")
16+
17+
# =============================================================================
18+
# PROCESSING CONFIGURATION
19+
# =============================================================================
20+
21+
# Skip files that already exist in OUTPUT_DIR
22+
SKIP_EXISTING = True
23+
24+
# Maximum number of worker processes for parallel processing
25+
# Set to 1 for debugging, increase based on available CPU cores
26+
MAX_WORKERS = 4
27+
28+
# =============================================================================
29+
# VISIBILITY MASKING CONFIGURATION
30+
# =============================================================================
31+
32+
# Enable/disable frame-level masking
33+
# When True: placeholder frames (all zeros) are set to UNVISIBLE_FRAME
34+
# Frame-level masking does not use VISIBILITY_THRESHOLD
35+
MASK_FRAME_LEVEL = True
36+
UNVISIBLE_FRAME = -999.0
37+
38+
# Enable/disable landmark-level masking
39+
# When True: individual landmarks with low visibility are set to UNVISIBLE_LANDMARK
40+
# Requires VISIBILITY_THRESHOLD
41+
MASK_LANDMARK_LEVEL = True
42+
UNVISIBLE_LANDMARK = -999.0
43+
44+
# Visibility threshold: landmarks with visibility < this value are masked
45+
# Only used when MASK_LANDMARK_LEVEL = True
46+
VISIBILITY_THRESHOLD = 0.3
47+
48+
# =============================================================================
49+
# NORMALIZATION CONFIGURATION
50+
# =============================================================================
51+
52+
# Whether to remove z-coordinate (reduces to 2D landmarks)
53+
# True: Output shape (T, 170) with only x, y coordinates
54+
# False: Output shape (T, 255) with x, y, z coordinates
55+
REMOVE_Z = False
56+
57+
# Normalization method: Isotropic unit bounding box scaling
58+
# Computes single scale factor across x, y, z to preserve aspect ratios
59+
NORMALIZATION_METHOD = 'minmax'
60+
61+
# =============================================================================
62+
# NOTES
63+
# =============================================================================
64+
65+
# Paper methodology (YouTube-ASL):
66+
# 1. Use MediaPipe Holistic to extract 532 landmarks
67+
# 2. Reduce to 85 selected landmarks (done in Step 3)
68+
# 3. Normalize by scaling to fit in unit bounding box across clip duration
69+
# 4. Represent missing landmarks with large negative value (-999)
70+
# 5. Ignore visibility in final output (remove 4th dimension)
71+
# 6. Final output: (T, 255) where 255 = 85 keypoints � 3 coords
72+
73+
# Normalization implementation:
74+
# - Clip-wise: Compute bounding box across entire video clip, not per-frame
75+
# - Isotropic: Single scale factor for x, y, z (preserves aspect ratios)
76+
# - Algorithm:
77+
# 1. Collect all valid 3D points (x, y, z) from entire clip
78+
# 2. Compute 3D bounding box: [min_x, min_y, min_z] to [max_x, max_y, max_z]
79+
# 3. Find max_range = max(x_range, y_range, z_range)
80+
# 4. Scale: p_norm = (p - coord_min) / max_range
81+
# 5. Result: Landmarks fit in 1×1×1 unit cube
82+
# - Sentinel values: UNVISIBLE_FRAME and UNVISIBLE_LANDMARK remain at -999.0
83+
#
84+
# Two-level masking system:
85+
# 1. Frame-level (MASK_FRAME_LEVEL): Entire frames with no detection → UNVISIBLE_FRAME
86+
# 2. Landmark-level (MASK_LANDMARK_LEVEL): Individual landmarks with low visibility → UNVISIBLE_LANDMARK
87+
# Both levels are independent and can be enabled/disabled separately

requirements.txt

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1+
torch==2.0.1
2+
torchaudio==2.0.2
3+
torchvision==0.15.2
14
yt-dlp
25
youtube-transcript-api
36
tqdm
4-
numpy
7+
numpy==1.26.4
58
pandas
69
opencv-python
710
mediapipe
811
psutil
9-
mmpose==1.3.2
10-
mmcv==2.0.1
11-
mmdet==3.1.0
12-
mmengine==0.10.7
1312
ftfy==6.3.1
1413
matplotlib==3.9.4

scripts/3a_extract_mediapipe.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,14 @@
2020
- POSE_IDX, FACE_IDX, HAND_IDX: Landmark indices to extract
2121
2222
Output Format:
23-
NumPy arrays (.npy) with shape (T, 255) where:
23+
NumPy arrays (.npy) with shape (T, 85, 4) where:
2424
- T: Number of frames
25-
- 255: Flattened landmarks (6 pose + 37 face + 21 left_hand + 21 right_hand) × 3 coords
25+
- 85: Total keypoints (6 pose + 37 face + 21 left_hand + 21 right_hand)
26+
- 4: [x, y, z, visibility] per keypoint
27+
28+
Note:
29+
This is the raw landmark extraction. Normalization and visibility masking
30+
are applied in Step 4 (scripts/4_reduction_normalization.py)
2631
"""
2732
import os
2833
import sys
@@ -96,6 +101,11 @@ def process_video_segment(
96101
landmark_sequences = []
97102
current_frame = start_frame
98103

104+
# Number of landmarks: pose + face + left hand + right hand
105+
num_landmarks = (
106+
len(cfg.POSE_IDX) + len(cfg.FACE_IDX) + 2 * len(cfg.HAND_IDX)
107+
)
108+
99109
while current_frame <= end_frame:
100110
ret, frame = cap.read()
101111
if not ret:
@@ -104,8 +114,14 @@ def process_video_segment(
104114
# Use sampler to decide whether to process this frame
105115
if sampler.take():
106116
landmarks = extractor.process_frame(frame)
107-
if landmarks is not None:
108-
landmark_sequences.append(landmarks)
117+
118+
# If no landmarks are detected, append a placeholder
119+
if landmarks is None:
120+
landmarks = np.zeros(
121+
(num_landmarks, 4), dtype=np.float32
122+
)
123+
124+
landmark_sequences.append(landmarks)
109125

110126
current_frame += 1
111127

scripts/3b_extract_mmpose.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,14 @@
2525
- MAX_WORKERS: Number of parallel worker processes
2626
2727
Output Format:
28-
NumPy arrays (.npy) with shape:
29-
- (T, 340) if ADD_VISIBLE=True: 85 keypoints × (x, y, z, visible)
30-
- (T, 255) if ADD_VISIBLE=False: 85 keypoints × (x, y, z)
31-
where T is the number of frames
28+
NumPy arrays (.npy) with shape (T, 85, 4) where:
29+
- T: Number of frames
30+
- 85: Total keypoints (COCO-WholeBody subset)
31+
- 4: [x, y, z, visibility] per keypoint
32+
33+
Note:
34+
This is the raw landmark extraction. Normalization and visibility masking
35+
are applied in Step 4 (scripts/4_reduction_normalization.py)
3236
"""
3337
import os
3438
import sys
@@ -53,7 +57,7 @@
5357
from src.asl_prep.pipeline.processor import read_manifest_csv, build_processing_tasks
5458
from src.asl_prep.common.files import get_video_filenames
5559
from src.asl_prep.common.video import FPSSampler
56-
from src.asl_prep.extractors.mmpose import MMPoseExtractor
60+
from src.asl_prep.extractors.mmpose import MMPoseExtractor, MultiPersonDetected
5761

5862
try:
5963
from mmdet.apis import init_detector
@@ -163,21 +167,46 @@ def process_video_segment(
163167
landmark_sequences = []
164168
current_frame = start_frame
165169

170+
# Track whether multiple persons are detected in this segment
171+
multi_person = False
172+
173+
# Number of landmarks equals the number of selected keypoints
174+
num_landmarks = len(cfg.COCO_WHOLEBODY_IDX)
175+
166176
while current_frame <= end_frame:
167177
ret, frame = cap.read()
168178
if not ret:
169179
break
170180

171181
# Use sampler to decide whether to process this frame
172182
if sampler.take():
173-
landmarks = extractor.process_frame(frame)
174-
if landmarks is not None:
175-
landmark_sequences.append(landmarks)
183+
try:
184+
landmarks = extractor.process_frame(frame)
185+
except MultiPersonDetected as e:
186+
logger.warning(
187+
f"Multiple persons detected in segment "
188+
f"{video_path} [{start_time:.3f}, {end_time:.3f}] - {e}. "
189+
f"Skipping this segment."
190+
)
191+
multi_person = True
192+
break
193+
194+
# If no person is detected in this frame, append a placeholder
195+
if landmarks is None:
196+
landmarks = np.zeros(
197+
(num_landmarks, 4), dtype=np.float32
198+
)
199+
200+
landmark_sequences.append(landmarks)
176201

177202
current_frame += 1
178203

179-
# Save landmarks if valid data exists
180-
if landmark_sequences:
204+
if multi_person:
205+
logger.info(
206+
f"Segment skipped due to multiple persons: "
207+
f"{video_path} [{start_time:.3f}, {end_time:.3f}]"
208+
)
209+
elif landmark_sequences:
181210
landmark_array = np.array(landmark_sequences)
182211
if landmark_array.size > 0 and np.any(landmark_array):
183212
os.makedirs(os.path.dirname(output_file), exist_ok=True)

0 commit comments

Comments
 (0)