Skip to content

Commit f843fb6

Browse files
committed
Add MMPose 3D pose estimation pipeline and improve video processing robustness
- Add s3_mmpose_labelling.py with RTMPose3D-based 3D keypoint extraction using COCO-WholeBody format - Add MMPose dependencies (mmpose, mmcv, mmdet, mmengine) to requirements.txt - Implement FPS-based frame sampling with two modes: reduce-to-target-fps and skip-every-N strategies - Add video FPS validation with configurable acceptable range (ACCEPT_VIDEO_FPS_WITHIN) - Add minimum segment duration filter (0.2s) to skip too-short segments - Configure MMPose model checkpoints and detection parameters in conf.py - Improve text normalization in s2_transcript_preprocess.py using ftfy library for Unicode correction - Update default CSV_FILE path to How2Sign validation dataset - Refactor MediaPipe frame sampling to use new FPSSampler class for consistent FPS handling
1 parent 147aee1 commit f843fb6

File tree

5 files changed

+793
-48
lines changed

5 files changed

+793
-48
lines changed

conf.py

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,27 @@
1111
TRANSCRIPT_DIR = f"{ROOT}/dataset/transcript/"
1212

1313
# Dataset files
14-
ID = "youtube-asl_youtube_asl_video_ids.txt"
15-
CSV_FILE = f"youtube_asl.csv"
14+
ID = "resource/youtube-asl_youtube_asl_video_ids.txt"
15+
CSV_FILE = f"dataset/how2sign/how2sign_realigned_val.csv" # "resource/youtube_asl.csv"
1616

1717
# =============================================================================
1818
# PROCESSING CONFIGURATION
1919
# =============================================================================
2020

21-
# Frame processing
22-
FRAME_SKIP = 2 # Number of frames to skip when extracting frames from a video
21+
# Option to downsample frames to a fixed FPS (takes priority over FRAME_SKIP)
22+
# Note: Only downsamples, does not upsample (if source fps < REDUCE_FPS_TO, keeps every frame)
23+
REDUCE_FPS_TO = 14.0 # default = 24; set to None to disable FPS reduction
24+
25+
# Frame sampling when NOT using REDUCE_FPS_TO (sample every Nth frame)
26+
FRAME_SKIP = 2 # e.g., 2 means sampling rate is 1/2
27+
28+
# Accepted video FPS range (videos outside this range will be skipped)
29+
ACCEPT_VIDEO_FPS_WITHIN = (24.0, 60.0) # default: (24, 60)
2330

2431
# Threading
2532
MAX_WORKERS = 4
2633

27-
# FPS reduction
34+
# FPS reduction (legacy setting for s4_fps_reduce.py, can be ignored if not used)
2835
TARGET_FPS = 8.0 # Target FPS for reduced landmark data
2936

3037
# Supported languages
@@ -76,18 +83,45 @@
7683
# =============================================================================
7784
# MEDIAPIPE LANDMARK INDICES
7885
# =============================================================================
79-
80-
# Hand landmarks (21 points)
86+
# BlazePose format (Mediapipe)
8187
HAND_IDX = list(range(21))
82-
83-
# Pose landmarks (key body points)
8488
POSE_IDX = [11, 12, 13, 14, 23, 24]
85-
86-
# Face landmarks (key facial features)
8789
FACE_IDX = [
8890
0, 4, 13, 14, 17, 33, 37, 39, 46, 52, 55, 61, 64, 81, 82, 93,
8991
133, 151, 152, 159, 172, 178, 181, 263, 269, 276, 282, 285, 291,
9092
294, 311, 323, 362, 386, 397, 468, 473
9193
]
9294

95+
# =============================================================================
96+
# MMPOSE PROCESSING CONFIGURATION
97+
# =============================================================================
98+
# ==== COCO-WholeBody indices (133 total, 0-indexed) ====
99+
COCO_WHOLEBODY_IDX = [
100+
5, 6, 7, 8, 11, 12, # shoulders, elbows, hips - 6 points
101+
23, 25, 27, 29, 31, 33, 35, 37, 39, # face shape - 9 points
102+
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, # eyes brows - 10 points
103+
52, 54, 56, 58, # nose - 4 points
104+
71, 73, 75, 77, 79, 81, 83, 84, 85, 86, 87, 88, 89, 90 # mouth - 14 points
105+
] + list(range(91, 133)) # all 21*2 hand landmarks - 42 points
106+
107+
# 3D Pose estimator configuration
108+
POSE_MODEL_CHECKPOINT = 'models/checkpoints/rtmw3d-l_8xb64_cocktail14-384x288-794dbc78_20240626.pth'
109+
POSE_MODEL_CHECKPOINT_LINK = 'https://download.openmmlab.com/mmpose/v1/wholebody_3d_keypoint/rtmw3d/rtmw3d-l_8xb64_cocktail14-384x288-794dbc78_20240626.pth'
110+
POSE_MODEL_CONFIG = 'models/configs/rtmw3d-l_8xb64_cocktail14-384x288.py'
111+
112+
# Detector configuration (for person detection)
113+
DET_MODEL_CHECKPOINT = 'models/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth'
114+
DET_MODEL_CHECKPOINT_LINK = 'https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth'
115+
DET_MODEL_CONFIG = 'models/configs/rtmdet_m_640-8xb32_coco-person.py'
116+
117+
# Output format configuration
118+
# If True, output shape is (T, NUM_KEYPOINTS*4) with (x, y, z, visible)
119+
# If False, output shape is (T, NUM_KEYPOINTS*3) with (x, y, z)
120+
ADD_VISIBLE = True
121+
122+
# Detection and inference parameters
123+
BBOX_THR = 0.5 # Bounding box score threshold for person detection
124+
KPT_THR = 0.3 # Keypoint score threshold for pose estimation
125+
DET_CAT_ID = 0 # Category ID for person detection in COCO dataset
126+
93127

requirements.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,10 @@ numpy
55
pandas
66
opencv-python
77
mediapipe
8-
psutil
8+
psutil
9+
mmpose==1.3.2
10+
mmcv==2.0.1
11+
mmdet==3.1.0
12+
mmengine==0.10.7
13+
ftfy==6.3.1
14+
matplotlib==3.9.4

s2_transcript_preprocess.py

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,41 +4,38 @@
44
import re
55
import numpy as np
66
import pandas as pd
7-
7+
import ftfy
88
import conf as c # Keeping original conf import name
99

1010

1111
def normalize_text(text):
1212
"""
13-
Normalizes text by replacing Unicode characters, removing non-ASCII characters,
14-
bracketed content, and standardizing whitespace.
13+
Normalizes text by primarily fixing Unicode issues with ftfy and then
14+
performing minimal, non-semantic cleaning to align with YouTube-ASL paper's spirit.
15+
16+
This function aims to:
17+
1. Correct mojibake and other Unicode encoding errors (ftfy).
18+
2. Standardize whitespace (remove extra spaces, newlines).
19+
3. Remove bracketed descriptive content (e.g., [Laughter]).
20+
4. Remove non-ASCII characters that are not part of standard English text,
21+
assuming the target is English captions for ASL.
22+
23+
It explicitly AVOIDS:
24+
- Lowercasing (as per the paper's statement).
25+
- Removing punctuation (beyond bracketed content).
26+
- Any other semantic normalization (stemming, lemmatization, stop words).
1527
1628
Args:
17-
text (str): Input text to be normalized
29+
text (str): Input text to be normalized.
1830
1931
Returns:
20-
str: Cleaned and normalized text in lowercase
32+
str: Cleaned and corrected text.
2133
"""
22-
unicode_mappings = {
23-
"\u201c": '"',
24-
"\u201d": '"',
25-
"\u2014": "-",
26-
"\u2018": "'",
27-
"\u2019": "'",
28-
"\u2026": "...",
29-
"\n": " ",
30-
"\r": " ",
31-
}
32-
33-
# Replace Unicode characters with ASCII equivalents
34-
pattern = re.compile("|".join(map(re.escape, unicode_mappings)))
35-
text = pattern.sub(lambda match: unicode_mappings[match.group()], text)
36-
37-
# Clean text using regex patterns
38-
text = re.sub(r"[^\x00-\x7F]+", "", text) # Remove non-ASCII
39-
text = re.sub(r"\[.*?\]", "", text) # Remove bracketed content
40-
text = re.sub(r"\s+", " ", text) # Standardize whitespace
41-
34+
text = ftfy.fix_text(text)
35+
text = text.replace("\n", " ").replace("\r", " ")
36+
text = re.sub(r"\[.*?\]", "", text)
37+
text = re.sub(r"[^\x00-\x7F]+", "", text)
38+
text = re.sub(r"\s+", " ", text)
4239
return text.strip()
4340

4441

s3_mediapipe_labelling.py

Lines changed: 77 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def validate_video_file(video_path: str) -> bool:
3434
"""
3535
if not os.path.exists(video_path):
3636
return False
37-
37+
3838
try:
3939
cap = cv2.VideoCapture(video_path)
4040
is_valid = cap.isOpened()
@@ -44,6 +44,53 @@ def validate_video_file(video_path: str) -> bool:
4444
return False
4545

4646

47+
def _get_video_fps(video_path: str) -> float:
48+
"""
49+
Return video FPS (float). Returns 0.0 if FPS cannot be obtained.
50+
"""
51+
try:
52+
cap = cv2.VideoCapture(video_path)
53+
if not cap.isOpened():
54+
return 0.0
55+
fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
56+
cap.release()
57+
return float(fps)
58+
except Exception:
59+
return 0.0
60+
61+
62+
class FPSSampler:
63+
"""
64+
Two sampling strategies:
65+
1) reduce mode (priority): Downsample source fps to target fps
66+
(uses accumulation error method for non-integer ratios, solves 30->24 etc.)
67+
2) skip mode: Sample every Nth frame.
68+
"""
69+
def __init__(self, src_fps: float, reduce_to: float | None, frame_skip_by: int):
70+
self.mode = 'reduce' if (reduce_to is not None and src_fps > 0) else 'skip'
71+
if self.mode == 'reduce':
72+
# Only downsample: if target >= src, sample every frame (equivalent to no reduction)
73+
self.target = min(reduce_to, src_fps)
74+
# Accumulation error method (Bresenham-like): accumulate r=target/src per frame,
75+
# when acc>=1, sample and acc-=1
76+
self.r = self.target / max(src_fps, 1e-6)
77+
self.acc = 0.0
78+
else:
79+
self.n = max(int(frame_skip_by), 1)
80+
self.count = 0
81+
82+
def take(self) -> bool:
83+
if self.mode == 'reduce':
84+
self.acc += self.r
85+
if self.acc >= 1.0:
86+
self.acc -= 1.0
87+
return True
88+
return False
89+
take_now = (self.count % self.n) == 0
90+
self.count += 1
91+
return take_now
92+
93+
4794
def process_mediapipe_detection(image, model):
4895
"""
4996
Processes an image through MediaPipe detection model.
@@ -100,9 +147,10 @@ def process_video_segment(video_path: str, start_time: float, end_time: float, o
100147
logger.error(f"Error opening video: {video_path}")
101148
return
102149

103-
# Determine frame skip rate based on video FPS
104-
fps = cap.get(cv2.CAP_PROP_FPS)
105-
frame_skip = 1 if fps <= 15 else c.FRAME_SKIP
150+
# FPS & create sampler
151+
fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
152+
target_fps = None if getattr(c, "REDUCE_FPS_TO", None) is None else float(c.REDUCE_FPS_TO)
153+
sampler = FPSSampler(src_fps=fps, reduce_to=target_fps, frame_skip_by=c.FRAME_SKIP)
106154

107155
# Calculate frame ranges
108156
start_frame, end_frame = int(start_time * fps), int(end_time * fps)
@@ -124,7 +172,7 @@ def process_video_segment(video_path: str, start_time: float, end_time: float, o
124172
if not ret:
125173
break
126174

127-
if (current_frame - start_frame) % frame_skip == 0:
175+
if sampler.take():
128176
results = process_mediapipe_detection(frame, holistic)
129177
landmark_sequences.append(extract_landmark_coordinates(results))
130178

@@ -206,6 +254,9 @@ def main():
206254
skipped_due_to_invalid_video = 0
207255
skipped_due_to_existing_file = 0
208256
skipped_due_to_duration = 0
257+
skipped_due_to_fps_range = 0
258+
skipped_due_to_too_short = 0
259+
video_fps_cache = {}
209260

210261
processing_tasks = []
211262
for _, row in timestamp_data.iterrows():
@@ -220,30 +271,45 @@ def main():
220271
if sentence_name in processed_files:
221272
skipped_due_to_existing_file += 1
222273
continue
223-
224-
# Skip if duration is too long
225-
if end - start > 60:
274+
275+
# Segment duration limits: 200ms <= duration <= 60 seconds
276+
seg_dur = float(end - start)
277+
if seg_dur < 0.2:
278+
skipped_due_to_too_short += 1
279+
continue
280+
if seg_dur > 60.0:
226281
skipped_due_to_duration += 1
227282
continue
228-
283+
229284
# Validate video file (use cache to avoid repeated checks)
230285
if video_path not in video_validation_cache:
231286
video_validation_cache[video_path] = validate_video_file(video_path)
232287
if not video_validation_cache[video_path]:
233288
invalid_videos.add(video_name)
234289
logger.warning(f"Invalid or missing video file: {video_path}")
235-
290+
236291
if not video_validation_cache[video_path]:
237292
skipped_due_to_invalid_video += 1
238293
continue
239-
294+
295+
# Video FPS filtering
296+
if video_path not in video_fps_cache:
297+
video_fps_cache[video_path] = _get_video_fps(video_path)
298+
vfps = video_fps_cache[video_path]
299+
min_fps, max_fps = c.ACCEPT_VIDEO_FPS_WITHIN
300+
if vfps <= 0.0 or vfps < float(min_fps) or vfps > float(max_fps):
301+
skipped_due_to_fps_range += 1
302+
continue
303+
240304
processing_tasks.append((video_path, start, end, output_path))
241305

242306
# Log summary of skipped tasks
243307
logger.info(f"Task summary:")
244308
logger.info(f" - Tasks to process: {len(processing_tasks)}")
245309
logger.info(f" - Skipped (existing files): {skipped_due_to_existing_file}")
246310
logger.info(f" - Skipped (duration > 60s): {skipped_due_to_duration}")
311+
logger.info(f" - Skipped (duration < 0.2s): {skipped_due_to_too_short}")
312+
logger.info(f" - Skipped (fps out of {c.ACCEPT_VIDEO_FPS_WITHIN}): {skipped_due_to_fps_range}")
247313
logger.info(f" - Skipped (invalid videos): {skipped_due_to_invalid_video}")
248314
if invalid_videos:
249315
logger.warning(f"Invalid video files found: {', '.join(sorted(invalid_videos))}")

0 commit comments

Comments
 (0)