Add MMPose 3D pose estimation pipeline and improve video processing robustness

balaboom123 · balaboom123 · commit f843fb616843 · 2025-10-21T21:19:17.000+08:00
- Add s3_mmpose_labelling.py with RTMPose3D-based 3D keypoint extraction using COCO-WholeBody format
  - Add MMPose dependencies (mmpose, mmcv, mmdet, mmengine) to requirements.txt
  - Implement FPS-based frame sampling with two modes: reduce-to-target-fps and skip-every-N strategies
  - Add video FPS validation with configurable acceptable range (ACCEPT_VIDEO_FPS_WITHIN)
  - Add minimum segment duration filter (0.2s) to skip too-short segments
  - Configure MMPose model checkpoints and detection parameters in conf.py
  - Improve text normalization in s2_transcript_preprocess.py using ftfy library for Unicode correction
  - Update default CSV_FILE path to How2Sign validation dataset
  - Refactor MediaPipe frame sampling to use new FPSSampler class for consistent FPS handling
diff --git a/conf.py b/conf.py
@@ -11,20 +11,27 @@
 TRANSCRIPT_DIR = f"{ROOT}/dataset/transcript/"
 
 # Dataset files
-ID = "youtube-asl_youtube_asl_video_ids.txt"
-CSV_FILE = f"youtube_asl.csv"
+ID = "resource/youtube-asl_youtube_asl_video_ids.txt"
+CSV_FILE = f"dataset/how2sign/how2sign_realigned_val.csv" # "resource/youtube_asl.csv"
 
 # =============================================================================
 # PROCESSING CONFIGURATION
 # =============================================================================
 
-# Frame processing
-FRAME_SKIP = 2  # Number of frames to skip when extracting frames from a video
+# Option to downsample frames to a fixed FPS (takes priority over FRAME_SKIP)
+# Note: Only downsamples, does not upsample (if source fps < REDUCE_FPS_TO, keeps every frame)
+REDUCE_FPS_TO = 14.0  # default = 24; set to None to disable FPS reduction
+
+# Frame sampling when NOT using REDUCE_FPS_TO (sample every Nth frame)
+FRAME_SKIP = 2  # e.g., 2 means sampling rate is 1/2
+
+# Accepted video FPS range (videos outside this range will be skipped)
+ACCEPT_VIDEO_FPS_WITHIN = (24.0, 60.0)  # default: (24, 60)
 
 # Threading
 MAX_WORKERS = 4
 
-# FPS reduction
+# FPS reduction (legacy setting for s4_fps_reduce.py, can be ignored if not used)
 TARGET_FPS = 8.0  # Target FPS for reduced landmark data
 
 # Supported languages
@@ -76,18 +83,45 @@
 # =============================================================================
 # MEDIAPIPE LANDMARK INDICES
 # =============================================================================
-
-# Hand landmarks (21 points)
+# BlazePose format (Mediapipe)
 HAND_IDX = list(range(21))
-
-# Pose landmarks (key body points)
 POSE_IDX = [11, 12, 13, 14, 23, 24]
-
-# Face landmarks (key facial features)
 FACE_IDX = [
     0, 4, 13, 14, 17, 33, 37, 39, 46, 52, 55, 61, 64, 81, 82, 93,
     133, 151, 152, 159, 172, 178, 181, 263, 269, 276, 282, 285, 291,
     294, 311, 323, 362, 386, 397, 468, 473
 ]
 
+# =============================================================================
+# MMPOSE PROCESSING CONFIGURATION
+# =============================================================================
+# ==== COCO-WholeBody indices (133 total, 0-indexed) ====
+COCO_WHOLEBODY_IDX = [
+    5, 6, 7, 8, 11, 12,  # shoulders, elbows, hips - 6 points
+    23, 25, 27, 29, 31, 33, 35, 37, 39,  # face shape - 9 points
+    40, 41, 42, 43, 44, 45, 46, 47, 48, 49,  # eyes brows - 10 points
+    52, 54, 56, 58,  # nose - 4 points
+    71, 73, 75, 77, 79, 81, 83, 84, 85, 86, 87, 88, 89, 90  # mouth - 14 points
+    ] + list(range(91, 133))  # all 21*2 hand landmarks - 42 points
+
+# 3D Pose estimator configuration
+POSE_MODEL_CHECKPOINT = 'models/checkpoints/rtmw3d-l_8xb64_cocktail14-384x288-794dbc78_20240626.pth'
+POSE_MODEL_CHECKPOINT_LINK = 'https://download.openmmlab.com/mmpose/v1/wholebody_3d_keypoint/rtmw3d/rtmw3d-l_8xb64_cocktail14-384x288-794dbc78_20240626.pth'
+POSE_MODEL_CONFIG = 'models/configs/rtmw3d-l_8xb64_cocktail14-384x288.py'
+
+# Detector configuration (for person detection)
+DET_MODEL_CHECKPOINT = 'models/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth'
+DET_MODEL_CHECKPOINT_LINK = 'https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth'
+DET_MODEL_CONFIG = 'models/configs/rtmdet_m_640-8xb32_coco-person.py'
+
+# Output format configuration
+# If True, output shape is (T, NUM_KEYPOINTS*4) with (x, y, z, visible)
+# If False, output shape is (T, NUM_KEYPOINTS*3) with (x, y, z)
+ADD_VISIBLE = True
+
+# Detection and inference parameters
+BBOX_THR = 0.5  # Bounding box score threshold for person detection
+KPT_THR = 0.3   # Keypoint score threshold for pose estimation
+DET_CAT_ID = 0  # Category ID for person detection in COCO dataset
+
 
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,10 @@ numpy
 pandas
 opencv-python
 mediapipe
-psutil
+psutil
+mmpose==1.3.2
+mmcv==2.0.1
+mmdet==3.1.0
+mmengine==0.10.7
+ftfy==6.3.1
+matplotlib==3.9.4
diff --git a/s2_transcript_preprocess.py b/s2_transcript_preprocess.py
@@ -4,41 +4,38 @@
 import re
 import numpy as np
 import pandas as pd
-
+import ftfy
 import conf as c  # Keeping original conf import name
 
 
 def normalize_text(text):
     """
-    Normalizes text by replacing Unicode characters, removing non-ASCII characters,
-    bracketed content, and standardizing whitespace.
+    Normalizes text by primarily fixing Unicode issues with ftfy and then
+    performing minimal, non-semantic cleaning to align with YouTube-ASL paper's spirit.
+
+    This function aims to:
+    1. Correct mojibake and other Unicode encoding errors (ftfy).
+    2. Standardize whitespace (remove extra spaces, newlines).
+    3. Remove bracketed descriptive content (e.g., [Laughter]).
+    4. Remove non-ASCII characters that are not part of standard English text,
+       assuming the target is English captions for ASL.
+
+    It explicitly AVOIDS:
+    - Lowercasing (as per the paper's statement).
+    - Removing punctuation (beyond bracketed content).
+    - Any other semantic normalization (stemming, lemmatization, stop words).
 
     Args:
-        text (str): Input text to be normalized
+        text (str): Input text to be normalized.
 
     Returns:
-        str: Cleaned and normalized text in lowercase
+        str: Cleaned and corrected text.
     """
-    unicode_mappings = {
-        "\u201c": '"',
-        "\u201d": '"',
-        "\u2014": "-",
-        "\u2018": "'",
-        "\u2019": "'",
-        "\u2026": "...",
-        "\n": " ",
-        "\r": " ",
-    }
-
-    # Replace Unicode characters with ASCII equivalents
-    pattern = re.compile("|".join(map(re.escape, unicode_mappings)))
-    text = pattern.sub(lambda match: unicode_mappings[match.group()], text)
-
-    # Clean text using regex patterns
-    text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII
-    text = re.sub(r"\[.*?\]", "", text)  # Remove bracketed content
-    text = re.sub(r"\s+", " ", text)  # Standardize whitespace
-
+    text = ftfy.fix_text(text)
+    text = text.replace("\n", " ").replace("\r", " ")
+    text = re.sub(r"\[.*?\]", "", text)
+    text = re.sub(r"[^\x00-\x7F]+", "", text)
+    text = re.sub(r"\s+", " ", text)
     return text.strip()
 
 
diff --git a/s3_mediapipe_labelling.py b/s3_mediapipe_labelling.py
@@ -34,7 +34,7 @@ def validate_video_file(video_path: str) -> bool:
     """
 	if not os.path.exists(video_path):
 		return False
-	
+
 	try:
 		cap = cv2.VideoCapture(video_path)
 		is_valid = cap.isOpened()
@@ -44,6 +44,53 @@ def validate_video_file(video_path: str) -> bool:
 		return False
 
 
+def _get_video_fps(video_path: str) -> float:
+	"""
+    Return video FPS (float). Returns 0.0 if FPS cannot be obtained.
+    """
+	try:
+		cap = cv2.VideoCapture(video_path)
+		if not cap.isOpened():
+			return 0.0
+		fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
+		cap.release()
+		return float(fps)
+	except Exception:
+		return 0.0
+
+
+class FPSSampler:
+	"""
+    Two sampling strategies:
+      1) reduce mode (priority): Downsample source fps to target fps
+         (uses accumulation error method for non-integer ratios, solves 30->24 etc.)
+      2) skip mode: Sample every Nth frame.
+    """
+	def __init__(self, src_fps: float, reduce_to: float | None, frame_skip_by: int):
+		self.mode = 'reduce' if (reduce_to is not None and src_fps > 0) else 'skip'
+		if self.mode == 'reduce':
+			# Only downsample: if target >= src, sample every frame (equivalent to no reduction)
+			self.target = min(reduce_to, src_fps)
+			# Accumulation error method (Bresenham-like): accumulate r=target/src per frame,
+			# when acc>=1, sample and acc-=1
+			self.r = self.target / max(src_fps, 1e-6)
+			self.acc = 0.0
+		else:
+			self.n = max(int(frame_skip_by), 1)
+			self.count = 0
+
+	def take(self) -> bool:
+		if self.mode == 'reduce':
+			self.acc += self.r
+			if self.acc >= 1.0:
+				self.acc -= 1.0
+				return True
+			return False
+		take_now = (self.count % self.n) == 0
+		self.count += 1
+		return take_now
+
+
 def process_mediapipe_detection(image, model):
 	"""
     Processes an image through MediaPipe detection model.
@@ -100,9 +147,10 @@ def process_video_segment(video_path: str, start_time: float, end_time: float, o
 			logger.error(f"Error opening video: {video_path}")
 			return
 
-		# Determine frame skip rate based on video FPS
-		fps = cap.get(cv2.CAP_PROP_FPS)
-		frame_skip = 1 if fps <= 15 else c.FRAME_SKIP
+		# FPS & create sampler
+		fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
+		target_fps = None if getattr(c, "REDUCE_FPS_TO", None) is None else float(c.REDUCE_FPS_TO)
+		sampler = FPSSampler(src_fps=fps, reduce_to=target_fps, frame_skip_by=c.FRAME_SKIP)
 
 		# Calculate frame ranges
 		start_frame, end_frame = int(start_time * fps), int(end_time * fps)
@@ -124,7 +172,7 @@ def process_video_segment(video_path: str, start_time: float, end_time: float, o
 			if not ret:
 				break
 
-			if (current_frame - start_frame) % frame_skip == 0:
+			if sampler.take():
 				results = process_mediapipe_detection(frame, holistic)
 				landmark_sequences.append(extract_landmark_coordinates(results))
 
@@ -206,6 +254,9 @@ def main():
 	skipped_due_to_invalid_video = 0
 	skipped_due_to_existing_file = 0
 	skipped_due_to_duration = 0
+	skipped_due_to_fps_range = 0
+	skipped_due_to_too_short = 0
+	video_fps_cache = {}
 
 	processing_tasks = []
 	for _, row in timestamp_data.iterrows():
@@ -220,30 +271,45 @@ def main():
 		if sentence_name in processed_files:
 			skipped_due_to_existing_file += 1
 			continue
-		
-		# Skip if duration is too long
-		if end - start > 60:
+
+		# Segment duration limits: 200ms <= duration <= 60 seconds
+		seg_dur = float(end - start)
+		if seg_dur < 0.2:
+			skipped_due_to_too_short += 1
+			continue
+		if seg_dur > 60.0:
 			skipped_due_to_duration += 1
 			continue
-		
+
 		# Validate video file (use cache to avoid repeated checks)
 		if video_path not in video_validation_cache:
 			video_validation_cache[video_path] = validate_video_file(video_path)
 			if not video_validation_cache[video_path]:
 				invalid_videos.add(video_name)
 				logger.warning(f"Invalid or missing video file: {video_path}")
-		
+
 		if not video_validation_cache[video_path]:
 			skipped_due_to_invalid_video += 1
 			continue
-		
+
+		# Video FPS filtering
+		if video_path not in video_fps_cache:
+			video_fps_cache[video_path] = _get_video_fps(video_path)
+		vfps = video_fps_cache[video_path]
+		min_fps, max_fps = c.ACCEPT_VIDEO_FPS_WITHIN
+		if vfps <= 0.0 or vfps < float(min_fps) or vfps > float(max_fps):
+			skipped_due_to_fps_range += 1
+			continue
+
 		processing_tasks.append((video_path, start, end, output_path))
 
 	# Log summary of skipped tasks
 	logger.info(f"Task summary:")
 	logger.info(f"  - Tasks to process: {len(processing_tasks)}")
 	logger.info(f"  - Skipped (existing files): {skipped_due_to_existing_file}")
 	logger.info(f"  - Skipped (duration > 60s): {skipped_due_to_duration}")
+	logger.info(f"  - Skipped (duration < 0.2s): {skipped_due_to_too_short}")
+	logger.info(f"  - Skipped (fps out of {c.ACCEPT_VIDEO_FPS_WITHIN}): {skipped_due_to_fps_range}")
 	logger.info(f"  - Skipped (invalid videos): {skipped_due_to_invalid_video}")
 	if invalid_videos:
 		logger.warning(f"Invalid video files found: {', '.join(sorted(invalid_videos))}")
diff --git a/s3_mmpose_labelling.py b/s3_mmpose_labelling.py