ver 24.12.19

balaboom123 · balaboom123 · commit 605f4994f4bc · 2024-12-19T22:33:49.000+08:00
1. add how2sign mediapipe preprocess
diff --git a/H2S_mediapipe.py b/H2S_mediapipe.py
@@ -0,0 +1,118 @@
+import os
+import cv2
+import mediapipe as mp
+import numpy as np
+import logging
+from glob import glob
+from typing import List
+from concurrent.futures import ProcessPoolExecutor
+
+import conf as c
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+mp_holistic = mp.solutions.holistic
+
+
+def find_video_files(directory: str, pattern="*.mp4") -> List[str]:
+    """Find all .mp4 files in the specified directory and return base names without extension."""
+    return [
+        os.path.splitext(os.path.basename(f))[0]
+        for f in glob(os.path.join(directory, pattern))
+    ]
+
+
+def mediapipe_detection(image, model):
+    return model.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+
+
+def extract_keypoints(results):
+    def landmarks_to_np(landmarks, indices):
+        return (
+            np.array(
+                [[landmarks[i].x, landmarks[i].y, landmarks[i].z] for i in indices]
+            )
+            if landmarks
+            else np.zeros((len(indices), 3))
+        )
+
+    pose = landmarks_to_np(
+        getattr(results.pose_landmarks, "landmark", None), c.POSE_IDX
+    )
+    lh = landmarks_to_np(
+        getattr(results.left_hand_landmarks, "landmark", None), c.HAND_IDX
+    )
+    rh = landmarks_to_np(
+        getattr(results.right_hand_landmarks, "landmark", None), c.HAND_IDX
+    )
+    face = landmarks_to_np(
+        getattr(results.face_landmarks, "landmark", None), c.FACE_IDX
+    )
+
+    return np.concatenate([pose.flatten(), face.flatten(), lh.flatten(), rh.flatten()])
+
+
+def process_video(video_path: str, output_file: str):
+    """Process an entire video, extract holistic keypoints, and save as .npy."""
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        logger.error(f"Error opening video: {video_path}")
+        return
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_skip = 1
+
+    if total_frames > c.MAX_FRAME:
+        frame_skip = np.ceil(total_frames / c.MAX_FRAME)
+
+    all_landmarks = []
+    with mp_holistic.Holistic(
+        model_complexity=1,
+        refine_face_landmarks=True,
+        min_detection_confidence=0.5,
+        min_tracking_confidence=0.5,
+    ) as holistic:
+        current_frame = 0
+        while current_frame < total_frames:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if current_frame % frame_skip == 0:
+                results = mediapipe_detection(frame, holistic)
+                all_landmarks.append(extract_keypoints(results))
+            current_frame += 1
+
+    cap.release()
+
+    data_array = np.array(all_landmarks)
+    if data_array.size > 0 and np.any(data_array):
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+        np.save(output_file, data_array)
+        logger.info(f"Saved landmarks to {output_file}")
+    else:
+        logger.info(f"No valid landmarks for video {video_path}, not saving.")
+
+
+def main():
+    available_videos = find_video_files(c.H2S_VIDEO_DIR)
+    existed_files = find_video_files(c.H2S_OUTPUT_DIR, pattern="*.npy")
+
+    tasks = []
+    for video_name in available_videos:
+        video_path = os.path.join(c.H2S_VIDEO_DIR, f"{video_name}.mp4")
+        output_file = os.path.join(c.H2S_OUTPUT_DIR, f"{video_name}.npy")
+        if video_name not in existed_files:
+            tasks.append((video_path, output_file))
+        else:
+            logger.info(f"Skipping existing file: {output_file}")
+
+    # Use multiple processors
+    with ProcessPoolExecutor(max_workers=c.MAX_WORKERS) as executor:
+        for video_path, output_file in tasks:
+            executor.submit(process_video, video_path, output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conf.py b/conf.py
@@ -1,31 +1,48 @@
 import os
 
-
+# gpu setting
 USE_GPU = True
 
+# how2sign dataset
+H2S_VIDEO_DIR = "dataset/how2sign/"
+H2S_OUTPUT_DIR = "dataset/how2sign/npy/"
 
-# video downloader
+# youtube asl dataset
 ROOT = os.path.dirname(os.path.abspath(__file__))
-ID = 'youtube-asl_youtube_asl_video_ids.txt'
-VIDEO_DIR = f'{ROOT}/dataset/origin/'
-OUTPUT_DIR = f'{ROOT}/dataset/10fps/'
-TRANSCRIPT_DIR = f'{ROOT}/dataset/transcript/'
-# transcript_dir = f'{root}/dataset/test/'
-CSV_FILE = f'video_info.csv'
-
+ID = "youtube-asl_youtube_asl_video_ids.txt"
+VIDEO_DIR = f"{ROOT}/dataset/origin/"
+OUTPUT_DIR = f"{ROOT}/dataset/npy/"
+TRANSCRIPT_DIR = f"{ROOT}/dataset/transcript/"
+CSV_FILE = f"video_info.csv"
+MAX_FRAME = 512
 DURATION = 16
 OVERLAP = 4
-MAX_WORKERS = 1
-LANGUAGE = ['en', 'ase', 'en-US', 'en-CA', 'en-GB', 'en-AU', 'en-NZ', 'en-IN', 'en-ZA', 'en-IE', 'en-SG', 'en-PH', 'en-NG', 'en-PK', 'en-JM']
+MAX_WORKERS = 8
+LANGUAGE = [
+    "en",
+    "ase",
+    "en-US",
+    "en-CA",
+    "en-GB",
+    "en-AU",
+    "en-NZ",
+    "en-IN",
+    "en-ZA",
+    "en-IE",
+    "en-SG",
+    "en-PH",
+    "en-NG",
+    "en-PK",
+    "en-JM",
+]
 
-# mediapipe
-POSE_IDX = [11, 12, 13, 14, 23, 24]
-FACE_IDX = [0, 4, 13, 14, 17, 33, 37, 39, 46, 52, 55, 61, 64, 81, 82, 93,
-                133, 151, 152, 159, 172, 178, 181, 263, 269, 276, 282, 285, 291,
-                294, 311, 323, 362, 386, 397, 468, 473]
+# mediapipe landmark indices
 HAND_IDX = list(range(21))
+POSE_IDX = [11, 12, 13, 14, 23, 24]
+FACE_IDX = [
+    0, 4, 13, 14, 17, 33, 37, 39, 46, 52, 55, 61, 64, 81, 82, 93,
+    133, 151, 152, 159, 172, 178, 181, 263, 269, 276, 282, 285, 291,
+    294, 311, 323, 362, 386, 397, 468, 473
+]
+
 
-# import numpy as np
-# data = np.load('dataset/10fps/UznY5SfH0RI-015.npy')
-# print(data.shape)
-# print(data)
diff --git a/test.py b/test.py