WIP

Cadene · Cadene · commit 9f841c305e14 · 2024-04-22T09:43:52.000Z
diff --git a/download_and_upload_dataset.py b/download_and_upload_dataset.py
@@ -3,13 +3,11 @@
 useless dependencies when using datasets.
 """
 
-from dataclasses import dataclass, field
 import io
 import json
 import pickle
 import shutil
 from pathlib import Path
-from typing import Any, ClassVar, Optional
 
 import einops
 import h5py
@@ -23,12 +21,10 @@
 
 from lerobot.common.datasets.utils import compute_stats, flatten_dict, hf_transform_to_torch
 
-
-
 # @dataclass
 # class VideoFrame:
 #     """
-   
+
 #     Example:
 
 #     ```py
@@ -56,6 +52,7 @@
 #     def decode_example(self, value):
 #         return value
 
+
 def download_and_upload(root, revision, dataset_id):
     # TODO(rcadene, adilzouitine): add community_id/user_id (e.g. "lerobot", "cadene") or repo_id (e.g. "lerobot/pusht")
     if "pusht" in dataset_id:
@@ -310,7 +307,7 @@ def download_and_upload_pusht(root, revision, dataset_id="pusht", fps=10):
     data_dict = concatenate_episodes(ep_dicts)
 
     features = {
-        #"observation.image": Image(),
+        # "observation.image": Image(),
         "observation.image": Value(dtype="int64", id="video"),
         "observation.state": Sequence(
             length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
@@ -89,7 +89,6 @@ def __getitem__(self, idx):
         return item
 
 
-
 def yuv_to_rgb(frames):
     assert frames.dtype == torch.uint8
     assert frames.ndim == 4
@@ -135,7 +134,7 @@ def decode_video_frame_torchaudio(video_path, timestamp):
     device = "cpu"
     width = None
     height = None
-    image_format = "rgb" # or "yuv"
+    image_format = "rgb"  # or "yuv"
     frame_rate = None
 
     filter_desc = []
@@ -172,7 +171,7 @@ def decode_video_frame_torchaudio(video_path, timestamp):
             if resize_height:
                 scales.append(f"height={height}")
             filter_desc.append(f"scale={':'.join(scales)}")
-    
+
     # choice of format
     if image_format is not None:
         if device == "cuda":
@@ -196,6 +195,7 @@ def decode_video_frame_torchaudio(video_path, timestamp):
     # create a stream and load a certain number of frame at a certain frame rate
     # TODO(rcadene): make sure it's the most optimal way to do it
     from torchaudio.io import StreamReader
+
     s = StreamReader(str(video_path))
     s.seek(timestamp)
     s.add_video_stream(**video_stream_kwgs)
@@ -204,12 +204,13 @@ def decode_video_frame_torchaudio(video_path, timestamp):
 
     if "yuv" in image_format:
         frames = yuv_to_rgb(frames)
-    
+
     if len(frames) > 1:
         return frames
 
     return frames[0]
 
+
 def decode_video_frames_ffmpegio(video_path, timestamp):
     num_contiguous_frames = 1
     device = "cpu"
@@ -220,11 +221,13 @@ def decode_video_frames_ffmpegio(video_path, timestamp):
     )
     frames = torch.from_numpy(frames)
     import einops
+
     frames = einops.rearrange(frames, "b h w c -> b c h w")
     if device == "cuda":
         frames = frames.to(device)
     return frames
 
+
 def _decode_frames_decord(video_path, timestamp):
     num_contiguous_frames = 1
     device = "cpu"
@@ -239,4 +242,4 @@ def _decode_frames_decord(video_path, timestamp):
     #     frames = vr.get_batch([frame_id])
     # frames = torch.from_numpy(frames.asnumpy())
     # frames = einops.rearrange(frames, "b h w c -> b c h w")
-    # return frames
+    # return frames