merge but remove refactor of save_camera_images

2024-10-16 18:10:01 +02:00
parent cfa5ce02b8 29f6abc997
commit eeea3e5d1d
55 changed files with 7644 additions and 2545 deletions
--- a/lerobot/init.py
+++ b/lerobot/init.py
@@ -28,6 +28,8 @@ Example:
        print(lerobot.available_policies)
        print(lerobot.available_policies_per_env)
        print(lerobot.available_robots)
+        print(lerobot.available_cameras)
+        print(lerobot.available_motors)
    ```

 When implementing a new dataset loadable with LeRobotDataset follow these steps:
@@ -198,12 +200,25 @@ available_robots = [
    "aloha",
 ]

+# lists all available cameras from `lerobot/common/robot_devices/cameras`
+available_cameras = [
+    "opencv",
+    "intelrealsense",
+]
+
+# lists all available motors from `lerobot/common/robot_devices/motors`
+available_motors = [
+    "dynamixel",
+]
+
 # keys and values refer to yaml files
 available_policies_per_env = {
    "aloha": ["act"],
    "pusht": ["diffusion", "vqbet"],
    "xarm": ["tdmpc"],
-    "dora_aloha_real": ["act_real"],
+    "koch_real": ["act_koch_real"],
+    "aloha_real": ["act_aloha_real"],
+    "dora_aloha_real": ["act_aloha_real"],
 }

 env_task_pairs = [(env, task) for env, tasks in available_tasks_per_env.items() for task in tasks]
--- a/lerobot/common/datasets/compute_stats.py
+++ b/lerobot/common/datasets/compute_stats.py
@@ -68,7 +68,7 @@ def get_stats_einops_patterns(dataset, num_workers=0):
    return stats_patterns


-def compute_stats(dataset, batch_size=32, num_workers=16, max_num_samples=None):
+def compute_stats(dataset, batch_size=8, num_workers=8, max_num_samples=None):
    """Compute mean/std and min/max statistics of all data keys in a LeRobotDataset."""
    if max_num_samples is None:
        max_num_samples = len(dataset)
--- a/lerobot/common/datasets/populate_dataset.py
+++ b/lerobot/common/datasets/populate_dataset.py
@@ -0,0 +1,468 @@
+"""Functions to create an empty dataset, and populate it with frames."""
+# TODO(rcadene, aliberts): to adapt as class methods of next version of LeRobotDataset
+
+import concurrent
+import json
+import logging
+import multiprocessing
+import shutil
+from pathlib import Path
+
+import torch
+import tqdm
+from PIL import Image
+
+from lerobot.common.datasets.compute_stats import compute_stats
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
+from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import to_hf_dataset
+from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, get_default_encoding
+from lerobot.common.datasets.utils import calculate_episode_data_index, create_branch
+from lerobot.common.datasets.video_utils import encode_video_frames
+from lerobot.common.utils.utils import log_say
+from lerobot.scripts.push_dataset_to_hub import (
+    push_dataset_card_to_hub,
+    push_meta_data_to_hub,
+    push_videos_to_hub,
+    save_meta_data,
+)
+
+########################################################################################
+# Asynchrounous saving of images on disk
+########################################################################################
+
+
+def safe_stop_image_writer(func):
+    # TODO(aliberts): Allow to pass custom exceptions
+    # (e.g. ThreadServiceExit, KeyboardInterrupt, SystemExit, UnpluggedError, DynamixelCommError)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            image_writer = kwargs.get("dataset", {}).get("image_writer")
+            if image_writer is not None:
+                print("Waiting for image writer to terminate...")
+                stop_image_writer(image_writer, timeout=20)
+            raise e
+
+    return wrapper
+
+
+def save_image(img_tensor, key, frame_index, episode_index, videos_dir: str):
+    img = Image.fromarray(img_tensor.numpy())
+    path = Path(videos_dir) / f"{key}_episode_{episode_index:06d}" / f"frame_{frame_index:06d}.png"
+    path.parent.mkdir(parents=True, exist_ok=True)
+    img.save(str(path), quality=100)
+
+
+def loop_to_save_images_in_threads(image_queue, num_threads):
+    if num_threads < 1:
+        raise NotImplementedError(f"Only `num_threads>=1` is supported for now, but {num_threads=} given.")
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = []
+        while True:
+            # Blocks until a frame is available
+            frame_data = image_queue.get()
+
+            # As usually done, exit loop when receiving None to stop the worker
+            if frame_data is None:
+                break
+
+            image, key, frame_index, episode_index, videos_dir = frame_data
+            futures.append(executor.submit(save_image, image, key, frame_index, episode_index, videos_dir))
+
+        # Before exiting function, wait for all threads to complete
+        with tqdm.tqdm(total=len(futures), desc="Writing images") as progress_bar:
+            concurrent.futures.wait(futures)
+            progress_bar.update(len(futures))
+
+
+def start_image_writer_processes(image_queue, num_processes, num_threads_per_process):
+    if num_processes < 1:
+        raise ValueError(f"Only `num_processes>=1` is supported, but {num_processes=} given.")
+
+    if num_threads_per_process < 1:
+        raise NotImplementedError(
+            "Only `num_threads_per_process>=1` is supported for now, but {num_threads_per_process=} given."
+        )
+
+    processes = []
+    for _ in range(num_processes):
+        process = multiprocessing.Process(
+            target=loop_to_save_images_in_threads,
+            args=(image_queue, num_threads_per_process),
+        )
+        process.start()
+        processes.append(process)
+    return processes
+
+
+def stop_processes(processes, queue, timeout):
+    # Send None to each process to signal them to stop
+    for _ in processes:
+        queue.put(None)
+
+    # Wait maximum 20 seconds for all processes to terminate
+    for process in processes:
+        process.join(timeout=timeout)
+
+    # If not terminated after 20 seconds, force termination
+    if process.is_alive():
+        process.terminate()
+
+    # Close the queue, no more items can be put in the queue
+    queue.close()
+
+    # Ensure all background queue threads have finished
+    queue.join_thread()
+
+
+def start_image_writer(num_processes, num_threads):
+    """This function abstract away the initialisation of processes or/and threads to
+    save images on disk asynchrounously, which is critical to control a robot and record data
+    at a high frame rate.
+
+    When `num_processes=0`, it returns a dictionary containing a threads pool of size `num_threads`.
+    When `num_processes>0`, it returns a dictionary containing a processes pool of size `num_processes`,
+    where each subprocess starts their own threads pool of size `num_threads`.
+
+    The optimal number of processes and threads depends on your computer capabilities.
+    We advise to use 4 threads per camera with 0 processes. If the fps is not stable, try to increase or lower
+    the number of threads. If it is still not stable, try to use 1 subprocess, or more.
+    """
+    image_writer = {}
+
+    if num_processes == 0:
+        futures = []
+        threads_pool = concurrent.futures.ThreadPoolExecutor(max_workers=num_threads)
+        image_writer["threads_pool"], image_writer["futures"] = threads_pool, futures
+    else:
+        # TODO(rcadene): When using num_processes>1, `multiprocessing.Manager().Queue()`
+        # might be better than `multiprocessing.Queue()`. Source: https://www.geeksforgeeks.org/python-multiprocessing-queue-vs-multiprocessing-manager-queue
+        image_queue = multiprocessing.Queue()
+        processes_pool = start_image_writer_processes(
+            image_queue, num_processes=num_processes, num_threads_per_process=num_threads
+        )
+        image_writer["processes_pool"], image_writer["image_queue"] = processes_pool, image_queue
+
+    return image_writer
+
+
+def async_save_image(image_writer, image, key, frame_index, episode_index, videos_dir):
+    """This function abstract away the saving of an image on disk asynchrounously. It uses a dictionary
+    called image writer which contains either a pool of processes or a pool of threads.
+    """
+    if "threads_pool" in image_writer:
+        threads_pool, futures = image_writer["threads_pool"], image_writer["futures"]
+        futures.append(threads_pool.submit(save_image, image, key, frame_index, episode_index, videos_dir))
+    else:
+        image_queue = image_writer["image_queue"]
+        image_queue.put((image, key, frame_index, episode_index, videos_dir))
+
+
+def stop_image_writer(image_writer, timeout):
+    if "threads_pool" in image_writer:
+        futures = image_writer["futures"]
+        # Before exiting function, wait for all threads to complete
+        with tqdm.tqdm(total=len(futures), desc="Writing images") as progress_bar:
+            concurrent.futures.wait(futures, timeout=timeout)
+            progress_bar.update(len(futures))
+    else:
+        processes_pool, image_queue = image_writer["processes_pool"], image_writer["image_queue"]
+        stop_processes(processes_pool, image_queue, timeout=timeout)
+
+
+########################################################################################
+# Functions to initialize, resume and populate a dataset
+########################################################################################
+
+
+def init_dataset(
+    repo_id,
+    root,
+    force_override,
+    fps,
+    video,
+    write_images,
+    num_image_writer_processes,
+    num_image_writer_threads,
+):
+    local_dir = Path(root) / repo_id
+    if local_dir.exists() and force_override:
+        shutil.rmtree(local_dir)
+
+    episodes_dir = local_dir / "episodes"
+    episodes_dir.mkdir(parents=True, exist_ok=True)
+
+    videos_dir = local_dir / "videos"
+    videos_dir.mkdir(parents=True, exist_ok=True)
+
+    # Logic to resume data recording
+    rec_info_path = episodes_dir / "data_recording_info.json"
+    if rec_info_path.exists():
+        with open(rec_info_path) as f:
+            rec_info = json.load(f)
+        num_episodes = rec_info["last_episode_index"] + 1
+    else:
+        num_episodes = 0
+
+    dataset = {
+        "repo_id": repo_id,
+        "local_dir": local_dir,
+        "videos_dir": videos_dir,
+        "episodes_dir": episodes_dir,
+        "fps": fps,
+        "video": video,
+        "rec_info_path": rec_info_path,
+        "num_episodes": num_episodes,
+    }
+
+    if write_images:
+        # Initialize processes or/and threads dedicated to save images on disk asynchronously,
+        # which is critical to control a robot and record data at a high frame rate.
+        image_writer = start_image_writer(
+            num_processes=num_image_writer_processes,
+            num_threads=num_image_writer_threads,
+        )
+        dataset["image_writer"] = image_writer
+
+    return dataset
+
+
+def add_frame(dataset, observation, action):
+    if "current_episode" not in dataset:
+        # initialize episode dictionary
+        ep_dict = {}
+        for key in observation:
+            if key not in ep_dict:
+                ep_dict[key] = []
+        for key in action:
+            if key not in ep_dict:
+                ep_dict[key] = []
+
+        ep_dict["episode_index"] = []
+        ep_dict["frame_index"] = []
+        ep_dict["timestamp"] = []
+        ep_dict["next.done"] = []
+
+        dataset["current_episode"] = ep_dict
+        dataset["current_frame_index"] = 0
+
+    ep_dict = dataset["current_episode"]
+    episode_index = dataset["num_episodes"]
+    frame_index = dataset["current_frame_index"]
+    videos_dir = dataset["videos_dir"]
+    video = dataset["video"]
+    fps = dataset["fps"]
+
+    ep_dict["episode_index"].append(episode_index)
+    ep_dict["frame_index"].append(frame_index)
+    ep_dict["timestamp"].append(frame_index / fps)
+    ep_dict["next.done"].append(False)
+
+    img_keys = [key for key in observation if "image" in key]
+    non_img_keys = [key for key in observation if "image" not in key]
+
+    # Save all observed modalities except images
+    for key in non_img_keys:
+        ep_dict[key].append(observation[key])
+
+    # Save actions
+    for key in action:
+        ep_dict[key].append(action[key])
+
+    if "image_writer" not in dataset:
+        dataset["current_frame_index"] += 1
+        return
+
+    # Save images
+    image_writer = dataset["image_writer"]
+    for key in img_keys:
+        imgs_dir = videos_dir / f"{key}_episode_{episode_index:06d}"
+        async_save_image(
+            image_writer,
+            image=observation[key],
+            key=key,
+            frame_index=frame_index,
+            episode_index=episode_index,
+            videos_dir=str(videos_dir),
+        )
+
+        if video:
+            fname = f"{key}_episode_{episode_index:06d}.mp4"
+            frame_info = {"path": f"videos/{fname}", "timestamp": frame_index / fps}
+        else:
+            frame_info = str(imgs_dir / f"frame_{frame_index:06d}.png")
+
+        ep_dict[key].append(frame_info)
+
+    dataset["current_frame_index"] += 1
+
+
+def delete_current_episode(dataset):
+    del dataset["current_episode"]
+    del dataset["current_frame_index"]
+
+    # delete temporary images
+    episode_index = dataset["num_episodes"]
+    videos_dir = dataset["videos_dir"]
+    for tmp_imgs_dir in videos_dir.glob(f"*_episode_{episode_index:06d}"):
+        shutil.rmtree(tmp_imgs_dir)
+
+
+def save_current_episode(dataset):
+    episode_index = dataset["num_episodes"]
+    ep_dict = dataset["current_episode"]
+    episodes_dir = dataset["episodes_dir"]
+    rec_info_path = dataset["rec_info_path"]
+
+    ep_dict["next.done"][-1] = True
+
+    for key in ep_dict:
+        if "observation" in key and "image" not in key:
+            ep_dict[key] = torch.stack(ep_dict[key])
+
+    ep_dict["action"] = torch.stack(ep_dict["action"])
+    ep_dict["episode_index"] = torch.tensor(ep_dict["episode_index"])
+    ep_dict["frame_index"] = torch.tensor(ep_dict["frame_index"])
+    ep_dict["timestamp"] = torch.tensor(ep_dict["timestamp"])
+    ep_dict["next.done"] = torch.tensor(ep_dict["next.done"])
+
+    ep_path = episodes_dir / f"episode_{episode_index}.pth"
+    torch.save(ep_dict, ep_path)
+
+    rec_info = {
+        "last_episode_index": episode_index,
+    }
+    with open(rec_info_path, "w") as f:
+        json.dump(rec_info, f)
+
+    # force re-initialization of episode dictionnary during add_frame
+    del dataset["current_episode"]
+
+    dataset["num_episodes"] += 1
+
+
+def encode_videos(dataset, image_keys, play_sounds):
+    log_say("Encoding videos", play_sounds)
+
+    num_episodes = dataset["num_episodes"]
+    videos_dir = dataset["videos_dir"]
+    local_dir = dataset["local_dir"]
+    fps = dataset["fps"]
+
+    # Use ffmpeg to convert frames stored as png into mp4 videos
+    for episode_index in tqdm.tqdm(range(num_episodes)):
+        for key in image_keys:
+            # key = f"observation.images.{name}"
+            tmp_imgs_dir = videos_dir / f"{key}_episode_{episode_index:06d}"
+            fname = f"{key}_episode_{episode_index:06d}.mp4"
+            video_path = local_dir / "videos" / fname
+            if video_path.exists():
+                # Skip if video is already encoded. Could be the case when resuming data recording.
+                continue
+            # note: `encode_video_frames` is a blocking call. Making it asynchronous shouldn't speedup encoding,
+            # since video encoding with ffmpeg is already using multithreading.
+            encode_video_frames(tmp_imgs_dir, video_path, fps, overwrite=True)
+            shutil.rmtree(tmp_imgs_dir)
+
+
+def from_dataset_to_lerobot_dataset(dataset, play_sounds):
+    log_say("Consolidate episodes", play_sounds)
+
+    num_episodes = dataset["num_episodes"]
+    episodes_dir = dataset["episodes_dir"]
+    videos_dir = dataset["videos_dir"]
+    video = dataset["video"]
+    fps = dataset["fps"]
+    repo_id = dataset["repo_id"]
+
+    ep_dicts = []
+    for episode_index in tqdm.tqdm(range(num_episodes)):
+        ep_path = episodes_dir / f"episode_{episode_index}.pth"
+        ep_dict = torch.load(ep_path)
+        ep_dicts.append(ep_dict)
+    data_dict = concatenate_episodes(ep_dicts)
+
+    if video:
+        image_keys = [key for key in data_dict if "image" in key]
+        encode_videos(dataset, image_keys, play_sounds)
+
+    hf_dataset = to_hf_dataset(data_dict, video)
+    episode_data_index = calculate_episode_data_index(hf_dataset)
+
+    info = {
+        "codebase_version": CODEBASE_VERSION,
+        "fps": fps,
+        "video": video,
+    }
+    if video:
+        info["encoding"] = get_default_encoding()
+
+    lerobot_dataset = LeRobotDataset.from_preloaded(
+        repo_id=repo_id,
+        hf_dataset=hf_dataset,
+        episode_data_index=episode_data_index,
+        info=info,
+        videos_dir=videos_dir,
+    )
+
+    return lerobot_dataset
+
+
+def save_lerobot_dataset_on_disk(lerobot_dataset):
+    hf_dataset = lerobot_dataset.hf_dataset
+    info = lerobot_dataset.info
+    stats = lerobot_dataset.stats
+    episode_data_index = lerobot_dataset.episode_data_index
+    local_dir = lerobot_dataset.videos_dir.parent
+    meta_data_dir = local_dir / "meta_data"
+
+    hf_dataset = hf_dataset.with_format(None)  # to remove transforms that cant be saved
+    hf_dataset.save_to_disk(str(local_dir / "train"))
+
+    save_meta_data(info, stats, episode_data_index, meta_data_dir)
+
+
+def push_lerobot_dataset_to_hub(lerobot_dataset, tags):
+    hf_dataset = lerobot_dataset.hf_dataset
+    local_dir = lerobot_dataset.videos_dir.parent
+    videos_dir = lerobot_dataset.videos_dir
+    repo_id = lerobot_dataset.repo_id
+    video = lerobot_dataset.video
+    meta_data_dir = local_dir / "meta_data"
+
+    if not (local_dir / "train").exists():
+        raise ValueError(
+            "You need to run `save_lerobot_dataset_on_disk(lerobot_dataset)` before pushing to the hub."
+        )
+
+    hf_dataset.push_to_hub(repo_id, revision="main")
+    push_meta_data_to_hub(repo_id, meta_data_dir, revision="main")
+    push_dataset_card_to_hub(repo_id, revision="main", tags=tags)
+    if video:
+        push_videos_to_hub(repo_id, videos_dir, revision="main")
+    create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION)
+
+
+def create_lerobot_dataset(dataset, run_compute_stats, push_to_hub, tags, play_sounds):
+    if "image_writer" in dataset:
+        logging.info("Waiting for image writer to terminate...")
+        image_writer = dataset["image_writer"]
+        stop_image_writer(image_writer, timeout=20)
+
+    lerobot_dataset = from_dataset_to_lerobot_dataset(dataset, play_sounds)
+
+    if run_compute_stats:
+        log_say("Computing dataset statistics", play_sounds)
+        lerobot_dataset.stats = compute_stats(lerobot_dataset)
+    else:
+        logging.info("Skipping computation of the dataset statistics")
+        lerobot_dataset.stats = {}
+
+    save_lerobot_dataset_on_disk(lerobot_dataset)
+
+    if push_to_hub:
+        push_lerobot_dataset_to_hub(lerobot_dataset, tags)
+
+    return lerobot_dataset
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@@ -32,7 +32,7 @@ DATASET_CARD_TEMPLATE = """
 ---
 # Metadata will go there
 ---
-This dataset was created using [🤗 LeRobot](https://github.com/huggingface/lerobot).
+This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).

 """

--- a/lerobot/common/logger.py
+++ b/lerobot/common/logger.py
@@ -189,7 +189,7 @@ class Logger:
            training_state["scheduler"] = scheduler.state_dict()
        torch.save(training_state, save_dir / self.training_state_file_name)

-    def save_checkpont(
+    def save_checkpoint(
        self,
        train_step: int,
        policy: Policy,
--- a/lerobot/common/robot_devices/cameras/intelrealsense.py
+++ b/lerobot/common/robot_devices/cameras/intelrealsense.py
@@ -2,41 +2,151 @@
 This file contains utilities for recording frames from Intel Realsense cameras.
 """

+import argparse
+import concurrent.futures
+import logging
+import math
+import shutil
 import threading
 import time
 import traceback
+from collections import Counter
 from dataclasses import dataclass, replace
+from pathlib import Path
 from threading import Thread

-import cv2
 import numpy as np
-import pyrealsense2 as rs
+from PIL import Image

 from lerobot.common.robot_devices.utils import (
    RobotDeviceAlreadyConnectedError,
    RobotDeviceNotConnectedError,
+    busy_wait,
 )
 from lerobot.common.utils.utils import capture_timestamp_utc

 SERIAL_NUMBER_INDEX = 1


-def find_camera_indices(raise_when_empty=True) -> list[int]:
+def find_cameras(raise_when_empty=True, mock=False) -> list[dict]:
    """
-    Find the serial numbers of the Intel RealSense cameras
+    Find the names and the serial numbers of the Intel RealSense cameras
    connected to the computer.
    """
-    camera_ids = []
+    if mock:
+        import tests.mock_pyrealsense2 as rs
+    else:
+        import pyrealsense2 as rs
+
+    cameras = []
    for device in rs.context().query_devices():
        serial_number = int(device.get_info(rs.camera_info(SERIAL_NUMBER_INDEX)))
-        camera_ids.append(serial_number)
+        name = device.get_info(rs.camera_info.name)
+        cameras.append(
+            {
+                "serial_number": serial_number,
+                "name": name,
+            }
+        )

-    if raise_when_empty and len(camera_ids) == 0:
+    if raise_when_empty and len(cameras) == 0:
        raise OSError(
            "Not a single camera was detected. Try re-plugging, or re-installing `librealsense` and its python wrapper `pyrealsense2`, or updating the firmware."
        )

-    return camera_ids
+    return cameras
+
+
+def save_image(img_array, serial_number, frame_index, images_dir):
+    try:
+        img = Image.fromarray(img_array)
+        path = images_dir / f"camera_{serial_number}_frame_{frame_index:06d}.png"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        img.save(str(path), quality=100)
+        logging.info(f"Saved image: {path}")
+    except Exception as e:
+        logging.error(f"Failed to save image for camera {serial_number} frame {frame_index}: {e}")
+
+
+def save_images_from_cameras(
+    images_dir: Path,
+    serial_numbers: list[int] | None = None,
+    fps=None,
+    width=None,
+    height=None,
+    record_time_s=2,
+    mock=False,
+):
+    """
+    Initializes all the cameras and saves images to the directory. Useful to visually identify the camera
+    associated to a given serial number.
+    """
+    if serial_numbers is None or len(serial_numbers) == 0:
+        camera_infos = find_cameras(mock=mock)
+        serial_numbers = [cam["serial_number"] for cam in camera_infos]
+
+    if mock:
+        import tests.mock_cv2 as cv2
+    else:
+        import cv2
+
+    print("Connecting cameras")
+    cameras = []
+    for cam_sn in serial_numbers:
+        print(f"{cam_sn=}")
+        camera = IntelRealSenseCamera(cam_sn, fps=fps, width=width, height=height, mock=mock)
+        camera.connect()
+        print(
+            f"IntelRealSenseCamera({camera.serial_number}, fps={camera.fps}, width={camera.width}, height={camera.height}, color_mode={camera.color_mode})"
+        )
+        cameras.append(camera)
+
+    images_dir = Path(images_dir)
+    if images_dir.exists():
+        shutil.rmtree(
+            images_dir,
+        )
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Saving images to {images_dir}")
+    frame_index = 0
+    start_time = time.perf_counter()
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+            while True:
+                now = time.perf_counter()
+
+                for camera in cameras:
+                    # If we use async_read when fps is None, the loop will go full speed, and we will end up
+                    # saving the same images from the cameras multiple times until the RAM/disk is full.
+                    image = camera.read() if fps is None else camera.async_read()
+                    if image is None:
+                        print("No Frame")
+
+                    bgr_converted_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+                    executor.submit(
+                        save_image,
+                        bgr_converted_image,
+                        camera.serial_number,
+                        frame_index,
+                        images_dir,
+                    )
+
+                if fps is not None:
+                    dt_s = time.perf_counter() - now
+                    busy_wait(1 / fps - dt_s)
+
+                if time.perf_counter() - start_time > record_time_s:
+                    break
+
+                print(f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}")
+
+                frame_index += 1
+    finally:
+        print(f"Images have been saved to {images_dir}")
+        for camera in cameras:
+            camera.disconnect()


@dataclass
@@ -50,6 +160,7 @@ class IntelRealSenseCameraConfig:
    IntelRealSenseCameraConfig(90, 640, 480)
    IntelRealSenseCameraConfig(30, 1280, 720)
    IntelRealSenseCameraConfig(30, 640, 480, use_depth=True)
+    IntelRealSenseCameraConfig(30, 640, 480, rotation=90)
    ```
    """

@@ -59,6 +170,8 @@ class IntelRealSenseCameraConfig:
    color_mode: str = "rgb"
    use_depth: bool = False
    force_hardware_reset: bool = True
+    rotation: int | None = None
+    mock: bool = False

    def __post_init__(self):
        if self.color_mode not in ["rgb", "bgr"]:
@@ -66,19 +179,23 @@ class IntelRealSenseCameraConfig:
                f"`color_mode` is expected to be 'rgb' or 'bgr', but {self.color_mode} is provided."
            )

-        if (self.fps or self.width or self.height) and not (self.fps and self.width and self.height):
+        at_least_one_is_not_none = self.fps is not None or self.width is not None or self.height is not None
+        at_least_one_is_none = self.fps is None or self.width is None or self.height is None
+        if at_least_one_is_not_none and at_least_one_is_none:
            raise ValueError(
                "For `fps`, `width` and `height`, either all of them need to be set, or none of them, "
                f"but {self.fps=}, {self.width=}, {self.height=} were provided."
            )

+        if self.rotation not in [-90, None, 90, 180]:
+            raise ValueError(f"`rotation` must be in [-90, None, 90, 180] (got {self.rotation})")
+

 class IntelRealSenseCamera:
    """
    The IntelRealSenseCamera class is similar to OpenCVCamera class but adds additional features for Intel Real Sense cameras:
-    - camera_index corresponds to the serial number of the camera,
-    - camera_index won't randomly change as it can be the case of OpenCVCamera for Linux,
-    - read is more reliable than OpenCVCamera,
+    - is instantiated with the serial number of the camera - won't randomly change as it can be the case of OpenCVCamera for Linux,
+    - can also be instantiated with the camera's name — if it's unique — using IntelRealSenseCamera.init_from_name(),
    - depth map can be returned.

    To find the camera indices of your cameras, you can run our utility script that will save a few frames for each camera:
@@ -93,8 +210,10 @@ class IntelRealSenseCamera:

    Example of usage:
    ```python
-    camera_index = 128422271347
-    camera = IntelRealSenseCamera(camera_index)
+    # Instantiate with its serial number
+    camera = IntelRealSenseCamera(128422271347)
+    # Or by its name if it's unique
+    camera = IntelRealSenseCamera.init_from_name("Intel RealSense D405")
    camera.connect()
    color_image = camera.read()
    # when done using the camera, consider disconnecting
@@ -103,19 +222,19 @@ class IntelRealSenseCamera:

    Example of changing default fps, width, height and color_mode:
    ```python
-    camera = IntelRealSenseCamera(camera_index, fps=30, width=1280, height=720)
+    camera = IntelRealSenseCamera(serial_number, fps=30, width=1280, height=720)
    camera = connect()  # applies the settings, might error out if these settings are not compatible with the camera

-    camera = IntelRealSenseCamera(camera_index, fps=90, width=640, height=480)
+    camera = IntelRealSenseCamera(serial_number, fps=90, width=640, height=480)
    camera = connect()

-    camera = IntelRealSenseCamera(camera_index, fps=90, width=640, height=480, color_mode="bgr")
+    camera = IntelRealSenseCamera(serial_number, fps=90, width=640, height=480, color_mode="bgr")
    camera = connect()
    ```

    Example of returning depth:
    ```python
-    camera = IntelRealSenseCamera(camera_index, use_depth=True)
+    camera = IntelRealSenseCamera(serial_number, use_depth=True)
    camera.connect()
    color_image, depth_map = camera.read()
    ```
@@ -123,7 +242,7 @@ class IntelRealSenseCamera:

    def __init__(
        self,
-        camera_index: int,
+        serial_number: int,
        config: IntelRealSenseCameraConfig | None = None,
        **kwargs,
    ):
@@ -133,13 +252,14 @@ class IntelRealSenseCamera:
        # Overwrite the config arguments using kwargs
        config = replace(config, **kwargs)

-        self.camera_index = camera_index
+        self.serial_number = serial_number
        self.fps = config.fps
        self.width = config.width
        self.height = config.height
        self.color_mode = config.color_mode
        self.use_depth = config.use_depth
        self.force_hardware_reset = config.force_hardware_reset
+        self.mock = config.mock

        self.camera = None
        self.is_connected = False
@@ -149,14 +269,55 @@ class IntelRealSenseCamera:
        self.depth_map = None
        self.logs = {}

+        if self.mock:
+            import tests.mock_cv2 as cv2
+        else:
+            import cv2
+
+        # TODO(alibets): Do we keep original width/height or do we define them after rotation?
+        self.rotation = None
+        if config.rotation == -90:
+            self.rotation = cv2.ROTATE_90_COUNTERCLOCKWISE
+        elif config.rotation == 90:
+            self.rotation = cv2.ROTATE_90_CLOCKWISE
+        elif config.rotation == 180:
+            self.rotation = cv2.ROTATE_180
+
+    @classmethod
+    def init_from_name(cls, name: str, config: IntelRealSenseCameraConfig | None = None, **kwargs):
+        camera_infos = find_cameras()
+        camera_names = [cam["name"] for cam in camera_infos]
+        this_name_count = Counter(camera_names)[name]
+        if this_name_count > 1:
+            # TODO(aliberts): Test this with multiple identical cameras (Aloha)
+            raise ValueError(
+                f"Multiple {name} cameras have been detected. Please use their serial number to instantiate them."
+            )
+
+        name_to_serial_dict = {cam["name"]: cam["serial_number"] for cam in camera_infos}
+        cam_sn = name_to_serial_dict[name]
+
+        if config is None:
+            config = IntelRealSenseCameraConfig()
+
+        # Overwrite the config arguments using kwargs
+        config = replace(config, **kwargs)
+
+        return cls(serial_number=cam_sn, config=config, **kwargs)
+
    def connect(self):
        if self.is_connected:
            raise RobotDeviceAlreadyConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is already connected."
+                f"IntelRealSenseCamera({self.serial_number}) is already connected."
            )

+        if self.mock:
+            import tests.mock_pyrealsense2 as rs
+        else:
+            import pyrealsense2 as rs
+
        config = rs.config()
-        config.enable_device(str(self.camera_index))
+        config.enable_device(str(self.serial_number))

        if self.fps and self.width and self.height:
            # TODO(rcadene): can we set rgb8 directly?
@@ -172,7 +333,7 @@ class IntelRealSenseCamera:

        self.camera = rs.pipeline()
        try:
-            self.camera.start(config)
+            profile = self.camera.start(config)
            is_camera_open = True
        except RuntimeError:
            is_camera_open = False
@@ -181,15 +342,41 @@ class IntelRealSenseCamera:
        # If the camera doesn't work, display the camera indices corresponding to
        # valid cameras.
        if not is_camera_open:
-            # Verify that the provided `camera_index` is valid before printing the traceback
-            available_cam_ids = find_camera_indices()
-            if self.camera_index not in available_cam_ids:
+            # Verify that the provided `serial_number` is valid before printing the traceback
+            camera_infos = find_cameras()
+            serial_numbers = [cam["serial_number"] for cam in camera_infos]
+            if self.serial_number not in serial_numbers:
                raise ValueError(
-                    f"`camera_index` is expected to be one of these available cameras {available_cam_ids}, but {self.camera_index} is provided instead. "
-                    "To find the camera index you should use, run `python lerobot/scripts/save_images_from_cameras.py --driver intelrealsense`."
+                    f"`serial_number` is expected to be one of these available cameras {serial_numbers}, but {self.serial_number} is provided instead. "
+                    "To find the serial number you should use, run `python lerobot/common/robot_devices/cameras/intelrealsense.py`."
                )

-            raise OSError(f"Can't access IntelRealSenseCamera({self.camera_index}).")
+            raise OSError(f"Can't access IntelRealSenseCamera({self.serial_number}).")
+
+        color_stream = profile.get_stream(rs.stream.color)
+        color_profile = color_stream.as_video_stream_profile()
+        actual_fps = color_profile.fps()
+        actual_width = color_profile.width()
+        actual_height = color_profile.height()
+
+        # Using `math.isclose` since actual fps can be a float (e.g. 29.9 instead of 30)
+        if self.fps is not None and not math.isclose(self.fps, actual_fps, rel_tol=1e-3):
+            # Using `OSError` since it's a broad that encompasses issues related to device communication
+            raise OSError(
+                f"Can't set {self.fps=} for IntelRealSenseCamera({self.serial_number}). Actual value is {actual_fps}."
+            )
+        if self.width is not None and self.width != actual_width:
+            raise OSError(
+                f"Can't set {self.width=} for IntelRealSenseCamera({self.serial_number}). Actual value is {actual_width}."
+            )
+        if self.height is not None and self.height != actual_height:
+            raise OSError(
+                f"Can't set {self.height=} for IntelRealSenseCamera({self.serial_number}). Actual value is {actual_height}."
+            )
+
+        self.fps = round(actual_fps)
+        self.width = round(actual_width)
+        self.height = round(actual_height)

        self.is_connected = True

@@ -205,9 +392,14 @@ class IntelRealSenseCamera:
        """
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
+                f"IntelRealSenseCamera({self.serial_number}) is not connected. Try running `camera.connect()` first."
            )

+        if self.mock:
+            import tests.mock_cv2 as cv2
+        else:
+            import cv2
+
        start_time = time.perf_counter()

        frame = self.camera.wait_for_frames(timeout_ms=5000)
@@ -215,7 +407,7 @@ class IntelRealSenseCamera:
        color_frame = frame.get_color_frame()

        if not color_frame:
-            raise OSError(f"Can't capture color image from IntelRealSenseCamera({self.camera_index}).")
+            raise OSError(f"Can't capture color image from IntelRealSenseCamera({self.serial_number}).")

        color_image = np.asanyarray(color_frame.get_data())

@@ -235,6 +427,9 @@ class IntelRealSenseCamera:
                f"Can't capture color image with expected height and width ({self.height} x {self.width}). ({h} x {w}) returned instead."
            )

+        if self.rotation is not None:
+            color_image = cv2.rotate(color_image, self.rotation)
+
        # log the number of seconds it took to read the image
        self.logs["delta_timestamp_s"] = time.perf_counter() - start_time

@@ -244,7 +439,7 @@ class IntelRealSenseCamera:
        if self.use_depth:
            depth_frame = frame.get_depth_frame()
            if not depth_frame:
-                raise OSError(f"Can't capture depth image from IntelRealSenseCamera({self.camera_index}).")
+                raise OSError(f"Can't capture depth image from IntelRealSenseCamera({self.serial_number}).")

            depth_map = np.asanyarray(depth_frame.get_data())

@@ -254,12 +449,15 @@ class IntelRealSenseCamera:
                    f"Can't capture depth map with expected height and width ({self.height} x {self.width}). ({h} x {w}) returned instead."
                )

+            if self.rotation is not None:
+                depth_map = cv2.rotate(depth_map, self.rotation)
+
            return color_image, depth_map
        else:
            return color_image

    def read_loop(self):
-        while self.stop_event is None or not self.stop_event.is_set():
+        while not self.stop_event.is_set():
            if self.use_depth:
                self.color_image, self.depth_map = self.read()
            else:
@@ -269,7 +467,7 @@ class IntelRealSenseCamera:
        """Access the latest color image"""
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
+                f"IntelRealSenseCamera({self.serial_number}) is not connected. Try running `camera.connect()` first."
            )

        if self.thread is None:
@@ -280,6 +478,7 @@ class IntelRealSenseCamera:

        num_tries = 0
        while self.color_image is None:
+            # TODO(rcadene, aliberts): intelrealsense has diverged compared to opencv over here
            num_tries += 1
            time.sleep(1 / self.fps)
            if num_tries > self.fps and (self.thread.ident is None or not self.thread.is_alive()):
@@ -295,7 +494,7 @@ class IntelRealSenseCamera:
    def disconnect(self):
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
+                f"IntelRealSenseCamera({self.serial_number}) is not connected. Try running `camera.connect()` first."
            )

        if self.thread is not None and self.thread.is_alive():
@@ -313,3 +512,48 @@ class IntelRealSenseCamera:
    def __del__(self):
        if getattr(self, "is_connected", False):
            self.disconnect()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Save a few frames using `IntelRealSenseCamera` for all cameras connected to the computer, or a selected subset."
+    )
+    parser.add_argument(
+        "--serial-numbers",
+        type=int,
+        nargs="*",
+        default=None,
+        help="List of serial numbers used to instantiate the `IntelRealSenseCamera`. If not provided, find and use all available camera indices.",
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=30,
+        help="Set the number of frames recorded per seconds for all cameras. If not provided, use the default fps of each camera.",
+    )
+    parser.add_argument(
+        "--width",
+        type=str,
+        default=640,
+        help="Set the width for all cameras. If not provided, use the default width of each camera.",
+    )
+    parser.add_argument(
+        "--height",
+        type=str,
+        default=480,
+        help="Set the height for all cameras. If not provided, use the default height of each camera.",
+    )
+    parser.add_argument(
+        "--images-dir",
+        type=Path,
+        default="outputs/images_from_intelrealsense_cameras",
+        help="Set directory to save a few frames for each camera.",
+    )
+    parser.add_argument(
+        "--record-time-s",
+        type=float,
+        default=2.0,
+        help="Set the number of seconds used to record the frames. By default, 2 seconds.",
+    )
+    args = parser.parse_args()
+    save_images_from_cameras(**vars(args))
--- a/lerobot/common/robot_devices/cameras/opencv.py
+++ b/lerobot/common/robot_devices/cameras/opencv.py
@@ -2,27 +2,27 @@
 This file contains utilities for recording frames from cameras. For more info look at `OpenCVCamera` docstring.
 """

+import argparse
+import concurrent.futures
 import math
 import platform
+import shutil
 import threading
 import time
 from dataclasses import dataclass, replace
 from pathlib import Path
 from threading import Thread

-import cv2
 import numpy as np
+from PIL import Image

 from lerobot.common.robot_devices.utils import (
    RobotDeviceAlreadyConnectedError,
    RobotDeviceNotConnectedError,
+    busy_wait,
 )
 from lerobot.common.utils.utils import capture_timestamp_utc

-# Use 1 thread to avoid blocking the main thread. Especially useful during data collection
-# when other threads are used to save the images.
-cv2.setNumThreads(1)
-
 # The maximum opencv device index depends on your operating system. For instance,
 # if you have 3 cameras, they should be associated to index 0, 1, and 2. This is the case
 # on MacOS. However, on Ubuntu, the indices are different like 6, 16, 23.
@@ -31,20 +31,44 @@ cv2.setNumThreads(1)
 MAX_OPENCV_INDEX = 60


-def find_camera_indices(raise_when_empty=False, max_index_search_range=MAX_OPENCV_INDEX):
+def find_cameras(raise_when_empty=False, max_index_search_range=MAX_OPENCV_INDEX, mock=False) -> list[dict]:
+    cameras = []
    if platform.system() == "Linux":
-        # Linux uses camera ports
        print("Linux detected. Finding available camera indices through scanning '/dev/video*' ports")
-        possible_camera_ids = []
-        for port in Path("/dev").glob("video*"):
-            camera_idx = int(str(port).replace("/dev/video", ""))
-            possible_camera_ids.append(camera_idx)
+        possible_ports = [str(port) for port in Path("/dev").glob("video*")]
+        ports = _find_cameras(possible_ports, mock=mock)
+        for port in ports:
+            cameras.append(
+                {
+                    "port": port,
+                    "index": int(port.removeprefix("/dev/video")),
+                }
+            )
    else:
        print(
            "Mac or Windows detected. Finding available camera indices through "
            f"scanning all indices from 0 to {MAX_OPENCV_INDEX}"
        )
-        possible_camera_ids = range(max_index_search_range)
+        possible_indices = range(max_index_search_range)
+        indices = _find_cameras(possible_indices, mock=mock)
+        for index in indices:
+            cameras.append(
+                {
+                    "port": None,
+                    "index": index,
+                }
+            )
+
+    return cameras
+
+
+def _find_cameras(
+    possible_camera_ids: list[int | str], raise_when_empty=False, mock=False
+) -> list[int | str]:
+    if mock:
+        import tests.mock_cv2 as cv2
+    else:
+        import cv2

    camera_ids = []
    for camera_idx in possible_camera_ids:
@@ -65,6 +89,92 @@ def find_camera_indices(raise_when_empty=False, max_index_search_range=MAX_OPENC
    return camera_ids


+def is_valid_unix_path(path: str) -> bool:
+    """Note: if 'path' points to a symlink, this will return True only if the target exists"""
+    p = Path(path)
+    return p.is_absolute() and p.exists()
+
+
+def get_camera_index_from_unix_port(port: Path) -> int:
+    return int(str(port.resolve()).removeprefix("/dev/video"))
+
+
+def save_image(img_array, camera_index, frame_index, images_dir):
+    img = Image.fromarray(img_array)
+    path = images_dir / f"camera_{camera_index:02d}_frame_{frame_index:06d}.png"
+    path.parent.mkdir(parents=True, exist_ok=True)
+    img.save(str(path), quality=100)
+
+
+def save_images_from_cameras(
+    images_dir: Path,
+    camera_ids: list | None = None,
+    fps=None,
+    width=None,
+    height=None,
+    record_time_s=2,
+    mock=False,
+):
+    """
+    Initializes all the cameras and saves images to the directory. Useful to visually identify the camera
+    associated to a given camera index.
+    """
+    if camera_ids is None or len(camera_ids) == 0:
+        camera_infos = find_cameras(mock=mock)
+        camera_ids = [cam["index"] for cam in camera_infos]
+
+    print("Connecting cameras")
+    cameras = []
+    for cam_idx in camera_ids:
+        camera = OpenCVCamera(cam_idx, fps=fps, width=width, height=height, mock=mock)
+        camera.connect()
+        print(
+            f"OpenCVCamera({camera.camera_index}, fps={camera.fps}, width={camera.width}, "
+            f"height={camera.height}, color_mode={camera.color_mode})"
+        )
+        cameras.append(camera)
+
+    images_dir = Path(images_dir)
+    if images_dir.exists():
+        shutil.rmtree(
+            images_dir,
+        )
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Saving images to {images_dir}")
+    frame_index = 0
+    start_time = time.perf_counter()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        while True:
+            now = time.perf_counter()
+
+            for camera in cameras:
+                # If we use async_read when fps is None, the loop will go full speed, and we will endup
+                # saving the same images from the cameras multiple times until the RAM/disk is full.
+                image = camera.read() if fps is None else camera.async_read()
+
+                executor.submit(
+                    save_image,
+                    image,
+                    camera.camera_index,
+                    frame_index,
+                    images_dir,
+                )
+
+            if fps is not None:
+                dt_s = time.perf_counter() - now
+                busy_wait(1 / fps - dt_s)
+
+            print(f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}")
+
+            if time.perf_counter() - start_time > record_time_s:
+                break
+
+            frame_index += 1
+
+    print(f"Images have been saved to {images_dir}")
+
+
@dataclass
 class OpenCVCameraConfig:
    """
@@ -82,6 +192,8 @@ class OpenCVCameraConfig:
    width: int | None = None
    height: int | None = None
    color_mode: str = "rgb"
+    rotation: int | None = None
+    mock: bool = False

    def __post_init__(self):
        if self.color_mode not in ["rgb", "bgr"]:
@@ -89,6 +201,9 @@ class OpenCVCameraConfig:
                f"`color_mode` is expected to be 'rgb' or 'bgr', but {self.color_mode} is provided."
            )

+        if self.rotation not in [-90, None, 90, 180]:
+            raise ValueError(f"`rotation` must be in [-90, None, 90, 180] (got {self.rotation})")
+

 class OpenCVCamera:
    """
@@ -131,7 +246,7 @@ class OpenCVCamera:
    ```
    """

-    def __init__(self, camera_index: int, config: OpenCVCameraConfig | None = None, **kwargs):
+    def __init__(self, camera_index: int | str, config: OpenCVCameraConfig | None = None, **kwargs):
        if config is None:
            config = OpenCVCameraConfig()

@@ -139,10 +254,24 @@ class OpenCVCamera:
        config = replace(config, **kwargs)

        self.camera_index = camera_index
+        self.port = None
+
+        # Linux uses ports for connecting to cameras
+        if platform.system() == "Linux":
+            if isinstance(self.camera_index, int):
+                self.port = Path(f"/dev/video{self.camera_index}")
+            elif isinstance(self.camera_index, str) and is_valid_unix_path(self.camera_index):
+                self.port = Path(self.camera_index)
+                # Retrieve the camera index from a potentially symlinked path
+                self.camera_index = get_camera_index_from_unix_port(self.port)
+            else:
+                raise ValueError(f"Please check the provided camera_index: {camera_index}")
+
        self.fps = config.fps
        self.width = config.width
        self.height = config.height
        self.color_mode = config.color_mode
+        self.mock = config.mock

        self.camera = None
        self.is_connected = False
@@ -151,43 +280,60 @@ class OpenCVCamera:
        self.color_image = None
        self.logs = {}

+        if self.mock:
+            import tests.mock_cv2 as cv2
+        else:
+            import cv2
+
+        # TODO(aliberts): Do we keep original width/height or do we define them after rotation?
+        self.rotation = None
+        if config.rotation == -90:
+            self.rotation = cv2.ROTATE_90_COUNTERCLOCKWISE
+        elif config.rotation == 90:
+            self.rotation = cv2.ROTATE_90_CLOCKWISE
+        elif config.rotation == 180:
+            self.rotation = cv2.ROTATE_180
+
    def connect(self):
        if self.is_connected:
            raise RobotDeviceAlreadyConnectedError(f"OpenCVCamera({self.camera_index}) is already connected.")

+        if self.mock:
+            import tests.mock_cv2 as cv2
+        else:
+            import cv2
+
+            # Use 1 thread to avoid blocking the main thread. Especially useful during data collection
+            # when other threads are used to save the images.
+            cv2.setNumThreads(1)
+
+        camera_idx = f"/dev/video{self.camera_index}" if platform.system() == "Linux" else self.camera_index
        # First create a temporary camera trying to access `camera_index`,
        # and verify it is a valid camera by calling `isOpened`.
-
-        if platform.system() == "Linux":
-            # Linux uses ports for connecting to cameras
-            tmp_camera = cv2.VideoCapture(f"/dev/video{self.camera_index}")
-        else:
-            tmp_camera = cv2.VideoCapture(self.camera_index)
-
+        tmp_camera = cv2.VideoCapture(camera_idx)
        is_camera_open = tmp_camera.isOpened()
        # Release camera to make it accessible for `find_camera_indices`
+        tmp_camera.release()
        del tmp_camera

        # If the camera doesn't work, display the camera indices corresponding to
        # valid cameras.
        if not is_camera_open:
            # Verify that the provided `camera_index` is valid before printing the traceback
-            available_cam_ids = find_camera_indices()
+            cameras_info = find_cameras()
+            available_cam_ids = [cam["index"] for cam in cameras_info]
            if self.camera_index not in available_cam_ids:
                raise ValueError(
                    f"`camera_index` is expected to be one of these available cameras {available_cam_ids}, but {self.camera_index} is provided instead. "
                    "To find the camera index you should use, run `python lerobot/scripts/save_images_from_cameras.py --driver opencv`."
                )

-            raise OSError(f"Can't access OpenCVCamera({self.camera_index}).")
+            raise OSError(f"Can't access OpenCVCamera({camera_idx}).")

        # Secondly, create the camera that will be used downstream.
        # Note: For some unknown reason, calling `isOpened` blocks the camera which then
        # needs to be re-created.
-        if platform.system() == "Linux":
-            self.camera = cv2.VideoCapture(f"/dev/video{self.camera_index}")
-        else:
-            self.camera = cv2.VideoCapture(self.camera_index)
+        self.camera = cv2.VideoCapture(camera_idx)

        if self.fps is not None:
            self.camera.set(cv2.CAP_PROP_FPS, self.fps)
@@ -200,22 +346,24 @@ class OpenCVCamera:
        actual_width = self.camera.get(cv2.CAP_PROP_FRAME_WIDTH)
        actual_height = self.camera.get(cv2.CAP_PROP_FRAME_HEIGHT)

+        # Using `math.isclose` since actual fps can be a float (e.g. 29.9 instead of 30)
        if self.fps is not None and not math.isclose(self.fps, actual_fps, rel_tol=1e-3):
+            # Using `OSError` since it's a broad that encompasses issues related to device communication
            raise OSError(
                f"Can't set {self.fps=} for OpenCVCamera({self.camera_index}). Actual value is {actual_fps}."
            )
-        if self.width is not None and self.width != actual_width:
+        if self.width is not None and not math.isclose(self.width, actual_width, rel_tol=1e-3):
            raise OSError(
                f"Can't set {self.width=} for OpenCVCamera({self.camera_index}). Actual value is {actual_width}."
            )
-        if self.height is not None and self.height != actual_height:
+        if self.height is not None and not math.isclose(self.height, actual_height, rel_tol=1e-3):
            raise OSError(
                f"Can't set {self.height=} for OpenCVCamera({self.camera_index}). Actual value is {actual_height}."
            )

-        self.fps = actual_fps
-        self.width = actual_width
-        self.height = actual_height
+        self.fps = round(actual_fps)
+        self.width = round(actual_width)
+        self.height = round(actual_height)

        self.is_connected = True

@@ -234,6 +382,7 @@ class OpenCVCamera:
        start_time = time.perf_counter()

        ret, color_image = self.camera.read()
+
        if not ret:
            raise OSError(f"Can't capture color image from camera {self.camera_index}.")

@@ -248,6 +397,11 @@ class OpenCVCamera:
        # However, Deep Learning framework such as LeRobot uses RGB format as default to train neural networks,
        # so we convert the image color from BGR to RGB.
        if requested_color_mode == "rgb":
+            if self.mock:
+                import tests.mock_cv2 as cv2
+            else:
+                import cv2
+
            color_image = cv2.cvtColor(color_image, cv2.COLOR_BGR2RGB)

        h, w, _ = color_image.shape
@@ -256,17 +410,25 @@ class OpenCVCamera:
                f"Can't capture color image with expected height and width ({self.height} x {self.width}). ({h} x {w}) returned instead."
            )

+        if self.rotation is not None:
+            color_image = cv2.rotate(color_image, self.rotation)
+
        # log the number of seconds it took to read the image
        self.logs["delta_timestamp_s"] = time.perf_counter() - start_time

        # log the utc time at which the image was received
        self.logs["timestamp_utc"] = capture_timestamp_utc()

+        self.color_image = color_image
+
        return color_image

    def read_loop(self):
-        while self.stop_event is None or not self.stop_event.is_set():
-            self.color_image = self.read()
+        while not self.stop_event.is_set():
+            try:
+                self.color_image = self.read()
+            except Exception as e:
+                print(f"Error reading in thread: {e}")

    def async_read(self):
        if not self.is_connected:
@@ -281,15 +443,14 @@ class OpenCVCamera:
            self.thread.start()

        num_tries = 0
-        while self.color_image is None:
-            num_tries += 1
-            time.sleep(1 / self.fps)
-            if num_tries > self.fps and (self.thread.ident is None or not self.thread.is_alive()):
-                raise Exception(
-                    "The thread responsible for `self.async_read()` took too much time to start. There might be an issue. Verify that `self.thread.start()` has been called."
-                )
+        while True:
+            if self.color_image is not None:
+                return self.color_image

-        return self.color_image
+            time.sleep(1 / self.fps)
+            num_tries += 1
+            if num_tries > self.fps * 2:
+                raise TimeoutError("Timed out waiting for async_read() to start.")

    def disconnect(self):
        if not self.is_connected:
@@ -297,18 +458,61 @@ class OpenCVCamera:
                f"OpenCVCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
            )

-        if self.thread is not None and self.thread.is_alive():
-            # wait for the thread to finish
+        if self.thread is not None:
            self.stop_event.set()
-            self.thread.join()
+            self.thread.join()  # wait for the thread to finish
            self.thread = None
            self.stop_event = None

        self.camera.release()
        self.camera = None
-
        self.is_connected = False

    def __del__(self):
        if getattr(self, "is_connected", False):
            self.disconnect()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Save a few frames using `OpenCVCamera` for all cameras connected to the computer, or a selected subset."
+    )
+    parser.add_argument(
+        "--camera-ids",
+        type=int,
+        nargs="*",
+        default=None,
+        help="List of camera indices used to instantiate the `OpenCVCamera`. If not provided, find and use all available camera indices.",
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=None,
+        help="Set the number of frames recorded per seconds for all cameras. If not provided, use the default fps of each camera.",
+    )
+    parser.add_argument(
+        "--width",
+        type=str,
+        default=None,
+        help="Set the width for all cameras. If not provided, use the default width of each camera.",
+    )
+    parser.add_argument(
+        "--height",
+        type=str,
+        default=None,
+        help="Set the height for all cameras. If not provided, use the default height of each camera.",
+    )
+    parser.add_argument(
+        "--images-dir",
+        type=Path,
+        default="outputs/images_from_opencv_cameras",
+        help="Set directory to save a few frames for each camera.",
+    )
+    parser.add_argument(
+        "--record-time-s",
+        type=float,
+        default=4.0,
+        help="Set the number of seconds used to record the frames. By default, 2 seconds.",
+    )
+    args = parser.parse_args()
+    save_images_from_cameras(**vars(args))
--- a/lerobot/common/robot_devices/cameras/utils.py
+++ b/lerobot/common/robot_devices/cameras/utils.py
@@ -1,55 +1,8 @@
-from pathlib import Path
 from typing import Protocol

-import cv2
-import einops
 import numpy as np


-def write_shape_on_image_inplace(image):
-    height, width = image.shape[:2]
-    text = f"Width: {width} Height: {height}"
-
-    # Define the font, scale, color, and thickness
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    font_scale = 1
-    color = (255, 0, 0)  # Blue in BGR
-    thickness = 2
-
-    position = (10, height - 10)  # 10 pixels from the bottom-left corner
-    cv2.putText(image, text, position, font, font_scale, color, thickness)
-
-
-def save_color_image(image, path, write_shape=False):
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    if write_shape:
-        write_shape_on_image_inplace(image)
-    cv2.imwrite(str(path), image)
-
-
-def save_depth_image(depth, path, write_shape=False):
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-
-    # Apply colormap on depth image (image must be converted to 8-bit per pixel first)
-    depth_image = cv2.applyColorMap(cv2.convertScaleAbs(depth, alpha=0.03), cv2.COLORMAP_JET)
-
-    if write_shape:
-        write_shape_on_image_inplace(depth_image)
-    cv2.imwrite(str(path), depth_image)
-
-
-def convert_torch_image_to_cv2(tensor, rgb_to_bgr=True):
-    assert tensor.ndim == 3
-    c, h, w = tensor.shape
-    assert c < h and c < w
-    color_image = einops.rearrange(tensor, "c h w -> h w c").numpy()
-    if rgb_to_bgr:
-        color_image = cv2.cvtColor(color_image, cv2.COLOR_RGB2BGR)
-    return color_image
-
-
 # Defines a camera type
 class Camera(Protocol):
    def connect(self): ...
--- a/lerobot/common/robot_devices/control_utils.py
+++ b/lerobot/common/robot_devices/control_utils.py
@@ -0,0 +1,327 @@
+########################################################################################
+# Utilities
+########################################################################################
+
+
+import logging
+import time
+import traceback
+from contextlib import nullcontext
+from copy import copy
+from functools import cache
+
+import cv2
+import torch
+import tqdm
+from termcolor import colored
+
+from lerobot.common.datasets.populate_dataset import add_frame, safe_stop_image_writer
+from lerobot.common.policies.factory import make_policy
+from lerobot.common.robot_devices.robots.utils import Robot
+from lerobot.common.robot_devices.utils import busy_wait
+from lerobot.common.utils.utils import get_safe_torch_device, init_hydra_config, set_global_seed
+from lerobot.scripts.eval import get_pretrained_policy_path
+
+
+def log_control_info(robot: Robot, dt_s, episode_index=None, frame_index=None, fps=None):
+    log_items = []
+    if episode_index is not None:
+        log_items.append(f"ep:{episode_index}")
+    if frame_index is not None:
+        log_items.append(f"frame:{frame_index}")
+
+    def log_dt(shortname, dt_val_s):
+        nonlocal log_items, fps
+        info_str = f"{shortname}:{dt_val_s * 1000:5.2f} ({1/ dt_val_s:3.1f}hz)"
+        if fps is not None:
+            actual_fps = 1 / dt_val_s
+            if actual_fps < fps - 1:
+                info_str = colored(info_str, "yellow")
+        log_items.append(info_str)
+
+    # total step time displayed in milliseconds and its frequency
+    log_dt("dt", dt_s)
+
+    # TODO(aliberts): move robot-specific logs logic in robot.print_logs()
+    if not robot.robot_type.startswith("stretch"):
+        for name in robot.leader_arms:
+            key = f"read_leader_{name}_pos_dt_s"
+            if key in robot.logs:
+                log_dt("dtRlead", robot.logs[key])
+
+        for name in robot.follower_arms:
+            key = f"write_follower_{name}_goal_pos_dt_s"
+            if key in robot.logs:
+                log_dt("dtWfoll", robot.logs[key])
+
+            key = f"read_follower_{name}_pos_dt_s"
+            if key in robot.logs:
+                log_dt("dtRfoll", robot.logs[key])
+
+        for name in robot.cameras:
+            key = f"read_camera_{name}_dt_s"
+            if key in robot.logs:
+                log_dt(f"dtR{name}", robot.logs[key])
+
+    info_str = " ".join(log_items)
+    logging.info(info_str)
+
+
+@cache
+def is_headless():
+    """Detects if python is running without a monitor."""
+    try:
+        import pynput  # noqa
+
+        return False
+    except Exception:
+        print(
+            "Error trying to import pynput. Switching to headless mode. "
+            "As a result, the video stream from the cameras won't be shown, "
+            "and you won't be able to change the control flow with keyboards. "
+            "For more info, see traceback below.\n"
+        )
+        traceback.print_exc()
+        print()
+        return True
+
+
+def has_method(_object: object, method_name: str):
+    return hasattr(_object, method_name) and callable(getattr(_object, method_name))
+
+
+def predict_action(observation, policy, device, use_amp):
+    observation = copy(observation)
+    with (
+        torch.inference_mode(),
+        torch.autocast(device_type=device.type) if device.type == "cuda" and use_amp else nullcontext(),
+    ):
+        # Convert to pytorch format: channel first and float32 in [0,1] with batch dimension
+        for name in observation:
+            if "image" in name:
+                observation[name] = observation[name].type(torch.float32) / 255
+                observation[name] = observation[name].permute(2, 0, 1).contiguous()
+            observation[name] = observation[name].unsqueeze(0)
+            observation[name] = observation[name].to(device)
+
+        # Compute the next action with the policy
+        # based on the current observation
+        action = policy.select_action(observation)
+
+        # Remove batch dimension
+        action = action.squeeze(0)
+
+        # Move to cpu, if not already the case
+        action = action.to("cpu")
+
+    return action
+
+
+def init_keyboard_listener():
+    # Allow to exit early while recording an episode or resetting the environment,
+    # by tapping the right arrow key '->'. This might require a sudo permission
+    # to allow your terminal to monitor keyboard events.
+    events = {}
+    events["exit_early"] = False
+    events["rerecord_episode"] = False
+    events["stop_recording"] = False
+
+    if is_headless():
+        logging.warning(
+            "Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
+        )
+        listener = None
+        return listener, events
+
+    # Only import pynput if not in a headless environment
+    from pynput import keyboard
+
+    def on_press(key):
+        try:
+            if key == keyboard.Key.right:
+                print("Right arrow key pressed. Exiting loop...")
+                events["exit_early"] = True
+            elif key == keyboard.Key.left:
+                print("Left arrow key pressed. Exiting loop and rerecord the last episode...")
+                events["rerecord_episode"] = True
+                events["exit_early"] = True
+            elif key == keyboard.Key.esc:
+                print("Escape key pressed. Stopping data recording...")
+                events["stop_recording"] = True
+                events["exit_early"] = True
+        except Exception as e:
+            print(f"Error handling key press: {e}")
+
+    listener = keyboard.Listener(on_press=on_press)
+    listener.start()
+
+    return listener, events
+
+
+def init_policy(pretrained_policy_name_or_path, policy_overrides):
+    """Instantiate the policy and load fps, device and use_amp from config yaml"""
+    pretrained_policy_path = get_pretrained_policy_path(pretrained_policy_name_or_path)
+    hydra_cfg = init_hydra_config(pretrained_policy_path / "config.yaml", policy_overrides)
+    policy = make_policy(hydra_cfg=hydra_cfg, pretrained_policy_name_or_path=pretrained_policy_path)
+
+    # Check device is available
+    device = get_safe_torch_device(hydra_cfg.device, log=True)
+    use_amp = hydra_cfg.use_amp
+    policy_fps = hydra_cfg.env.fps
+
+    policy.eval()
+    policy.to(device)
+
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    set_global_seed(hydra_cfg.seed)
+    return policy, policy_fps, device, use_amp
+
+
+def warmup_record(
+    robot,
+    events,
+    enable_teloperation,
+    warmup_time_s,
+    display_cameras,
+    fps,
+):
+    control_loop(
+        robot=robot,
+        control_time_s=warmup_time_s,
+        display_cameras=display_cameras,
+        events=events,
+        fps=fps,
+        teleoperate=enable_teloperation,
+    )
+
+
+def record_episode(
+    robot,
+    dataset,
+    events,
+    episode_time_s,
+    display_cameras,
+    policy,
+    device,
+    use_amp,
+    fps,
+):
+    control_loop(
+        robot=robot,
+        control_time_s=episode_time_s,
+        display_cameras=display_cameras,
+        dataset=dataset,
+        events=events,
+        policy=policy,
+        device=device,
+        use_amp=use_amp,
+        fps=fps,
+        teleoperate=policy is None,
+    )
+
+
+@safe_stop_image_writer
+def control_loop(
+    robot,
+    control_time_s,
+    teleoperate=False,
+    display_cameras=False,
+    dataset=None,
+    events=None,
+    policy=None,
+    device=None,
+    use_amp=None,
+    fps=None,
+):
+    # TODO(rcadene): Add option to record logs
+    if not robot.is_connected:
+        robot.connect()
+
+    if events is None:
+        events = {"exit_early": False}
+
+    if teleoperate and policy is not None:
+        raise ValueError("When `teleoperate` is True, `policy` should be None.")
+
+    if dataset is not None and fps is not None and dataset["fps"] != fps:
+        raise ValueError(f"The dataset fps should be equal to requested fps ({dataset['fps']} != {fps}).")
+
+    timestamp = 0
+    start_episode_t = time.perf_counter()
+    while timestamp < control_time_s:
+        start_loop_t = time.perf_counter()
+
+        if teleoperate:
+            observation, action = robot.teleop_step(record_data=True)
+        else:
+            observation = robot.capture_observation()
+
+            if policy is not None:
+                pred_action = predict_action(observation, policy, device, use_amp)
+                # Action can eventually be clipped using `max_relative_target`,
+                # so action actually sent is saved in the dataset.
+                action = robot.send_action(pred_action)
+                action = {"action": action}
+
+        if dataset is not None:
+            add_frame(dataset, observation, action)
+
+        if display_cameras and not is_headless():
+            image_keys = [key for key in observation if "image" in key]
+            for key in image_keys:
+                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
+            cv2.waitKey(1)
+
+        if fps is not None:
+            dt_s = time.perf_counter() - start_loop_t
+            busy_wait(1 / fps - dt_s)
+
+        dt_s = time.perf_counter() - start_loop_t
+        log_control_info(robot, dt_s, fps=fps)
+
+        timestamp = time.perf_counter() - start_episode_t
+        if events["exit_early"]:
+            events["exit_early"] = False
+            break
+
+
+def reset_environment(robot, events, reset_time_s):
+    # TODO(rcadene): refactor warmup_record and reset_environment
+    # TODO(alibets): allow for teleop during reset
+    if has_method(robot, "teleop_safety_stop"):
+        robot.teleop_safety_stop()
+
+    timestamp = 0
+    start_vencod_t = time.perf_counter()
+
+    # Wait if necessary
+    with tqdm.tqdm(total=reset_time_s, desc="Waiting") as pbar:
+        while timestamp < reset_time_s:
+            time.sleep(1)
+            timestamp = time.perf_counter() - start_vencod_t
+            pbar.update(1)
+            if events["exit_early"]:
+                events["exit_early"] = False
+                break
+
+
+def stop_recording(robot, listener, display_cameras):
+    robot.disconnect()
+
+    if not is_headless():
+        if listener is not None:
+            listener.stop()
+
+        if display_cameras:
+            cv2.destroyAllWindows()
+
+
+def sanity_check_dataset_name(repo_id, policy):
+    _, dataset_name = repo_id.split("/")
+    # either repo_id doesnt start with "eval_" and there is no policy
+    # or repo_id starts with "eval_" and there is a policy
+    if dataset_name.startswith("eval_") == (policy is None):
+        raise ValueError(
+            f"Your dataset name begins by 'eval_' ({dataset_name}) but no policy is provided ({policy})."
+        )
--- a/lerobot/common/robot_devices/motors/dynamixel.py
+++ b/lerobot/common/robot_devices/motors/dynamixel.py
@@ -7,17 +7,6 @@ from copy import deepcopy

 import numpy as np
 import tqdm
-from dynamixel_sdk import (
-    COMM_SUCCESS,
-    DXL_HIBYTE,
-    DXL_HIWORD,
-    DXL_LOBYTE,
-    DXL_LOWORD,
-    GroupSyncRead,
-    GroupSyncWrite,
-    PacketHandler,
-    PortHandler,
-)

 from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
 from lerobot.common.utils.utils import capture_timestamp_utc
@@ -165,24 +154,29 @@ def convert_degrees_to_steps(degrees: float | np.ndarray, models: str | list[str
    return steps


-def convert_to_bytes(value, bytes):
+def convert_to_bytes(value, bytes, mock=False):
+    if mock:
+        return value
+
+    import dynamixel_sdk as dxl
+
    # Note: No need to convert back into unsigned int, since this byte preprocessing
    # already handles it for us.
    if bytes == 1:
        data = [
-            DXL_LOBYTE(DXL_LOWORD(value)),
+            dxl.DXL_LOBYTE(dxl.DXL_LOWORD(value)),
        ]
    elif bytes == 2:
        data = [
-            DXL_LOBYTE(DXL_LOWORD(value)),
-            DXL_HIBYTE(DXL_LOWORD(value)),
+            dxl.DXL_LOBYTE(dxl.DXL_LOWORD(value)),
+            dxl.DXL_HIBYTE(dxl.DXL_LOWORD(value)),
        ]
    elif bytes == 4:
        data = [
-            DXL_LOBYTE(DXL_LOWORD(value)),
-            DXL_HIBYTE(DXL_LOWORD(value)),
-            DXL_LOBYTE(DXL_HIWORD(value)),
-            DXL_HIBYTE(DXL_HIWORD(value)),
+            dxl.DXL_LOBYTE(dxl.DXL_LOWORD(value)),
+            dxl.DXL_HIBYTE(dxl.DXL_LOWORD(value)),
+            dxl.DXL_LOBYTE(dxl.DXL_HIWORD(value)),
+            dxl.DXL_HIBYTE(dxl.DXL_HIWORD(value)),
        ]
    else:
        raise NotImplementedError(
@@ -303,9 +297,11 @@ class DynamixelMotorsBus:
        motors: dict[str, tuple[int, str]],
        extra_model_control_table: dict[str, list[tuple]] | None = None,
        extra_model_resolution: dict[str, int] | None = None,
+        mock=False,
    ):
        self.port = port
        self.motors = motors
+        self.mock = mock

        self.model_ctrl_table = deepcopy(MODEL_CONTROL_TABLE)
        if extra_model_control_table:
@@ -329,8 +325,13 @@ class DynamixelMotorsBus:
                f"DynamixelMotorsBus({self.port}) is already connected. Do not call `motors_bus.connect()` twice."
            )

-        self.port_handler = PortHandler(self.port)
-        self.packet_handler = PacketHandler(PROTOCOL_VERSION)
+        if self.mock:
+            import tests.mock_dynamixel_sdk as dxl
+        else:
+            import dynamixel_sdk as dxl
+
+        self.port_handler = dxl.PortHandler(self.port)
+        self.packet_handler = dxl.PacketHandler(PROTOCOL_VERSION)

        try:
            if not self.port_handler.openPort():
@@ -348,10 +349,17 @@ class DynamixelMotorsBus:
        self.port_handler.setPacketTimeoutMillis(TIMEOUT_MS)

    def reconnect(self):
-        self.port_handler = PortHandler(self.port)
-        self.packet_handler = PacketHandler(PROTOCOL_VERSION)
+        if self.mock:
+            import tests.mock_dynamixel_sdk as dxl
+        else:
+            import dynamixel_sdk as dxl
+
+        self.port_handler = dxl.PortHandler(self.port)
+        self.packet_handler = dxl.PacketHandler(PROTOCOL_VERSION)
+
        if not self.port_handler.openPort():
            raise OSError(f"Failed to open port '{self.port}'.")
+
        self.is_connected = True

    def are_motors_configured(self):
@@ -631,6 +639,11 @@ class DynamixelMotorsBus:
        return values

    def read_with_motor_ids(self, motor_models, motor_ids, data_name):
+        if self.mock:
+            import tests.mock_dynamixel_sdk as dxl
+        else:
+            import dynamixel_sdk as dxl
+
        return_list = True
        if not isinstance(motor_ids, list):
            return_list = False
@@ -638,12 +651,12 @@ class DynamixelMotorsBus:

        assert_same_address(self.model_ctrl_table, self.motor_models, data_name)
        addr, bytes = self.model_ctrl_table[motor_models[0]][data_name]
-        group = GroupSyncRead(self.port_handler, self.packet_handler, addr, bytes)
+        group = dxl.GroupSyncRead(self.port_handler, self.packet_handler, addr, bytes)
        for idx in motor_ids:
            group.addParam(idx)

        comm = group.txRxPacket()
-        if comm != COMM_SUCCESS:
+        if comm != dxl.COMM_SUCCESS:
            raise ConnectionError(
                f"Read failed due to communication error on port {self.port_handler.port_name} for indices {motor_ids}: "
                f"{self.packet_handler.getTxRxResult(comm)}"
@@ -667,6 +680,11 @@ class DynamixelMotorsBus:

        start_time = time.perf_counter()

+        if self.mock:
+            import tests.mock_dynamixel_sdk as dxl
+        else:
+            import dynamixel_sdk as dxl
+
        if motor_names is None:
            motor_names = self.motor_names

@@ -686,16 +704,18 @@ class DynamixelMotorsBus:

        if data_name not in self.group_readers:
            # create new group reader
-            self.group_readers[group_key] = GroupSyncRead(self.port_handler, self.packet_handler, addr, bytes)
+            self.group_readers[group_key] = dxl.GroupSyncRead(
+                self.port_handler, self.packet_handler, addr, bytes
+            )
            for idx in motor_ids:
                self.group_readers[group_key].addParam(idx)

        for _ in range(NUM_READ_RETRY):
            comm = self.group_readers[group_key].txRxPacket()
-            if comm == COMM_SUCCESS:
+            if comm == dxl.COMM_SUCCESS:
                break

-        if comm != COMM_SUCCESS:
+        if comm != dxl.COMM_SUCCESS:
            raise ConnectionError(
                f"Read failed due to communication error on port {self.port} for group_key {group_key}: "
                f"{self.packet_handler.getTxRxResult(comm)}"
@@ -726,6 +746,11 @@ class DynamixelMotorsBus:
        return values

    def write_with_motor_ids(self, motor_models, motor_ids, data_name, values):
+        if self.mock:
+            import tests.mock_dynamixel_sdk as dxl
+        else:
+            import dynamixel_sdk as dxl
+
        if not isinstance(motor_ids, list):
            motor_ids = [motor_ids]
        if not isinstance(values, list):
@@ -733,13 +758,13 @@ class DynamixelMotorsBus:

        assert_same_address(self.model_ctrl_table, motor_models, data_name)
        addr, bytes = self.model_ctrl_table[motor_models[0]][data_name]
-        group = GroupSyncWrite(self.port_handler, self.packet_handler, addr, bytes)
+        group = dxl.GroupSyncWrite(self.port_handler, self.packet_handler, addr, bytes)
        for idx, value in zip(motor_ids, values, strict=True):
-            data = convert_to_bytes(value, bytes)
+            data = convert_to_bytes(value, bytes, self.mock)
            group.addParam(idx, data)

        comm = group.txPacket()
-        if comm != COMM_SUCCESS:
+        if comm != dxl.COMM_SUCCESS:
            raise ConnectionError(
                f"Write failed due to communication error on port {self.port_handler.port_name} for indices {motor_ids}: "
                f"{self.packet_handler.getTxRxResult(comm)}"
@@ -753,6 +778,11 @@ class DynamixelMotorsBus:

        start_time = time.perf_counter()

+        if self.mock:
+            import tests.mock_dynamixel_sdk as dxl
+        else:
+            import dynamixel_sdk as dxl
+
        if motor_names is None:
            motor_names = self.motor_names

@@ -782,19 +812,19 @@ class DynamixelMotorsBus:

        init_group = data_name not in self.group_readers
        if init_group:
-            self.group_writers[group_key] = GroupSyncWrite(
+            self.group_writers[group_key] = dxl.GroupSyncWrite(
                self.port_handler, self.packet_handler, addr, bytes
            )

        for idx, value in zip(motor_ids, values, strict=True):
-            data = convert_to_bytes(value, bytes)
+            data = convert_to_bytes(value, bytes, self.mock)
            if init_group:
                self.group_writers[group_key].addParam(idx, data)
            else:
                self.group_writers[group_key].changeParam(idx, data)

        comm = self.group_writers[group_key].txPacket()
-        if comm != COMM_SUCCESS:
+        if comm != dxl.COMM_SUCCESS:
            raise ConnectionError(
                f"Write failed due to communication error on port {self.port} for group_key {group_key}: "
                f"{self.packet_handler.getTxRxResult(comm)}"
--- a/lerobot/common/robot_devices/robots/factory.py
+++ b/lerobot/common/robot_devices/robots/factory.py
@@ -1,7 +1,9 @@
 import hydra
 from omegaconf import DictConfig

+from lerobot.common.robot_devices.robots.utils import Robot

-def make_robot(cfg: DictConfig):
+
+def make_robot(cfg: DictConfig) -> Robot:
    robot = hydra.utils.instantiate(cfg)
    return robot
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@@ -350,6 +350,25 @@ class ManipulatorRobot:
        self.is_connected = False
        self.logs = {}

+    @property
+    def has_camera(self):
+        return len(self.cameras) > 0
+
+    @property
+    def num_cameras(self):
+        return len(self.cameras)
+
+    @property
+    def available_arms(self):
+        available_arms = []
+        for name in self.follower_arms:
+            arm_id = get_arm_id(name, "follower")
+            available_arms.append(arm_id)
+        for name in self.leader_arms:
+            arm_id = get_arm_id(name, "leader")
+            available_arms.append(arm_id)
+        return available_arms
+
    def connect(self):
        if self.is_connected:
            raise RobotDeviceAlreadyConnectedError(
@@ -539,8 +558,8 @@ class ManipulatorRobot:
            self.follower_arms[name].write("P_Coefficient", 32, "shoulder_pan")
            # self.follower_arms[name].write("D_Coefficient", 230, "shoulder_pan")
            self.follower_arms[name].write("D_Coefficient", 32, "shoulder_pan")
-            #self.follower_arms[name].write("Acceleration", 254)
-            #self.follower_arms[name].write("Minimum_Startup_Force", 16)
+            # self.follower_arms[name].write("Acceleration", 254)
+            # self.follower_arms[name].write("Minimum_Startup_Force", 16)
            self.follower_arms[name].write("Lock", 0)
            self.follower_arms[name].write("Maximum_Acceleration", 250)

@@ -709,6 +728,10 @@ class ManipulatorRobot:

        return torch.cat(action_sent)

+    def print_logs(self):
+        pass
+        # TODO(aliberts): move robot-specific logs logic here
+
    def disconnect(self):
        if not self.is_connected:
            raise RobotDeviceNotConnectedError(
--- a/lerobot/common/robot_devices/robots/stretch.py
+++ b/lerobot/common/robot_devices/robots/stretch.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from dataclasses import dataclass, field, replace
+
+import torch
+from stretch_body.gamepad_teleop import GamePadTeleop
+from stretch_body.robot import Robot as StretchAPI
+from stretch_body.robot_params import RobotParams
+
+from lerobot.common.robot_devices.cameras.utils import Camera
+
+
+@dataclass
+class StretchRobotConfig:
+    robot_type: str | None = "stretch"
+    cameras: dict[str, Camera] = field(default_factory=lambda: {})
+    # TODO(aliberts): add feature with max_relative target
+    # TODO(aliberts): add comment on max_relative target
+    max_relative_target: list[float] | float | None = None
+
+
+class StretchRobot(StretchAPI):
+    """Wrapper of stretch_body.robot.Robot"""
+
+    def __init__(self, config: StretchRobotConfig | None = None, **kwargs):
+        super().__init__()
+        if config is None:
+            config = StretchRobotConfig()
+        # Overwrite config arguments using kwargs
+        self.config = replace(config, **kwargs)
+
+        self.robot_type = self.config.robot_type
+        self.cameras = self.config.cameras
+        self.is_connected = False
+        self.teleop = None
+        self.logs = {}
+
+        # TODO(aliberts): test this
+        RobotParams.set_logging_level("WARNING")
+        RobotParams.set_logging_formatter("brief_console_formatter")
+
+        self.state_keys = None
+        self.action_keys = None
+
+    def connect(self) -> None:
+        self.is_connected = self.startup()
+        if not self.is_connected:
+            print("Another process is already using Stretch. Try running 'stretch_free_robot_process.py'")
+            raise ConnectionError()
+
+        for name in self.cameras:
+            self.cameras[name].connect()
+            self.is_connected = self.is_connected and self.cameras[name].is_connected
+
+        if not self.is_connected:
+            print("Could not connect to the cameras, check that all cameras are plugged-in.")
+            raise ConnectionError()
+
+        self.run_calibration()
+
+    def run_calibration(self) -> None:
+        if not self.is_homed():
+            self.home()
+
+    def teleop_step(
+        self, record_data=False
+    ) -> None | tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+        # TODO(aliberts): return ndarrays instead of torch.Tensors
+        if not self.is_connected:
+            raise ConnectionError()
+
+        if self.teleop is None:
+            self.teleop = GamePadTeleop(robot_instance=False)
+            self.teleop.startup(robot=self)
+
+        before_read_t = time.perf_counter()
+        state = self.get_state()
+        action = self.teleop.gamepad_controller.get_state()
+        self.logs["read_pos_dt_s"] = time.perf_counter() - before_read_t
+
+        before_write_t = time.perf_counter()
+        self.teleop.do_motion(robot=self)
+        self.push_command()
+        self.logs["write_pos_dt_s"] = time.perf_counter() - before_write_t
+
+        if self.state_keys is None:
+            self.state_keys = list(state)
+
+        if not record_data:
+            return
+
+        state = torch.as_tensor(list(state.values()))
+        action = torch.as_tensor(list(action.values()))
+
+        # Capture images from cameras
+        images = {}
+        for name in self.cameras:
+            before_camread_t = time.perf_counter()
+            images[name] = self.cameras[name].async_read()
+            images[name] = torch.from_numpy(images[name])
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
+            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+
+        # Populate output dictionnaries
+        obs_dict, action_dict = {}, {}
+        obs_dict["observation.state"] = state
+        action_dict["action"] = action
+        for name in self.cameras:
+            obs_dict[f"observation.images.{name}"] = images[name]
+
+        return obs_dict, action_dict
+
+    def get_state(self) -> dict:
+        status = self.get_status()
+        return {
+            "head_pan.pos": status["head"]["head_pan"]["pos"],
+            "head_tilt.pos": status["head"]["head_tilt"]["pos"],
+            "lift.pos": status["lift"]["pos"],
+            "arm.pos": status["arm"]["pos"],
+            "wrist_pitch.pos": status["end_of_arm"]["wrist_pitch"]["pos"],
+            "wrist_roll.pos": status["end_of_arm"]["wrist_roll"]["pos"],
+            "wrist_yaw.pos": status["end_of_arm"]["wrist_yaw"]["pos"],
+            "gripper.pos": status["end_of_arm"]["stretch_gripper"]["pos"],
+            "base_x.vel": status["base"]["x_vel"],
+            "base_y.vel": status["base"]["y_vel"],
+            "base_theta.vel": status["base"]["theta_vel"],
+        }
+
+    def capture_observation(self) -> dict:
+        # TODO(aliberts): return ndarrays instead of torch.Tensors
+        before_read_t = time.perf_counter()
+        state = self.get_state()
+        self.logs["read_pos_dt_s"] = time.perf_counter() - before_read_t
+
+        if self.state_keys is None:
+            self.state_keys = list(state)
+
+        state = torch.as_tensor(list(state.values()))
+
+        # Capture images from cameras
+        images = {}
+        for name in self.cameras:
+            before_camread_t = time.perf_counter()
+            images[name] = self.cameras[name].async_read()
+            images[name] = torch.from_numpy(images[name])
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
+            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+
+        # Populate output dictionnaries
+        obs_dict = {}
+        obs_dict["observation.state"] = state
+        for name in self.cameras:
+            obs_dict[f"observation.images.{name}"] = images[name]
+
+        return obs_dict
+
+    def send_action(self, action: torch.Tensor) -> torch.Tensor:
+        # TODO(aliberts): return ndarrays instead of torch.Tensors
+        if not self.is_connected:
+            raise ConnectionError()
+
+        if self.teleop is None:
+            self.teleop = GamePadTeleop(robot_instance=False)
+            self.teleop.startup(robot=self)
+
+        if self.action_keys is None:
+            dummy_action = self.teleop.gamepad_controller.get_state()
+            self.action_keys = list(dummy_action.keys())
+
+        action_dict = dict(zip(self.action_keys, action.tolist(), strict=True))
+
+        before_write_t = time.perf_counter()
+        self.teleop.do_motion(state=action_dict, robot=self)
+        self.push_command()
+        self.logs["write_pos_dt_s"] = time.perf_counter() - before_write_t
+
+        # TODO(aliberts): return action_sent when motion is limited
+        return action
+
+    def print_logs(self) -> None:
+        pass
+        # TODO(aliberts): move robot-specific logs logic here
+
+    def teleop_safety_stop(self) -> None:
+        if self.teleop is not None:
+            self.teleop._safety_stop(robot=self)
+
+    def disconnect(self) -> None:
+        self.stop()
+        if self.teleop is not None:
+            self.teleop.gamepad_controller.stop()
+            self.teleop.stop()
+
+        if len(self.cameras) > 0:
+            for cam in self.cameras.values():
+                cam.disconnect()
+
+        self.is_connected = False
+
+    def __del__(self):
+        self.disconnect()
--- a/lerobot/common/robot_devices/robots/utils.py
+++ b/lerobot/common/robot_devices/robots/utils.py
@@ -9,8 +9,12 @@ def get_arm_id(name, arm_type):


 class Robot(Protocol):
-    def init_teleop(self): ...
+    # TODO(rcadene, aliberts): Add unit test checking the protocol is implemented in the corresponding classes
+    robot_type: str
+
+    def connect(self): ...
    def run_calibration(self): ...
    def teleop_step(self, record_data=False): ...
    def capture_observation(self): ...
    def send_action(self, action): ...
+    def disconnect(self): ...
--- a/lerobot/common/robot_devices/utils.py
+++ b/lerobot/common/robot_devices/utils.py
@@ -16,6 +16,20 @@ def busy_wait(seconds):
            time.sleep(seconds)


+def safe_disconnect(func):
+    # TODO(aliberts): Allow to pass custom exceptions
+    # (e.g. ThreadServiceExit, KeyboardInterrupt, SystemExit, UnpluggedError, DynamixelCommError)
+    def wrapper(robot, *args, **kwargs):
+        try:
+            return func(robot, *args, **kwargs)
+        except Exception as e:
+            if robot.is_connected:
+                robot.disconnect()
+            raise e
+
+    return wrapper
+
+
 class RobotDeviceNotConnectedError(Exception):
    """Exception raised when the robot device is not connected."""

--- a/lerobot/common/utils/utils.py
+++ b/lerobot/common/utils/utils.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import os.path as osp
+import platform
 import random
 from contextlib import contextmanager
 from datetime import datetime, timezone
@@ -28,6 +29,12 @@ import torch
 from omegaconf import DictConfig


+def none_or_int(value):
+    if value == "None":
+        return None
+    return int(value)
+
+
 def inside_slurm():
    """Check whether the python process was launched through slurm"""
    # TODO(rcadene): return False for interactive mode `--pty bash`
@@ -183,3 +190,30 @@ def print_cuda_memory_usage():

 def capture_timestamp_utc():
    return datetime.now(timezone.utc)
+
+
+def say(text, blocking=False):
+    # Check if mac, linux, or windows.
+    if platform.system() == "Darwin":
+        cmd = f'say "{text}"'
+    elif platform.system() == "Linux":
+        cmd = f'spd-say "{text}"'
+    elif platform.system() == "Windows":
+        cmd = (
+            'PowerShell -Command "Add-Type -AssemblyName System.Speech; '
+            f"(New-Object System.Speech.Synthesis.SpeechSynthesizer).Speak('{text}')\""
+        )
+
+    if not blocking and platform.system() in ["Darwin", "Linux"]:
+        # TODO(rcadene): Make it work for Windows
+        # Use the ampersand to run command in the background
+        cmd += " &"
+
+    os.system(cmd)
+
+
+def log_say(text, play_sounds, blocking=False):
+    logging.info(text)
+
+    if play_sounds:
+        say(text, blocking)
--- a/lerobot/configs/default.yaml
+++ b/lerobot/configs/default.yaml
@@ -120,7 +120,7 @@ eval:
  # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
  batch_size: 1
  # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
-  use_async_envs: true
+  use_async_envs: false

 wandb:
  enable: false
--- a/lerobot/configs/env/aloha.yaml
+++ b/lerobot/configs/env/aloha.yaml
@@ -2,11 +2,6 @@

 fps: 50

-eval:
-  # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
-  # set it to false to avoid some problems of the aloha env
-  use_async_envs: false
-
 env:
  name: aloha
  task: AlohaInsertion-v0
--- a/lerobot/configs/env/aloha_real.yaml
+++ b/lerobot/configs/env/aloha_real.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+
+fps: 30
+
+env:
+  name: real_world
+  task: null
+  state_dim: 18
+  action_dim: 18
+  fps: ${fps}
--- a/lerobot/configs/env/xarm.yaml
+++ b/lerobot/configs/env/xarm.yaml
@@ -2,11 +2,6 @@

 fps: 15

-eval:
-  # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
-  # set it to false to avoid some problems of the aloha env
-  use_async_envs: false
-
 env:
  name: xarm
  task: XarmLift-v0
--- a/lerobot/configs/policy/act_aloha_real.yaml
+++ b/lerobot/configs/policy/act_aloha_real.yaml
@@ -1,16 +1,22 @@
 # @package _global_

-# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets.
-# Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, images,
-# cam_low) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used
-# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation.
-# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot).
-# Look at its README for more information on how to evaluate a checkpoint in the real-world.
+# Use `act_aloha_real.yaml` to train on real-world datasets collected on Aloha or Aloha-2 robots.
+# Compared to `act.yaml`, it contains 4 cameras (i.e. cam_right_wrist, cam_left_wrist, cam_high, cam_low) instead of 1 camera (i.e. top).
+# Also, `training.eval_freq` is set to -1. This config is used to evaluate checkpoints at a certain frequency of training steps.
+# When it is set to -1, it deactivates evaluation. This is because real-world evaluation is done through our `control_robot.py` script.
+# Look at the documentation in header of `control_robot.py` for more information on how to collect data , train and evaluate a policy.
 #
-# Example of usage for training:
+# Example of usage for training and inference with `control_robot.py`:
 # ```bash
 # python lerobot/scripts/train.py \
-#   policy=act_real \
+#   policy=act_aloha_real \
+#   env=aloha_real
+# ```
+#
+# Example of usage for training and inference with [Dora-rs](https://github.com/dora-rs/dora-lerobot):
+# ```bash
+# python lerobot/scripts/train.py \
+#   policy=act_aloha_real \
 #   env=dora_aloha_real
 # ```

@@ -36,10 +42,11 @@ override_dataset_stats:
    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)

 training:
-  offline_steps: 100000
+  offline_steps: 80000
  online_steps: 0
  eval_freq: -1
-  save_freq: 20000
+  save_freq: 10000
+  log_freq: 100
  save_checkpoint: true

  batch_size: 8
@@ -62,7 +69,7 @@ policy:

  # Input / output structure.
  n_obs_steps: 1
-  chunk_size: 100 # chunk_size
+  chunk_size: 100
  n_action_steps: 100

  input_shapes:
@@ -107,7 +114,7 @@ policy:
  n_vae_encoder_layers: 4

  # Inference.
-  temporal_ensemble_coeff: null
+  temporal_ensemble_momentum: null

  # Training and loss computation.
  dropout: 0.1
--- a/lerobot/configs/policy/act_real_no_state.yaml
+++ b/lerobot/configs/policy/act_real_no_state.yaml
@@ -1,110 +0,0 @@
-# @package _global_
-
-# Use `act_real_no_state.yaml` to train on real-world Aloha/Aloha2 datasets when cameras are moving (e.g. wrist cameras)
-# Compared to `act_real.yaml`, it is camera only and does not use the state as input which is vector of robot joint positions.
-# We validated experimentaly that not using state reaches better success rate. Our hypothesis is that `act_real.yaml` might
-# overfits to the state, because the images are more complex to learn from since they are moving.
-#
-# Example of usage for training:
-# ```bash
-# python lerobot/scripts/train.py \
-#   policy=act_real_no_state \
-#   env=dora_aloha_real
-# ```
-
-seed: 1000
-dataset_repo_id: lerobot/aloha_static_vinh_cup
-
-override_dataset_stats:
-  observation.images.cam_right_wrist:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.cam_left_wrist:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.cam_high:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-  observation.images.cam_low:
-    # stats from imagenet, since we use a pretrained vision model
-    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
-    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
-
-training:
-  offline_steps: 100000
-  online_steps: 0
-  eval_freq: -1
-  save_freq: 20000
-  save_checkpoint: true
-
-  batch_size: 8
-  lr: 1e-5
-  lr_backbone: 1e-5
-  weight_decay: 1e-4
-  grad_clip_norm: 10
-  online_steps_between_rollouts: 1
-
-  delta_timestamps:
-    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
-
-eval:
-  n_episodes: 50
-  batch_size: 50
-
-# See `configuration_act.py` for more details.
-policy:
-  name: act
-
-  # Input / output structure.
-  n_obs_steps: 1
-  chunk_size: 100 # chunk_size
-  n_action_steps: 100
-
-  input_shapes:
-    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
-    observation.images.cam_right_wrist: [3, 480, 640]
-    observation.images.cam_left_wrist: [3, 480, 640]
-    observation.images.cam_high: [3, 480, 640]
-    observation.images.cam_low: [3, 480, 640]
-  output_shapes:
-    action: ["${env.action_dim}"]
-
-  # Normalization / Unnormalization
-  input_normalization_modes:
-    observation.images.cam_right_wrist: mean_std
-    observation.images.cam_left_wrist: mean_std
-    observation.images.cam_high: mean_std
-    observation.images.cam_low: mean_std
-  output_normalization_modes:
-    action: mean_std
-
-  # Architecture.
-  # Vision backbone.
-  vision_backbone: resnet18
-  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
-  replace_final_stride_with_dilation: false
-  # Transformer layers.
-  pre_norm: false
-  dim_model: 512
-  n_heads: 8
-  dim_feedforward: 3200
-  feedforward_activation: relu
-  n_encoder_layers: 4
-  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
-  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
-  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
-  n_decoder_layers: 1
-  # VAE.
-  use_vae: true
-  latent_dim: 32
-  n_vae_encoder_layers: 4
-
-  # Inference.
-  temporal_ensemble_coeff: null
-
-  # Training and loss computation.
-  dropout: 0.1
-  kl_weight: 10.0
--- a/lerobot/configs/robot/aloha.yaml
+++ b/lerobot/configs/robot/aloha.yaml
@@ -91,25 +91,25 @@ follower_arms:
 cameras:
  cam_high:
    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 128422271347
+    serial_number: 128422271347
    fps: 30
    width: 640
    height: 480
  cam_low:
    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 130322270656
+    serial_number: 130322270656
    fps: 30
    width: 640
    height: 480
  cam_left_wrist:
    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 218622272670
+    serial_number: 218622272670
    fps: 30
    width: 640
    height: 480
  cam_right_wrist:
    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 130322272300
+    serial_number: 130322272300
    fps: 30
    width: 640
    height: 480
--- a/lerobot/configs/robot/stretch.yaml
+++ b/lerobot/configs/robot/stretch.yaml
@@ -0,0 +1,24 @@
+_target_: lerobot.common.robot_devices.robots.stretch.StretchRobot
+robot_type: stretch3
+
+cameras:
+  navigation:
+    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
+    camera_index: /dev/hello-nav-head-camera
+    fps: 10
+    width: 1280
+    height: 720
+    rotation: -90
+  head:
+    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera.init_from_name
+    name: Intel RealSense D435I
+    fps: 30
+    width: 640
+    height: 480
+    rotation: 90
+  wrist:
+    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera.init_from_name
+    name: Intel RealSense D405
+    fps: 30
+    width: 640
+    height: 480
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -99,161 +99,53 @@ python lerobot/scripts/control_robot.py record \
 """

 import argparse
-import concurrent.futures
-import json
 import logging
-import os
-import platform
-import shutil
 import time
-import traceback
-from contextlib import nullcontext
-from functools import cache
 from pathlib import Path
-
-import cv2
-import torch
-import tqdm
-from omegaconf import DictConfig
-from PIL import Image
-from termcolor import colored
+from typing import List

 # from safetensors.torch import load_file, save_file
-from lerobot.common.datasets.compute_stats import compute_stats
-from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
-from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import to_hf_dataset
-from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, get_default_encoding
-from lerobot.common.datasets.utils import calculate_episode_data_index, create_branch
-from lerobot.common.datasets.video_utils import encode_video_frames
-from lerobot.common.policies.factory import make_policy
-from lerobot.common.robot_devices.robots.factory import make_robot
-from lerobot.common.robot_devices.robots.utils import Robot, get_arm_id
-from lerobot.common.robot_devices.utils import busy_wait
-from lerobot.common.utils.utils import get_safe_torch_device, init_hydra_config, init_logging, set_global_seed
-from lerobot.scripts.eval import get_pretrained_policy_path
-from lerobot.scripts.push_dataset_to_hub import (
-    push_dataset_card_to_hub,
-    push_meta_data_to_hub,
-    push_videos_to_hub,
-    save_meta_data,
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.populate_dataset import (
+    create_lerobot_dataset,
+    delete_current_episode,
+    init_dataset,
+    save_current_episode,
 )
-
-########################################################################################
-# Utilities
-########################################################################################
-
-
-def say(text, blocking=False):
-    # Check if mac, linux, or windows.
-    if platform.system() == "Darwin":
-        cmd = f'say "{text}"'
-    elif platform.system() == "Linux":
-        cmd = f'spd-say "{text}"'
-    elif platform.system() == "Windows":
-        cmd = (
-            'PowerShell -Command "Add-Type -AssemblyName System.Speech; '
-            f"(New-Object System.Speech.Synthesis.SpeechSynthesizer).Speak('{text}')\""
-        )
-
-    if not blocking and platform.system() in ["Darwin", "Linux"]:
-        # TODO(rcadene): Make it work for Windows
-        # Use the ampersand to run command in the background
-        cmd += " &"
-
-    os.system(cmd)
-
-
-def save_image(img_tensor, key, frame_index, episode_index, videos_dir):
-    img = Image.fromarray(img_tensor.numpy())
-    path = videos_dir / f"{key}_episode_{episode_index:06d}" / f"frame_{frame_index:06d}.png"
-    path.parent.mkdir(parents=True, exist_ok=True)
-    img.save(str(path), quality=100)
-
-
-def none_or_int(value):
-    if value == "None":
-        return None
-    return int(value)
-
-
-def log_control_info(robot, dt_s, episode_index=None, frame_index=None, fps=None):
-    log_items = []
-    if episode_index is not None:
-        log_items.append(f"ep:{episode_index}")
-    if frame_index is not None:
-        log_items.append(f"frame:{frame_index}")
-
-    def log_dt(shortname, dt_val_s):
-        nonlocal log_items, fps
-        info_str = f"{shortname}:{dt_val_s * 1000:5.2f} ({1/ dt_val_s:3.1f}hz)"
-        if fps is not None:
-            actual_fps = 1 / dt_val_s
-            if actual_fps < fps - 1:
-                info_str = colored(info_str, "yellow")
-        log_items.append(info_str)
-
-    # total step time displayed in milliseconds and its frequency
-    log_dt("dt", dt_s)
-
-    for name in robot.leader_arms:
-        key = f"read_leader_{name}_pos_dt_s"
-        if key in robot.logs:
-            log_dt("dtRlead", robot.logs[key])
-
-    for name in robot.follower_arms:
-        key = f"write_follower_{name}_goal_pos_dt_s"
-        if key in robot.logs:
-            log_dt("dtWfoll", robot.logs[key])
-
-        key = f"read_follower_{name}_pos_dt_s"
-        if key in robot.logs:
-            log_dt("dtRfoll", robot.logs[key])
-
-    for name in robot.cameras:
-        key = f"read_camera_{name}_dt_s"
-        if key in robot.logs:
-            log_dt(f"dtR{name}", robot.logs[key])
-
-    info_str = " ".join(log_items)
-    logging.info(info_str)
-
-
-@cache
-def is_headless():
-    """Detects if python is running without a monitor."""
-    try:
-        import pynput  # noqa
-
-        return False
-    except Exception:
-        print(
-            "Error trying to import pynput. Switching to headless mode. "
-            "As a result, the video stream from the cameras won't be shown, "
-            "and you won't be able to change the control flow with keyboards. "
-            "For more info, see traceback below.\n"
-        )
-        traceback.print_exc()
-        print()
-        return True
-
+from lerobot.common.robot_devices.control_utils import (
+    control_loop,
+    has_method,
+    init_keyboard_listener,
+    init_policy,
+    log_control_info,
+    record_episode,
+    reset_environment,
+    sanity_check_dataset_name,
+    stop_recording,
+    warmup_record,
+)
+from lerobot.common.robot_devices.robots.factory import make_robot
+from lerobot.common.robot_devices.robots.utils import Robot
+from lerobot.common.robot_devices.utils import busy_wait, safe_disconnect
+from lerobot.common.utils.utils import init_hydra_config, init_logging, log_say, none_or_int

 ########################################################################################
 # Control modes
 ########################################################################################


+@safe_disconnect
 def calibrate(robot: Robot, arms: list[str] | None):
-    available_arms = []
-    for name in robot.follower_arms:
-        arm_id = get_arm_id(name, "follower")
-        available_arms.append(arm_id)
-    for name in robot.leader_arms:
-        arm_id = get_arm_id(name, "leader")
-        available_arms.append(arm_id)
+    # TODO(aliberts): move this code in robots' classes
+    if robot.robot_type.startswith("stretch"):
+        if not robot.is_connected:
+            robot.connect()
+        if not robot.is_homed():
+            robot.home()
+        return

-    unknown_arms = [arm_id for arm_id in arms if arm_id not in available_arms]
-
-    available_arms_str = " ".join(available_arms)
+    unknown_arms = [arm_id for arm_id in arms if arm_id not in robot.available_arms]
+    available_arms_str = " ".join(robot.available_arms)
    unknown_arms_str = " ".join(unknown_arms)

    if arms is None or len(arms) == 0:
@@ -285,34 +177,27 @@ def calibrate(robot: Robot, arms: list[str] | None):
    print("Calibration is done! You can now teleoperate and record datasets!")


-def teleoperate(robot: Robot, fps: int | None = None, teleop_time_s: float | None = None):
-    # TODO(rcadene): Add option to record logs
-    if not robot.is_connected:
-        robot.connect()
-
-    start_teleop_t = time.perf_counter()
-    while True:
-        start_loop_t = time.perf_counter()
-        robot.teleop_step()
-
-        if fps is not None:
-            dt_s = time.perf_counter() - start_loop_t
-            busy_wait(1 / fps - dt_s)
-
-        dt_s = time.perf_counter() - start_loop_t
-        log_control_info(robot, dt_s, fps=fps)
-
-        if teleop_time_s is not None and time.perf_counter() - start_teleop_t > teleop_time_s:
-            break
+@safe_disconnect
+def teleoperate(
+    robot: Robot, fps: int | None = None, teleop_time_s: float | None = None, display_cameras: bool = False
+):
+    control_loop(
+        robot,
+        control_time_s=teleop_time_s,
+        fps=fps,
+        teleoperate=True,
+        display_cameras=display_cameras,
+    )


+@safe_disconnect
 def record(
    robot: Robot,
-    policy: torch.nn.Module | None = None,
-    hydra_cfg: DictConfig | None = None,
+    root: str,
+    repo_id: str,
+    pretrained_policy_name_or_path: str | None = None,
+    policy_overrides: List[str] | None = None,
    fps: int | None = None,
-    root="data",
-    repo_id="lerobot/debug",
    warmup_time_s=2,
    episode_time_s=10,
    reset_time_s=5,
@@ -321,377 +206,115 @@ def record(
    run_compute_stats=True,
    push_to_hub=True,
    tags=None,
-    num_image_writers_per_camera=4,
+    num_image_writer_processes=0,
+    num_image_writer_threads_per_camera=4,
    force_override=False,
+    display_cameras=True,
+    play_sounds=True,
 ):
    # TODO(rcadene): Add option to record logs
-    # TODO(rcadene): Clean this function via decomposition in higher level functions
+    listener = None
+    events = None
+    policy = None
+    device = None
+    use_amp = None

-    _, dataset_name = repo_id.split("/")
-    if dataset_name.startswith("eval_") and policy is None:
-        raise ValueError(
-            f"Your dataset name begins by 'eval_' ({dataset_name}) but no policy is provided ({policy})."
-        )
+    # Load pretrained policy
+    if pretrained_policy_name_or_path is not None:
+        policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides)

-    if not video:
-        raise NotImplementedError()
+        if fps is None:
+            fps = policy_fps
+            logging.warning(f"No fps provided, so using the fps from policy config ({policy_fps}).")
+        elif fps != policy_fps:
+            logging.warning(
+                f"There is a mismatch between the provided fps ({fps}) and the one from policy config ({policy_fps})."
+            )
+
+    # Create empty dataset or load existing saved episodes
+    sanity_check_dataset_name(repo_id, policy)
+    dataset = init_dataset(
+        repo_id,
+        root,
+        force_override,
+        fps,
+        video,
+        write_images=robot.has_camera,
+        num_image_writer_processes=num_image_writer_processes,
+        num_image_writer_threads=num_image_writer_threads_per_camera * robot.num_cameras,
+    )

    if not robot.is_connected:
        robot.connect()

-    local_dir = Path(root) / repo_id
-    if local_dir.exists() and force_override:
-        shutil.rmtree(local_dir)
+    listener, events = init_keyboard_listener()

-    episodes_dir = local_dir / "episodes"
-    episodes_dir.mkdir(parents=True, exist_ok=True)
+    # Execute a few seconds without recording to:
+    # 1. teleoperate the robot to move it in starting position if no policy provided,
+    # 2. give times to the robot devices to connect and start synchronizing,
+    # 3. place the cameras windows on screen
+    enable_teleoperation = policy is None
+    log_say("Warmup record", play_sounds)
+    warmup_record(robot, events, enable_teleoperation, warmup_time_s, display_cameras, fps)

-    videos_dir = local_dir / "videos"
-    videos_dir.mkdir(parents=True, exist_ok=True)
+    if has_method(robot, "teleop_safety_stop"):
+        robot.teleop_safety_stop()

-    # Logic to resume data recording
-    rec_info_path = episodes_dir / "data_recording_info.json"
-    if rec_info_path.exists():
-        with open(rec_info_path) as f:
-            rec_info = json.load(f)
-        episode_index = rec_info["last_episode_index"] + 1
-    else:
-        episode_index = 0
+    while True:
+        if dataset["num_episodes"] >= num_episodes:
+            break

-    if is_headless():
-        logging.info(
-            "Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
+        episode_index = dataset["num_episodes"]
+        log_say(f"Recording episode {episode_index}", play_sounds)
+        record_episode(
+            dataset=dataset,
+            robot=robot,
+            events=events,
+            episode_time_s=episode_time_s,
+            display_cameras=display_cameras,
+            policy=policy,
+            device=device,
+            use_amp=use_amp,
+            fps=fps,
        )

-    # Allow to exit early while recording an episode or resetting the environment,
-    # by tapping the right arrow key '->'. This might require a sudo permission
-    # to allow your terminal to monitor keyboard events.
-    exit_early = False
-    rerecord_episode = False
-    stop_recording = False
+        # Execute a few seconds without recording to give time to manually reset the environment
+        # Current code logic doesn't allow to teleoperate during this time.
+        # TODO(rcadene): add an option to enable teleoperation during reset
+        # Skip reset for the last episode to be recorded
+        if not events["stop_recording"] and (
+            (episode_index < num_episodes - 1) or events["rerecord_episode"]
+        ):
+            log_say("Reset the environment", play_sounds)
+            reset_environment(robot, events, reset_time_s)

-    # Only import pynput if not in a headless environment
-    if not is_headless():
-        from pynput import keyboard
+        if events["rerecord_episode"]:
+            log_say("Re-record episode", play_sounds)
+            events["rerecord_episode"] = False
+            events["exit_early"] = False
+            delete_current_episode(dataset)
+            continue

-        def on_press(key):
-            nonlocal exit_early, rerecord_episode, stop_recording
-            try:
-                if key == keyboard.Key.right:
-                    print("Right arrow key pressed. Exiting loop...")
-                    exit_early = True
-                elif key == keyboard.Key.left:
-                    print("Left arrow key pressed. Exiting loop and rerecord the last episode...")
-                    rerecord_episode = True
-                    exit_early = True
-                elif key == keyboard.Key.esc:
-                    print("Escape key pressed. Stopping data recording...")
-                    stop_recording = True
-                    exit_early = True
-            except Exception as e:
-                print(f"Error handling key press: {e}")
+        # Increment by one dataset["current_episode_index"]
+        save_current_episode(dataset)

-        listener = keyboard.Listener(on_press=on_press)
-        listener.start()
+        if events["stop_recording"]:
+            break

-    # Load policy if any
-    if policy is not None:
-        # Check device is available
-        device = get_safe_torch_device(hydra_cfg.device, log=True)
+    log_say("Stop recording", play_sounds, blocking=True)
+    stop_recording(robot, listener, display_cameras)

-        policy.eval()
-        policy.to(device)
+    lerobot_dataset = create_lerobot_dataset(dataset, run_compute_stats, push_to_hub, tags, play_sounds)

-        torch.backends.cudnn.benchmark = True
-        torch.backends.cuda.matmul.allow_tf32 = True
-        set_global_seed(hydra_cfg.seed)
-
-        # override fps using policy fps
-        fps = hydra_cfg.env.fps
-
-    # Execute a few seconds without recording data, to give times
-    # to the robot devices to connect and start synchronizing.
-    timestamp = 0
-    start_warmup_t = time.perf_counter()
-    is_warmup_print = False
-    while timestamp < warmup_time_s:
-        if not is_warmup_print:
-            logging.info("Warming up (no data recording)")
-            say("Warming up")
-            is_warmup_print = True
-
-        start_loop_t = time.perf_counter()
-
-        if policy is None:
-            observation, action = robot.teleop_step(record_data=True)
-        else:
-            observation = robot.capture_observation()
-
-        if not is_headless():
-            image_keys = [key for key in observation if "image" in key]
-            for key in image_keys:
-                cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
-            cv2.waitKey(1)
-
-        dt_s = time.perf_counter() - start_loop_t
-        busy_wait(1 / fps - dt_s)
-
-        dt_s = time.perf_counter() - start_loop_t
-        log_control_info(robot, dt_s, fps=fps)
-
-        timestamp = time.perf_counter() - start_warmup_t
-
-    # Save images using threads to reach high fps (30 and more)
-    # Using `with` to exist smoothly if an execption is raised.
-    futures = []
-    num_image_writers = num_image_writers_per_camera * len(robot.cameras)
-    if num_image_writers == 0:
-        num_image_writers = 1
-    with concurrent.futures.ThreadPoolExecutor(max_workers=num_image_writers) as executor:
-        # Start recording all episodes
-        while episode_index < num_episodes:
-            logging.info(f"Recording episode {episode_index}")
-            say(f"Recording episode {episode_index}")
-            ep_dict = {}
-            frame_index = 0
-            timestamp = 0
-            start_episode_t = time.perf_counter()
-            while timestamp < episode_time_s:
-                start_loop_t = time.perf_counter()
-
-                if policy is None:
-                    observation, action = robot.teleop_step(record_data=True)
-                else:
-                    observation = robot.capture_observation()
-
-                image_keys = [key for key in observation if "image" in key]
-                not_image_keys = [key for key in observation if "image" not in key]
-
-                for key in image_keys:
-                    futures += [
-                        executor.submit(
-                            save_image, observation[key], key, frame_index, episode_index, videos_dir
-                        )
-                    ]
-
-                if not is_headless():
-                    image_keys = [key for key in observation if "image" in key]
-                    for key in image_keys:
-                        cv2.imshow(key, cv2.cvtColor(observation[key].numpy(), cv2.COLOR_RGB2BGR))
-                    cv2.waitKey(1)
-
-                for key in not_image_keys:
-                    if key not in ep_dict:
-                        ep_dict[key] = []
-                    ep_dict[key].append(observation[key])
-
-                if policy is not None:
-                    with (
-                        torch.inference_mode(),
-                        torch.autocast(device_type=device.type)
-                        if device.type == "cuda" and hydra_cfg.use_amp
-                        else nullcontext(),
-                    ):
-                        # Convert to pytorch format: channel first and float32 in [0,1] with batch dimension
-                        for name in observation:
-                            if "image" in name:
-                                observation[name] = observation[name].type(torch.float32) / 255
-                                observation[name] = observation[name].permute(2, 0, 1).contiguous()
-                            observation[name] = observation[name].unsqueeze(0)
-                            observation[name] = observation[name].to(device)
-
-                        # Compute the next action with the policy
-                        # based on the current observation
-                        action = policy.select_action(observation)
-
-                        # Remove batch dimension
-                        action = action.squeeze(0)
-
-                        # Move to cpu, if not already the case
-                        action = action.to("cpu")
-
-                    # Order the robot to move
-                    action_sent = robot.send_action(action)
-
-                    # Action can eventually be clipped using `max_relative_target`,
-                    # so action actually sent is saved in the dataset.
-                    action = {"action": action_sent}
-
-                for key in action:
-                    if key not in ep_dict:
-                        ep_dict[key] = []
-                    ep_dict[key].append(action[key])
-
-                frame_index += 1
-
-                dt_s = time.perf_counter() - start_loop_t
-                busy_wait(1 / fps - dt_s)
-
-                dt_s = time.perf_counter() - start_loop_t
-                log_control_info(robot, dt_s, fps=fps)
-
-                timestamp = time.perf_counter() - start_episode_t
-                if exit_early:
-                    exit_early = False
-                    break
-
-            if not stop_recording:
-                # Start resetting env while the executor are finishing
-                logging.info("Reset the environment")
-                say("Reset the environment")
-
-            timestamp = 0
-            start_vencod_t = time.perf_counter()
-
-            # During env reset we save the data and encode the videos
-            num_frames = frame_index
-
-            for key in image_keys:
-                tmp_imgs_dir = videos_dir / f"{key}_episode_{episode_index:06d}"
-                fname = f"{key}_episode_{episode_index:06d}.mp4"
-                video_path = local_dir / "videos" / fname
-                if video_path.exists():
-                    video_path.unlink()
-                # Store the reference to the video frame, even tho the videos are not yet encoded
-                ep_dict[key] = []
-                for i in range(num_frames):
-                    ep_dict[key].append({"path": f"videos/{fname}", "timestamp": i / fps})
-
-            for key in not_image_keys:
-                ep_dict[key] = torch.stack(ep_dict[key])
-
-            for key in action:
-                ep_dict[key] = torch.stack(ep_dict[key])
-
-            ep_dict["episode_index"] = torch.tensor([episode_index] * num_frames)
-            ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
-            ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
-
-            done = torch.zeros(num_frames, dtype=torch.bool)
-            done[-1] = True
-            ep_dict["next.done"] = done
-
-            ep_path = episodes_dir / f"episode_{episode_index}.pth"
-            print("Saving episode dictionary...")
-            torch.save(ep_dict, ep_path)
-
-            rec_info = {
-                "last_episode_index": episode_index,
-            }
-            with open(rec_info_path, "w") as f:
-                json.dump(rec_info, f)
-
-            is_last_episode = stop_recording or (episode_index == (num_episodes - 1))
-
-            # Wait if necessary
-            with tqdm.tqdm(total=reset_time_s, desc="Waiting") as pbar:
-                while timestamp < reset_time_s and not is_last_episode:
-                    time.sleep(1)
-                    timestamp = time.perf_counter() - start_vencod_t
-                    pbar.update(1)
-                    if exit_early:
-                        exit_early = False
-                        break
-
-            # Skip updating episode index which forces re-recording episode
-            if rerecord_episode:
-                rerecord_episode = False
-                continue
-
-            episode_index += 1
-
-            if is_last_episode:
-                logging.info("Done recording")
-                say("Done recording", blocking=True)
-                if not is_headless():
-                    listener.stop()
-
-                logging.info("Waiting for threads writing the images on disk to terminate...")
-                for _ in tqdm.tqdm(
-                    concurrent.futures.as_completed(futures), total=len(futures), desc="Writting images"
-                ):
-                    pass
-                break
-
-    robot.disconnect()
-    if not is_headless():
-        cv2.destroyAllWindows()
-
-    num_episodes = episode_index
-
-    logging.info("Encoding videos")
-    say("Encoding videos")
-    # Use ffmpeg to convert frames stored as png into mp4 videos
-    for episode_index in tqdm.tqdm(range(num_episodes)):
-        for key in image_keys:
-            tmp_imgs_dir = videos_dir / f"{key}_episode_{episode_index:06d}"
-            fname = f"{key}_episode_{episode_index:06d}.mp4"
-            video_path = local_dir / "videos" / fname
-            if video_path.exists():
-                # Skip if video is already encoded. Could be the case when resuming data recording.
-                continue
-            # note: `encode_video_frames` is a blocking call. Making it asynchronous shouldn't speedup encoding,
-            # since video encoding with ffmpeg is already using multithreading.
-            encode_video_frames(tmp_imgs_dir, video_path, fps, overwrite=True)
-            shutil.rmtree(tmp_imgs_dir)
-
-    logging.info("Concatenating episodes")
-    ep_dicts = []
-    for episode_index in tqdm.tqdm(range(num_episodes)):
-        ep_path = episodes_dir / f"episode_{episode_index}.pth"
-        ep_dict = torch.load(ep_path)
-        ep_dicts.append(ep_dict)
-    data_dict = concatenate_episodes(ep_dicts)
-
-    total_frames = data_dict["frame_index"].shape[0]
-    data_dict["index"] = torch.arange(0, total_frames, 1)
-
-    hf_dataset = to_hf_dataset(data_dict, video)
-    episode_data_index = calculate_episode_data_index(hf_dataset)
-    info = {
-        "codebase_version": CODEBASE_VERSION,
-        "fps": fps,
-        "video": video,
-    }
-    if video:
-        info["encoding"] = get_default_encoding()
-
-    lerobot_dataset = LeRobotDataset.from_preloaded(
-        repo_id=repo_id,
-        hf_dataset=hf_dataset,
-        episode_data_index=episode_data_index,
-        info=info,
-        videos_dir=videos_dir,
-    )
-    if run_compute_stats:
-        logging.info("Computing dataset statistics")
-        say("Computing dataset statistics")
-        stats = compute_stats(lerobot_dataset)
-        lerobot_dataset.stats = stats
-    else:
-        stats = {}
-        logging.info("Skipping computation of the dataset statistics")
-
-    hf_dataset = hf_dataset.with_format(None)  # to remove transforms that cant be saved
-    hf_dataset.save_to_disk(str(local_dir / "train"))
-
-    meta_data_dir = local_dir / "meta_data"
-    save_meta_data(info, stats, episode_data_index, meta_data_dir)
-
-    if push_to_hub:
-        hf_dataset.push_to_hub(repo_id, revision="main")
-        push_meta_data_to_hub(repo_id, meta_data_dir, revision="main")
-        push_dataset_card_to_hub(repo_id, revision="main", tags=tags)
-        if video:
-            push_videos_to_hub(repo_id, videos_dir, revision="main")
-        create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION)
-
-    logging.info("Exiting")
-    say("Exiting")
+    log_say("Exiting", play_sounds)
    return lerobot_dataset


-def replay(robot: Robot, episode: int, fps: int | None = None, root="data", repo_id="lerobot/debug"):
+@safe_disconnect
+def replay(
+    robot: Robot, episode: int, fps: int | None = None, root="data", repo_id="lerobot/debug", play_sounds=True
+):
+    # TODO(rcadene, aliberts): refactor with control_loop, once `dataset` is an instance of LeRobotDataset
    # TODO(rcadene): Add option to record logs
    local_dir = Path(root) / repo_id
    if not local_dir.exists():
@@ -705,8 +328,7 @@ def replay(robot: Robot, episode: int, fps: int | None = None, root="data", repo
    if not robot.is_connected:
        robot.connect()

-    logging.info("Replaying episode")
-    say("Replaying episode", blocking=True)
+    log_say("Replaying episode", play_sounds, blocking=True)
    for idx in range(from_idx, to_idx):
        start_episode_t = time.perf_counter()

@@ -751,6 +373,12 @@ if __name__ == "__main__":
    parser_teleop.add_argument(
        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
    )
+    parser_teleop.add_argument(
+        "--display-cameras",
+        type=int,
+        default=1,
+        help="Display all cameras on screen (set to 1 to display or 0).",
+    )

    parser_record = subparsers.add_parser("record", parents=[base_parser])
    parser_record.add_argument(
@@ -806,12 +434,23 @@ if __name__ == "__main__":
        help="Add tags to your dataset on the hub.",
    )
    parser_record.add_argument(
-        "--num-image-writers-per-camera",
+        "--num-image-writer-processes",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses handling the saving of frames as PNGs. Set to 0 to use threads only; "
+            "set to ≥1 to use subprocesses, each using threads to write images. The best number of processes "
+            "and threads depends on your system. We recommend 4 threads per camera with 0 processes. "
+            "If fps is unstable, adjust the thread count. If still unstable, try using 1 or more subprocesses."
+        ),
+    )
+    parser_record.add_argument(
+        "--num-image-writer-threads-per-camera",
        type=int,
        default=4,
        help=(
            "Number of threads writing the frames as png images on disk, per camera. "
-            "Too much threads might cause unstable teleoperation fps due to main thread being blocked. "
+            "Too many threads might cause unstable teleoperation fps due to main thread being blocked. "
            "Not enough threads might cause low camera fps."
        ),
    )
@@ -877,19 +516,7 @@ if __name__ == "__main__":
        teleoperate(robot, **kwargs)

    elif control_mode == "record":
-        pretrained_policy_name_or_path = args.pretrained_policy_name_or_path
-        policy_overrides = args.policy_overrides
-        del kwargs["pretrained_policy_name_or_path"]
-        del kwargs["policy_overrides"]
-
-        policy_cfg = None
-        if pretrained_policy_name_or_path is not None:
-            pretrained_policy_path = get_pretrained_policy_path(pretrained_policy_name_or_path)
-            policy_cfg = init_hydra_config(pretrained_policy_path / "config.yaml", policy_overrides)
-            policy = make_policy(hydra_cfg=policy_cfg, pretrained_policy_name_or_path=pretrained_policy_path)
-            record(robot, policy, policy_cfg, **kwargs)
-        else:
-            record(robot, **kwargs)
+        record(robot, **kwargs)

    elif control_mode == "replay":
        replay(robot, **kwargs)
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -57,7 +57,7 @@ import gymnasium as gym
 import numpy as np
 import torch
 from huggingface_hub import snapshot_download
-from huggingface_hub.utils._errors import RepositoryNotFoundError
+from huggingface_hub.errors import RepositoryNotFoundError
 from huggingface_hub.utils._validators import HFValidationError
 from torch import Tensor, nn
 from tqdm import trange
--- a/lerobot/scripts/save_images_from_cameras.py
+++ b/lerobot/scripts/save_images_from_cameras.py
@@ -1,158 +0,0 @@
-import argparse
-import concurrent.futures
-import importlib
-import shutil
-import time
-from pathlib import Path
-
-import cv2
-from PIL import Image
-
-from lerobot.scripts.control_robot import busy_wait
-
-
-def save_image(img_array, camera_index, frame_index, images_dir):
-    try:
-        img = Image.fromarray(img_array)
-        path = images_dir / f"camera_{camera_index:02d}_frame_{frame_index:06d}.png"
-        path.parent.mkdir(parents=True, exist_ok=True)
-        img.save(str(path), quality=100)
-        print(f"Image saved to: {path}")
-    except Exception as e:
-        print(f"Failed to save image: {e}")
-
-
-def save_images_from_cameras(
-    driver: str,
-    images_dir: Path,
-    camera_ids: list[int] | None = None,
-    fps=None,
-    width=None,
-    height=None,
-    record_time_s=2,
-):
-    """
-    Initializes all the cameras and saves images to the directory. Useful to visually identify the camera
-    associated to a given camera index.
-    """
-    # Dynamically import the appropriate camera class based on the brand
-    if driver == "intelrealsense":
-        camera_module = importlib.import_module("lerobot.common.robot_devices.cameras.intelrealsense")
-        camera_class = camera_module.IntelRealSenseCamera
-        find_camera_indices = camera_module.find_camera_indices
-    elif driver == "opencv":
-        camera_module = importlib.import_module("lerobot.common.robot_devices.cameras.opencv")
-        camera_class = camera_module.OpenCVCamera
-        find_camera_indices = camera_module.find_camera_indices
-    else:
-        raise ValueError(
-            f"Unsupported camera driver: {driver}. Note: the drivers we currently support are opencv and intelrealsense."
-        )
-
-    if camera_ids is None:
-        camera_ids = find_camera_indices()
-
-    print("Connecting cameras")
-    cameras = []
-    for cam_idx in camera_ids:
-        camera = camera_class(cam_idx, fps=fps, width=width, height=height)
-        camera.connect()
-        print(
-            f"{camera.__class__.__name__}({camera.camera_index}, fps={camera.fps}, width={camera.width}, height={camera.height}, color_mode={camera.color_mode})"
-        )
-        cameras.append(camera)
-
-    images_dir = Path(images_dir)
-    if images_dir.exists():
-        shutil.rmtree(images_dir)
-    images_dir.mkdir(parents=True, exist_ok=True)
-
-    print(f"Saving images to {images_dir}")
-    frame_index = 0
-    start_time = time.perf_counter()
-
-    # Use ThreadPoolExecutor for saving images asynchronously
-    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-        try:
-            while True:
-                now = time.perf_counter()
-
-                for camera in cameras:
-                    # Capture image
-                    image = camera.read() if fps is None else camera.async_read()
-                    if image is None:
-                        print("No Frame")
-                    else:
-                        bgr_converted_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-
-                        # Submit the save_image function to be executed in the background
-                        executor.submit(
-                            save_image,
-                            bgr_converted_image,
-                            camera.camera_index,
-                            frame_index,
-                            images_dir,
-                        )
-
-                if fps is not None:
-                    dt_s = time.perf_counter() - now
-                    busy_wait(1 / fps - dt_s)
-
-                if time.perf_counter() - start_time > record_time_s:
-                    break
-
-                print(f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}")
-
-                frame_index += 1
-        finally:
-            print(f"Images have been saved to {images_dir}")
-            for camera in cameras:
-                camera.disconnect()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Save a few frames for all cameras connected to the computer, or a selected subset."
-    )
-    parser.add_argument(
-        "--driver", type=str, required=True, help="Camera driver (e.g., intelrealsense, opencv)"
-    )
-    parser.add_argument(
-        "--camera-ids",
-        type=int,
-        nargs="*",
-        default=None,
-        help="List of camera indices used to instantiate the `IntelRealSenseCamera`. If not provided, find and use all available camera indices.",
-    )
-    parser.add_argument(
-        "--fps",
-        type=int,
-        default=30,
-        help="Set the number of frames recorded per second for all cameras. If not provided, use the default fps of each camera.",
-    )
-    parser.add_argument(
-        "--width",
-        type=str,
-        default=640,
-        help="Set the width for all cameras. If not provided, use the default width of each camera.",
-    )
-    parser.add_argument(
-        "--height",
-        type=str,
-        default=480,
-        help="Set the height for all cameras. If not provided, use the default height of each camera.",
-    )
-    parser.add_argument(
-        "--images-dir",
-        type=Path,
-        default="outputs/images_from_cameras",
-        help="Set directory to save a few frames for each camera.",
-    )
-    parser.add_argument(
-        "--record-time-s",
-        type=float,
-        default=2.0,
-        help="Set the number of seconds used to record the frames. By default, 2 seconds.",
-    )
-    args = parser.parse_args()
-    save_images_from_cameras(**vars(args))
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -383,7 +383,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
            logging.info(f"Checkpoint policy after step {step}")
            # Note: Save with step as the identifier, and format it to have at least 6 digits but more if
            # needed (choose 6 as a minimum for consistency without being overkill).
-            logger.save_checkpont(
+            logger.save_checkpoint(
                step,
                policy,
                optimizer,
--- a/lerobot/templates/visualize_dataset_template.html
+++ b/lerobot/templates/visualize_dataset_template.html
@@ -250,7 +250,7 @@
                    if(!canPlayVideos){
                        this.videoCodecError = true;
                    }
-                    
+
                    // process CSV data
                    this.videos = document.querySelectorAll('video');
                    this.video = this.videos[0];