issacdataengine/policy/lmdb2lerobotv21/convertv21_to_v30.py

#!/usr/bin/env python

# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.1 to
3.0. It will:

- Generate per-episodes stats and writes them in `episodes_stats.jsonl`
- Check consistency between these new stats and the old ones.
- Remove the deprecated `stats.json`.
- Update codebase_version in `info.json`.
- Push this new version to the hub on the 'main' branch and tags it with "v3.0".

Usage:

Convert a local dataset (works in place):
```bash
python convert_dataset_v21_to_v30.py \
    --old-repo-id=v21/lift2_sim_long_horizon \
    --new-repo-id=lift2/lift2_sim_long_horizon
```

"""

import argparse
import logging
import shutil
import glob, os
from pathlib import Path
from typing import Any
from pdb import set_trace

import jsonlines
import pandas as pd
import pyarrow as pa
import tqdm
from datasets import Dataset, Features, Image
from huggingface_hub import HfApi, snapshot_download
from requests import HTTPError

from lerobot.datasets.compute_stats import aggregate_stats
from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
from lerobot.datasets.utils import (
    DEFAULT_CHUNK_SIZE,
    DEFAULT_DATA_FILE_SIZE_IN_MB,
    DEFAULT_DATA_PATH,
    DEFAULT_VIDEO_FILE_SIZE_IN_MB,
    DEFAULT_VIDEO_PATH,
    LEGACY_EPISODES_PATH,
    LEGACY_EPISODES_STATS_PATH,
    LEGACY_TASKS_PATH,
    cast_stats_to_numpy,
    flatten_dict,
    get_file_size_in_mb,
    get_parquet_file_size_in_mb,
    get_parquet_num_frames,
    load_info,
    update_chunk_file_indices,
    write_episodes,
    write_info,
    write_stats,
    write_tasks,
)
from lerobot.datasets.video_utils import concatenate_video_files, get_video_duration_in_s
from lerobot.utils.constants import HF_LEROBOT_HOME
from lerobot.utils.utils import init_logging

V21 = "v2.1"
V30 = "v3.0"

"""
-------------------------
OLD
data/chunk-000/episode_000000.parquet

NEW
data/chunk-000/file_000.parquet
-------------------------
OLD
videos/chunk-000/CAMERA/episode_000000.mp4

NEW
videos/CAMERA/chunk-000/file_000.mp4
-------------------------
OLD
episodes.jsonl
{"episode_index": 1, "tasks": ["Put the blue block in the green bowl"], "length": 266}

NEW
meta/episodes/chunk-000/episodes_000.parquet
episode_index | video_chunk_index | video_file_index | data_chunk_index | data_file_index | tasks | length
-------------------------
OLD
tasks.jsonl
{"task_index": 1, "task": "Put the blue block in the green bowl"}

NEW
meta/tasks/chunk-000/file_000.parquet
task_index | task
-------------------------
OLD
episodes_stats.jsonl

NEW
meta/episodes_stats/chunk-000/file_000.parquet
episode_index | mean | std | min | max
-------------------------
UPDATE
meta/info.json
-------------------------
"""


def load_jsonlines(fpath: Path) -> list[Any]:
    with jsonlines.open(fpath, "r") as reader:
        return list(reader)


def legacy_load_episodes(local_dir: Path) -> dict:
    episodes = load_jsonlines(local_dir / LEGACY_EPISODES_PATH)
    return {item["episode_index"]: item for item in sorted(episodes, key=lambda x: x["episode_index"])}


def legacy_load_episodes_stats(local_dir: Path) -> dict:
    episodes_stats = load_jsonlines(local_dir / LEGACY_EPISODES_STATS_PATH)
    return {
        item["episode_index"]: cast_stats_to_numpy(item["stats"])
        for item in sorted(episodes_stats, key=lambda x: x["episode_index"])
    }


def legacy_load_tasks(local_dir: Path) -> tuple[dict, dict]:
    tasks = load_jsonlines(local_dir / LEGACY_TASKS_PATH)
    tasks = {item["task_index"]: item["task"] for item in sorted(tasks, key=lambda x: x["task_index"])}
    task_to_task_index = {task: task_index for task_index, task in tasks.items()}
    return tasks, task_to_task_index


def validate_local_dataset_version(local_path: Path) -> None:
    """Validate that the local dataset has the expected v2.1 version."""
    info = load_info(local_path)
    dataset_version = info.get("codebase_version", "unknown")
    if dataset_version != V21:
        raise ValueError(
            f"Local dataset has codebase version '{dataset_version}', expected '{V21}'. "
            f"This script is specifically for converting v2.1 datasets to v3.0."
        )


def convert_tasks(root, new_root):
    logging.info(f"Converting tasks from {root} to {new_root}")
    tasks, _ = legacy_load_tasks(root)
    task_indices = tasks.keys()
    task_strings = tasks.values()
    df_tasks = pd.DataFrame({"task_index": task_indices}, index=task_strings)
    write_tasks(df_tasks, new_root)


def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
    import pyarrow.parquet as pq
    import pyarrow as pa
    from datasets import Features, Image

    # 1. Read all tables
    tables = [pq.read_table(f) for f in paths_to_cat]

    # 2. Concatenate with type promotion
    table = pa.concat_tables(tables, promote=True)

    # 3. Build HF Features from arrow schema
    features = Features.from_arrow_schema(table.schema)

    # 4. Override image columns to be HF Image()
    for key in image_keys:
        features[key] = Image()

    # 5. Convert back to arrow schema with updated metadata
    arrow_schema = features.arrow_schema

    # 6. Write parquet with correct schema
    path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
    path.parent.mkdir(parents=True, exist_ok=True)

    pq.write_table(table.cast(arrow_schema), path)


def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int):
    data_dir = root / "data"
    ep_paths = sorted(data_dir.glob("*/*.parquet"))

    image_keys = get_image_keys(root)

    ep_idx = 0
    chunk_idx = 0
    file_idx = 0
    size_in_mb = 0
    num_frames = 0
    paths_to_cat = []
    episodes_metadata = []

    logging.info(f"Converting data files from {len(ep_paths)} episodes")

    for ep_path in tqdm.tqdm(ep_paths, desc="convert data files"):
        ep_size_in_mb = get_parquet_file_size_in_mb(ep_path)
        ep_num_frames = get_parquet_num_frames(ep_path)
        ep_metadata = {
            "episode_index": ep_idx,
            "data/chunk_index": chunk_idx,
            "data/file_index": file_idx,
            "dataset_from_index": num_frames,
            "dataset_to_index": num_frames + ep_num_frames,
        }
        size_in_mb += ep_size_in_mb
        num_frames += ep_num_frames
        episodes_metadata.append(ep_metadata)
        ep_idx += 1

        if size_in_mb < data_file_size_in_mb:
            paths_to_cat.append(ep_path)
            continue

        if paths_to_cat:
            concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)

        # Reset for the next file
        size_in_mb = ep_size_in_mb
        paths_to_cat = [ep_path]

        chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE)

    # Write remaining data if any
    if paths_to_cat:
        concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)

    return episodes_metadata


def get_video_keys(root):
    info = load_info(root)
    features = info["features"]
    video_keys = [key for key, ft in features.items() if ft["dtype"] == "video"]
    return video_keys


def get_image_keys(root):
    info = load_info(root)
    features = info["features"]
    image_keys = [key for key, ft in features.items() if ft["dtype"] == "image"]
    return image_keys


def convert_videos(root: Path, new_root: Path, video_file_size_in_mb: int):
    logging.info(f"Converting videos from {root} to {new_root}")

    video_keys = get_video_keys(root)
    if len(video_keys) == 0:
        return None

    video_keys = sorted(video_keys)

    eps_metadata_per_cam = []
    for camera in video_keys:
        eps_metadata = convert_videos_of_camera(root, new_root, camera, video_file_size_in_mb)
        eps_metadata_per_cam.append(eps_metadata)

    num_eps_per_cam = [len(eps_cam_map) for eps_cam_map in eps_metadata_per_cam]
    if len(set(num_eps_per_cam)) != 1:
        raise ValueError(f"All cams dont have same number of episodes ({num_eps_per_cam}).")

    episods_metadata = []
    num_cameras = len(video_keys)
    num_episodes = num_eps_per_cam[0]
    for ep_idx in tqdm.tqdm(range(num_episodes), desc="convert videos"):
        # Sanity check
        ep_ids = [eps_metadata_per_cam[cam_idx][ep_idx]["episode_index"] for cam_idx in range(num_cameras)]
        ep_ids += [ep_idx]
        if len(set(ep_ids)) != 1:
            raise ValueError(f"All episode indices need to match ({ep_ids}).")

        ep_dict = {}
        for cam_idx in range(num_cameras):
            ep_dict.update(eps_metadata_per_cam[cam_idx][ep_idx])
        episods_metadata.append(ep_dict)

    return episods_metadata


def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_file_size_in_mb: int):
    # Access old paths to mp4
    videos_dir = root / "videos"
    ep_paths = sorted(videos_dir.glob(f"*/{video_key}/*.mp4"))

    ep_idx = 0
    chunk_idx = 0
    file_idx = 0
    size_in_mb = 0
    duration_in_s = 0.0
    paths_to_cat = []
    episodes_metadata = []

    for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"):
        ep_size_in_mb = get_file_size_in_mb(ep_path)
        ep_duration_in_s = get_video_duration_in_s(ep_path)

        # Check if adding this episode would exceed the limit
        if size_in_mb + ep_size_in_mb >= video_file_size_in_mb and len(paths_to_cat) > 0:
            # Size limit would be exceeded, save current accumulation WITHOUT this episode
            concatenate_video_files(
                paths_to_cat,
                new_root
                / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
            )

            # Update episodes metadata for the file we just saved
            for i, _ in enumerate(paths_to_cat):
                past_ep_idx = ep_idx - len(paths_to_cat) + i
                episodes_metadata[past_ep_idx][f"videos/{video_key}/chunk_index"] = chunk_idx
                episodes_metadata[past_ep_idx][f"videos/{video_key}/file_index"] = file_idx

            # Move to next file and start fresh with current episode
            chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE)
            size_in_mb = 0
            duration_in_s = 0.0
            paths_to_cat = []

        # Add current episode metadata
        ep_metadata = {
            "episode_index": ep_idx,
            f"videos/{video_key}/chunk_index": chunk_idx,  # Will be updated when file is saved
            f"videos/{video_key}/file_index": file_idx,  # Will be updated when file is saved
            f"videos/{video_key}/from_timestamp": duration_in_s,
            f"videos/{video_key}/to_timestamp": duration_in_s + ep_duration_in_s,
        }
        episodes_metadata.append(ep_metadata)

        # Add current episode to accumulation
        paths_to_cat.append(ep_path)
        size_in_mb += ep_size_in_mb
        duration_in_s += ep_duration_in_s
        ep_idx += 1

    # Write remaining videos if any
    if paths_to_cat:
        concatenate_video_files(
            paths_to_cat,
            new_root
            / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
        )

        # Update episodes metadata for the final file
        for i, _ in enumerate(paths_to_cat):
            past_ep_idx = ep_idx - len(paths_to_cat) + i
            episodes_metadata[past_ep_idx][f"videos/{video_key}/chunk_index"] = chunk_idx
            episodes_metadata[past_ep_idx][f"videos/{video_key}/file_index"] = file_idx

    return episodes_metadata


def generate_episode_metadata_dict(
    episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None
):
    num_episodes = len(episodes_metadata)
    episodes_legacy_metadata_vals = list(episodes_legacy_metadata.values())
    episodes_stats_vals = list(episodes_stats.values())
    episodes_stats_keys = list(episodes_stats.keys())

    for i in range(num_episodes):
        ep_legacy_metadata = episodes_legacy_metadata_vals[i]
        ep_metadata = episodes_metadata[i]
        ep_stats = episodes_stats_vals[i]

        ep_ids_set = {
            ep_legacy_metadata["episode_index"],
            ep_metadata["episode_index"],
            episodes_stats_keys[i],
        }

        if episodes_videos is None:
            ep_video = {}
        else:
            ep_video = episodes_videos[i]
            ep_ids_set.add(ep_video["episode_index"])

        if len(ep_ids_set) != 1:
            raise ValueError(f"Number of episodes is not the same ({ep_ids_set}).")

        ep_dict = {**ep_metadata, **ep_video, **ep_legacy_metadata, **flatten_dict({"stats": ep_stats})}
        ep_dict["meta/episodes/chunk_index"] = 0
        ep_dict["meta/episodes/file_index"] = 0
        yield ep_dict


def convert_episodes_metadata(root, new_root, episodes_metadata, episodes_video_metadata=None):
    logging.info(f"Converting episodes metadata from {root} to {new_root}")

    episodes_legacy_metadata = legacy_load_episodes(root)
    episodes_stats = legacy_load_episodes_stats(root)

    num_eps_set = {len(episodes_legacy_metadata), len(episodes_metadata)}
    if episodes_video_metadata is not None:
        num_eps_set.add(len(episodes_video_metadata))

    if len(num_eps_set) != 1:
        raise ValueError(f"Number of episodes is not the same ({num_eps_set}).")

    ds_episodes = Dataset.from_generator(
        lambda: generate_episode_metadata_dict(
            episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_video_metadata
        )
    )
    write_episodes(ds_episodes, new_root)

    stats = aggregate_stats(list(episodes_stats.values()))
    write_stats(stats, new_root)


def convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb):
    info = load_info(root)
    info["codebase_version"] = V30
    del info["total_chunks"]
    del info["total_videos"]
    info["data_files_size_in_mb"] = data_file_size_in_mb
    info["video_files_size_in_mb"] = video_file_size_in_mb
    info["data_path"] = DEFAULT_DATA_PATH
    info["video_path"] = DEFAULT_VIDEO_PATH if info["video_path"] is not None else None
    info["fps"] = int(info["fps"])
    logging.info(f"Converting info from {root} to {new_root}")
    for key in info["features"]:
        if info["features"][key]["dtype"] == "video":
            # already has fps in video_info
            continue
        info["features"][key]["fps"] = info["fps"]
    write_info(info, new_root)


def convert_dataset(
    load_path: str | Path | None = None,
    save_path: str | Path | None = None,
    branch: str | None = None,
    data_file_size_in_mb: int | None = None,
    video_file_size_in_mb: int | None = None,
    push_to_hub: bool = True,
    force_conversion: bool = False,
    start_ratio: float = 0.0,
    end_ratio: float = 1.0,
):
    if data_file_size_in_mb is None:
        data_file_size_in_mb = DEFAULT_DATA_FILE_SIZE_IN_MB
    if video_file_size_in_mb is None:
        video_file_size_in_mb = DEFAULT_VIDEO_FILE_SIZE_IN_MB

    # # First check if the dataset already has a v3.0 version
    # if save_root is None and not force_conversion:
    #     try:
    #         print("Trying to download v3.0 version of the dataset from the hub...")
    #         snapshot_download(old_repo_id, repo_type="dataset", revision=V30, local_dir=HF_LEROBOT_HOME / old_repo_id)
    #     except Exception:
    #         print("Dataset does not have an uploaded v3.0 version. Continuing with conversion.")

    # Set root based on whether local dataset path is provided
    use_local_dataset = False
    # root = HF_LEROBOT_HOME / old_repo_id if root is None else Path(root) / old_repo_id
    # root = Path(load_root) / old_repo_id
    root = Path(load_path)
    if root.exists():
        validate_local_dataset_version(root)
        use_local_dataset = True
        print(f"Using local dataset at {root}")


    # new_root = HF_LEROBOT_HOME / new_repo_id
    new_root = Path(save_path)

    # Handle old_root cleanup if both old_root and root exist

    if new_root.is_dir():
        return
        shutil.rmtree(new_root)

    try:
        convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb)
        convert_tasks(root, new_root)
        episodes_metadata = convert_data(root, new_root, data_file_size_in_mb)
        episodes_videos_metadata = convert_videos(root, new_root, video_file_size_in_mb)
        convert_episodes_metadata(root, new_root, episodes_metadata, episodes_videos_metadata)
    except:
        shutil.rmtree(new_root)

if __name__ == "__main__":
    init_logging()
    parser = argparse.ArgumentParser()
    # parser.add_argument(
    #     "--old-repo-id",
    #     type=str,
    #     required=True,
    #     help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset "
    #     "(e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
    # )
    # parser.add_argument(
    #     "--new-repo-id",
    #     type=str,
    #     required=True,
    #     help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset "
    #     "(e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
    # )
    parser.add_argument(
        "--start_ratio",
        type=float,
        default=0.0
    )
    parser.add_argument(
        "--end_ratio",
        type=float,
        default=1.0
    )
    parser.add_argument(
        "--branch",
        type=str,
        default=None,
        help="Repo branch to push your dataset. Defaults to the main branch.",
    )
    parser.add_argument(
        "--data-file-size-in-mb",
        type=int,
        default=None,
        help="File size in MB. Defaults to 100 for data and 500 for videos.",
    )
    parser.add_argument(
        "--video-file-size-in-mb",
        type=int,
        default=None,
        help="File size in MB. Defaults to 100 for data and 500 for videos.",
    )
    # parser.add_argument(
    #     "--load-root",
    #     type=str,
    #     default=None,
    #     help="Local directory to use for downloading the dataset.",
    # )
    # parser.add_argument(
    #     "--save-root",
    #     type=str,
    #     default=None,
    #     help="Local directory to use for writing the dataset.",
    # )
    parser.add_argument(
        "--push-to-hub",
        type=lambda input: input.lower() == "true",
        default=True,
        help="Push the converted dataset to the hub.",
    )
    parser.add_argument(
        "--force-conversion",
        action="store_true",
        help="Force conversion even if the dataset already has a v3.0 version.",
    )

    args = parser.parse_args()

    load_root_path = "/mnt/shared-storage-user/internvla/InternData-A1-realese/v2.0-stable/InternData-A1/sim"
    save_root_path = "/mnt/shared-storage-user/internvla/InternData-A1-realese/v2.0-stable/InternData-A1/sim_lerobotv30"
    # load_paths = (
    #     glob.glob(os.path.join(load_root_path, "articulation_tasks", "*", "*")) + \
    #     glob.glob(os.path.join(load_root_path, "basic_tasks", "*", "*")) + \
    #     glob.glob(os.path.join(load_root_path, "long_horizon_tasks", "*", "*"))
    # )
    # load_paths += (glob.glob(os.path.join(load_root_path, "pick_and_place_tasks", "*", "*", "*")))

    load_paths = (glob.glob(os.path.join(load_root_path, "long_horizon_tasks", "lift2", "*collaborate_assemble_a_beef_sandwich_part3*")))
    load_paths += (glob.glob(os.path.join(load_root_path, "long_horizon_tasks", "split_aloha", "*collaborate_assemble_a_beef_sandwich*")))

    load_paths.sort()
    num_eps = len(load_paths)
    start_eps = int(num_eps * args.start_ratio)
    end_eps = int(num_eps * args.end_ratio) + 1

    print("start_eps :", start_eps, "end_eps :", end_eps)

    for load_path in tqdm.tqdm(load_paths[start_eps:end_eps]):
        save_path = load_path.replace(load_root_path, save_root_path)
        repo_id = load_path.split("/")[-1]
        robot_id = load_path.split("/")[-2]
        task_type = load_path.split("/")[-3]
        print(f"Converting {task_type} {robot_id} {repo_id} task to lerobot v30")
        args.load_path = load_path
        args.save_path = save_path
        convert_dataset(**vars(args))