884 lines
40 KiB
Python
884 lines
40 KiB
Python
import argparse
|
||
import json
|
||
import logging
|
||
import os
|
||
import gc
|
||
import shutil
|
||
import torchvision
|
||
import cv2
|
||
import h5py
|
||
import lmdb
|
||
import numpy as np
|
||
import pickle
|
||
import torch
|
||
import pinocchio as pin
|
||
import time
|
||
import ray
|
||
import logging
|
||
import pdb
|
||
import os
|
||
import imageio # imageio-ffmpeg
|
||
|
||
from PIL import Image
|
||
from tqdm import tqdm
|
||
from lerobot.common.datasets.compute_stats import auto_downsample_height_width, get_feature_stats, sample_indices
|
||
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
|
||
from lerobot.common.datasets.utils import check_timestamps_sync, get_episode_data_index, validate_episode_buffer
|
||
from ray.runtime_env import RuntimeEnv
|
||
from scipy.spatial.transform import Rotation as R
|
||
from copy import deepcopy
|
||
from concurrent.futures import ALL_COMPLETED, ProcessPoolExecutor, ThreadPoolExecutor, as_completed, wait
|
||
from pathlib import Path
|
||
from typing import Callable, Dict, List, Optional, Tuple
|
||
|
||
"""
|
||
Store both camera image and robot state as a combined observation.
|
||
Args:
|
||
observation: images(camera), states (robot state)
|
||
actions: joint, gripper, ee_pose
|
||
"""
|
||
FEATURES = {
|
||
"images.rgb.head": {
|
||
"dtype": "video",
|
||
"shape": (360, 640, 3),
|
||
"names": ["height", "width", "channel"],
|
||
},
|
||
"images.rgb.hand_left": {
|
||
"dtype": "video",
|
||
"shape": (480, 640, 3),
|
||
"names": ["height", "width", "channel"],
|
||
},
|
||
"images.rgb.hand_right": {
|
||
"dtype": "video",
|
||
"shape": (480, 640, 3),
|
||
"names": ["height", "width", "channel"],
|
||
},
|
||
"head_camera_intrinsics": {
|
||
"dtype": "float32",
|
||
"shape": (4,),
|
||
"names": ["fx", "fy", "cx", "cy"],
|
||
},
|
||
"hand_left_camera_intrinsics": {
|
||
"dtype": "float32",
|
||
"shape": (4,),
|
||
"names": ["fx", "fy", "cx", "cy"],
|
||
},
|
||
"hand_right_camera_intrinsics": {
|
||
"dtype": "float32",
|
||
"shape": (4,),
|
||
"names": ["fx", "fy", "cx", "cy"],
|
||
},
|
||
"head_camera_to_robot_extrinsics": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"hand_left_camera_to_robot_extrinsics": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"hand_right_camera_to_robot_extrinsics": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.left_joint.position": {
|
||
"dtype": "float32",
|
||
"shape": (6,),
|
||
"names": ["left_joint_0", "left_joint_1", "left_joint_2", "left_joint_3", "left_joint_4", "left_joint_5",],
|
||
},
|
||
"states.left_gripper.position": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["left_gripper_0",],
|
||
},
|
||
"states.left_ee_to_left_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.left_ee_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.left_tcp_to_left_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.left_tcp_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.right_joint.position": {
|
||
"dtype": "float32",
|
||
"shape": (6,),
|
||
"names": ["right_joint_0", "right_joint_1", "right_joint_2", "right_joint_3", "right_joint_4", "right_joint_5",],
|
||
},
|
||
"states.right_gripper.position": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["right_gripper_0",],
|
||
},
|
||
"states.right_ee_to_right_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.right_ee_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.right_tcp_to_right_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.right_tcp_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"states.robot_to_env_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.left_joint.position": {
|
||
"dtype": "float32",
|
||
"shape": (6,),
|
||
"names": ["left_joint_0", "left_joint_1", "left_joint_2", "left_joint_3", "left_joint_4", "left_joint_5",],
|
||
},
|
||
"actions.left_gripper.position": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["left_gripper_0",],
|
||
},
|
||
"actions.left_ee_to_left_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.left_ee_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.left_tcp_to_left_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.left_tcp_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.right_joint.position": {
|
||
"dtype": "float32",
|
||
"shape": (6,),
|
||
"names": ["right_joint_0", "right_joint_1", "right_joint_2", "right_joint_3", "right_joint_4", "right_joint_5",],
|
||
},
|
||
"actions.right_gripper.position": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["right_gripper_0", ],
|
||
},
|
||
"actions.right_ee_to_right_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.right_ee_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.right_tcp_to_right_armbase_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"actions.right_tcp_to_robot_pose": {
|
||
"dtype": "float32",
|
||
"shape": (7,),
|
||
"names": ["position.x", "position.y", "position.z", "quaternion.w", "quaternion.x", "quaternion.y", "quaternion.z"],
|
||
},
|
||
"master_actions.left_joint.position": {
|
||
"dtype": "float32",
|
||
"shape": (6,),
|
||
"names": ["left_joint_0", "left_joint_1", "left_joint_2", "left_joint_3", "left_joint_4", "left_joint_5",],
|
||
},
|
||
"master_actions.left_gripper.position": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["left_gripper_0",],
|
||
},
|
||
"master_actions.left_gripper.openness": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["left_gripper_0",],
|
||
},
|
||
"master_actions.right_joint.position": {
|
||
"dtype": "float32",
|
||
"shape": (6,),
|
||
"names": ["right_joint_0", "right_joint_1", "right_joint_2", "right_joint_3", "right_joint_4", "right_joint_5",],
|
||
},
|
||
"master_actions.right_gripper.position": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["right_gripper_0", ],
|
||
},
|
||
"master_actions.right_gripper.openness": {
|
||
"dtype": "float32",
|
||
"shape": (1,),
|
||
"names": ["right_gripper_0",],
|
||
},
|
||
}
|
||
|
||
class SplitAlohaDataset(LeRobotDataset):
|
||
def __init__(
|
||
self,
|
||
repo_id: str,
|
||
root: str | Path | None = None,
|
||
episodes: list[int] | None = None,
|
||
image_transforms: Callable | None = None,
|
||
delta_timestamps: dict[list[float]] | None = None,
|
||
tolerance_s: float = 1e-4,
|
||
download_videos: bool = True,
|
||
local_files_only: bool = False,
|
||
video_backend: str | None = None,
|
||
):
|
||
super().__init__(
|
||
repo_id=repo_id,
|
||
root=root,
|
||
episodes=episodes,
|
||
image_transforms=image_transforms,
|
||
delta_timestamps=delta_timestamps,
|
||
tolerance_s=tolerance_s,
|
||
download_videos=download_videos,
|
||
local_files_only=local_files_only,
|
||
video_backend=video_backend,
|
||
)
|
||
|
||
def save_episode(self, episode_data: dict | None = None, videos: dict | None = None) -> None:
|
||
if not episode_data:
|
||
episode_buffer = self.episode_buffer
|
||
|
||
validate_episode_buffer(episode_buffer, self.meta.total_episodes, self.features)
|
||
episode_length = episode_buffer.pop("size")
|
||
tasks = episode_buffer.pop("task")
|
||
episode_tasks = list(set(tasks))
|
||
episode_index = episode_buffer["episode_index"]
|
||
|
||
episode_buffer["index"] = np.arange(self.meta.total_frames, self.meta.total_frames + episode_length)
|
||
episode_buffer["episode_index"] = np.full((episode_length,), episode_index)
|
||
|
||
for task in episode_tasks:
|
||
task_index = self.meta.get_task_index(task)
|
||
if task_index is None:
|
||
self.meta.add_task(task)
|
||
|
||
episode_buffer["task_index"] = np.array([self.meta.get_task_index(task) for task in tasks])
|
||
for key, ft in self.features.items():
|
||
if key in ["index", "episode_index", "task_index"] or ft["dtype"] in ["video"]:
|
||
continue
|
||
episode_buffer[key] = np.stack(episode_buffer[key]).squeeze()
|
||
for key in self.meta.video_keys:
|
||
video_path = self.root / self.meta.get_video_file_path(episode_index, key)
|
||
episode_buffer[key] = str(video_path) # PosixPath -> str
|
||
video_path.parent.mkdir(parents=True, exist_ok=True)
|
||
shutil.copyfile(videos[key], video_path)
|
||
ep_stats = compute_episode_stats(episode_buffer, self.features)
|
||
self._save_episode_table(episode_buffer, episode_index)
|
||
self.meta.save_episode(episode_index, episode_length, episode_tasks, ep_stats)
|
||
ep_data_index = get_episode_data_index(self.meta.episodes, [episode_index])
|
||
ep_data_index_np = {k: t.numpy() for k, t in ep_data_index.items()}
|
||
check_timestamps_sync(
|
||
episode_buffer["timestamp"],
|
||
episode_buffer["episode_index"],
|
||
ep_data_index_np,
|
||
self.fps,
|
||
self.tolerance_s,
|
||
)
|
||
if not episode_data:
|
||
self.episode_buffer = self.create_episode_buffer()
|
||
|
||
|
||
def add_frame(self, frame: dict) -> None:
|
||
for name in frame:
|
||
if isinstance(frame[name], torch.Tensor):
|
||
frame[name] = frame[name].numpy()
|
||
features = {key: value for key, value in self.features.items() if key in self.hf_features}
|
||
if self.episode_buffer is None:
|
||
self.episode_buffer = self.create_episode_buffer()
|
||
frame_index = self.episode_buffer["size"]
|
||
timestamp = frame.pop("timestamp") if "timestamp" in frame else frame_index / self.fps
|
||
self.episode_buffer["frame_index"].append(frame_index)
|
||
self.episode_buffer["timestamp"].append(timestamp)
|
||
|
||
for key in frame:
|
||
if key == "task":
|
||
self.episode_buffer["task"].append(frame["task"])
|
||
continue
|
||
if key not in self.features:
|
||
raise ValueError(f"An element of the frame is not in the features. '{key}' not in '{self.features.keys()}'.")
|
||
self.episode_buffer[key].append(frame[key])
|
||
self.episode_buffer["size"] += 1
|
||
|
||
# def crop_resize_no_padding(image, target_size=(480, 640)):
|
||
# """
|
||
# Crop and scale to target size (no padding)
|
||
# :param image: input image (NumPy array)
|
||
# :param target_size: target size (height, width)
|
||
# :return: processed image
|
||
# """
|
||
# h, w = image.shape[:2]
|
||
# target_h, target_w = target_size
|
||
# target_ratio = target_w / target_h # Target aspect ratio (e.g. 640/480=1.333)
|
||
|
||
# # the original image aspect ratio and cropping direction
|
||
# if w / h > target_ratio: # Original image is wider → crop width
|
||
# crop_w = int(h * target_ratio) # Calculate crop width based on target aspect ratio
|
||
# crop_h = h
|
||
# start_x = (w - crop_w) // 2 # Horizontal center starting point
|
||
# start_y = 0
|
||
# else: # Original image is higher → crop height
|
||
# crop_h = int(w / target_ratio) # Calculate clipping height according to target aspect ratio
|
||
# crop_w = w
|
||
# start_x = 0
|
||
# start_y = (h - crop_h) // 2 # Vertical center starting point
|
||
|
||
# # Perform centered cropping (to prevent out-of-bounds)
|
||
# start_x, start_y = max(0, start_x), max(0, start_y)
|
||
# end_x, end_y = min(w, start_x + crop_w), min(h, start_y + crop_h)
|
||
# cropped = image[start_y:end_y, start_x:end_x]
|
||
|
||
# # Resize to target size (bilinear interpolation)
|
||
# resized = cv2.resize(cropped, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
|
||
# return resized
|
||
|
||
def tf2xyzwxyz(posetf):
|
||
translation = posetf[:3, 3]
|
||
orientation = R.from_matrix(posetf[:3,:3]).as_quat(scalar_first=True) # w, x, y, z
|
||
xyzwxyz = (np.concatenate([translation, orientation])).astype("float32")
|
||
|
||
return xyzwxyz
|
||
|
||
def load_lmdb_data(episode_path: Path, sava_path: Path, fps_factor: int, target_fps: int) -> Optional[Dict]:
|
||
def load_image(txn, key):
|
||
raw = txn.get(key)
|
||
data = pickle.loads(raw)
|
||
image = cv2.imdecode(data, cv2.IMREAD_COLOR)
|
||
# Convert to RGB if necessary
|
||
# image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||
# image = crop_resize_no_padding(image, target_size=(480, 640))
|
||
return image
|
||
|
||
left_armbase_to_robot_pose = np.eye(4)
|
||
left_armbase_to_robot_pose[:3, 3] = np.array([0.3684792 , 0.30600064, 0.65270409])
|
||
right_armbase_to_robot_pose = np.eye(4)
|
||
right_armbase_to_robot_pose[:3, 3] = np.array([0.36847978, -0.30599953, 0.65270409])
|
||
|
||
rot_x = np.eye(4)
|
||
rot_x[1][1] = -1.0
|
||
rot_x[2][2] = -1.0
|
||
|
||
tcp2ee_pose = np.eye(4)
|
||
tcp2ee_pose[2, 3] = 0.135
|
||
|
||
model = pin.buildModelFromUrdf("../assets/piper100/piper100.urdf")
|
||
data = model.createData()
|
||
|
||
try:
|
||
|
||
env = lmdb.open(
|
||
str(episode_path / "lmdb"),
|
||
readonly=True,
|
||
lock=False,
|
||
max_readers=128,
|
||
readahead=False
|
||
)
|
||
meta_info = pickle.load(open(episode_path/"meta_info.pkl", "rb"))
|
||
with env.begin(write=False) as txn:
|
||
keys = [k for k, _ in txn.cursor()]
|
||
qpos_keys = ['states.left_gripper.position', 'states.left_joint.position', 'states.right_gripper.position', 'states.right_joint.position']
|
||
master_action_keys = ['master_actions.left_gripper.openness', 'master_actions.left_gripper.position', 'master_actions.left_joint.position', 'master_actions.right_gripper.openness', 'master_actions.right_gripper.position', 'master_actions.right_joint.position']
|
||
image_keys = ['images.rgb.head', 'images.rgb.hand_left', 'images.rgb.hand_right']
|
||
compute_qpos_keys = ['states.left_joint.position', 'states.right_joint.position']
|
||
additional_action_keys = ["actions.left_ee_to_left_armbase_pose", "actions.left_ee_to_robot_pose", "actions.left_tcp_to_left_armbase_pose", "actions.left_tcp_to_robot_pose", "actions.right_ee_to_right_armbase_pose", "actions.right_ee_to_robot_pose", "actions.right_tcp_to_right_armbase_pose", "actions.right_tcp_to_robot_pose"]
|
||
robot2env_keys = ['robot2env_pose']
|
||
intrinsics_keys = ['json_data']
|
||
camera2env_keys = ["camera2env_pose.head", "camera2env_pose.hand_left", "camera2env_pose.hand_right"]
|
||
num_steps = meta_info["num_steps"]
|
||
total_steps = []
|
||
for image_key in image_keys:
|
||
keys_image_per_step = meta_info['keys'][image_key]
|
||
total_steps.append(len(keys_image_per_step))
|
||
|
||
state_action_dict = {}
|
||
### qpos
|
||
for key in qpos_keys:
|
||
state_action_dict[key] = pickle.loads(txn.get(key.encode()))
|
||
state_action_dict[key] = np.stack(state_action_dict[key])
|
||
total_steps.append(len(state_action_dict[key]))
|
||
state_keys = list(state_action_dict.keys())
|
||
### next qpos as action
|
||
for k in state_keys:
|
||
state_action_dict[k.replace("states", "actions")] = np.concatenate([state_action_dict[k][1:, :], state_action_dict[k][-1, :][None,:]], axis=0)
|
||
### master action
|
||
for key in master_action_keys:
|
||
state_action_dict[key] = pickle.loads(txn.get(key.encode()))
|
||
if np.isscalar(state_action_dict[key]):
|
||
state_action_dict[key] = np.array([state_action_dict[key]]).astype("float32")
|
||
state_action_dict[key] = np.stack(state_action_dict[key])
|
||
total_steps.append(len(state_action_dict[key]))
|
||
### ee & tcp pose proprio
|
||
for compute_qpos_key in compute_qpos_keys:
|
||
compute_qpos = pickle.loads(txn.get(compute_qpos_key.encode()))
|
||
ee_to_armbase_poses = []
|
||
ee_to_robot_poses = []
|
||
tcp_to_armbase_poses = []
|
||
tcp_to_robot_poses = []
|
||
for each_compute_qpos in compute_qpos:
|
||
q = np.zeros(model.nq) # 关节角
|
||
ndim = each_compute_qpos.shape[0]
|
||
q[:ndim] = each_compute_qpos
|
||
pin.forwardKinematics(model, data, q)
|
||
pin.updateFramePlacements(model, data)
|
||
fid_a = model.getFrameId("arm_base")
|
||
fid_b = model.getFrameId("link6")
|
||
|
||
T_a = data.oMf[fid_a] # world -> a
|
||
T_b = data.oMf[fid_b] # world -> b
|
||
T_a_b = T_a.inverse() * T_b
|
||
|
||
ee2a_translation = T_a_b.homogeneous[:3, 3]
|
||
ee2a_orientation = R.from_matrix(T_a_b.homogeneous[:3,:3]).as_quat(scalar_first=True) # w, x, y, z
|
||
|
||
ee_to_armbase_pose = (np.concatenate([ee2a_translation, ee2a_orientation])).astype("float32")
|
||
ee_to_armbase_poses.append(ee_to_armbase_pose)
|
||
|
||
tcp_to_arm_base_posetf = T_a_b.homogeneous @ tcp2ee_pose
|
||
tcp_to_arm_base_translation = tcp_to_arm_base_posetf[:3, 3]
|
||
tcp_to_arm_base_orientation = R.from_matrix(tcp_to_arm_base_posetf[:3,:3]).as_quat(scalar_first=True) # w, x, y, z
|
||
tcp_to_armbase_pose = (np.concatenate([tcp_to_arm_base_translation, tcp_to_arm_base_orientation])).astype("float32")
|
||
tcp_to_armbase_poses.append(tcp_to_armbase_pose)
|
||
|
||
if "left" in compute_qpos_key:
|
||
ee_to_robot_posetf = left_armbase_to_robot_pose @ T_a_b.homogeneous
|
||
elif "right" in compute_qpos_key:
|
||
ee_to_robot_posetf = right_armbase_to_robot_pose @ T_a_b.homogeneous
|
||
|
||
ee2r_translation = ee_to_robot_posetf[:3, 3]
|
||
ee2r_orientation = R.from_matrix(ee_to_robot_posetf[:3,:3]).as_quat(scalar_first=True) # w, x, y, z
|
||
|
||
ee_to_robot_pose = (np.concatenate([ee2r_translation, ee2r_orientation])).astype("float32")
|
||
ee_to_robot_poses.append(ee_to_robot_pose)
|
||
|
||
tcp_to_robot_posetf = ee_to_robot_posetf @ tcp2ee_pose
|
||
tcp_to_robot_translation = tcp_to_robot_posetf[:3, 3]
|
||
tcp_to_robot_orientation = R.from_matrix(tcp_to_robot_posetf[:3,:3]).as_quat(scalar_first=True) # w, x, y, z
|
||
tcp_to_robot_pose = (np.concatenate([tcp_to_robot_translation, tcp_to_robot_orientation])).astype("float32")
|
||
tcp_to_robot_poses.append(tcp_to_robot_pose)
|
||
|
||
if "left" in compute_qpos_key:
|
||
ee2a_key = f"states.left_ee_to_left_armbase_pose"
|
||
ee2r_key = f"states.left_ee_to_robot_pose"
|
||
tcp2a_key = f"states.left_tcp_to_left_armbase_pose"
|
||
tcp2r_key = f"states.left_tcp_to_robot_pose"
|
||
elif "right" in compute_qpos_key:
|
||
ee2a_key = f"states.right_ee_to_right_armbase_pose"
|
||
ee2r_key = f"states.right_ee_to_robot_pose"
|
||
tcp2a_key = f"states.right_tcp_to_right_armbase_pose"
|
||
tcp2r_key = f"states.right_tcp_to_robot_pose"
|
||
|
||
state_action_dict[ee2a_key] = np.stack(ee_to_armbase_poses)
|
||
state_action_dict[ee2r_key] = np.stack(ee_to_robot_poses)
|
||
state_action_dict[tcp2a_key] = np.stack(tcp_to_armbase_poses)
|
||
state_action_dict[tcp2r_key] = np.stack(tcp_to_robot_poses)
|
||
### ee & tcp pose action
|
||
for additional_action_key in additional_action_keys:
|
||
additional_state_key = additional_action_key.replace("actions", "states")
|
||
additional_state = state_action_dict[additional_state_key]
|
||
additional_action = np.concatenate([additional_state[1:, :], additional_state[-1, :][None,:]], axis=0)
|
||
state_action_dict[additional_action_key] = additional_action
|
||
### intrinsics pose
|
||
for intrinsics_key in intrinsics_keys:
|
||
intrinsics_params = pickle.loads(txn.get(intrinsics_key.encode()))
|
||
hand_left_camera_params = intrinsics_params["hand_left_camera_params"]
|
||
hand_left_camera_params = (np.array([hand_left_camera_params[0][0], hand_left_camera_params[1][1], hand_left_camera_params[0][2], hand_left_camera_params[1][2]])).astype("float32")
|
||
hand_right_camera_params = intrinsics_params["hand_right_camera_params"]
|
||
hand_right_camera_params = (np.array([hand_right_camera_params[0][0], hand_right_camera_params[1][1], hand_right_camera_params[0][2], hand_right_camera_params[1][2]])).astype("float32")
|
||
head_camera_params = intrinsics_params["head_camera_params"]
|
||
head_camera_params = (np.array([head_camera_params[0][0], head_camera_params[1][1], head_camera_params[0][2], head_camera_params[1][2]])).astype("float32")
|
||
if head_camera_params[2] >= 500:
|
||
head_camera_params /= 2
|
||
state_action_dict["head_camera_intrinsics"] = np.stack([head_camera_params for _ in range(num_steps)])
|
||
state_action_dict["hand_left_camera_intrinsics"] = np.stack([hand_left_camera_params for _ in range(num_steps)])
|
||
state_action_dict["hand_right_camera_intrinsics"] = np.stack([hand_right_camera_params for _ in range(num_steps)])
|
||
# print(state_action_dict["hand_left_camera_intrinsics"].shape, state_action_dict["hand_right_camera_intrinsics"].shape)
|
||
### robot2env pose
|
||
for robot2env_key in robot2env_keys:
|
||
robot2env_pose_tfs = pickle.loads(txn.get(robot2env_key.encode()))
|
||
robot2env_pose_7ds = []
|
||
for robot2env_pose_tf in robot2env_pose_tfs:
|
||
translation = robot2env_pose_tf[:3, 3]
|
||
orientation = R.from_matrix(robot2env_pose_tf[:3,:3]).as_quat(scalar_first=True) # w, x, y, z
|
||
robot2env_pose_7d = (np.concatenate([translation, orientation])).astype("float32")
|
||
robot2env_pose_7ds.append(robot2env_pose_7d)
|
||
state_action_dict[robot2env_key] = np.stack(robot2env_pose_7ds)
|
||
### camera2env pose
|
||
for camera2env_key in camera2env_keys:
|
||
camera2env_pose_tfs = pickle.loads(txn.get(camera2env_key.encode()))
|
||
camera2robot_poses = []
|
||
for frame_idx in range(len(camera2env_pose_tfs)):
|
||
camera2env_posetf = camera2env_pose_tfs[frame_idx]
|
||
robot2env_pose_tf = robot2env_pose_tfs[frame_idx]
|
||
camera2robot_pose_tf = np.linalg.inv(robot2env_pose_tf) @ camera2env_posetf @ rot_x
|
||
camera2robot_poses.append(tf2xyzwxyz(camera2robot_pose_tf))
|
||
|
||
if camera2env_key == "camera2env_pose.head":
|
||
state_action_dict["head_camera_to_robot_extrinsics"] = np.stack(camera2robot_poses)
|
||
elif camera2env_key == "camera2env_pose.hand_left":
|
||
state_action_dict["hand_left_camera_to_robot_extrinsics"] = np.stack(camera2robot_poses)
|
||
elif camera2env_key == "camera2env_pose.hand_right":
|
||
state_action_dict["hand_right_camera_to_robot_extrinsics"] = np.stack(camera2robot_poses)
|
||
|
||
unique_steps = list(set(total_steps))
|
||
# import pdb; pdb.set_trace()
|
||
print("episode_path:", episode_path)
|
||
print("total_steps: ", total_steps)
|
||
assert len(unique_steps) == 1 and unique_steps[0]>0, f"no data found or qpos / image steps mismatch in {episode_path}"
|
||
assert np.max(np.abs(state_action_dict["states.left_joint.position"])) < 2 * np.pi
|
||
assert np.max(np.abs(state_action_dict["states.right_joint.position"])) < 2 * np.pi
|
||
selected_steps = [step for step in range(unique_steps[0]) if step % fps_factor == 0]
|
||
frames = []
|
||
image_observations = {}
|
||
for image_key in image_keys:
|
||
image_observations[image_key] = []
|
||
start_time = time.time()
|
||
for step_index, step in enumerate(selected_steps):
|
||
step_str = f"{step:04d}"
|
||
data_dict = {}
|
||
for key, value in state_action_dict.items():
|
||
# if "forlan2robot_pose" in key:
|
||
# if key == "fl_forlan2robot_pose":
|
||
# data_dict["states.left_ee_to_left_armbase_pose"] = value[step]
|
||
# elif key == "fr_forlan2robot_pose":
|
||
# data_dict["states.right_ee_to_right_armbase_pose"] = value[step]
|
||
if "robot2env_pose" in key:
|
||
data_dict["states.robot_to_env_pose"] = value[step]
|
||
else:
|
||
data_dict[key] = value[step]
|
||
data_dict["task"] = meta_info['language_instruction']
|
||
frames.append(data_dict)
|
||
# import pdb; pdb.set_trace()
|
||
for image_key in image_keys:
|
||
image_key_step_encode = f"{image_key}/{step_str}".encode()
|
||
if not image_key_step_encode in keys:
|
||
raise ValueError(f"Image key {image_key_step_encode} not found in LMDB keys.")
|
||
image_observations[image_key].append(load_image(txn, image_key_step_encode))
|
||
end_time = time.time()
|
||
elapsed_time = end_time - start_time
|
||
print(f"load image_observations of {episode_path}")
|
||
env.close()
|
||
if not frames:
|
||
return None
|
||
os.makedirs(sava_path, exist_ok=True)
|
||
os.makedirs(sava_path/episode_path.name, exist_ok=True)
|
||
video_paths = {}
|
||
for image_key in image_keys:
|
||
h_ori, w_ori = image_observations[image_key][0].shape[:2]
|
||
if w_ori == 1280:
|
||
w_tgt = w_ori//2
|
||
h_tgt = h_ori//2
|
||
else:
|
||
w_tgt = w_ori
|
||
h_tgt = h_ori
|
||
imageio.mimsave(
|
||
sava_path/episode_path.name/f'{image_key.replace(".", "_")}.mp4',
|
||
image_observations[image_key],
|
||
fps=target_fps,
|
||
codec="libsvtav1",
|
||
# codec="libx264",
|
||
ffmpeg_params=[
|
||
"-crf", "28", # 画质控制(0-63,默认30)
|
||
"-preset", "8", # 速度预设(0-13,值越高越快但压缩率越低)
|
||
# "-g", "240", # 关键帧间隔(建议 ≥ fps 的 8 倍)
|
||
"-pix_fmt", "yuv420p", # 兼容性像素格式
|
||
"-movflags", "+faststart", # 将元数据移到文件开头,便于网络播放
|
||
# "-threads", "8", # 使用的线程数
|
||
"-vf", f"scale={w_tgt}:{h_tgt}",
|
||
"-y", # 覆盖已存在的输出文件
|
||
]
|
||
)
|
||
video_paths[image_key] = sava_path/episode_path.name/f'{image_key.replace(".", "_")}.mp4'
|
||
# imageio.mimsave(sava_path/episode_path.name/'hand_left.mp4', image_observations["images.rgb.hand_left"], fps=target_fps)
|
||
# imageio.mimsave(sava_path/episode_path.name/'hand_right.mp4', image_observations["images.rgb.hand_right"], fps=target_fps)
|
||
print(f"imageio.mimsave time taken of {episode_path}")
|
||
|
||
return {
|
||
"frames": frames,
|
||
"videos": video_paths,
|
||
}
|
||
|
||
except Exception as e:
|
||
logging.error(f"Failed to load or process LMDB data: {e}")
|
||
return None
|
||
|
||
|
||
def get_all_tasks(src_path: Path, output_path: Path) -> Tuple[Path, Path]:
|
||
output_path.mkdir(exist_ok=True)
|
||
yield (src_path, output_path)
|
||
|
||
def compute_episode_stats(episode_data: Dict[str, List[str] | np.ndarray], features: Dict) -> Dict:
|
||
ep_stats = {}
|
||
for key, data in episode_data.items():
|
||
if features[key]["dtype"] == "string":
|
||
continue
|
||
elif features[key]["dtype"] in ["image", "video"]:
|
||
ep_ft_array = sample_images(data)
|
||
axes_to_reduce = (0, 2, 3) # keep channel dim
|
||
keepdims = True
|
||
else:
|
||
ep_ft_array = data # data is already a np.ndarray
|
||
axes_to_reduce = 0 # compute stats over the first axis
|
||
keepdims = data.ndim == 1 # keep as np.array
|
||
|
||
ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
|
||
if features[key]["dtype"] in ["image", "video"]:
|
||
ep_stats[key] = {
|
||
k: v if k == "count" else np.squeeze(v / 255.0, axis=0) for k, v in ep_stats[key].items()
|
||
}
|
||
return ep_stats
|
||
|
||
def sample_images(input):
|
||
if type(input) is str:
|
||
video_path = input
|
||
reader = torchvision.io.VideoReader(video_path, stream="video")
|
||
frames = [frame["data"] for frame in reader]
|
||
frames_array = torch.stack(frames).numpy() # Shape: [T, C, H, W]
|
||
sampled_indices = sample_indices(len(frames_array))
|
||
images = None
|
||
for i, idx in enumerate(sampled_indices):
|
||
img = frames_array[idx]
|
||
img = auto_downsample_height_width(img)
|
||
if images is None:
|
||
images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
|
||
images[i] = img
|
||
elif type(input) is np.ndarray:
|
||
frames_array = input[:, None, :, :] # Shape: [T, C, H, W]
|
||
sampled_indices = sample_indices(len(frames_array))
|
||
images = None
|
||
for i, idx in enumerate(sampled_indices):
|
||
img = frames_array[idx]
|
||
img = auto_downsample_height_width(img)
|
||
if images is None:
|
||
images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
|
||
images[i] = img
|
||
return images
|
||
|
||
|
||
def load_local_dataset(episode_path: str, save_path:str, origin_fps=30, target_fps=30):
|
||
fps_factor = origin_fps // target_fps
|
||
# print(f"fps downsample factor: {fps_factor}")
|
||
# logging.info(f"fps downsample factor: {fps_factor}")
|
||
# for format_str in [f"{episode_id:07d}", f"{episode_id:06d}", str(episode_id)]:
|
||
# episode_path = Path(src_path) / format_str
|
||
# save_path = Path(save_path) / format_str
|
||
# if episode_path.exists():
|
||
# break
|
||
# else:
|
||
# logging.warning(f"Episode directory not found for ID {episode_id}")
|
||
# return None, None
|
||
episode_path = Path(episode_path)
|
||
if not episode_path.exists():
|
||
logging.warning(f"{episode_path} does not exist")
|
||
return None, None
|
||
|
||
if not (episode_path / "lmdb/data.mdb").exists():
|
||
logging.warning(f"LMDB data not found for episode {episode_path}")
|
||
return None, None
|
||
|
||
raw_dataset = load_lmdb_data(episode_path, save_path, fps_factor, target_fps)
|
||
if raw_dataset is None:
|
||
return None, None
|
||
frames = raw_dataset["frames"] # states, actions, task
|
||
videos = raw_dataset["videos"] # image paths
|
||
## check the frames
|
||
for camera_name, video_path in videos.items():
|
||
if not os.path.exists(video_path):
|
||
logging.error(f"Video file {video_path} does not exist.")
|
||
print(f"Camera {camera_name} Video file {video_path} does not exist.")
|
||
return None, None
|
||
return frames, videos
|
||
|
||
|
||
def save_as_lerobot_dataset(task: tuple[Path, Path], repo_id, num_threads, debug, origin_fps=30, target_fps=30, num_demos=None, robot_type="AgileX Split Aloha", delete_downsampled_videos=True):
|
||
src_path, save_path = task
|
||
print(f"**Processing collected** {src_path}")
|
||
print(f"**saving to** {save_path}")
|
||
if save_path.exists():
|
||
print(f"Output directory {save_path} already exists. Deleting it.")
|
||
logging.warning(f"Output directory {save_path} already exists. Deleting it.")
|
||
shutil.rmtree(save_path)
|
||
# print(f"Output directory {save_path} already exists.")
|
||
# return
|
||
|
||
dataset = SplitAlohaDataset.create(
|
||
repo_id=f"{repo_id}",
|
||
root=save_path,
|
||
fps=target_fps,
|
||
robot_type=robot_type,
|
||
features=FEATURES,
|
||
)
|
||
all_episode_paths = sorted([f.as_posix() for f in src_path.glob(f"*") if f.is_dir()])
|
||
if num_demos is not None:
|
||
all_episode_paths = all_episode_paths[:num_demos]
|
||
# all_subdir_eids = [int(Path(path).name) for path in all_subdir]
|
||
if debug:
|
||
for i in range(1):
|
||
frames, videos = load_local_dataset(episode_path=all_episode_paths[i], save_path=save_path, origin_fps=origin_fps, target_fps=target_fps)
|
||
if frames is None or videos is None:
|
||
print(f"Skipping episode {all_episode_paths[i]} due to missing data.")
|
||
continue
|
||
for frame_data in frames:
|
||
dataset.add_frame(frame_data)
|
||
dataset.save_episode(videos=videos)
|
||
if delete_downsampled_videos:
|
||
for _, video_path in videos.items():
|
||
parent_dir = os.path.dirname(video_path)
|
||
try:
|
||
shutil.rmtree(parent_dir)
|
||
# os.remove(video_path)
|
||
# print(f"Successfully deleted: {parent_dir}")
|
||
print(f"Successfully deleted: {video_path}")
|
||
except Exception as e:
|
||
pass # Handle the case where the directory might not exist or is already deleted
|
||
|
||
else:
|
||
counter_episodes_uncomplete = 0
|
||
for batch_index in range(len(all_episode_paths)//num_threads+1):
|
||
batch_episode_paths = all_episode_paths[batch_index*num_threads:(batch_index+1)*num_threads]
|
||
if len(batch_episode_paths) == 0:
|
||
continue
|
||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||
futures = []
|
||
for episode_path in batch_episode_paths:
|
||
print("starting to process episode: ", episode_path)
|
||
futures.append(
|
||
executor.submit(load_local_dataset, episode_path=episode_path, save_path=save_path, origin_fps=origin_fps, target_fps=target_fps)
|
||
)
|
||
for raw_dataset in as_completed(futures):
|
||
frames, videos = raw_dataset.result()
|
||
if frames is None or videos is None:
|
||
counter_episodes_uncomplete += 1
|
||
print(f"Skipping episode {episode_path} due to missing data.")
|
||
continue
|
||
for frame_data in frames:
|
||
dataset.add_frame(frame_data)
|
||
dataset.save_episode(videos=videos)
|
||
gc.collect()
|
||
print(f"finishing processed {videos}")
|
||
if delete_downsampled_videos:
|
||
for _, video_path in videos.items():
|
||
# Get the parent directory of the video
|
||
parent_dir = os.path.dirname(video_path)
|
||
try:
|
||
shutil.rmtree(parent_dir)
|
||
print(f"Successfully deleted: {parent_dir}")
|
||
except Exception as e:
|
||
pass
|
||
print("counter_episodes_uncomplete:", counter_episodes_uncomplete)
|
||
|
||
def main(src_path, save_path, repo_id, num_threads=4, debug=False, origin_fps=30, target_fps=30, num_demos=None):
|
||
logging.info("Scanning for episodes...")
|
||
tasks = get_all_tasks(src_path, save_path)
|
||
if debug:
|
||
task = next(tasks)
|
||
save_as_lerobot_dataset(task, repo_id, num_threads=num_threads, debug=debug, origin_fps=origin_fps, target_fps=target_fps, num_demos=num_demos)
|
||
else:
|
||
for task in tasks:
|
||
save_as_lerobot_dataset(task, repo_id, num_threads=num_threads, debug=debug, origin_fps=origin_fps, target_fps=target_fps, num_demos=num_demos)
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser(description="Convert collected data from Piper to Lerobot format.")
|
||
parser.add_argument(
|
||
"--src_path",
|
||
type=str,
|
||
# required=False,
|
||
default="/fs-computility/efm/shared/datasets/myData-A1/real/raw_data/agilex_split_aloha/",
|
||
help="Path to the input file containing collected data in Piper format.",
|
||
)
|
||
parser.add_argument(
|
||
"--save_path",
|
||
type=str,
|
||
# required=False,
|
||
default="/fs-computility/efm/shared/datasets/myData-A1/real/lerobot_v2_1/agilex_split_aloha/",
|
||
help="Path to the output file where the converted Lerobot format will be saved.",
|
||
)
|
||
parser.add_argument(
|
||
"--debug",
|
||
action="store_true",
|
||
help="Run in debug mode with limited episodes",
|
||
)
|
||
parser.add_argument(
|
||
"--num-threads",
|
||
type=int,
|
||
default=64,
|
||
help="Number of threads per process",
|
||
)
|
||
# parser.add_argument(
|
||
# "--task_name",
|
||
# type=str,
|
||
# required=True,
|
||
# default="Pick_up_the_marker_and_put_it_into_the_pen_holder",
|
||
# help="Name of the task to be processed. Default is 'Pick_up_the_marker_and_put_it_into_the_pen_holder'.",
|
||
# )
|
||
parser.add_argument(
|
||
"--repo_id",
|
||
type=str,
|
||
# required=True,
|
||
default="SplitAloha_20250714",
|
||
help="identifier for the dataset repository.",
|
||
)
|
||
parser.add_argument(
|
||
"--origin_fps",
|
||
type=int,
|
||
default=30,
|
||
help="Frames per second for the obervation video. Default is 30.",
|
||
)
|
||
parser.add_argument(
|
||
"--target_fps",
|
||
type=int,
|
||
default=30,
|
||
help="Frames per second for the downsample video. Default is 30.",
|
||
)
|
||
parser.add_argument(
|
||
"--num_demos",
|
||
type=int,
|
||
default=None,
|
||
help="Demos need to transfer"
|
||
)
|
||
args = parser.parse_args()
|
||
assert int(args.origin_fps) % int(args.target_fps) == 0, "origin_fps must be an integer multiple of target_fps"
|
||
start_time = time.time()
|
||
main(
|
||
src_path=Path(args.src_path),
|
||
save_path=Path(args.save_path),
|
||
repo_id=args.repo_id,
|
||
num_threads=args.num_threads,
|
||
debug=args.debug,
|
||
origin_fps=args.origin_fps,
|
||
target_fps=args.target_fps,
|
||
num_demos=args.num_demos,
|
||
)
|
||
end_time = time.time()
|
||
elapsed_time = end_time - start_time
|
||
print(f"Total time taken: {elapsed_time:.2f} seconds")
|