[WIP] Non functional yet

Add ManiSkill environment configuration and wrappers - Introduced `VideoRecordConfig` for video recording settings. - Added `ManiskillEnvConfig` to encapsulate environment-specific configurations. - Implemented various wrappers for the ManiSkill environment, including observation and action scaling. - Enhanced the `make_maniskill` function to create a wrapped ManiSkill environment with video recording and observation processing. - Updated the `actor_server` and `learner_server` to utilize the new configuration structure. - Refactored the training pipeline to accommodate the new environment and policy configurations.
2025-03-26 08:15:05 +00:00
parent 114ec644d0
commit 056f79d358
9 changed files with 667 additions and 436 deletions
--- a/lerobot/common/envs/configs.py
+++ b/lerobot/common/envs/configs.py
@@ -154,3 +154,61 @@ class XarmEnv(EnvConfig):
            "visualization_height": self.visualization_height,
            "max_episode_steps": self.episode_length,
        }
+
+
+@dataclass
+class VideoRecordConfig:
+    """Configuration for video recording in ManiSkill environments."""
+    enabled: bool = False
+    record_dir: str = "videos"
+    trajectory_name: str = "trajectory"
+
+@EnvConfig.register_subclass("maniskill_push")
+@dataclass
+class ManiskillEnvConfig(EnvConfig):
+    """Configuration for the ManiSkill environment."""
+    name: str = "maniskill/pushcube"
+    task: str = "PushCube-v1"
+    image_size: int = 64
+    control_mode: str = "pd_ee_delta_pose"
+    state_dim: int = 25
+    action_dim: int = 7
+    fps: int = 400
+    episode_length: int = 50
+    obs_type: str = "rgb"
+    render_mode: str = "rgb_array"
+    render_size: int = 64
+    device: str = "cuda"
+    robot: str = "so100" # This is a hack to make the robot config work
+    video_record: VideoRecordConfig = field(default_factory=VideoRecordConfig)
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            "action": PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
+            "observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 64, 64)),
+            "observation.state": PolicyFeature(type=FeatureType.STATE, shape=(25,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            "action": ACTION,
+            "observation.image": OBS_IMAGE,
+            "observation.state": OBS_ROBOT,
+        }
+    )
+    reward_classifier: dict[str, str | None] = field(
+        default_factory=lambda: {
+            "pretrained_path": None,
+            "config_path": None,
+        }
+    )
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {
+            "obs_type": self.obs_type,
+            "render_mode": self.render_mode,
+            "max_episode_steps": self.episode_length,
+            "control_mode": self.control_mode,
+            "sensor_configs": {"width": self.image_size, "height": self.image_size},
+            "num_envs": 1,
+        }
--- a/lerobot/common/envs/factory.py
+++ b/lerobot/common/envs/factory.py
@@ -69,88 +69,3 @@ def make_env(cfg: EnvConfig, n_envs: int = 1, use_async_envs: bool = False) -> g

    return env

-
-def make_maniskill_env(cfg: DictConfig, n_envs: int | None = None) -> gym.vector.VectorEnv | None:
-    """Make ManiSkill3 gym environment"""
-    from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
-
-    env = gym.make(
-        cfg.env.task,
-        obs_mode=cfg.env.obs,
-        control_mode=cfg.env.control_mode,
-        render_mode=cfg.env.render_mode,
-        sensor_configs=dict(width=cfg.env.image_size, height=cfg.env.image_size),
-        num_envs=n_envs,
-    )
-    # cfg.env_cfg.control_mode = cfg.eval_env_cfg.control_mode = env.control_mode
-    env = ManiSkillVectorEnv(env, ignore_terminations=True)
-    # state should have the size of 25
-    # env = ConvertToLeRobotEnv(env, n_envs)
-    # env = PixelWrapper(cfg, env, n_envs)
-    env._max_episode_steps = env.max_episode_steps = 50  # gym_utils.find_max_episode_steps_value(env)
-    env.unwrapped.metadata["render_fps"] = 20
-
-    return env
-
-
-class PixelWrapper(gym.Wrapper):
-    """
-    Wrapper for pixel observations. Works with Maniskill vectorized environments
-    """
-
-    def __init__(self, cfg, env, num_envs, num_frames=3):
-        super().__init__(env)
-        self.cfg = cfg
-        self.env = env
-        self.observation_space = gym.spaces.Box(
-            low=0,
-            high=255,
-            shape=(num_envs, num_frames * 3, cfg.env.render_size, cfg.env.render_size),
-            dtype=np.uint8,
-        )
-        self._frames = deque([], maxlen=num_frames)
-        self._render_size = cfg.env.render_size
-
-    def _get_obs(self, obs):
-        frame = obs["sensor_data"]["base_camera"]["rgb"].cpu().permute(0, 3, 1, 2)
-        self._frames.append(frame)
-        return {"pixels": torch.from_numpy(np.concatenate(self._frames, axis=1)).to(self.env.device)}
-
-    def reset(self, seed):
-        obs, info = self.env.reset()  # (seed=seed)
-        for _ in range(self._frames.maxlen):
-            obs_frames = self._get_obs(obs)
-        return obs_frames, info
-
-    def step(self, action):
-        obs, reward, terminated, truncated, info = self.env.step(action)
-        return self._get_obs(obs), reward, terminated, truncated, info
-
-
-# TODO: Remove this
-class ConvertToLeRobotEnv(gym.Wrapper):
-    def __init__(self, env, num_envs):
-        super().__init__(env)
-
-    def reset(self, seed=None, options=None):
-        obs, info = self.env.reset(seed=seed, options={})
-        return self._get_obs(obs), info
-
-    def step(self, action):
-        obs, reward, terminated, truncated, info = self.env.step(action)
-        return self._get_obs(obs), reward, terminated, truncated, info
-
-    def _get_obs(self, observation):
-        sensor_data = observation.pop("sensor_data")
-        del observation["sensor_param"]
-        images = []
-        for cam_data in sensor_data.values():
-            images.append(cam_data["rgb"])
-
-        images = torch.concat(images, axis=-1)
-        # flatten the rest of the data which should just be state data
-        observation = common.flatten_state_dict(observation, use_torch=True, device=self.base_env.device)
-        ret = dict()
-        ret["state"] = observation
-        ret["pixels"] = images
-        return ret
--- a/lerobot/common/policies/sac/configuration_sac.py
+++ b/lerobot/common/policies/sac/configuration_sac.py
@@ -31,12 +31,19 @@ class SACConfig(PreTrainedConfig):
    Args:
        n_obs_steps: Number of environment steps worth of observations to pass to the policy.
        normalization_mapping: Mapping from feature types to normalization modes.
+        dataset_stats: Statistics for normalizing different data types.
        camera_number: Number of cameras to use.
+        device: Device to use for training.
        storage_device: Device to use for storage.
        vision_encoder_name: Name of the vision encoder to use.
        freeze_vision_encoder: Whether to freeze the vision encoder.
        image_encoder_hidden_dim: Hidden dimension for the image encoder.
        shared_encoder: Whether to use a shared encoder.
+        online_steps: Total number of online training steps.
+        online_env_seed: Seed for the online environment.
+        online_buffer_capacity: Capacity of the online replay buffer.
+        online_step_before_learning: Number of steps to collect before starting learning.
+        policy_update_freq: Frequency of policy updates.
        discount: Discount factor for the RL algorithm.
        temperature_init: Initial temperature for entropy regularization.
        num_critics: Number of critic networks.
@@ -54,6 +61,8 @@ class SACConfig(PreTrainedConfig):
        critic_network_kwargs: Additional arguments for critic networks.
        actor_network_kwargs: Additional arguments for actor network.
        policy_kwargs: Additional arguments for policy.
+        actor_learner_config: Configuration for actor-learner communication.
+        concurrency: Configuration for concurrency model.
    """

    # Input / output structure
@@ -86,13 +95,21 @@ class SACConfig(PreTrainedConfig):

    # Architecture specifics
    camera_number: int = 1
+    device: str = "cuda"
    storage_device: str = "cpu"
    # Set to "helper2424/resnet10" for hil serl 
    vision_encoder_name: str | None = None 
    freeze_vision_encoder: bool = True
    image_encoder_hidden_dim: int = 32
    shared_encoder: bool = True
-    
+
+    # Training parameter
+    online_steps: int = 1000000
+    online_env_seed: int = 10000
+    online_buffer_capacity: int = 10000
+    online_step_before_learning: int = 100 
+    policy_update_freq: int = 1
+
    # SAC algorithm parameters
    discount: float = 0.99
    temperature_init: float = 1.0
@@ -132,11 +149,17 @@ class SACConfig(PreTrainedConfig):
        }
    )
    
-    # Deprecated, kept for backward compatibility
    actor_learner_config: dict[str, str | int] = field(
        default_factory=lambda: {
            "learner_host": "127.0.0.1",
            "learner_port": 50051,
+            "policy_parameters_push_frequency": 4,
+        }
+    )
+    concurrency: dict[str, str] = field(
+        default_factory=lambda: {
+            "actor": "threads",
+            "learner": "threads"
        }
    )

--- a/lerobot/common/utils/wandb_utils.py
+++ b/lerobot/common/utils/wandb_utils.py
@@ -92,6 +92,8 @@ class WandBLogger:
            resume="must" if cfg.resume else None,
            mode=self.cfg.mode if self.cfg.mode in ["online", "offline", "disabled"] else "online",
        )
+        # Handle custom step key for rl asynchronous training.
+        self._wandb_custom_step_key: set[str] | None = None
        print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
        logging.info(f"Track this run --> {colored(wandb.run.get_url(), 'yellow', attrs=['bold'])}")
        self._wandb = wandb
@@ -108,9 +110,24 @@ class WandBLogger:
        artifact.add_file(checkpoint_dir / PRETRAINED_MODEL_DIR / SAFETENSORS_SINGLE_FILE)
        self._wandb.log_artifact(artifact)

-    def log_dict(self, d: dict, step: int, mode: str = "train"):
+    def log_dict(self, d: dict, step: int, mode: str = "train", custom_step_key: str | None = None):
        if mode not in {"train", "eval"}:
            raise ValueError(mode)
+        if step is None and custom_step_key is None:
+                    raise ValueError("Either step or custom_step_key must be provided.")
+
+        # NOTE: This is not simple. Wandb step is it must always monotonically increase and it
+        # increases with each wandb.log call, but in the case of asynchronous RL for example,
+        # multiple time steps is possible for example, the interaction step with the environment,
+        # the training step, the evaluation step, etc. So we need to define a custom step key
+        # to log the correct step for each metric.
+        if custom_step_key is not None:
+            if self._wandb_custom_step_key is None:
+                self._wandb_custom_step_key = set()
+            new_custom_key = f"{mode}/{custom_step_key}"
+            if new_custom_key not in self._wandb_custom_step_key:
+                self._wandb_custom_step_key.add(new_custom_key)
+                self._wandb.define_metric(new_custom_key, hidden=True)

        for k, v in d.items():
            if not isinstance(v, (int, float, str)):
@@ -118,7 +135,26 @@ class WandBLogger:
                    f'WandB logging of key "{k}" was ignored as its type is not handled by this wrapper.'
                )
                continue
-            self._wandb.log({f"{mode}/{k}": v}, step=step)
+
+            # Do not log the custom step key itself.
+            if (
+                self._wandb_custom_step_key is not None
+                and k in self._wandb_custom_step_key
+            ):
+                continue
+
+            if custom_step_key is not None:
+                value_custom_step = d[custom_step_key]
+                self._wandb.log(
+                    {
+                        f"{mode}/{k}": v,
+                        f"{mode}/{custom_step_key}": value_custom_step,
+                    }
+                )
+                continue
+
+            self._wandb.log(data={f"{mode}/{k}": v}, step=step)
+

    def log_video(self, video_path: str, step: int, mode: str = "train"):
        if mode not in {"train", "eval"}: