164 lines
6.5 KiB
Python
164 lines
6.5 KiB
Python
#!/usr/bin/env python
|
|
|
|
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import warnings
|
|
from typing import Any
|
|
|
|
import einops
|
|
import gymnasium as gym
|
|
import numpy as np
|
|
import torch
|
|
from torch import Tensor
|
|
|
|
from lerobot.common.envs.configs import EnvConfig
|
|
from lerobot.common.utils.utils import get_channel_first_image_shape
|
|
from lerobot.configs.types import FeatureType, PolicyFeature
|
|
|
|
|
|
def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Tensor]:
|
|
# TODO(aliberts, rcadene): refactor this to use features from the environment (no hardcoding)
|
|
"""Convert environment observation to LeRobot format observation.
|
|
Args:
|
|
observation: Dictionary of observation batches from a Gym vector environment.
|
|
Returns:
|
|
Dictionary of observation batches with keys renamed to LeRobot format and values as tensors.
|
|
"""
|
|
# map to expected inputs for the policy
|
|
return_observations = {}
|
|
if "pixels" in observations:
|
|
if isinstance(observations["pixels"], dict):
|
|
imgs = {f"observation.images.{key}": img for key, img in observations["pixels"].items()}
|
|
else:
|
|
imgs = {"observation.image": observations["pixels"]}
|
|
|
|
for imgkey, img in imgs.items():
|
|
# TODO(aliberts, rcadene): use transforms.ToTensor()?
|
|
img = torch.from_numpy(img)
|
|
|
|
if img.dim() == 3:
|
|
img = img.unsqueeze(0)
|
|
# sanity check that images are channel last
|
|
_, h, w, c = img.shape
|
|
assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
|
|
|
|
# sanity check that images are uint8
|
|
assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
|
|
|
|
# convert to channel first of type float32 in range [0,1]
|
|
img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
|
|
img = img.type(torch.float32)
|
|
img /= 255
|
|
|
|
return_observations[imgkey] = img
|
|
|
|
if "environment_state" in observations:
|
|
env_state = torch.from_numpy(observations["environment_state"]).float()
|
|
if env_state.dim() == 1:
|
|
env_state = env_state.unsqueeze(0)
|
|
|
|
return_observations["observation.environment_state"] = env_state
|
|
|
|
# TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing
|
|
agent_pos = torch.from_numpy(observations["agent_pos"]).float()
|
|
if agent_pos.dim() == 1:
|
|
agent_pos = agent_pos.unsqueeze(0)
|
|
return_observations["observation.state"] = agent_pos
|
|
|
|
return return_observations
|
|
|
|
|
|
class ObservationProcessorWrapper(gym.vector.VectorEnvWrapper):
|
|
def __init__(self, env: gym.vector.VectorEnv):
|
|
super().__init__(env)
|
|
|
|
def _observations(self, observations: dict[str, Any]) -> dict[str, Any]:
|
|
return preprocess_observation(observations)
|
|
|
|
def reset(
|
|
self,
|
|
*,
|
|
seed: int | list[int] | None = None,
|
|
options: dict[str, Any] | None = None,
|
|
):
|
|
"""Modifies the observation returned from the environment ``reset`` using the :meth:`observation`."""
|
|
observations, infos = self.env.reset(seed=seed, options=options)
|
|
return self._observations(observations), infos
|
|
|
|
def step(self, actions):
|
|
"""Modifies the observation returned from the environment ``step`` using the :meth:`observation`."""
|
|
observations, rewards, terminations, truncations, infos = self.env.step(actions)
|
|
return (
|
|
self._observations(observations),
|
|
rewards,
|
|
terminations,
|
|
truncations,
|
|
infos,
|
|
)
|
|
|
|
|
|
def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
|
|
# TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is
|
|
# (need to also refactor preprocess_observation and externalize normalization from policies)
|
|
policy_features = {}
|
|
for key, ft in env_cfg.features.items():
|
|
if ft.type is FeatureType.VISUAL:
|
|
if len(ft.shape) != 3:
|
|
raise ValueError(f"Number of dimensions of {key} != 3 (shape={ft.shape})")
|
|
|
|
shape = get_channel_first_image_shape(ft.shape)
|
|
feature = PolicyFeature(type=ft.type, shape=shape)
|
|
else:
|
|
feature = ft
|
|
|
|
policy_key = env_cfg.features_map[key]
|
|
policy_features[policy_key] = feature
|
|
|
|
return policy_features
|
|
|
|
|
|
def are_all_envs_same_type(env: gym.vector.VectorEnv) -> bool:
|
|
first_type = type(env.envs[0]) # Get type of first env
|
|
return all(type(e) is first_type for e in env.envs) # Fast type check
|
|
|
|
|
|
def check_env_attributes_and_types(env: gym.vector.VectorEnv) -> None:
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("once", UserWarning) # Apply filter only in this function
|
|
|
|
if not (hasattr(env.envs[0], "task_description") and hasattr(env.envs[0], "task")):
|
|
warnings.warn(
|
|
"The environment does not have 'task_description' and 'task'. Some policies require these features.",
|
|
UserWarning,
|
|
stacklevel=2,
|
|
)
|
|
if not are_all_envs_same_type(env):
|
|
warnings.warn(
|
|
"The environments have different types. Make sure you infer the right task from each environment. Empty task will be passed instead.",
|
|
UserWarning,
|
|
stacklevel=2,
|
|
)
|
|
|
|
|
|
def add_envs_task(env: gym.vector.VectorEnv, observation: dict[str, Any]) -> dict[str, Any]:
|
|
"""Adds task feature to the observation dict with respect to the first environment attribute."""
|
|
if hasattr(env.envs[0], "task_description"):
|
|
observation["task"] = env.call("task_description")
|
|
elif hasattr(env.envs[0], "task"):
|
|
observation["task"] = env.call("task")
|
|
else: # For envs without language instructions, e.g. aloha transfer cube and etc.
|
|
num_envs = observation[list(observation.keys())[0]].shape[0]
|
|
observation["task"] = ["" for _ in range(num_envs)]
|
|
return observation
|