Compare commits

..

128 Commits

Author SHA1 Message Date
Michel Aractingi
16b905ee67 adding sac implemenation 2024-12-06 01:32:22 +01:00
Michel Aractingi
44d96a0811 nit 2024-11-27 15:19:20 +01:00
Michel Aractingi
4488e55e94 first commit 2024-11-27 15:01:06 +01:00
Simon Alibert
2945dca65b Comment 2024-11-26 10:38:11 +01:00
Simon Alibert
691d39aa32 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-11-26 10:37:17 +01:00
Simon Alibert
56c01a2944 Activate end-to-end tests 2024-11-26 10:35:10 +01:00
Simon Alibert
49bdcc094c Remove comment 2024-11-26 10:34:14 +01:00
Simon Alibert
6ad84a6561 Refactor pusht_zarr 2024-11-25 18:23:04 +01:00
Simon Alibert
3b5af7eb38 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-11-25 17:43:58 +01:00
Simon Alibert
23f6c875b5 Apply suggestions from code review 2024-11-25 12:44:12 +01:00
Simon Alibert
f56d769dfb Remove todos 2024-11-20 21:10:02 +01:00
Simon Alibert
36b9b60a0e Update example 1 2024-11-20 18:22:46 +01:00
Simon Alibert
93d9bf83c2 Add open X datasets 2024-11-20 14:42:08 +01:00
Simon Alibert
37da50b573 Fix conversion script 2024-11-20 12:16:27 +01:00
Simon Alibert
c6ad495176 Enhance dataset cards 2024-11-20 11:57:27 +01:00
Simon Alibert
f43e5d07f5 Fix tests 2024-11-20 00:26:31 +01:00
Simon Alibert
9ee8711504 Update example 1 2024-11-19 18:49:35 +01:00
Simon Alibert
6203641710 Use HWC for images 2024-11-19 18:48:17 +01:00
Remi
1f13bda25b Improve dataset v2 (#498) 2024-11-19 12:31:47 +01:00
Simon Alibert
acae4b49d2 Add comment on license 2024-11-18 18:54:13 +01:00
Simon Alibert
c72ad49c9b Skip test_examples 2024-11-18 18:46:00 +01:00
Simon Alibert
eda02fade5 Skip test_visualize_local_dataset 2024-11-18 18:06:18 +01:00
Simon Alibert
8546358bc5 Fix test_visualize_dataset_html 2024-11-18 17:54:15 +01:00
Simon Alibert
a91b7c6163 Add extra info to dataset card, various fixes from Remi's review 2024-11-18 17:50:13 +01:00
Simon Alibert
4d15861872 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-11-05 19:10:38 +01:00
Simon Alibert
f3630ad910 Fix tests 2024-11-05 19:09:12 +01:00
Simon Alibert
aed9f4036a Refactor dataset features 2024-11-05 13:10:43 +01:00
Simon Alibert
757ea175d3 Fix test_examples 2024-11-03 19:41:50 +01:00
Simon Alibert
b69a132737 Fix test_examples 2024-11-03 19:30:56 +01:00
Simon Alibert
7b159a6b22 Move calculate_episode_data_index 2024-11-03 19:13:00 +01:00
Simon Alibert
74270c8c91 Remove reset_episode_index 2024-11-03 19:07:43 +01:00
Simon Alibert
a6762ec316 Fix advanced example 2 2024-11-03 19:03:15 +01:00
Simon Alibert
fde29e0167 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-11-03 18:32:14 +01:00
Simon Alibert
56e4603d5b Deactivate policies backward compatibility test 2024-11-03 18:30:50 +01:00
Simon Alibert
f6c90ca35f Remove end-to-end tests 2024-11-03 18:28:54 +01:00
Simon Alibert
c2d6fb6119 Fix werkzeug alert 2024-11-03 18:23:38 +01:00
Simon Alibert
95a4b59b5b Fix vizualize 2024-11-03 18:18:23 +01:00
Simon Alibert
16103cb8b7 Fix hanging 2024-11-03 18:07:55 +01:00
Simon Alibert
e4ba084e25 Add LeRobotDatasetMetadata 2024-11-03 18:07:37 +01:00
Simon Alibert
ac79e8cb36 Fix test_online_buffer.py 2024-11-03 13:15:01 +01:00
Simon Alibert
df2cb51364 Add todo in skipped test 2024-11-02 20:01:34 +01:00
Simon Alibert
7a342db9c4 Add more options to img factories 2024-11-02 20:01:02 +01:00
Simon Alibert
6b2ec1ed77 Add test_image_writer, accept PIL images, improve ImageWriter perf in main process 2024-11-02 20:00:07 +01:00
Simon Alibert
375abd3020 Add img and img_tensor factories 2024-11-02 13:06:38 +01:00
Simon Alibert
293bdc7f67 Simplify, add test content, add todo 2024-11-01 19:55:28 +01:00
Simon Alibert
79d114cc1f Rename num_samples -> num_frames for consistency 2024-11-01 19:47:16 +01:00
Simon Alibert
2650872b76 Add tasks and episodes factories 2024-11-01 13:37:17 +01:00
Simon Alibert
cd1509d805 Mock snapshot_download 2024-11-01 10:58:09 +01:00
Simon Alibert
5ea7c78237 Remove obsolete code 2024-10-31 21:43:57 +01:00
Simon Alibert
443a9eec88 Remove/comment obsolete tests 2024-10-31 21:43:29 +01:00
Simon Alibert
ab23a4fd27 Add fixtures in test_datasets 2024-10-31 21:32:09 +01:00
Simon Alibert
1267c3e955 Split fixtures into factories and files 2024-10-31 21:29:50 +01:00
Simon Alibert
c70b8d0abc Update doc 2024-10-31 14:08:13 +01:00
Simon Alibert
e1845d4dcc Update doc 2024-10-31 14:06:54 +01:00
Simon Alibert
e69f0c5059 Add test_delta_timestamps.py 2024-10-31 13:48:40 +01:00
Simon Alibert
ff84024ee9 Add dataset fixtures 2024-10-31 13:46:46 +01:00
Simon Alibert
ee51f54cb5 Remove dataset from image_transform tests 2024-10-29 16:08:01 +01:00
Simon Alibert
fee5fa5c2e Remove image_writer arg 2024-10-29 16:07:10 +01:00
Simon Alibert
8d57093ce7 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-10-29 15:25:25 +01:00
Simon Alibert
4c22de20a6 Add sanity check 2024-10-28 12:26:07 +01:00
Simon Alibert
51e87f6f97 Fix image writer 2024-10-28 12:01:32 +01:00
Simon Alibert
df3d2ec5df Speedup test 2024-10-26 10:59:06 +02:00
Simon Alibert
e210d795de Add video_info, fix image_writer 2024-10-25 16:55:33 +02:00
Simon Alibert
18ffa4248b Add json/jsonl io functions 2024-10-24 11:49:53 +02:00
Simon Alibert
8bcf81fa24 Add todo 2024-10-24 11:38:32 +02:00
Simon Alibert
615894d3fb Add test_same_attributes_defined 2024-10-24 11:37:44 +02:00
Simon Alibert
450eae310b Add error msg 2024-10-24 00:13:53 +02:00
Simon Alibert
60865e8980 Allow dataset creation without robot 2024-10-24 00:13:21 +02:00
Simon Alibert
0d77be90ee Move ImageWriter creation inside the dataset 2024-10-23 23:12:44 +02:00
Simon Alibert
0098bd264e Nits 2024-10-23 20:55:54 +02:00
Simon Alibert
1aba80d93f Fix consolidate 2024-10-23 18:45:59 +02:00
Simon Alibert
07570f867f Fix _query_videos return shapes 2024-10-23 18:18:28 +02:00
Simon Alibert
b8bdbc1c5b Fix check_delta_timestamps 2024-10-23 18:17:56 +02:00
Simon Alibert
7ae8d05326 Fix visualization 2024-10-23 14:20:27 +02:00
Simon Alibert
a2a8538ac9 add write_stats, changes names, add some typing 2024-10-23 11:38:07 +02:00
Simon Alibert
fb73cdb9a4 Update dataset doc 2024-10-23 00:32:28 +02:00
Simon Alibert
9dca233d7e Fix episode chunk 2024-10-23 00:27:14 +02:00
Simon Alibert
c3c0141738 Update & fix conversion script 2024-10-23 00:05:31 +02:00
Simon Alibert
c72dc23c43 Remove total_episodes from default parquet path 2024-10-23 00:03:30 +02:00
Simon Alibert
237a484be0 Fix paths & add add_frame doc 2024-10-22 22:46:34 +02:00
Simon Alibert
6c2cb6e107 Remove populate dataset 2024-10-22 20:21:26 +02:00
Simon Alibert
b46db7ea73 Fix tests 2024-10-22 20:14:06 +02:00
Simon Alibert
ee52b8b782 Add channels to intelrealsense 2024-10-22 20:07:11 +02:00
Simon Alibert
a805458c7e Add local_files_only, encode_videos, fix bugs to pass tests (WIP) 2024-10-22 19:57:52 +02:00
Simon Alibert
e991a31061 Improve consistency between __init__() and create(), WIP on consolidate 2024-10-22 00:19:25 +02:00
Simon Alibert
c4c0a43de7 add delete_episode, WIP on consolidate 2024-10-21 20:10:13 +02:00
Simon Alibert
299451af81 Add add_episode & task logic 2024-10-21 19:30:20 +02:00
Simon Alibert
9ebf8b88ec Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-10-21 00:22:30 +02:00
Simon Alibert
c1232a01e2 Add add_frame, empty dataset creation 2024-10-21 00:16:52 +02:00
Simon Alibert
3b925c3dce Add ImageWriter 2024-10-21 00:15:09 +02:00
Simon Alibert
e46bdb9d30 Change card creation 2024-10-20 14:01:10 +02:00
Simon Alibert
9316cf46ef Add file paths 2024-10-20 14:00:19 +02:00
Simon Alibert
ac3798bd62 Move default paths, use jsonlines for tasks 2024-10-18 17:53:25 +02:00
Simon Alibert
bce3dc3bfa Add load_metadata 2024-10-18 14:59:09 +02:00
Simon Alibert
1a51505ec6 Add download_metadata, move default paths 2024-10-18 14:48:34 +02:00
Simon Alibert
e7355ba595 Fix episodes.jsonl 2024-10-18 12:03:29 +02:00
Simon Alibert
91e8ce772b Remove caret requirement 2024-10-18 09:48:38 +02:00
Simon Alibert
3a9f964429 Add copyrights 2024-10-18 00:31:21 +02:00
Simon Alibert
beacb7e957 Cleanup 2024-10-18 00:30:16 +02:00
Simon Alibert
be64d54bd9 Update doc 2024-10-18 00:29:50 +02:00
Simon Alibert
354f37aed3 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-10-17 23:34:00 +02:00
Simon Alibert
d0d8193625 Add unitreeh and umi 2024-10-17 23:33:51 +02:00
Simon Alibert
7242c57400 Cleanup 2024-10-17 16:08:37 +02:00
Simon Alibert
3ee3739edc Add batch conversion log 2024-10-17 13:08:58 +02:00
Simon Alibert
ad3f112d16 Add fixes for lfs tracking 2024-10-17 12:58:48 +02:00
Simon Alibert
50a75ad3fe Write episodes as jsonlines 2024-10-17 10:17:27 +02:00
Simon Alibert
c146ba936f Add episode chunks logic, move_videos & lfs tracking fix 2024-10-16 23:34:54 +02:00
Simon Alibert
110264000f Add fixes for batch convert 2024-10-15 19:03:11 +02:00
Simon Alibert
9433ac52ec WIP add batch convert 2024-10-15 13:08:31 +02:00
Simon Alibert
da78bbfd16 Update load_tasks doc 2024-10-15 11:06:28 +02:00
Simon Alibert
835ab5a81b Cleanup, fix load_tasks 2024-10-15 11:05:16 +02:00
Simon Alibert
f96773de10 Fix safe_version 2024-10-14 13:51:40 +02:00
Simon Alibert
cbc51e1341 Extend v1 compatibility 2024-10-14 10:14:27 +02:00
Simon Alibert
cf633344be Add multitask support, refactor conversion script 2024-10-13 21:21:40 +02:00
Simon Alibert
8bd406e607 Add suggestions from code review 2024-10-11 18:52:11 +02:00
Simon Alibert
3ea53124e0 Add padding keys and download_data option 2024-10-11 17:38:47 +02:00
Simon Alibert
7f680886b0 Add huggingface-hub patch for offline snapshot_download with local_dir 2024-10-11 11:03:11 +02:00
Simon Alibert
6d2bc11365 Add doc, scrap video_frame_keys attribute 2024-10-11 10:59:38 +02:00
Simon Alibert
b417cebc4e Update LeRobotDataset.__get_item__ 2024-10-10 21:32:14 +02:00
Simon Alibert
3113038beb Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-10-09 14:33:58 +02:00
Simon Alibert
096824b5ff Rework LeRobotDataset.__init__ 2024-10-09 14:33:26 +02:00
Simon Alibert
2d75b93ba0 Update info.json format 2024-10-08 15:31:37 +02:00
Simon Alibert
21ba4b5263 Add pixel channels 2024-10-06 11:16:49 +02:00
Simon Alibert
028c17fd48 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-10-04 18:59:41 +02:00
Simon Alibert
07e113ce21 Add info.json link 2024-10-04 14:36:11 +02:00
Simon Alibert
1016a983a1 Add upload folders 2024-10-04 14:26:50 +02:00
Simon Alibert
17a1214e25 Merge remote-tracking branch 'origin/main' into user/aliberts/2024_09_25_reshape_dataset 2024-10-04 11:22:22 +02:00
Simon Alibert
ad115b6c27 WIP 2024-10-03 20:00:44 +02:00
44 changed files with 2097 additions and 2533 deletions

View File

@@ -21,7 +21,7 @@ Provide a simple way for the reviewer to try out your changes.
Examples:
```bash
pytest -sx tests/test_stuff.py::test_something
DATA_DIR=tests/data pytest -sx tests/test_stuff.py::test_something
```
```bash
python lerobot/scripts/train.py --some.option=true

View File

@@ -7,8 +7,10 @@ on:
schedule:
- cron: "0 2 * * *"
# env:
env:
DATA_DIR: tests/data
# SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
jobs:
run_all_tests_cpu:
name: CPU
@@ -28,9 +30,13 @@ jobs:
working-directory: /lerobot
steps:
- name: Tests
env:
DATA_DIR: tests/data
run: pytest -v --cov=./lerobot --disable-warnings tests
- name: Tests end-to-end
env:
DATA_DIR: tests/data
run: make test-end-to-end

View File

@@ -29,6 +29,7 @@ jobs:
name: Pytest
runs-on: ubuntu-latest
env:
DATA_DIR: tests/data
MUJOCO_GL: egl
steps:
- uses: actions/checkout@v4
@@ -69,6 +70,7 @@ jobs:
name: Pytest (minimal install)
runs-on: ubuntu-latest
env:
DATA_DIR: tests/data
MUJOCO_GL: egl
steps:
- uses: actions/checkout@v4
@@ -102,38 +104,39 @@ jobs:
&& rm -rf tests/outputs outputs
# TODO(aliberts, rcadene): redesign after v2 migration / removing hydra
# end-to-end:
# name: End-to-end
# runs-on: ubuntu-latest
# env:
# MUJOCO_GL: egl
# steps:
# - uses: actions/checkout@v4
# with:
# lfs: true # Ensure LFS files are pulled
end-to-end:
name: End-to-end
runs-on: ubuntu-latest
env:
DATA_DIR: tests/data
MUJOCO_GL: egl
steps:
- uses: actions/checkout@v4
with:
lfs: true # Ensure LFS files are pulled
# - name: Install apt dependencies
# # portaudio19-dev is needed to install pyaudio
# run: |
# sudo apt-get update && \
# sudo apt-get install -y libegl1-mesa-dev portaudio19-dev
- name: Install apt dependencies
# portaudio19-dev is needed to install pyaudio
run: |
sudo apt-get update && \
sudo apt-get install -y libegl1-mesa-dev portaudio19-dev
# - name: Install poetry
# run: |
# pipx install poetry && poetry config virtualenvs.in-project true
# echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Install poetry
run: |
pipx install poetry && poetry config virtualenvs.in-project true
echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
# - name: Set up Python 3.10
# uses: actions/setup-python@v5
# with:
# python-version: "3.10"
# cache: "poetry"
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: "poetry"
# - name: Install poetry dependencies
# run: |
# poetry install --all-extras
- name: Install poetry dependencies
run: |
poetry install --all-extras
# - name: Test end-to-end
# run: |
# make test-end-to-end \
# && rm -rf outputs
- name: Test end-to-end
run: |
make test-end-to-end \
&& rm -rf outputs

View File

@@ -3,7 +3,7 @@ default_language_version:
python: python3.10
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v4.6.0
hooks:
- id: check-added-large-files
- id: debug-statements
@@ -14,11 +14,11 @@ repos:
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/asottile/pyupgrade
rev: v3.19.0
rev: v3.16.0
hooks:
- id: pyupgrade
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.2
rev: v0.5.2
hooks:
- id: ruff
args: [--fix]
@@ -32,6 +32,6 @@ repos:
- "--check"
- "--no-update"
- repo: https://github.com/gitleaks/gitleaks
rev: v8.21.2
rev: v8.18.4
hooks:
- id: gitleaks

View File

@@ -267,7 +267,7 @@ We use `pytest` in order to run the tests. From the root of the
repository, here's how to run tests with `pytest` for the library:
```bash
python -m pytest -sv ./tests
DATA_DIR="tests/data" python -m pytest -sv ./tests
```

View File

@@ -153,12 +153,10 @@ python lerobot/scripts/visualize_dataset.py \
--episode-index 0
```
or from a dataset in a local folder with the `root` option and the `--local-files-only` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`)
or from a dataset in a local folder with the root `DATA_DIR` environment variable (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`)
```bash
python lerobot/scripts/visualize_dataset.py \
DATA_DIR='./my_local_data_dir' python lerobot/scripts/visualize_dataset.py \
--repo-id lerobot/pusht \
--root ./my_local_data_dir \
--local-files-only 1 \
--episode-index 0
```
@@ -210,10 +208,12 @@ dataset attributes:
A `LeRobotDataset` is serialised using several widespread file formats for each of its parts, namely:
- hf_dataset stored using Hugging Face datasets library serialization to parquet
- videos are stored in mp4 format to save space
- metadata are stored in plain json/jsonl files
- videos are stored in mp4 format to save space or png files
- episode_data_index saved using `safetensor` tensor serialization format
- stats saved using `safetensor` tensor serialization format
- info are saved using JSON
Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can use the `local_files_only` argument and specify its location with the `root` argument if it's not in the default `~/.cache/huggingface/lerobot` location.
Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can set the `DATA_DIR` environment variable to your root dataset folder as illustrated in the above section on dataset visualization.
### Evaluate a pretrained policy

View File

@@ -192,6 +192,7 @@ Record 2 episodes and upload your dataset to the hub:
python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/so100.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/so100_test \
--tags so100 tutorial \
--warmup-time-s 5 \
@@ -211,6 +212,7 @@ echo ${HF_USER}/so100_test
If you didn't upload with `--push-to-hub 0`, you can also visualize it locally with:
```bash
python lerobot/scripts/visualize_dataset_html.py \
--root data \
--repo-id ${HF_USER}/so100_test
```
@@ -218,9 +220,10 @@ python lerobot/scripts/visualize_dataset_html.py \
Now try to replay the first episode on your robot:
```bash
python lerobot/scripts/control_robot.py replay \
DATA_DIR=data python lerobot/scripts/control_robot.py replay \
--robot-path lerobot/configs/robot/so100.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/so100_test \
--episode 0
```
@@ -229,7 +232,7 @@ python lerobot/scripts/control_robot.py replay \
To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
```bash
python lerobot/scripts/train.py \
DATA_DIR=data python lerobot/scripts/train.py \
dataset_repo_id=${HF_USER}/so100_test \
policy=act_so100_real \
env=so100_real \
@@ -245,6 +248,7 @@ Let's explain it:
3. We provided an environment as argument with `env=so100_real`. This loads configurations from [`lerobot/configs/env/so100_real.yaml`](../lerobot/configs/env/so100_real.yaml).
4. We provided `device=cuda` since we are training on a Nvidia GPU, but you can also use `device=mps` if you are using a Mac with Apple silicon, or `device=cpu` otherwise.
5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`.
6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync.
Training should take several hours. You will find checkpoints in `outputs/train/act_so100_test/checkpoints`.
@@ -255,6 +259,7 @@ You can use the `record` function from [`lerobot/scripts/control_robot.py`](../l
python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/so100.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/eval_act_so100_test \
--tags so100 tutorial eval \
--warmup-time-s 5 \

View File

@@ -192,6 +192,7 @@ Record 2 episodes and upload your dataset to the hub:
python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/moss.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/moss_test \
--tags moss tutorial \
--warmup-time-s 5 \
@@ -211,6 +212,7 @@ echo ${HF_USER}/moss_test
If you didn't upload with `--push-to-hub 0`, you can also visualize it locally with:
```bash
python lerobot/scripts/visualize_dataset_html.py \
--root data \
--repo-id ${HF_USER}/moss_test
```
@@ -218,9 +220,10 @@ python lerobot/scripts/visualize_dataset_html.py \
Now try to replay the first episode on your robot:
```bash
python lerobot/scripts/control_robot.py replay \
DATA_DIR=data python lerobot/scripts/control_robot.py replay \
--robot-path lerobot/configs/robot/moss.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/moss_test \
--episode 0
```
@@ -229,7 +232,7 @@ python lerobot/scripts/control_robot.py replay \
To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
```bash
python lerobot/scripts/train.py \
DATA_DIR=data python lerobot/scripts/train.py \
dataset_repo_id=${HF_USER}/moss_test \
policy=act_moss_real \
env=moss_real \
@@ -245,6 +248,7 @@ Let's explain it:
3. We provided an environment as argument with `env=moss_real`. This loads configurations from [`lerobot/configs/env/moss_real.yaml`](../lerobot/configs/env/moss_real.yaml).
4. We provided `device=cuda` since we are training on a Nvidia GPU, but you can also use `device=mps` if you are using a Mac with Apple silicon, or `device=cpu` otherwise.
5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`.
6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync.
Training should take several hours. You will find checkpoints in `outputs/train/act_moss_test/checkpoints`.
@@ -255,6 +259,7 @@ You can use the `record` function from [`lerobot/scripts/control_robot.py`](../l
python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/moss.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/eval_act_moss_test \
--tags moss tutorial eval \
--warmup-time-s 5 \

View File

@@ -1,83 +0,0 @@
# Training a HIL-SERL Reward Classifier with LeRobot
This tutorial provides step-by-step instructions for training a reward classifier using LeRobot.
---
## Training Script Overview
LeRobot includes a ready-to-use training script located at [`lerobot/scripts/train_hilserl_classifier.py`](../../lerobot/scripts/train_hilserl_classifier.py). Here's an outline of its workflow:
1. **Configuration Loading**
The script uses Hydra to load a configuration file for subsequent steps. (Details on Hydra follow below.)
2. **Dataset Initialization**
It loads a `LeRobotDataset` containing images and rewards. To optimize performance, a weighted random sampler is used to balance class sampling.
3. **Classifier Initialization**
A lightweight classification head is built on top of a frozen, pretrained image encoder from HuggingFace. The classifier outputs either:
- A single probability (binary classification), or
- Logits (multi-class classification).
4. **Training Loop Execution**
The script performs:
- Forward and backward passes,
- Optimization steps,
- Periodic logging, evaluation, and checkpoint saving.
---
## Configuring with Hydra
For detailed information about Hydra usage, refer to [`examples/4_train_policy_with_script.md`](../examples/4_train_policy_with_script.md). However, note that training the reward classifier differs slightly and requires a separate configuration file.
### Config File Setup
The default `default.yaml` cannot launch the reward classifier training directly. Instead, you need a configuration file like [`lerobot/configs/policy/hilserl_classifier.yaml`](../../lerobot/configs/policy/hilserl_classifier.yaml), with the following adjustment:
Replace the `dataset_repo_id` field with the identifier for your dataset, which contains images and sparse rewards:
```yaml
# Example: lerobot/configs/policy/reward_classifier.yaml
dataset_repo_id: "my_dataset_repo_id"
## Typical logs and metrics
```
When you start the training process, you will first see your full configuration being printed in the terminal. You can check it to make sure that you config it correctly and your config is not overrided by other files. The final configuration will also be saved with the checkpoint.
After that, you will see training log like this one:
```
[2024-11-29 18:26:36,999][root][INFO] -
Epoch 5/5
Training: 82%|██████████████████████████████████████████████████████████████████████████████▋ | 91/111 [00:50<00:09, 2.04it/s, loss=0.2999, acc=69.99%]
```
or evaluation log like:
```
Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:20<00:00, 1.37it/s]
```
### Metrics Tracking with Weights & Biases (WandB)
If `wandb.enable` is set to `true`, the training and evaluation logs will also be saved in WandB. This allows you to track key metrics in real-time, including:
- **Training Metrics**:
- `train/accuracy`
- `train/loss`
- `train/dataloading_s`
- **Evaluation Metrics**:
- `eval/accuracy`
- `eval/loss`
- `eval/eval_s`
#### Additional Features
You can also log sample predictions during evaluation. Each logged sample will include:
- The **input image**.
- The **predicted label**.
- The **true label**.
- The **classifier's "confidence" (logits/probability)**.
These logs can be useful for diagnosing and debugging performance issues.

View File

@@ -10,10 +10,10 @@ from torchvision.transforms import ToPILImage, v2
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
dataset_repo_id = "lerobot/aloha_static_screw_driver"
dataset_repo_id = "lerobot/aloha_static_tape"
# Create a LeRobotDataset with no transformations
dataset = LeRobotDataset(dataset_repo_id, episodes=[0])
dataset = LeRobotDataset(dataset_repo_id)
# This is equivalent to `dataset = LeRobotDataset(dataset_repo_id, image_transforms=None)`
# Get the index of the first observation in the first episode
@@ -28,13 +28,12 @@ transforms = v2.Compose(
[
v2.ColorJitter(brightness=(0.5, 1.5)),
v2.ColorJitter(contrast=(0.5, 1.5)),
v2.ColorJitter(hue=(-0.1, 0.1)),
v2.RandomAdjustSharpness(sharpness_factor=2, p=1),
]
)
# Create another LeRobotDataset with the defined transformations
transformed_dataset = LeRobotDataset(dataset_repo_id, episodes=[0], image_transforms=transforms)
transformed_dataset = LeRobotDataset(dataset_repo_id, image_transforms=transforms)
# Get a frame from the transformed dataset
transformed_frame = transformed_dataset[first_idx][transformed_dataset.meta.camera_keys[0]]

View File

@@ -29,7 +29,7 @@ For a visual walkthrough of the assembly process, you can refer to [this video t
## 2. Configure motors, calibrate arms, teleoperate your Koch v1.1
First, install the additional dependencies required for robots built with dynamixel motors like Koch v1.1 by running one of the following commands (make sure gcc is installed).
First, install the additional dependencies required for robots built with dynamixel motors like Koch v1.1 by running one of the following commands.
Using `pip`:
```bash
@@ -778,6 +778,7 @@ Now run this to record 2 episodes:
python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/koch.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/koch_test \
--tags tutorial \
--warmup-time-s 5 \
@@ -786,7 +787,7 @@ python lerobot/scripts/control_robot.py record \
--num-episodes 2
```
This will write your dataset locally to `~/.cache/huggingface/lerobot/{repo-id}` (e.g. `data/cadene/koch_test`) and push it on the hub at `https://huggingface.co/datasets/{HF_USER}/{repo-id}`. Your dataset will be automatically tagged with `LeRobot` for the community to find it easily, and you can also add custom tags (in this case `tutorial` for example).
This will write your dataset locally to `{root}/{repo-id}` (e.g. `data/cadene/koch_test`) and push it on the hub at `https://huggingface.co/datasets/{HF_USER}/{repo-id}`. Your dataset will be automatically tagged with `LeRobot` for the community to find it easily, and you can also add custom tags (in this case `tutorial` for example).
You can look for other LeRobot datasets on the hub by searching for `LeRobot` tags: https://huggingface.co/datasets?other=LeRobot
@@ -839,6 +840,7 @@ In the coming months, we plan to release a foundational model for robotics. We a
You can visualize your dataset by running:
```bash
python lerobot/scripts/visualize_dataset_html.py \
--root data \
--repo-id ${HF_USER}/koch_test
```
@@ -856,6 +858,7 @@ To replay the first episode of the dataset you just recorded, run the following
python lerobot/scripts/control_robot.py replay \
--robot-path lerobot/configs/robot/koch.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/koch_test \
--episode 0
```
@@ -868,7 +871,7 @@ Your robot should replicate movements similar to those you recorded. For example
To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
```bash
python lerobot/scripts/train.py \
DATA_DIR=data python lerobot/scripts/train.py \
dataset_repo_id=${HF_USER}/koch_test \
policy=act_koch_real \
env=koch_real \
@@ -915,6 +918,7 @@ env:
It should match your dataset (e.g. `fps: 30`) and your robot (e.g. `state_dim: 6` and `action_dim: 6`). We are still working on simplifying this in future versions of `lerobot`.
4. We provided `device=cuda` since we are training on a Nvidia GPU, but you could use `device=mps` to train on Apple silicon.
5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`.
6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync.
For more information on the `train` script see the previous tutorial: [`examples/4_train_policy_with_script.md`](../examples/4_train_policy_with_script.md)
@@ -987,6 +991,7 @@ To this end, you can use the `record` function from [`lerobot/scripts/control_ro
python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/koch.yaml \
--fps 30 \
--root data \
--repo-id ${HF_USER}/eval_koch_test \
--tags tutorial eval \
--warmup-time-s 5 \
@@ -1005,6 +1010,7 @@ As you can see, it's almost the same command as previously used to record your t
You can then visualize your evaluation dataset by running the same command as before but with the new inference dataset as argument:
```bash
python lerobot/scripts/visualize_dataset.py \
--root data \
--repo-id ${HF_USER}/eval_koch_test
```

View File

@@ -128,6 +128,7 @@ Record one episode:
python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/stretch.yaml \
--fps 20 \
--root data \
--repo-id ${HF_USER}/stretch_test \
--tags stretch tutorial \
--warmup-time-s 3 \
@@ -145,6 +146,7 @@ Now try to replay this episode (make sure the robot's initial position is the sa
python lerobot/scripts/control_robot.py replay \
--robot-path lerobot/configs/robot/stretch.yaml \
--fps 20 \
--root data \
--repo-id ${HF_USER}/stretch_test \
--episode 0
```

View File

@@ -56,7 +56,7 @@ python lerobot/scripts/control_robot.py teleoperate \
--robot-overrides max_relative_target=5
```
By adding `--robot-overrides max_relative_target=5`, we override the default value for `max_relative_target` defined in `lerobot/configs/robot/aloha.yaml`. It is expected to be `5` to limit the magnitude of the movement for more safety, but the teleoperation won't be smooth. When you feel confident, you can disable this limit by adding `--robot-overrides max_relative_target=null` to the command line:
By adding `--robot-overrides max_relative_target=5`, we override the default value for `max_relative_target` defined in `lerobot/configs/robot/aloha.yaml`. It is expected to be `5` to limit the magnitude of the movement for more safety, but the teloperation won't be smooth. When you feel confident, you can disable this limit by adding `--robot-overrides max_relative_target=null` to the command line:
```bash
python lerobot/scripts/control_robot.py teleoperate \
--robot-path lerobot/configs/robot/aloha.yaml \
@@ -84,6 +84,7 @@ python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/aloha.yaml \
--robot-overrides max_relative_target=null \
--fps 30 \
--root data \
--repo-id ${HF_USER}/aloha_test \
--tags aloha tutorial \
--warmup-time-s 5 \
@@ -103,6 +104,7 @@ echo ${HF_USER}/aloha_test
If you didn't upload with `--push-to-hub 0`, you can also visualize it locally with:
```bash
python lerobot/scripts/visualize_dataset_html.py \
--root data \
--repo-id ${HF_USER}/aloha_test
```
@@ -117,6 +119,7 @@ python lerobot/scripts/control_robot.py replay \
--robot-path lerobot/configs/robot/aloha.yaml \
--robot-overrides max_relative_target=null \
--fps 30 \
--root data \
--repo-id ${HF_USER}/aloha_test \
--episode 0
```
@@ -125,7 +128,7 @@ python lerobot/scripts/control_robot.py replay \
To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
```bash
python lerobot/scripts/train.py \
DATA_DIR=data python lerobot/scripts/train.py \
dataset_repo_id=${HF_USER}/aloha_test \
policy=act_aloha_real \
env=aloha_real \
@@ -141,6 +144,7 @@ Let's explain it:
3. We provided an environment as argument with `env=aloha_real`. This loads configurations from [`lerobot/configs/env/aloha_real.yaml`](../lerobot/configs/env/aloha_real.yaml). Note: this yaml defines 18 dimensions for the `state_dim` and `action_dim`, corresponding to 18 motors, not 14 motors as used in previous Aloha work. This is because, we include the `shoulder_shadow` and `elbow_shadow` motors for simplicity.
4. We provided `device=cuda` since we are training on a Nvidia GPU.
5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`.
6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync.
Training should take several hours. You will find checkpoints in `outputs/train/act_aloha_test/checkpoints`.
@@ -152,6 +156,7 @@ python lerobot/scripts/control_robot.py record \
--robot-path lerobot/configs/robot/aloha.yaml \
--robot-overrides max_relative_target=null \
--fps 30 \
--root data \
--repo-id ${HF_USER}/eval_act_aloha_test \
--tags aloha tutorial eval \
--warmup-time-s 5 \

View File

@@ -63,7 +63,7 @@ def build_features(mode: str) -> dict:
return features
def load_raw_dataset(zarr_path: Path):
def load_raw_dataset(zarr_path: Path, load_images: bool = True):
try:
from lerobot.common.datasets.push_dataset_to_hub._diffusion_policy_replay_buffer import (
ReplayBuffer as DiffusionPolicyReplayBuffer,

View File

@@ -28,7 +28,7 @@ def safe_stop_image_writer(func):
try:
return func(*args, **kwargs)
except Exception as e:
dataset = kwargs.get("dataset")
dataset = kwargs.get("dataset", None)
image_writer = getattr(dataset, "image_writer", None) if dataset else None
if image_writer is not None:
print("Waiting for image writer to terminate...")

View File

@@ -291,14 +291,14 @@ class LeRobotDatasetMetadata:
obj.root.mkdir(parents=True, exist_ok=False)
if robot is not None:
features = {**(features or {}), **get_features_from_robot(robot)}
features = get_features_from_robot(robot, use_videos)
robot_type = robot.robot_type
if not all(cam.fps == fps for cam in robot.cameras.values()):
logging.warning(
f"Some cameras in your {robot.robot_type} robot don't have an fps matching the fps of your dataset."
"In this case, frames from lower fps cameras will be repeated to fill in the blanks."
)
elif features is None:
elif robot_type is None or features is None:
raise ValueError(
"Dataset features must either come from a Robot or explicitly passed upon creation."
)

View File

@@ -0,0 +1,639 @@
OPENX_DATASET_CONFIGS:
fractal20220817_data:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- base_pose_tool_reached
- gripper_closed
fps: 3
kuka:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- clip_function_input/base_pose_tool_reached
- gripper_closed
fps: 10
bridge_openx:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- EEF_state
- gripper_state
fps: 5
taco_play:
image_obs_keys:
- rgb_static
- rgb_gripper
depth_obs_keys:
- depth_static
- depth_gripper
state_obs_keys:
- state_eef
- state_gripper
fps: 15
jaco_play:
image_obs_keys:
- image
- image_wrist
depth_obs_keys:
- null
state_obs_keys:
- state_eef
- state_gripper
fps: 10
berkeley_cable_routing:
image_obs_keys:
- image
- top_image
- wrist45_image
- wrist225_image
depth_obs_keys:
- null
state_obs_keys:
- robot_state
fps: 10
roboturk:
image_obs_keys:
- front_rgb
depth_obs_keys:
- null
state_obs_keys:
- null
fps: 10
nyu_door_opening_surprising_effectiveness:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- null
fps: 3
viola:
image_obs_keys:
- agentview_rgb
- eye_in_hand_rgb
depth_obs_keys:
- null
state_obs_keys:
- joint_states
- gripper_states
fps: 20
berkeley_autolab_ur5:
image_obs_keys:
- image
- hand_image
depth_obs_keys:
- image_with_depth
state_obs_keys:
- state
fps: 5
toto:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 30
language_table:
image_obs_keys:
- rgb
depth_obs_keys:
- null
state_obs_keys:
- effector_translation
fps: 10
columbia_cairlab_pusht_real:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- robot_state
fps: 10
stanford_kuka_multimodal_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- depth_image
state_obs_keys:
- ee_position
- ee_orientation
fps: 20
nyu_rot_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 3
io_ai_tech:
image_obs_keys:
- image
- image_fisheye
- image_left_side
- image_right_side
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 3
stanford_hydra_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 10
austin_buds_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 20
nyu_franka_play_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
- image_additional_view
depth_obs_keys:
- depth
- depth_additional_view
state_obs_keys:
- eef_state
fps: 3
maniskill_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- depth
- wrist_depth
state_obs_keys:
- tcp_pose
- gripper_state
fps: 20
furniture_bench_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 10
cmu_franka_exploration_dataset_converted_externally_to_rlds:
image_obs_keys:
- highres_image
depth_obs_keys:
- null
state_obs_keys:
- null
fps: 10
ucsd_kitchen_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- joint_state
fps: 2
ucsd_pick_and_place_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 3
spoc:
image_obs_keys:
- image
- image_manipulation
depth_obs_keys:
- null
state_obs_keys:
- null
fps: 3
austin_sailor_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 20
austin_sirius_dataset_converted_externally_to_rlds:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 20
bc_z:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- present/xyz
- present/axis_angle
- present/sensed_close
fps: 10
utokyo_pr2_opening_fridge_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 10
utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 10
utokyo_xarm_pick_and_place_converted_externally_to_rlds:
image_obs_keys:
- image
- image2
- hand_image
depth_obs_keys:
- null
state_obs_keys:
- end_effector_pose
fps: 10
utokyo_xarm_bimanual_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- pose_r
fps: 10
robo_net:
image_obs_keys:
- image
- image1
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 1
robo_set:
image_obs_keys:
- image_left
- image_right
- image_wrist
depth_obs_keys:
- null
state_obs_keys:
- state
- state_velocity
fps: 5
berkeley_mvp_converted_externally_to_rlds:
image_obs_keys:
- hand_image
depth_obs_keys:
- null
state_obs_keys:
- gripper
- pose
- joint_pos
fps: 5
berkeley_rpt_converted_externally_to_rlds:
image_obs_keys:
- hand_image
depth_obs_keys:
- null
state_obs_keys:
- joint_pos
- gripper
fps: 30
kaist_nonprehensile_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 10
stanford_mask_vit_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
tokyo_u_lsmo_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 10
dlr_sara_pour_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 10
dlr_sara_grid_clamp_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 10
dlr_edan_shared_control_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 5
asu_table_top_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 12.5
stanford_robocook_converted_externally_to_rlds:
image_obs_keys:
- image_1
- image_2
depth_obs_keys:
- depth_1
- depth_2
state_obs_keys:
- eef_state
- gripper_state
fps: 5
imperialcollege_sawyer_wrist_cam:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 10
iamlab_cmu_pickup_insert_converted_externally_to_rlds:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- joint_state
- gripper_state
fps: 20
uiuc_d3field:
image_obs_keys:
- image_1
- image_2
depth_obs_keys:
- depth_1
- depth_2
state_obs_keys:
- null
fps: 1
utaustin_mutex:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 20
berkeley_fanuc_manipulation:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- joint_state
- gripper_state
fps: 10
cmu_playing_with_food:
image_obs_keys:
- image
- finger_vision_1
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 10
cmu_play_fusion:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 5
cmu_stretch:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- eef_state
- gripper_state
fps: 10
berkeley_gnm_recon:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
- position
- yaw
fps: 3
berkeley_gnm_cory_hall:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
- position
- yaw
fps: 5
berkeley_gnm_sac_son:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- state
- position
- yaw
fps: 10
droid:
image_obs_keys:
- exterior_image_1_left
- exterior_image_2_left
- wrist_image_left
depth_obs_keys:
- null
state_obs_keys:
- proprio
fps: 15
droid_100:
image_obs_keys:
- exterior_image_1_left
- exterior_image_2_left
- wrist_image_left
depth_obs_keys:
- null
state_obs_keys:
- proprio
fps: 15
fmb:
image_obs_keys:
- image_side_1
- image_side_2
- image_wrist_1
- image_wrist_2
depth_obs_keys:
- image_side_1_depth
- image_side_2_depth
- image_wrist_1_depth
- image_wrist_2_depth
state_obs_keys:
- proprio
fps: 10
dobbe:
image_obs_keys:
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- proprio
fps: 3.75
usc_cloth_sim_converted_externally_to_rlds:
image_obs_keys:
- image
depth_obs_keys:
- null
state_obs_keys:
- null
fps: 10
plex_robosuite:
image_obs_keys:
- image
- wrist_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 20
conq_hose_manipulation:
image_obs_keys:
- frontleft_fisheye_image
- frontright_fisheye_image
- hand_color_image
depth_obs_keys:
- null
state_obs_keys:
- state
fps: 30

View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the Licens e.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
NOTE(YL): Adapted from:
Octo: https://github.com/octo-models/octo/blob/main/octo/data/utils/data_utils.py
data_utils.py
Additional utils for data processing.
"""
from typing import Any, Dict, List
import tensorflow as tf
def binarize_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
"""
Converts gripper actions from continuous to binary values (0 and 1).
We exploit that fact that most of the time, the gripper is fully open (near 1.0) or fully closed (near 0.0). As it
transitions between the two, it sometimes passes through a few intermediate values. We relabel those intermediate
values based on the state that is reached _after_ those intermediate values.
In the edge case that the trajectory ends with an intermediate value, we give up on binarizing and relabel that
chunk of intermediate values as the last action in the trajectory.
The `scan_fn` implements the following logic:
new_actions = np.empty_like(actions)
carry = actions[-1]
for i in reversed(range(actions.shape[0])):
if in_between_mask[i]:
carry = carry
else:
carry = float(open_mask[i])
new_actions[i] = carry
"""
open_mask, closed_mask = actions > 0.95, actions < 0.05
in_between_mask = tf.logical_not(tf.logical_or(open_mask, closed_mask))
is_open_float = tf.cast(open_mask, tf.float32)
def scan_fn(carry, i):
return tf.cond(in_between_mask[i], lambda: tf.cast(carry, tf.float32), lambda: is_open_float[i])
return tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), actions[-1], reverse=True)
def invert_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
return 1 - actions
def rel2abs_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
"""
Converts relative gripper actions (+1 for closing, -1 for opening) to absolute actions (0 = closed; 1 = open).
Assumes that the first relative gripper is not redundant (i.e. close when already closed)!
"""
# Note =>> -1 for closing, 1 for opening, 0 for no change
opening_mask, closing_mask = actions < -0.1, actions > 0.1
thresholded_actions = tf.where(opening_mask, 1, tf.where(closing_mask, -1, 0))
def scan_fn(carry, i):
return tf.cond(thresholded_actions[i] == 0, lambda: carry, lambda: thresholded_actions[i])
# If no relative grasp, assumes open for whole trajectory
start = -1 * thresholded_actions[tf.argmax(thresholded_actions != 0, axis=0)]
start = tf.cond(start == 0, lambda: 1, lambda: start)
# Note =>> -1 for closed, 1 for open
new_actions = tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), start)
new_actions = tf.cast(new_actions, tf.float32) / 2 + 0.5
return new_actions
# === Bridge-V2 =>> Dataset-Specific Transform ===
def relabel_bridge_actions(traj: Dict[str, Any]) -> Dict[str, Any]:
"""Relabels actions to use reached proprioceptive state; discards last timestep (no-action)."""
movement_actions = traj["observation"]["state"][1:, :6] - traj["observation"]["state"][:-1, :6]
traj_truncated = tf.nest.map_structure(lambda x: x[:-1], traj)
traj_truncated["action"] = tf.concat([movement_actions, traj["action"][:-1, -1:]], axis=1)
return traj_truncated
# === RLDS Dataset Initialization Utilities ===
def pprint_data_mixture(dataset_kwargs_list: List[Dict[str, Any]], dataset_weights: List[int]) -> None:
print("\n######################################################################################")
print(f"# Loading the following {len(dataset_kwargs_list)} datasets (incl. sampling weight):{'': >24} #")
for dataset_kwargs, weight in zip(dataset_kwargs_list, dataset_weights, strict=False):
pad = 80 - len(dataset_kwargs["name"])
print(f"# {dataset_kwargs['name']}: {weight:=>{pad}f} #")
print("######################################################################################\n")

View File

@@ -0,0 +1,200 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
NOTE(YL): Adapted from:
OpenVLA: https://github.com/openvla/openvla
Episode transforms for DROID dataset.
"""
from typing import Any, Dict
import tensorflow as tf
import tensorflow_graphics.geometry.transformation as tfg
def rmat_to_euler(rot_mat):
return tfg.euler.from_rotation_matrix(rot_mat)
def euler_to_rmat(euler):
return tfg.rotation_matrix_3d.from_euler(euler)
def invert_rmat(rot_mat):
return tfg.rotation_matrix_3d.inverse(rot_mat)
def rotmat_to_rot6d(mat):
"""
Converts rotation matrix to R6 rotation representation (first two rows in rotation matrix).
Args:
mat: rotation matrix
Returns: 6d vector (first two rows of rotation matrix)
"""
r6 = mat[..., :2, :]
r6_0, r6_1 = r6[..., 0, :], r6[..., 1, :]
r6_flat = tf.concat([r6_0, r6_1], axis=-1)
return r6_flat
def velocity_act_to_wrist_frame(velocity, wrist_in_robot_frame):
"""
Translates velocity actions (translation + rotation) from base frame of the robot to wrist frame.
Args:
velocity: 6d velocity action (3 x translation, 3 x rotation)
wrist_in_robot_frame: 6d pose of the end-effector in robot base frame
Returns: 9d velocity action in robot wrist frame (3 x translation, 6 x rotation as R6)
"""
r_frame = euler_to_rmat(wrist_in_robot_frame[:, 3:6])
r_frame_inv = invert_rmat(r_frame)
# world to wrist: dT_pi = R^-1 dT_rbt
vel_t = (r_frame_inv @ velocity[:, :3][..., None])[..., 0]
# world to wrist: dR_pi = R^-1 dR_rbt R
dr_ = euler_to_rmat(velocity[:, 3:6])
dr_ = r_frame_inv @ (dr_ @ r_frame)
dr_r6 = rotmat_to_rot6d(dr_)
return tf.concat([vel_t, dr_r6], axis=-1)
def rand_swap_exterior_images(img1, img2):
"""
Randomly swaps the two exterior images (for training with single exterior input).
"""
return tf.cond(tf.random.uniform(shape=[]) > 0.5, lambda: (img1, img2), lambda: (img2, img1))
def droid_baseact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
DROID dataset transformation for actions expressed in *base* frame of the robot.
"""
dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
trajectory["action"] = tf.concat(
(
dt,
dr_,
1 - trajectory["action_dict"]["gripper_position"],
),
axis=-1,
)
trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
rand_swap_exterior_images(
trajectory["observation"]["exterior_image_1_left"],
trajectory["observation"]["exterior_image_2_left"],
)
)
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["cartesian_position"],
trajectory["observation"]["gripper_position"],
),
axis=-1,
)
return trajectory
def droid_wristact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
DROID dataset transformation for actions expressed in *wrist* frame of the robot.
"""
wrist_act = velocity_act_to_wrist_frame(
trajectory["action_dict"]["cartesian_velocity"], trajectory["observation"]["cartesian_position"]
)
trajectory["action"] = tf.concat(
(
wrist_act,
trajectory["action_dict"]["gripper_position"],
),
axis=-1,
)
trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
rand_swap_exterior_images(
trajectory["observation"]["exterior_image_1_left"],
trajectory["observation"]["exterior_image_2_left"],
)
)
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["cartesian_position"],
trajectory["observation"]["gripper_position"],
),
axis=-1,
)
return trajectory
def droid_finetuning_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
DROID dataset transformation for actions expressed in *base* frame of the robot.
"""
dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
trajectory["action"] = tf.concat(
(
dt,
dr_,
1 - trajectory["action_dict"]["gripper_position"],
),
axis=-1,
)
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["cartesian_position"],
trajectory["observation"]["gripper_position"],
),
axis=-1,
)
return trajectory
def zero_action_filter(traj: Dict) -> bool:
"""
Filters transitions whose actions are all-0 (only relative actions, no gripper action).
Note: this filter is applied *after* action normalization, so need to compare to "normalized 0".
"""
droid_q01 = tf.convert_to_tensor(
[
-0.7776297926902771,
-0.5803514122962952,
-0.5795090794563293,
-0.6464047729969025,
-0.7041108310222626,
-0.8895104378461838,
]
)
droid_q99 = tf.convert_to_tensor(
[
0.7597932070493698,
0.5726242214441299,
0.7351000607013702,
0.6705610305070877,
0.6464948207139969,
0.8897542208433151,
]
)
droid_norm_0_act = (
2 * (tf.zeros_like(traj["action"][:, :6]) - droid_q01) / (droid_q99 - droid_q01 + 1e-8) - 1
)
return tf.reduce_any(tf.math.abs(traj["action"][:, :6] - droid_norm_0_act) > 1e-5)

View File

@@ -0,0 +1,859 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
NOTE(YL): Adapted from:
OpenVLA: https://github.com/openvla/openvla
Octo: https://github.com/octo-models/octo
transforms.py
Defines a registry of per-dataset standardization transforms for each dataset in Open-X Embodiment.
Transforms adopt the following structure:
Input: Dictionary of *batched* features (i.e., has leading time dimension)
Output: Dictionary `step` =>> {
"observation": {
<image_keys, depth_image_keys>
State (in chosen state representation)
},
"action": Action (in chosen action representation),
"language_instruction": str
}
"""
from typing import Any, Dict
import tensorflow as tf
from lerobot.common.datasets.push_dataset_to_hub.openx.data_utils import (
binarize_gripper_actions,
invert_gripper_actions,
rel2abs_gripper_actions,
relabel_bridge_actions,
)
def droid_baseact_transform_fn():
from lerobot.common.datasets.push_dataset_to_hub.openx.droid_utils import droid_baseact_transform
return droid_baseact_transform
def bridge_openx_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
Applies to version of Bridge V2 in Open X-Embodiment mixture.
Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
"""
for key in trajectory:
if key == "traj_metadata":
continue
elif key in ["observation", "action"]:
for key2 in trajectory[key]:
trajectory[key][key2] = trajectory[key][key2][1:]
else:
trajectory[key] = trajectory[key][1:]
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
trajectory = relabel_bridge_actions(trajectory)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def bridge_orig_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
Applies to original version of Bridge V2 from the official project website.
Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
"""
for key in trajectory:
if key == "traj_metadata":
continue
elif key == "observation":
for key2 in trajectory[key]:
trajectory[key][key2] = trajectory[key][key2][1:]
else:
trajectory[key] = trajectory[key][1:]
trajectory["action"] = tf.concat(
[
trajectory["action"][:, :6],
binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
],
axis=1,
)
trajectory = relabel_bridge_actions(trajectory)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def ppgm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
[
trajectory["action"][:, :6],
binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
],
axis=1,
)
trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
return trajectory
def rt1_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def kuka_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
# decode compressed state
eef_value = tf.io.decode_compressed(
trajectory["observation"]["clip_function_input/base_pose_tool_reached"],
compression_type="ZLIB",
)
eef_value = tf.io.decode_raw(eef_value, tf.float32)
trajectory["observation"]["clip_function_input/base_pose_tool_reached"] = tf.reshape(eef_value, (-1, 7))
gripper_value = tf.io.decode_compressed(
trajectory["observation"]["gripper_closed"], compression_type="ZLIB"
)
gripper_value = tf.io.decode_raw(gripper_value, tf.float32)
trajectory["observation"]["gripper_closed"] = tf.reshape(gripper_value, (-1, 1))
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def taco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state_eef"] = trajectory["observation"]["robot_obs"][:, :6]
trajectory["observation"]["state_gripper"] = trajectory["observation"]["robot_obs"][:, 7:8]
trajectory["action"] = trajectory["action"]["rel_actions_world"]
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
tf.clip_by_value(trajectory["action"][:, -1:], 0, 1),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def jaco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state_eef"] = trajectory["observation"]["end_effector_cartesian_pos"][:, :6]
trajectory["observation"]["state_gripper"] = trajectory["observation"]["end_effector_cartesian_pos"][
:, -1:
]
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
tf.zeros_like(trajectory["action"]["world_vector"]),
gripper_action[:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def berkeley_cable_routing_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
tf.zeros_like(trajectory["action"]["world_vector"][:, :1]),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def roboturk_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert absolute gripper action, +1 = open, 0 = close
gripper_action = invert_gripper_actions(
tf.clip_by_value(trajectory["action"]["gripper_closedness_action"], 0, 1)
)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action,
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
trajectory["language_embedding"] = trajectory["observation"]["natural_language_embedding"]
return trajectory
def nyu_door_opening_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def viola_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# make gripper action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"][:, None]
gripper_action = tf.clip_by_value(gripper_action, 0, 1)
gripper_action = invert_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action,
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def berkeley_autolab_ur5_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["robot_state"][:, 6:14]
# make gripper action absolute action, +1 = open, 0 = close
gripper_action = trajectory["action"]["gripper_closedness_action"]
gripper_action = rel2abs_gripper_actions(gripper_action)
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
gripper_action[:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def toto_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def language_table_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# default to "open" gripper
trajectory["action"] = tf.concat(
(
trajectory["action"],
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"]),
tf.ones_like(trajectory["action"][:, :1]),
),
axis=-1,
)
# decode language instruction
instruction_bytes = trajectory["observation"]["instruction"]
instruction_encoded = tf.strings.unicode_encode(instruction_bytes, output_encoding="UTF-8")
# Remove trailing padding --> convert RaggedTensor to regular Tensor.
trajectory["language_instruction"] = tf.strings.split(instruction_encoded, "\x00")[:, :1].to_tensor()[
:, 0
]
return trajectory
def pusht_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["world_vector"],
trajectory["action"]["rotation_delta"],
trajectory["action"]["gripper_closedness_action"][:, None],
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def stanford_kuka_multimodal_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["depth_image"] = trajectory["observation"]["depth_image"][..., 0]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tf.zeros_like(trajectory["action"][:, :3]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def nyu_rot_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][..., :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., -1:]
trajectory["action"] = trajectory["action"][..., :7]
return trajectory
def stanford_hydra_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(trajectory["action"][:, -1:]),
),
axis=-1,
)
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["state"][:, :3],
trajectory["observation"]["state"][:, 7:10],
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -3:-2]
return trajectory
def austin_buds_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
return trajectory
def nyu_franka_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["depth"] = tf.cast(trajectory["observation"]["depth"][..., 0], tf.float32)
trajectory["observation"]["depth_additional_view"] = tf.cast(
trajectory["observation"]["depth_additional_view"][..., 0], tf.float32
)
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, -6:]
# clip gripper action, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, -8:-2],
tf.clip_by_value(trajectory["action"][:, -2:-1], 0, 1),
),
axis=-1,
)
return trajectory
def maniskill_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., 7:8]
return trajectory
def furniture_bench_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
import tensorflow_graphics.geometry.transformation as tft
trajectory["observation"]["state"] = tf.concat(
(
trajectory["observation"]["state"][:, :7],
trajectory["observation"]["state"][:, -1:],
),
axis=-1,
)
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
return trajectory
def cmu_franka_exploration_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def ucsd_kitchen_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def ucsd_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tf.zeros_like(trajectory["action"][:, :3]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def austin_sailor_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
return trajectory
def austin_sirius_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
return trajectory
def bc_z_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"]["future/xyz_residual"][:, :3],
trajectory["action"]["future/axis_angle_residual"][:, :3],
invert_gripper_actions(tf.cast(trajectory["action"]["future/target_close"][:, :1], tf.float32)),
),
axis=-1,
)
trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
return trajectory
def tokyo_pr2_opening_fridge_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def tokyo_pr2_tabletop_manipulation_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def utokyo_xarm_bimanual_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = trajectory["action"][..., -7:]
return trajectory
def robo_net_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["state"][:, :4],
tf.zeros_like(trajectory["observation"]["state"][:, :2]),
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :4],
tf.zeros_like(trajectory["action"][:, :2]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def berkeley_mvp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
"""
trajectory["observation"]["state"] = tf.concat((
tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32),
trajectory["observation"]["pose"],
trajectory["observation"]["joint_pos"],),
axis=-1,)
"""
trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32)
return trajectory
def berkeley_rpt_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32)
return trajectory
def kaist_nonprehensible_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, -7:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
tf.zeros_like(trajectory["action"][:, :1]),
),
axis=-1,
)
return trajectory
def stanford_mask_vit_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["end_effector_pose"][:, :4],
tf.zeros_like(trajectory["observation"]["end_effector_pose"][:, :2]),
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["end_effector_pose"][:, -1:]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :4],
tf.zeros_like(trajectory["action"][:, :2]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def tokyo_lsmo_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def dlr_sara_grid_clamp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :6]
return trajectory
def dlr_edan_shared_control_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# invert gripper action, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(trajectory["action"][:, -1:]),
),
axis=-1,
)
return trajectory
def asu_table_top_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["ground_truth_states"]["EE"]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def robocook_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
return trajectory
def imperial_wristcam_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def iamlab_pick_insert_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
import tensorflow_graphics.geometry.transformation as tft
trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 7:8]
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
trajectory["action"][:, 7:8],
),
axis=-1,
)
return trajectory
def uiuc_d3field_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"],
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"][:, :1]),
),
axis=-1,
)
return trajectory
def utaustin_mutex_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
# invert gripper action + clip, +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :6],
invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
),
axis=-1,
)
return trajectory
def berkeley_fanuc_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :6]
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 6:7]
# dataset does not store gripper actions, so use gripper state info, invert so +1 = open, 0 = close
trajectory["action"] = tf.concat(
(
trajectory["action"],
invert_gripper_actions(trajectory["observation"]["gripper_state"]),
),
axis=-1,
)
return trajectory
def cmu_playing_with_food_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
import tensorflow_graphics.geometry.transformation as tft
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
trajectory["action"][:, -1:],
),
axis=-1,
)
return trajectory
def playfusion_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :3],
trajectory["action"][:, -4:],
),
axis=-1,
)
return trajectory
def cmu_stretch_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["eef_state"] = tf.concat(
(
trajectory["observation"]["state"][:, :3],
tf.zeros_like(trajectory["observation"]["state"][:, :3]),
),
axis=-1,
)
trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
trajectory["action"] = trajectory["action"][..., :-1]
return trajectory
def gnm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
trajectory["observation"]["state"] = tf.concat(
(
trajectory["observation"]["position"],
tf.zeros_like(trajectory["observation"]["state"][:, :3]),
trajectory["observation"]["yaw"],
),
axis=-1,
)
trajectory["action"] = tf.concat(
(
trajectory["action"],
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"]),
tf.zeros_like(trajectory["action"][:, :1]),
),
axis=-1,
)
return trajectory
def fmb_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# every input feature is batched, ie has leading batch dimension
trajectory["observation"]["proprio"] = tf.concat(
(
trajectory["observation"]["eef_pose"],
trajectory["observation"]["state_gripper_pose"][..., None],
),
axis=-1,
)
return trajectory
def dobbe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# every input feature is batched, ie has leading batch dimension
trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
return trajectory
def robo_set_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
# gripper action is in -1...1 --> clip to 0...1, flip
gripper_action = trajectory["action"][:, -1:]
gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
trajectory["action"] = tf.concat(
(
trajectory["action"][:, :7],
gripper_action,
),
axis=-1,
)
return trajectory
def identity_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
return trajectory
# === Registry ===
OPENX_STANDARDIZATION_TRANSFORMS = {
"bridge_openx": bridge_openx_dataset_transform,
"bridge_orig": bridge_orig_dataset_transform,
"bridge_dataset": bridge_orig_dataset_transform,
"ppgm": ppgm_dataset_transform,
"ppgm_static": ppgm_dataset_transform,
"ppgm_wrist": ppgm_dataset_transform,
"fractal20220817_data": rt1_dataset_transform,
"kuka": kuka_dataset_transform,
"taco_play": taco_play_dataset_transform,
"jaco_play": jaco_play_dataset_transform,
"berkeley_cable_routing": berkeley_cable_routing_dataset_transform,
"roboturk": roboturk_dataset_transform,
"nyu_door_opening_surprising_effectiveness": nyu_door_opening_dataset_transform,
"viola": viola_dataset_transform,
"berkeley_autolab_ur5": berkeley_autolab_ur5_dataset_transform,
"toto": toto_dataset_transform,
"language_table": language_table_dataset_transform,
"columbia_cairlab_pusht_real": pusht_dataset_transform,
"stanford_kuka_multimodal_dataset_converted_externally_to_rlds": stanford_kuka_multimodal_dataset_transform,
"nyu_rot_dataset_converted_externally_to_rlds": nyu_rot_dataset_transform,
"stanford_hydra_dataset_converted_externally_to_rlds": stanford_hydra_dataset_transform,
"austin_buds_dataset_converted_externally_to_rlds": austin_buds_dataset_transform,
"nyu_franka_play_dataset_converted_externally_to_rlds": nyu_franka_play_dataset_transform,
"maniskill_dataset_converted_externally_to_rlds": maniskill_dataset_transform,
"furniture_bench_dataset_converted_externally_to_rlds": furniture_bench_dataset_transform,
"cmu_franka_exploration_dataset_converted_externally_to_rlds": cmu_franka_exploration_dataset_transform,
"ucsd_kitchen_dataset_converted_externally_to_rlds": ucsd_kitchen_dataset_transform,
"ucsd_pick_and_place_dataset_converted_externally_to_rlds": ucsd_pick_place_dataset_transform,
"austin_sailor_dataset_converted_externally_to_rlds": austin_sailor_dataset_transform,
"austin_sirius_dataset_converted_externally_to_rlds": austin_sirius_dataset_transform,
"bc_z": bc_z_dataset_transform,
"utokyo_pr2_opening_fridge_converted_externally_to_rlds": tokyo_pr2_opening_fridge_dataset_transform,
"utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": tokyo_pr2_tabletop_manipulation_dataset_transform,
"utokyo_xarm_pick_and_place_converted_externally_to_rlds": identity_transform,
"utokyo_xarm_bimanual_converted_externally_to_rlds": utokyo_xarm_bimanual_dataset_transform,
"robo_net": robo_net_dataset_transform,
"berkeley_mvp_converted_externally_to_rlds": berkeley_mvp_dataset_transform,
"berkeley_rpt_converted_externally_to_rlds": berkeley_rpt_dataset_transform,
"kaist_nonprehensile_converted_externally_to_rlds": kaist_nonprehensible_dataset_transform,
"stanford_mask_vit_converted_externally_to_rlds": stanford_mask_vit_dataset_transform,
"tokyo_u_lsmo_converted_externally_to_rlds": tokyo_lsmo_dataset_transform,
"dlr_sara_pour_converted_externally_to_rlds": identity_transform,
"dlr_sara_grid_clamp_converted_externally_to_rlds": dlr_sara_grid_clamp_dataset_transform,
"dlr_edan_shared_control_converted_externally_to_rlds": dlr_edan_shared_control_dataset_transform,
"asu_table_top_converted_externally_to_rlds": asu_table_top_dataset_transform,
"stanford_robocook_converted_externally_to_rlds": robocook_dataset_transform,
"imperialcollege_sawyer_wrist_cam": imperial_wristcam_dataset_transform,
"iamlab_cmu_pickup_insert_converted_externally_to_rlds": iamlab_pick_insert_dataset_transform,
"uiuc_d3field": uiuc_d3field_dataset_transform,
"utaustin_mutex": utaustin_mutex_dataset_transform,
"berkeley_fanuc_manipulation": berkeley_fanuc_dataset_transform,
"cmu_playing_with_food": cmu_playing_with_food_dataset_transform,
"cmu_play_fusion": playfusion_dataset_transform,
"cmu_stretch": cmu_stretch_dataset_transform,
"berkeley_gnm_recon": gnm_dataset_transform,
"berkeley_gnm_cory_hall": gnm_dataset_transform,
"berkeley_gnm_sac_son": gnm_dataset_transform,
"droid": droid_baseact_transform_fn(),
"droid_100": droid_baseact_transform_fn(), # first 100 episodes of droid
"fmb": fmb_transform,
"dobbe": dobbe_dataset_transform,
"robo_set": robo_set_dataset_transform,
"usc_cloth_sim_converted_externally_to_rlds": identity_transform,
"plex_robosuite": identity_transform,
"conq_hose_manipulation": identity_transform,
"io_ai_tech": identity_transform,
"spoc": identity_transform,
}

View File

@@ -14,16 +14,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
For all datasets in the RLDS format.
For https://github.com/google-deepmind/open_x_embodiment (OPENX) datasets.
NOTE: You need to install tensorflow and tensorflow_datsets before running this script.
Example:
python lerobot/scripts/push_dataset_to_hub.py \
--raw-dir /path/to/data/bridge_dataset/1.0.0/ \
--repo-id your_hub/sampled_bridge_data_v2 \
--raw-format rlds \
--raw-dir /hdd/tensorflow_datasets/bridge_dataset/1.0.0/ \
--repo-id youliangtan/sampled_bridge_data_v2 \
--raw-format openx_rlds.bridge_orig \
--episodes 3 4 5 8 9
Exact dataset fps defined in openx/config.py, obtained from:
@@ -38,10 +35,12 @@ import tensorflow as tf
import tensorflow_datasets as tfds
import torch
import tqdm
import yaml
from datasets import Dataset, Features, Image, Sequence, Value
from PIL import Image as PILImage
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
from lerobot.common.datasets.push_dataset_to_hub.openx.transforms import OPENX_STANDARDIZATION_TRANSFORMS
from lerobot.common.datasets.push_dataset_to_hub.utils import (
calculate_episode_data_index,
concatenate_episodes,
@@ -53,6 +52,11 @@ from lerobot.common.datasets.utils import (
)
from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames
with open("lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml") as f:
_openx_list = yaml.safe_load(f)
OPENX_DATASET_CONFIGS = _openx_list["OPENX_DATASET_CONFIGS"]
np.set_printoptions(precision=2)
@@ -104,6 +108,7 @@ def load_from_raw(
video: bool,
episodes: list[int] | None = None,
encoding: dict | None = None,
openx_dataset_name: str | None = None,
):
"""
Args:
@@ -131,17 +136,16 @@ def load_from_raw(
# we will apply the standardization transform if the dataset_name is provided
# if the dataset name is not provided and the goal is to convert any rlds formatted dataset
# search for 'image' keys in the observations
image_keys = []
state_keys = []
observation_info = dataset_info.features["steps"]["observation"]
for key in observation_info:
# check whether the key is for an image or a vector observation
if len(observation_info[key].shape) == 3:
# only adding uint8 images discards depth images
if observation_info[key].dtype == tf.uint8:
image_keys.append(key)
else:
state_keys.append(key)
if openx_dataset_name is not None:
print(" - applying standardization transform for dataset: ", openx_dataset_name)
assert openx_dataset_name in OPENX_STANDARDIZATION_TRANSFORMS
transform_fn = OPENX_STANDARDIZATION_TRANSFORMS[openx_dataset_name]
dataset = dataset.map(transform_fn)
image_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["image_obs_keys"]
else:
obs_keys = dataset_info.features["steps"]["observation"].keys()
image_keys = [key for key in obs_keys if "image" in key]
lang_key = "language_instruction" if "language_instruction" in dataset.element_spec else None
@@ -189,31 +193,50 @@ def load_from_raw(
num_frames = episode["action"].shape[0]
ep_dict = {}
for key in state_keys:
ep_dict[f"observation.{key}"] = tf_to_torch(episode["observation"][key])
###########################################################
# Handle the episodic data
ep_dict["action"] = tf_to_torch(episode["action"])
ep_dict["next.reward"] = tf_to_torch(episode["reward"]).float()
ep_dict["next.done"] = tf_to_torch(episode["is_last"])
ep_dict["is_terminal"] = tf_to_torch(episode["is_terminal"])
ep_dict["is_first"] = tf_to_torch(episode["is_first"])
ep_dict["discount"] = tf_to_torch(episode["discount"])
# last step of demonstration is considered done
done = torch.zeros(num_frames, dtype=torch.bool)
done[-1] = True
ep_dict = {}
langs = [] # TODO: might be located in "observation"
image_array_dict = {key: [] for key in image_keys}
# We will create the state observation tensor by stacking the state
# obs keys defined in the openx/configs.py
if openx_dataset_name is not None:
state_obs_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["state_obs_keys"]
# stack the state observations, if is None, pad with zeros
states = []
for key in state_obs_keys:
if key in episode["observation"]:
states.append(tf_to_torch(episode["observation"][key]))
else:
states.append(torch.zeros(num_frames, 1)) # pad with zeros
states = torch.cat(states, dim=1)
# assert states.shape == (num_frames, 8), f"states shape: {states.shape}"
else:
states = tf_to_torch(episode["observation"]["state"])
actions = tf_to_torch(episode["action"])
rewards = tf_to_torch(episode["reward"]).float()
# If lang_key is present, convert the entire tensor at once
if lang_key is not None:
ep_dict["language_instruction"] = [x.numpy().decode("utf-8") for x in episode[lang_key]]
ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
image_array_dict = {key: [] for key in image_keys}
langs = [str(x) for x in episode[lang_key]]
for im_key in image_keys:
imgs = episode["observation"][im_key]
image_array_dict[im_key] = [tf_img_convert(img) for img in imgs]
# simple assertions
for item in [states, actions, rewards, done]:
assert len(item) == num_frames
###########################################################
# loop through all cameras
for im_key in image_keys:
img_key = f"observation.images.{im_key}"
@@ -239,6 +262,17 @@ def load_from_raw(
else:
ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
if lang_key is not None:
ep_dict["language_instruction"] = langs
ep_dict["observation.state"] = states
ep_dict["action"] = actions
ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
ep_dict["next.reward"] = rewards
ep_dict["next.done"] = done
path_ep_dict = tmp_ep_dicts_dir.joinpath(
"ep_dict_" + "0" * (10 - len(str(ep_idx))) + str(ep_idx) + ".pt"
)
@@ -256,28 +290,30 @@ def load_from_raw(
def to_hf_dataset(data_dict, video) -> Dataset:
features = {}
for key in data_dict:
# check if vector state obs
if key.startswith("observation.") and "observation.images." not in key:
features[key] = Sequence(length=data_dict[key].shape[1], feature=Value(dtype="float32", id=None))
# check if image obs
elif "observation.images." in key:
if video:
features[key] = VideoFrame()
else:
features[key] = Image()
keys = [key for key in data_dict if "observation.images." in key]
for key in keys:
if video:
features[key] = VideoFrame()
else:
features[key] = Image()
features["observation.state"] = Sequence(
length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
)
if "observation.velocity" in data_dict:
features["observation.velocity"] = Sequence(
length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
)
if "observation.effort" in data_dict:
features["observation.effort"] = Sequence(
length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
)
if "language_instruction" in data_dict:
features["language_instruction"] = Value(dtype="string", id=None)
features["action"] = Sequence(
length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
)
features["is_terminal"] = Value(dtype="bool", id=None)
features["is_first"] = Value(dtype="bool", id=None)
features["discount"] = Value(dtype="float32", id=None)
features["episode_index"] = Value(dtype="int64", id=None)
features["frame_index"] = Value(dtype="int64", id=None)
features["timestamp"] = Value(dtype="float32", id=None)
@@ -297,8 +333,19 @@ def from_raw_to_lerobot_format(
video: bool = True,
episodes: list[int] | None = None,
encoding: dict | None = None,
openx_dataset_name: str | None = None,
):
data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
"""This is a test impl for rlds conversion"""
if openx_dataset_name is None:
# set a default rlds frame rate if the dataset is not from openx
fps = 30
elif "fps" not in OPENX_DATASET_CONFIGS[openx_dataset_name]:
raise ValueError(
"fps for this dataset is not specified in openx/configs.py yet," "means it is not yet tested"
)
fps = OPENX_DATASET_CONFIGS[openx_dataset_name]["fps"]
data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding, openx_dataset_name)
hf_dataset = to_hf_dataset(data_dict, video)
episode_data_index = calculate_episode_data_index(hf_dataset)
info = {

View File

@@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib.resources
import json
import logging
import textwrap
@@ -477,8 +476,6 @@ def create_lerobot_dataset_card(
Note: If specified, license must be one of https://huggingface.co/docs/hub/repositories-licenses.
"""
card_tags = ["LeRobot"]
card_template_path = importlib.resources.path("lerobot.common.datasets", "card_template.md")
if tags:
card_tags += tags
if dataset_info:
@@ -496,9 +493,8 @@ def create_lerobot_dataset_card(
}
],
)
return DatasetCard.from_template(
card_data=card_data,
template_path=str(card_template_path),
template_path="./lerobot/common/datasets/card_template.md",
**kwargs,
)

View File

@@ -25,7 +25,6 @@ from glob import glob
from pathlib import Path
import torch
import wandb
from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
from omegaconf import DictConfig, OmegaConf
from termcolor import colored
@@ -108,6 +107,8 @@ class Logger:
self._wandb = None
else:
os.environ["WANDB_SILENT"] = "true"
import wandb
wandb_run_id = None
if cfg.resume:
wandb_run_id = get_wandb_run_id_from_filesystem(self.checkpoints_dir)
@@ -231,7 +232,7 @@ class Logger:
# TODO(alexander-soare): Add local text log.
if self._wandb is not None:
for k, v in d.items():
if not isinstance(v, (int, float, str, wandb.Table)):
if not isinstance(v, (int, float, str)):
logging.warning(
f'WandB logging of key "{k}" was ignored as its type is not handled by this wrapper.'
)

View File

@@ -1,36 +0,0 @@
import json
import os
from dataclasses import asdict, dataclass
import torch
@dataclass
class ClassifierConfig:
"""Configuration for the Classifier model."""
num_classes: int = 2
hidden_dim: int = 256
dropout_rate: float = 0.1
model_name: str = "microsoft/resnet-50"
device: str = "cuda" if torch.cuda.is_available() else "mps"
model_type: str = "cnn" # "transformer" or "cnn"
def save_pretrained(self, save_dir):
"""Save config to json file."""
os.makedirs(save_dir, exist_ok=True)
# Convert to dict and save as JSON
config_dict = asdict(self)
with open(os.path.join(save_dir, "config.json"), "w") as f:
json.dump(config_dict, f, indent=2)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path):
"""Load config from json file."""
config_file = os.path.join(pretrained_model_name_or_path, "config.json")
with open(config_file) as f:
config_dict = json.load(f)
return cls(**config_dict)

View File

@@ -1,134 +0,0 @@
import logging
from typing import Optional
import torch
from huggingface_hub import PyTorchModelHubMixin
from torch import Tensor, nn
from transformers import AutoImageProcessor, AutoModel
from .configuration_classifier import ClassifierConfig
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
class ClassifierOutput:
"""Wrapper for classifier outputs with additional metadata."""
def __init__(
self, logits: Tensor, probabilities: Optional[Tensor] = None, hidden_states: Optional[Tensor] = None
):
self.logits = logits
self.probabilities = probabilities
self.hidden_states = hidden_states
class Classifier(
nn.Module,
PyTorchModelHubMixin,
# Add Hub metadata
library_name="lerobot",
repo_url="https://github.com/huggingface/lerobot",
tags=["robotics", "vision-classifier"],
):
"""Image classifier built on top of a pre-trained encoder."""
# Add name attribute for factory
name = "classifier"
def __init__(self, config: ClassifierConfig):
super().__init__()
self.config = config
self.processor = AutoImageProcessor.from_pretrained(self.config.model_name, trust_remote_code=True)
encoder = AutoModel.from_pretrained(self.config.model_name, trust_remote_code=True)
# Extract vision model if we're given a multimodal model
if hasattr(encoder, "vision_model"):
logging.info("Multimodal model detected - using vision encoder only")
self.encoder = encoder.vision_model
self.vision_config = encoder.config.vision_config
else:
self.encoder = encoder
self.vision_config = getattr(encoder, "config", None)
# Model type from config
self.is_cnn = self.config.model_type == "cnn"
# For CNNs, initialize backbone
if self.is_cnn:
self._setup_cnn_backbone()
self._freeze_encoder()
self._build_classifier_head()
def _setup_cnn_backbone(self):
"""Set up CNN encoder"""
if hasattr(self.encoder, "fc"):
self.feature_dim = self.encoder.fc.in_features
self.encoder = nn.Sequential(*list(self.encoder.children())[:-1])
elif hasattr(self.encoder.config, "hidden_sizes"):
self.feature_dim = self.encoder.config.hidden_sizes[-1] # Last channel dimension
else:
raise ValueError("Unsupported CNN architecture")
def _freeze_encoder(self) -> None:
"""Freeze the encoder parameters."""
for param in self.encoder.parameters():
param.requires_grad = False
def _build_classifier_head(self) -> None:
"""Initialize the classifier head architecture."""
# Get input dimension based on model type
if self.is_cnn:
input_dim = self.feature_dim
else: # Transformer models
if hasattr(self.encoder.config, "hidden_size"):
input_dim = self.encoder.config.hidden_size
else:
raise ValueError("Unsupported transformer architecture since hidden_size is not found")
self.classifier_head = nn.Sequential(
nn.Linear(input_dim, self.config.hidden_dim),
nn.Dropout(self.config.dropout_rate),
nn.LayerNorm(self.config.hidden_dim),
nn.ReLU(),
nn.Linear(self.config.hidden_dim, 1 if self.config.num_classes == 2 else self.config.num_classes),
)
def _get_encoder_output(self, x: torch.Tensor) -> torch.Tensor:
"""Extract the appropriate output from the encoder."""
# Process images with the processor (handles resizing and normalization)
processed = self.processor(
images=x, # LeRobotDataset already provides proper tensor format
return_tensors="pt",
)
processed = processed["pixel_values"].to(x.device)
with torch.no_grad():
if self.is_cnn:
# The HF ResNet applies pooling internally
outputs = self.encoder(processed)
# Get pooled output directly
features = outputs.pooler_output
if features.dim() > 2:
features = features.squeeze(-1).squeeze(-1)
return features
else: # Transformer models
outputs = self.encoder(processed)
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
return outputs.pooler_output
return outputs.last_hidden_state[:, 0, :]
def forward(self, x: torch.Tensor) -> ClassifierOutput:
"""Forward pass of the classifier."""
# For training, we expect input to be a tensor directly from LeRobotDataset
encoder_output = self._get_encoder_output(x)
logits = self.classifier_head(encoder_output)
if self.config.num_classes == 2:
logits = logits.squeeze(-1)
probabilities = torch.sigmoid(logits)
else:
probabilities = torch.softmax(logits, dim=-1)
return ClassifierOutput(logits=logits, probabilities=probabilities, hidden_states=encoder_output)

View File

@@ -15,7 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from dataclasses import dataclass, field
@dataclass

View File

@@ -15,10 +15,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
import torch.nn.functional as F # noqa: N812
from huggingface_hub import PyTorchModelHubMixin
class HILSerlPolicy(
nn.Module,
PyTorchModelHubMixin,

View File

@@ -15,25 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from dataclasses import dataclass, field
@dataclass
class SACConfig:
discount = 0.99
temperature_init = 1.0
num_critics = 2
critic_lr = 3e-4
actor_lr = 3e-4
critic_network_kwargs = {
"hidden_dims": [256, 256],
"activate_final": True,
}
actor_network_kwargs = {
"hidden_dims": [256, 256],
"activate_final": True,
}
policy_kwargs = {
"tanh_squash_distribution": True,
"std_parameterization": "uniform",
}
discount = 0.99

View File

@@ -15,11 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: (1) better device management
from collections import deque
from copy import deepcopy
from functools import partial
import einops
@@ -31,10 +27,6 @@ from torch import Tensor
from huggingface_hub import PyTorchModelHubMixin
from lerobot.common.policies.normalize import Normalize, Unnormalize
from lerobot.common.policies.sac.configuration_sac import SACConfig
import numpy as np
from typing import Callable, Optional, Tuple, Sequence
class SACPolicy(
nn.Module,
@@ -66,27 +58,12 @@ class SACPolicy(
self.unnormalize_outputs = Unnormalize(
config.output_shapes, config.output_normalization_modes, dataset_stats
)
encoder = SACObservationEncoder(config)
# Define networks
critic_nets = []
for _ in range(config.num_critics):
critic_net = Critic(
encoder=encoder,
network=MLP(**config.critic_network_kwargs)
)
critic_nets.append(critic_net)
self.critic_ensemble = create_critic_ensemble(critic_nets, config.num_critics)
self.critic_target = deepcopy(self.critic_ensemble)
self.critic_ensemble = ...
self.critic_target = ...
self.actor_network = ...
self.actor_network = Policy(
encoder=encoder,
network=MLP(**config.actor_network_kwargs),
action_dim=config.output_shapes["action"][0],
**config.policy_kwargs
)
self.temperature = LagrangeMultiplier(init_value=config.temperature_init)
self.temperature = ...
def reset(self):
"""
@@ -112,18 +89,12 @@ class SACPolicy(
Returns a dictionary with loss as a tensor, and other information as native floats.
"""
batch = self.normalize_inputs(batch)
# batch shape is (b, 2, ...) where index 1 returns the current observation and
# the next observation for caluculating the right td index.
actions = batch["action"][:, 0]
rewards = batch["next.reward"][:, 0]
observations = {}
next_observations = {}
for k in batch:
if k.startswith("observation."):
observations[k] = batch[k][:, 0]
next_observations[k] = batch[k][:, 1]
observation_batch =
next_obaservation_batch =
action_batch =
reward_batch =
dones_batch =
# perform image augmentation
# reward bias
@@ -133,51 +104,34 @@ class SACPolicy(
# calculate critics loss
# 1- compute actions from policy
action_preds, log_probs = self.actor_network(observations)
next_actions = ..
# 2- compute q targets
q_targets = self.target_qs(next_observations, action_preds)
q_targets = self.target_qs(next_obaservation_batch, next_actions)
# critics subsample size
min_q = q_targets.min(dim=0)
# backup entropy
td_target = rewards + self.discount * min_q
td_target = reward_batch + self.discount * min_q
# 3- compute predicted qs
q_preds = self.critic_ensemble(observations, actions)
q_preds = self.critic_ensemble(observation_batch, action_batch)
# 4- Calculate loss
# Compute state-action value loss (TD loss) for all of the Q functions in the ensemble.
critics_loss = (
F.mse_loss(
q_preds,
einops.repeat(td_target, "t b -> e t b", e=q_preds.shape[0]),
reduction="none",
).sum(0) # sum over ensemble
# `q_preds_ensemble` depends on the first observation and the actions.
* ~batch["observation.state_is_pad"][0]
* ~batch["action_is_pad"]
# q_targets depends on the reward and the next observations.
* ~batch["next.reward_is_pad"]
* ~batch["observation.state_is_pad"][1:]
).sum(0).mean()
critics_loss = F.mse_loss(q_preds,
einops.repeat(td_target, "b -> e b", e=q_preds.shape[0])) # dones masks
# calculate actors loss
# 1- temperature
temperature = self.temperature()
# 2- get actions (batch_size, action_dim) and log probs (batch_size,)
actions, log_probs = self.actor_network(observations) \
actions, log_probs = self.actor_network(observation_batch)
# 3- get q-value predictions
with torch.no_grad():
q_preds = self.critic_ensemble(observations, actions, return_type="mean")
actor_loss = (
-(q_preds - temperature * log_probs).mean()
* ~batch["observation.state_is_pad"][0]
* ~batch["action_is_pad"]
).mean()
q_preds = self.critic_ensemble(observation_batch, actions, return_type="mean")
actor_loss = -(q_preds - temperature * log_probs).mean()
# calculate temperature loss
# 1- calculate entropy
@@ -187,8 +141,8 @@ class SACPolicy(
loss = critics_loss + actor_loss + temperature_loss
return {
"critics_loss": critics_loss.item(),
"actor_loss": actor_loss.item(),
"Q_value_loss": critics_loss.item(),
"pi_loss": actor_loss.item(),
"temperature_loss": temperature_loss.item(),
"temperature": temperature.item(),
"entropy": entropy.item(),
@@ -200,484 +154,3 @@ class SACPolicy(
self.critic_target.lerp_(self.critic_ensemble, self.config.critic_target_update_weight)
#for target_param, param in zip(self.critic_target.parameters(), self.critic_ensemble.parameters()):
# target_param.data.copy_(target_param.data * (1.0 - self.config.critic_target_update_weight) + param.data * self.critic_target_update_weight)
class MLP(nn.Module):
def __init__(
self,
config: SACConfig,
activations: Callable[[torch.Tensor], torch.Tensor] | str = nn.SiLU(),
activate_final: bool = False,
dropout_rate: Optional[float] = None,
):
super().__init__()
self.activate_final = config.activate_final
layers = []
for i, size in enumerate(config.network_hidden_dims):
layers.append(nn.Linear(config.network_hidden_dims[i-1] if i > 0 else config.network_hidden_dims[0], size))
if i + 1 < len(config.network_hidden_dims) or activate_final:
if dropout_rate is not None and dropout_rate > 0:
layers.append(nn.Dropout(p=dropout_rate))
layers.append(nn.LayerNorm(size))
layers.append(activations if isinstance(activations, nn.Module) else getattr(nn, activations)())
self.net = nn.Sequential(*layers)
def forward(self, x: torch.Tensor, train: bool = False) -> torch.Tensor:
# in training mode or not. TODO: find better way to do this
self.train(train)
return self.net(x)
class Critic(nn.Module):
def __init__(
self,
encoder: Optional[nn.Module],
network: nn.Module,
init_final: Optional[float] = None,
activate_final: bool = False,
device: str = "cuda"
):
super().__init__()
self.device = torch.device(device)
self.encoder = encoder
self.network = network
self.init_final = init_final
self.activate_final = activate_final
# Output layer
if init_final is not None:
if self.activate_final:
self.output_layer = nn.Linear(network.net[-3].out_features, 1)
else:
self.output_layer = nn.Linear(network.net[-2].out_features, 1)
nn.init.uniform_(self.output_layer.weight, -init_final, init_final)
nn.init.uniform_(self.output_layer.bias, -init_final, init_final)
else:
if self.activate_final:
self.output_layer = nn.Linear(network.net[-3].out_features, 1)
else:
self.output_layer = nn.Linear(network.net[-2].out_features, 1)
orthogonal_init()(self.output_layer.weight)
self.to(self.device)
def forward(
self,
observations: torch.Tensor,
actions: torch.Tensor,
train: bool = False
) -> torch.Tensor:
self.train(train)
observations = observations.to(self.device)
actions = actions.to(self.device)
if self.encoder is not None:
obs_enc = self.encoder(observations)
else:
obs_enc = observations
inputs = torch.cat([obs_enc, actions], dim=-1)
x = self.network(inputs)
value = self.output_layer(x)
return value.squeeze(-1)
def q_value_ensemble(
self,
observations: torch.Tensor,
actions: torch.Tensor,
train: bool = False
) -> torch.Tensor:
observations = observations.to(self.device)
actions = actions.to(self.device)
if len(actions.shape) == 3: # [batch_size, num_actions, action_dim]
batch_size, num_actions = actions.shape[:2]
obs_expanded = observations.unsqueeze(1).expand(-1, num_actions, -1)
obs_flat = obs_expanded.reshape(-1, observations.shape[-1])
actions_flat = actions.reshape(-1, actions.shape[-1])
q_values = self(obs_flat, actions_flat, train)
return q_values.reshape(batch_size, num_actions)
else:
return self(observations, actions, train)
class Policy(nn.Module):
def __init__(
self,
encoder: Optional[nn.Module],
network: nn.Module,
action_dim: int,
std_parameterization: str = "exp",
std_min: float = 1e-5,
std_max: float = 10.0,
tanh_squash_distribution: bool = False,
fixed_std: Optional[torch.Tensor] = None,
init_final: Optional[float] = None,
activate_final: bool = False,
device: str = "cuda"
):
super().__init__()
self.device = torch.device(device)
self.encoder = encoder
self.network = network
self.action_dim = action_dim
self.std_parameterization = std_parameterization
self.std_min = std_min
self.std_max = std_max
self.tanh_squash_distribution = tanh_squash_distribution
self.fixed_std = fixed_std.to(self.device) if fixed_std is not None else None
self.activate_final = activate_final
# Mean layer
if self.activate_final:
self.mean_layer = nn.Linear(network.net[-3].out_features, action_dim)
else:
self.mean_layer = nn.Linear(network.net[-2].out_features, action_dim)
if init_final is not None:
nn.init.uniform_(self.mean_layer.weight, -init_final, init_final)
nn.init.uniform_(self.mean_layer.bias, -init_final, init_final)
else:
orthogonal_init()(self.mean_layer.weight)
# Standard deviation layer or parameter
if fixed_std is None:
if std_parameterization == "uniform":
self.log_stds = nn.Parameter(torch.zeros(action_dim, device=self.device))
else:
if self.activate_final:
self.std_layer = nn.Linear(network.net[-3].out_features, action_dim)
else:
self.std_layer = nn.Linear(network.net[-2].out_features, action_dim)
if init_final is not None:
nn.init.uniform_(self.std_layer.weight, -init_final, init_final)
nn.init.uniform_(self.std_layer.bias, -init_final, init_final)
else:
orthogonal_init()(self.std_layer.weight)
self.to(self.device)
def forward(
self,
observations: torch.Tensor,
temperature: float = 1.0,
train: bool = False,
non_squash_distribution: bool = False
) -> torch.distributions.Distribution:
self.train(train)
# Encode observations if encoder exists
if self.encoder is not None:
with torch.set_grad_enabled(train):
obs_enc = self.encoder(observations, train=train)
else:
obs_enc = observations
# Get network outputs
outputs = self.network(obs_enc)
means = self.mean_layer(outputs)
# Compute standard deviations
if self.fixed_std is None:
if self.std_parameterization == "exp":
log_stds = self.std_layer(outputs)
stds = torch.exp(log_stds)
elif self.std_parameterization == "softplus":
stds = torch.nn.functional.softplus(self.std_layer(outputs))
elif self.std_parameterization == "uniform":
stds = torch.exp(self.log_stds).expand_as(means)
else:
raise ValueError(
f"Invalid std_parameterization: {self.std_parameterization}"
)
else:
assert self.std_parameterization == "fixed"
stds = self.fixed_std.expand_as(means)
# Clip standard deviations and scale with temperature
temperature = torch.tensor(temperature, device=self.device)
stds = torch.clamp(stds, self.std_min, self.std_max) * torch.sqrt(temperature)
# Create distribution
if self.tanh_squash_distribution and not non_squash_distribution:
distribution = TanhMultivariateNormalDiag(
loc=means,
scale_diag=stds,
)
else:
distribution = torch.distributions.Normal(
loc=means,
scale=stds,
)
return distribution
def get_features(self, observations: torch.Tensor) -> torch.Tensor:
"""Get encoded features from observations"""
observations = observations.to(self.device)
if self.encoder is not None:
with torch.no_grad():
return self.encoder(observations, train=False)
return observations
class SACObservationEncoder(nn.Module):
"""Encode image and/or state vector observations.
TODO(ke-wang): The original work allows for (1) stacking multiple history frames and (2) using pretrained resnet encoders.
"""
def __init__(self, config: SACConfig):
"""
Creates encoders for pixel and/or state modalities.
"""
super().__init__()
self.config = config
if "observation.image" in config.input_shapes:
self.image_enc_layers = nn.Sequential(
nn.Conv2d(
config.input_shapes["observation.image"][0], config.image_encoder_hidden_dim, 7, stride=2
),
nn.ReLU(),
nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 5, stride=2),
nn.ReLU(),
nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
nn.ReLU(),
nn.Conv2d(config.image_encoder_hidden_dim, config.image_encoder_hidden_dim, 3, stride=2),
nn.ReLU(),
)
dummy_batch = torch.zeros(1, *config.input_shapes["observation.image"])
with torch.inference_mode():
out_shape = self.image_enc_layers(dummy_batch).shape[1:]
self.image_enc_layers.extend(
nn.Sequential(
nn.Flatten(),
nn.Linear(np.prod(out_shape), config.latent_dim),
nn.LayerNorm(config.latent_dim),
nn.Tanh(),
)
)
if "observation.state" in config.input_shapes:
self.state_enc_layers = nn.Sequential(
nn.Linear(config.input_shapes["observation.state"][0], config.state_encoder_hidden_dim),
nn.ELU(),
nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
nn.LayerNorm(config.latent_dim),
nn.Tanh(),
)
if "observation.environment_state" in config.input_shapes:
self.env_state_enc_layers = nn.Sequential(
nn.Linear(
config.input_shapes["observation.environment_state"][0], config.state_encoder_hidden_dim
),
nn.ELU(),
nn.Linear(config.state_encoder_hidden_dim, config.latent_dim),
nn.LayerNorm(config.latent_dim),
nn.Tanh(),
)
def forward(self, obs_dict: dict[str, Tensor]) -> Tensor:
"""Encode the image and/or state vector.
Each modality is encoded into a feature vector of size (latent_dim,) and then a uniform mean is taken
over all features.
"""
feat = []
# Concatenate all images along the channel dimension.
image_keys = [k for k in self.config.input_shapes if k.startswith("observation.image")]
for image_key in image_keys:
feat.append(flatten_forward_unflatten(self.image_enc_layers, obs_dict[image_key]))
if "observation.environment_state" in self.config.input_shapes:
feat.append(self.env_state_enc_layers(obs_dict["observation.environment_state"]))
if "observation.state" in self.config.input_shapes:
feat.append(self.state_enc_layers(obs_dict["observation.state"]))
return torch.stack(feat, dim=0).mean(0)
class LagrangeMultiplier(nn.Module):
def __init__(
self,
init_value: float = 1.0,
constraint_shape: Sequence[int] = (),
device: str = "cuda"
):
super().__init__()
self.device = torch.device(device)
init_value = torch.log(torch.exp(torch.tensor(init_value, device=self.device)) - 1)
# Initialize the Lagrange multiplier as a parameter
self.lagrange = nn.Parameter(
torch.full(constraint_shape, init_value, dtype=torch.float32, device=self.device)
)
self.to(self.device)
def forward(
self,
lhs: Optional[torch.Tensor] = None,
rhs: Optional[torch.Tensor] = None
) -> torch.Tensor:
# Get the multiplier value based on parameterization
multiplier = torch.nn.functional.softplus(self.lagrange)
# Return the raw multiplier if no constraint values provided
if lhs is None:
return multiplier
# Move inputs to device
lhs = lhs.to(self.device)
if rhs is not None:
rhs = rhs.to(self.device)
# Use the multiplier to compute the Lagrange penalty
if rhs is None:
rhs = torch.zeros_like(lhs, device=self.device)
diff = lhs - rhs
assert diff.shape == multiplier.shape, f"Shape mismatch: {diff.shape} vs {multiplier.shape}"
return multiplier * diff
# The TanhMultivariateNormalDiag is a probability distribution that represents a transformed normal (Gaussian) distribution where:
# 1. The base distribution is a diagonal multivariate normal distribution
# 2. The samples from this normal distribution are transformed through a tanh function, which squashes the values to be between -1 and 1
# 3. Optionally, the values can be further transformed to fit within arbitrary bounds [low, high] using an affine transformation
# This type of distribution is commonly used in reinforcement learning, particularly for continuous action spaces
class TanhMultivariateNormalDiag(torch.distributions.TransformedDistribution):
def __init__(
self,
loc: torch.Tensor,
scale_diag: torch.Tensor,
low: Optional[torch.Tensor] = None,
high: Optional[torch.Tensor] = None,
):
# Create base normal distribution
base_distribution = torch.distributions.Normal(loc=loc, scale=scale_diag)
# Create list of transforms
transforms = []
# Add tanh transform
transforms.append(torch.distributions.transforms.TanhTransform())
# Add rescaling transform if bounds are provided
if low is not None and high is not None:
transforms.append(
torch.distributions.transforms.AffineTransform(
loc=(high + low) / 2,
scale=(high - low) / 2
)
)
# Initialize parent class
super().__init__(
base_distribution=base_distribution,
transforms=transforms
)
# Store parameters
self.loc = loc
self.scale_diag = scale_diag
self.low = low
self.high = high
def mode(self) -> torch.Tensor:
"""Get the mode of the transformed distribution"""
# The mode of a normal distribution is its mean
mode = self.loc
# Apply transforms
for transform in self.transforms:
mode = transform(mode)
return mode
def rsample(self, sample_shape=torch.Size()) -> torch.Tensor:
"""
Reparameterized sample from the distribution
"""
# Sample from base distribution
x = self.base_dist.rsample(sample_shape)
# Apply transforms
for transform in self.transforms:
x = transform(x)
return x
def log_prob(self, value: torch.Tensor) -> torch.Tensor:
"""
Compute log probability of a value
Includes the log det jacobian for the transforms
"""
# Initialize log prob
log_prob = torch.zeros_like(value[..., 0])
# Inverse transforms to get back to normal distribution
q = value
for transform in reversed(self.transforms):
q = transform.inv(q)
log_prob = log_prob - transform.log_abs_det_jacobian(q, transform(q))
# Add base distribution log prob
log_prob = log_prob + self.base_dist.log_prob(q).sum(-1)
return log_prob
def sample_and_log_prob(self, sample_shape=torch.Size()) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Sample from the distribution and compute log probability
"""
x = self.rsample(sample_shape)
log_prob = self.log_prob(x)
return x, log_prob
def entropy(self) -> torch.Tensor:
"""
Compute entropy of the distribution
"""
# Start with base distribution entropy
entropy = self.base_dist.entropy().sum(-1)
# Add log det jacobian for each transform
x = self.rsample()
for transform in self.transforms:
entropy = entropy + transform.log_abs_det_jacobian(x, transform(x))
x = transform(x)
return entropy
def create_critic_ensemble(critic_class, num_critics: int, device: str = "cuda") -> nn.ModuleList:
"""Creates an ensemble of critic networks"""
critics = nn.ModuleList([critic_class() for _ in range(num_critics)])
return critics.to(device)
def orthogonal_init():
return lambda x: torch.nn.init.orthogonal_(x, gain=1.0)
# borrowed from tdmpc
def flatten_forward_unflatten(fn: Callable[[Tensor], Tensor], image_tensor: Tensor) -> Tensor:
"""Helper to temporarily flatten extra dims at the start of the image tensor.
Args:
fn: Callable that the image tensor will be passed to. It should accept (B, C, H, W) and return
(B, *), where * is any number of dimensions.
image_tensor: An image tensor of shape (**, C, H, W), where ** is any number of dimensions and
can be more than 1 dimensions, generally different from *.
Returns:
A return value from the callable reshaped to (**, *).
"""
if image_tensor.ndim == 4:
return fn(image_tensor)
start_dims = image_tensor.shape[:-3]
inp = torch.flatten(image_tensor, end_dim=-4)
flat_out = fn(inp)
return torch.reshape(flat_out, (*start_dims, *flat_out.shape[1:]))

View File

@@ -120,22 +120,14 @@ def predict_action(observation, policy, device, use_amp):
return action
def init_keyboard_listener(assign_rewards=False):
"""
Initializes a keyboard listener to enable early termination of an episode
or environment reset by pressing the right arrow key ('->'). This may require
sudo permissions to allow the terminal to monitor keyboard events.
Args:
assign_rewards (bool): If True, allows annotating the collected trajectory
with a binary reward at the end of the episode to indicate success.
"""
def init_keyboard_listener():
# Allow to exit early while recording an episode or resetting the environment,
# by tapping the right arrow key '->'. This might require a sudo permission
# to allow your terminal to monitor keyboard events.
events = {}
events["exit_early"] = False
events["rerecord_episode"] = False
events["stop_recording"] = False
if assign_rewards:
events["next.reward"] = 0
if is_headless():
logging.warning(
@@ -160,13 +152,6 @@ def init_keyboard_listener(assign_rewards=False):
print("Escape key pressed. Stopping data recording...")
events["stop_recording"] = True
events["exit_early"] = True
elif assign_rewards and key == keyboard.Key.space:
events["next.reward"] = 1 if events["next.reward"] == 0 else 0
print(
"Space key pressed. Assigning new reward to the subsequent frames. New reward:",
events["next.reward"],
)
except Exception as e:
print(f"Error handling key press: {e}")
@@ -199,7 +184,7 @@ def init_policy(pretrained_policy_name_or_path, policy_overrides):
def warmup_record(
robot,
events,
enable_teleoperation,
enable_teloperation,
warmup_time_s,
display_cameras,
fps,
@@ -210,7 +195,7 @@ def warmup_record(
display_cameras=display_cameras,
events=events,
fps=fps,
teleoperate=enable_teleoperation,
teleoperate=enable_teloperation,
)
@@ -287,8 +272,6 @@ def control_loop(
if dataset is not None:
frame = {**observation, **action}
if "next.reward" in events:
frame["next.reward"] = events["next.reward"]
dataset.add_frame(frame)
if display_cameras and not is_headless():
@@ -318,8 +301,6 @@ def reset_environment(robot, events, reset_time_s):
timestamp = 0
start_vencod_t = time.perf_counter()
if "next.reward" in events:
events["next.reward"] = 0
# Wait if necessary
with tqdm.tqdm(total=reset_time_s, desc="Waiting") as pbar:
@@ -372,7 +353,7 @@ def sanity_check_dataset_robot_compatibility(
mismatches = []
for field, dataset_value, present_value in fields:
diff = DeepDiff(dataset_value, present_value, exclude_regex_paths=[r".*\['info'\]$"])
diff = DeepDiff(dataset_value, present_value)
if diff:
mismatches.append(f"{field}: expected {present_value}, got {dataset_value}")

View File

@@ -1,48 +0,0 @@
# @package _global_
defaults:
- _self_
seed: 13
dataset_repo_id: "dataset_repo_id"
train_split_proportion: 0.8
# Required by logger
env:
name: "classifier"
task: "binary_classification"
training:
num_epochs: 5
batch_size: 16
learning_rate: 1e-4
num_workers: 4
grad_clip_norm: 10
use_amp: true
log_freq: 1
eval_freq: 1 # How often to run validation (in epochs)
save_freq: 1 # How often to save checkpoints (in epochs)
save_checkpoint: true
image_key: "observation.images.phone"
label_key: "next.reward"
eval:
batch_size: 16
num_samples_to_log: 30 # Number of validation samples to log in the table
policy:
name: "hilserl/classifier"
model_name: "facebook/convnext-base-224"
model_type: "cnn"
wandb:
enable: false
project: "classifier-training"
entity: "wandb_entity"
job_name: "classifier_training_0"
disable_artifact: false
device: "mps"
resume: false
output_dir: "output"

View File

@@ -10,7 +10,7 @@ max_relative_target: null
leader_arms:
main:
_target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
port: /dev/tty.usbmodem58760430441
port: /dev/tty.usbmodem575E0031751
motors:
# name: (index, model)
shoulder_pan: [1, "xl330-m077"]
@@ -23,7 +23,7 @@ leader_arms:
follower_arms:
main:
_target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
port: /dev/tty.usbmodem585A0083391
port: /dev/tty.usbmodem575E0032081
motors:
# name: (index, model)
shoulder_pan: [1, "xl430-w250"]

View File

@@ -18,7 +18,7 @@ max_relative_target: null
leader_arms:
main:
_target_: lerobot.common.robot_devices.motors.feetech.FeetechMotorsBus
port: /dev/tty.usbmodem58760433331
port: /dev/tty.usbmodem585A0077581
motors:
# name: (index, model)
shoulder_pan: [1, "sts3215"]

View File

@@ -29,6 +29,7 @@ python lerobot/scripts/control_robot.py teleoperate \
```bash
python lerobot/scripts/control_robot.py record \
--fps 30 \
--root tmp/data \
--repo-id $USER/koch_test \
--num-episodes 1 \
--run-compute-stats 0
@@ -37,6 +38,7 @@ python lerobot/scripts/control_robot.py record \
- Visualize dataset:
```bash
python lerobot/scripts/visualize_dataset.py \
--root tmp/data \
--repo-id $USER/koch_test \
--episode-index 0
```
@@ -45,6 +47,7 @@ python lerobot/scripts/visualize_dataset.py \
```bash
python lerobot/scripts/control_robot.py replay \
--fps 30 \
--root tmp/data \
--repo-id $USER/koch_test \
--episode 0
```
@@ -54,6 +57,7 @@ python lerobot/scripts/control_robot.py replay \
```bash
python lerobot/scripts/control_robot.py record \
--fps 30 \
--root data \
--repo-id $USER/koch_pick_place_lego \
--num-episodes 50 \
--warmup-time-s 2 \
@@ -68,12 +72,12 @@ python lerobot/scripts/control_robot.py record \
- Tap escape key 'esc' to stop the data recording.
This might require a sudo permission to allow your terminal to monitor keyboard events.
**NOTE**: You can resume/continue data recording by running the same data recording command and adding `--resume 1`.
If the dataset you want to extend is not on the hub, you also need to add `--local-files-only 1`.
**NOTE**: You can resume/continue data recording by running the same data recording command twice.
To avoid resuming by deleting the dataset, use `--force-override 1`.
- Train on this dataset with the ACT policy:
```bash
python lerobot/scripts/train.py \
DATA_DIR=data python lerobot/scripts/train.py \
policy=act_koch_real \
env=koch_real \
dataset_repo_id=$USER/koch_pick_place_lego \
@@ -84,6 +88,7 @@ python lerobot/scripts/train.py \
```bash
python lerobot/scripts/control_robot.py record \
--fps 30 \
--root data \
--repo-id $USER/eval_act_koch_real \
--num-episodes 10 \
--warmup-time-s 2 \
@@ -186,12 +191,11 @@ def teleoperate(
@safe_disconnect
def record(
robot: Robot,
root: Path,
root: str,
repo_id: str,
single_task: str,
pretrained_policy_name_or_path: str | None = None,
policy_overrides: List[str] | None = None,
assign_rewards: bool = False,
fps: int | None = None,
warmup_time_s: int | float = 2,
episode_time_s: int | float = 10,
@@ -200,7 +204,6 @@ def record(
video: bool = True,
run_compute_stats: bool = True,
push_to_hub: bool = True,
tags: list[str] | None = None,
num_image_writer_processes: int = 0,
num_image_writer_threads_per_camera: int = 4,
display_cameras: bool = True,
@@ -215,9 +218,6 @@ def record(
policy = None
device = None
use_amp = None
extra_features = (
{"next.reward": {"dtype": "int64", "shape": (1,), "names": None}} if assign_rewards else None
)
if single_task:
task = single_task
@@ -258,12 +258,12 @@ def record(
use_videos=video,
image_writer_processes=num_image_writer_processes,
image_writer_threads=num_image_writer_threads_per_camera * len(robot.cameras),
features=extra_features,
)
if not robot.is_connected:
robot.connect()
listener, events = init_keyboard_listener(assign_rewards=assign_rewards)
listener, events = init_keyboard_listener()
# Execute a few seconds without recording to:
# 1. teleoperate the robot to move it in starting position if no policy provided,
@@ -331,7 +331,7 @@ def record(
dataset.consolidate(run_compute_stats)
if push_to_hub:
dataset.push_to_hub(tags=tags)
dataset.push_to_hub()
log_say("Exiting", play_sounds)
return dataset
@@ -345,7 +345,7 @@ def replay(
episode: int,
fps: int | None = None,
play_sounds: bool = True,
local_files_only: bool = False,
local_files_only: bool = True,
):
# TODO(rcadene, aliberts): refactor with control_loop, once `dataset` is an instance of LeRobotDataset
# TODO(rcadene): Add option to record logs
@@ -427,8 +427,8 @@ if __name__ == "__main__":
parser_record.add_argument(
"--root",
type=Path,
default=None,
help="Root directory where the dataset will be stored (e.g. 'dataset/path').",
default="data",
help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').",
)
parser_record.add_argument(
"--repo-id",
@@ -436,12 +436,6 @@ if __name__ == "__main__":
default="lerobot/test",
help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
)
parser_record.add_argument(
"--local-files-only",
type=int,
default=0,
help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.",
)
parser_record.add_argument(
"--warmup-time-s",
type=int,
@@ -473,12 +467,12 @@ if __name__ == "__main__":
default=1,
help="Upload dataset to Hugging Face hub.",
)
# parser_record.add_argument(
# "--tags",
# type=str,
# nargs="*",
# help="Add tags to your dataset on the hub.",
# )
parser_record.add_argument(
"--tags",
type=str,
nargs="*",
help="Add tags to your dataset on the hub.",
)
parser_record.add_argument(
"--num-image-writer-processes",
type=int,
@@ -501,10 +495,10 @@ if __name__ == "__main__":
),
)
parser_record.add_argument(
"--resume",
"--force-override",
type=int,
default=0,
help="Resume recording on an existing dataset.",
help="By default, data recording is resumed. When set to 1, delete the local directory and start data recording from scratch.",
)
parser_record.add_argument(
"-p",
@@ -521,12 +515,6 @@ if __name__ == "__main__":
nargs="*",
help="Any key=value arguments to override config values (use dots for.nested=overrides)",
)
parser_record.add_argument(
"--assign-rewards",
type=int,
default=0,
help="Enables the assignation of rewards to frames (by default no assignation). When enabled, assign a 0 reward to frames until the space bar is pressed which assign a 1 reward. Press the space bar a second time to assign a 0 reward. The reward assigned is reset to 0 when the episode ends.",
)
parser_replay = subparsers.add_parser("replay", parents=[base_parser])
parser_replay.add_argument(
@@ -535,8 +523,8 @@ if __name__ == "__main__":
parser_replay.add_argument(
"--root",
type=Path,
default=None,
help="Root directory where the dataset will be stored (e.g. 'dataset/path').",
default="data",
help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').",
)
parser_replay.add_argument(
"--repo-id",
@@ -544,12 +532,6 @@ if __name__ == "__main__":
default="lerobot/test",
help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
)
parser_replay.add_argument(
"--local-files-only",
type=int,
default=0,
help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.",
)
parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episode to replay.")
args = parser.parse_args()

View File

@@ -1,546 +0,0 @@
"""
Utilities to control a robot in simulation.
Useful to record a dataset, replay a recorded episode and record an evaluation dataset.
Examples of usage:
- Unlimited teleoperation at a limited frequency of 30 Hz, to simulate data recording frequency.
You can modify this value depending on how fast your simulation can run:
```bash
python lerobot/scripts/control_robot.py teleoperate \
--fps 30 \
--robot-path lerobot/configs/robot/your_robot_config.yaml \
--sim-config lerobot/configs/env/your_sim_config.yaml
```
- Record one episode in order to test replay:
```bash
python lerobot/scripts/control_sim_robot.py record \
--robot-path lerobot/configs/robot/your_robot_config.yaml \
--sim-config lerobot/configs/env/your_sim_config.yaml \
--fps 30 \
--repo-id $USER/robot_sim_test \
--num-episodes 1 \
--run-compute-stats 0
```
Enable the --push-to-hub 1 to push the recorded dataset to the huggingface hub.
- Visualize dataset:
```bash
python lerobot/scripts/visualize_dataset.py \
--repo-id $USER/robot_sim_test \
--episode-index 0
```
- Replay a sequence of test episodes:
```bash
python lerobot/scripts/control_sim_robot.py replay \
--robot-path lerobot/configs/robot/your_robot_config.yaml \
--sim-config lerobot/configs/env/your_sim_config.yaml \
--fps 30 \
--repo-id $USER/robot_sim_test \
--episode 0
```
Note: The seed is saved, therefore, during replay we can load the same environment state as the one during collection.
- Record a full dataset in order to train a policy,
30 seconds of recording for each episode, and 10 seconds to reset the environment in between episodes:
```bash
python lerobot/scripts/control_sim_robot.py record \
--robot-path lerobot/configs/robot/your_robot_config.yaml \
--sim-config lerobot/configs/env/your_sim_config.yaml \
--fps 30 \
--repo-id $USER/robot_sim_test \
--num-episodes 50 \
--episode-time-s 30 \
```
**NOTE**: You can use your keyboard to control data recording flow.
- Tap right arrow key '->' to early exit while recording an episode and go to reseting the environment.
- Tap right arrow key '->' to early exit while reseting the environment and got to recording the next episode.
- Tap left arrow key '<-' to early exit and re-record the current episode.
- Tap escape key 'esc' to stop the data recording.
This might require a sudo permission to allow your terminal to monitor keyboard events.
**NOTE**: You can resume/continue data recording by running the same data recording command twice.
"""
import argparse
import importlib
import logging
import time
from pathlib import Path
import cv2
import gymnasium as gym
import numpy as np
import torch
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.robot_devices.control_utils import (
init_keyboard_listener,
init_policy,
is_headless,
log_control_info,
predict_action,
sanity_check_dataset_name,
sanity_check_dataset_robot_compatibility,
stop_recording,
)
from lerobot.common.robot_devices.robots.factory import make_robot
from lerobot.common.robot_devices.robots.utils import Robot
from lerobot.common.robot_devices.utils import busy_wait
from lerobot.common.utils.utils import init_hydra_config, init_logging, log_say
DEFAULT_FEATURES = {
"next.reward": {
"dtype": "float32",
"shape": (1,),
"names": None,
},
"next.success": {
"dtype": "bool",
"shape": (1,),
"names": None,
},
"seed": {
"dtype": "int64",
"shape": (1,),
"names": None,
},
"timestamp": {
"dtype": "float32",
"shape": (1,),
"names": None,
},
}
########################################################################################
# Utilities
########################################################################################
def none_or_int(value):
if value == "None":
return None
return int(value)
def init_sim_calibration(robot, cfg):
# Constants necessary for transforming the joint pos of the real robot to the sim
# depending on the robot discription used in that sim.
start_pos = np.array(robot.leader_arms.main.calibration["start_pos"])
axis_directions = np.array(cfg.get("axis_directions", [1]))
offsets = np.array(cfg.get("offsets", [0])) * np.pi
return {"start_pos": start_pos, "axis_directions": axis_directions, "offsets": offsets}
def real_positions_to_sim(real_positions, axis_directions, start_pos, offsets):
"""Counts - starting position -> radians -> align axes -> offset"""
return axis_directions * (real_positions - start_pos) * 2.0 * np.pi / 4096 + offsets
########################################################################################
# Control modes
########################################################################################
def teleoperate(env, robot: Robot, process_action_fn, teleop_time_s=None):
env = env()
env.reset()
start_teleop_t = time.perf_counter()
while True:
leader_pos = robot.leader_arms.main.read("Present_Position")
action = process_action_fn(leader_pos)
env.step(np.expand_dims(action, 0))
if teleop_time_s is not None and time.perf_counter() - start_teleop_t > teleop_time_s:
print("Teleoperation processes finished.")
break
def record(
env,
robot: Robot,
process_action_from_leader,
root: Path,
repo_id: str,
task: str,
fps: int | None = None,
tags: list[str] | None = None,
pretrained_policy_name_or_path: str = None,
policy_overrides: bool | None = None,
episode_time_s: int = 30,
num_episodes: int = 50,
video: bool = True,
push_to_hub: bool = True,
num_image_writer_processes: int = 0,
num_image_writer_threads_per_camera: int = 4,
display_cameras: bool = False,
play_sounds: bool = True,
resume: bool = False,
local_files_only: bool = False,
run_compute_stats: bool = True,
) -> LeRobotDataset:
# Load pretrained policy
policy = None
if pretrained_policy_name_or_path is not None:
policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides)
if fps is None:
fps = policy_fps
logging.warning(f"No fps provided, so using the fps from policy config ({policy_fps}).")
if policy is None and process_action_from_leader is None:
raise ValueError("Either policy or process_action_fn has to be set to enable control in sim.")
# initialize listener before sim env
listener, events = init_keyboard_listener()
# create sim env
env = env()
# Create empty dataset or load existing saved episodes
num_cameras = sum([1 if "image" in key else 0 for key in env.observation_space])
# get image keys
image_keys = [key for key in env.observation_space if "image" in key]
state_keys_dict = env_cfg.state_keys
if resume:
dataset = LeRobotDataset(
repo_id,
root=root,
local_files_only=local_files_only,
)
dataset.start_image_writer(
num_processes=num_image_writer_processes,
num_threads=num_image_writer_threads_per_camera * num_cameras,
)
sanity_check_dataset_robot_compatibility(dataset, robot, fps, video)
else:
features = DEFAULT_FEATURES
# add image keys to features
for key in image_keys:
shape = env.observation_space[key].shape
if not key.startswith("observation.image."):
key = "observation.image." + key
features[key] = {"dtype": "video", "names": ["channel", "height", "width"], "shape": shape}
for key, obs_key in state_keys_dict.items():
features[key] = {
"dtype": "float32",
"names": None,
"shape": env.observation_space[obs_key].shape,
}
features["action"] = {"dtype": "float32", "shape": env.action_space.shape, "names": None}
# Create empty dataset or load existing saved episodes
sanity_check_dataset_name(repo_id, policy)
dataset = LeRobotDataset.create(
repo_id,
fps,
root=root,
features=features,
use_videos=video,
image_writer_processes=num_image_writer_processes,
image_writer_threads=num_image_writer_threads_per_camera * num_cameras,
)
recorded_episodes = 0
while True:
log_say(f"Recording episode {dataset.num_episodes}", play_sounds)
if events is None:
events = {"exit_early": False}
if episode_time_s is None:
episode_time_s = float("inf")
timestamp = 0
start_episode_t = time.perf_counter()
seed = np.random.randint(0, 1e5)
observation, info = env.reset(seed=seed)
while timestamp < episode_time_s:
start_loop_t = time.perf_counter()
if policy is not None:
action = predict_action(observation, policy, device, use_amp)
else:
leader_pos = robot.leader_arms.main.read("Present_Position")
action = process_action_from_leader(leader_pos)
observation, reward, terminated, _, info = env.step(action)
success = info.get("is_success", False)
env_timestamp = info.get("timestamp", dataset.episode_buffer["size"] / fps)
frame = {
"action": torch.from_numpy(action),
"next.reward": reward,
"next.success": success,
"seed": seed,
"timestamp": env_timestamp,
}
for key in image_keys:
if not key.startswith("observation.image"):
frame["observation.image." + key] = observation[key]
else:
frame[key] = observation[key]
for key, obs_key in state_keys_dict.items():
frame[key] = torch.from_numpy(observation[obs_key])
dataset.add_frame(frame)
if display_cameras and not is_headless():
for key in image_keys:
cv2.imshow(key, cv2.cvtColor(observation[key], cv2.COLOR_RGB2BGR))
cv2.waitKey(1)
if fps is not None:
dt_s = time.perf_counter() - start_loop_t
busy_wait(1 / fps - dt_s)
dt_s = time.perf_counter() - start_loop_t
log_control_info(robot, dt_s, fps=fps)
timestamp = time.perf_counter() - start_episode_t
if events["exit_early"] or terminated:
events["exit_early"] = False
break
if events["rerecord_episode"]:
log_say("Re-record episode", play_sounds)
events["rerecord_episode"] = False
events["exit_early"] = False
dataset.clear_episode_buffer()
continue
dataset.save_episode(task=task)
recorded_episodes += 1
if events["stop_recording"] or recorded_episodes >= num_episodes:
break
else:
logging.info("Waiting for a few seconds before starting next episode recording...")
busy_wait(3)
log_say("Stop recording", play_sounds, blocking=True)
stop_recording(robot, listener, display_cameras)
if run_compute_stats:
logging.info("Computing dataset statistics")
dataset.consolidate(run_compute_stats)
if push_to_hub:
dataset.push_to_hub(tags=tags)
log_say("Exiting", play_sounds)
return dataset
def replay(
env, root: Path, repo_id: str, episode: int, fps: int | None = None, local_files_only: bool = True
):
env = env()
local_dir = Path(root) / repo_id
if not local_dir.exists():
raise ValueError(local_dir)
dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only)
items = dataset.hf_dataset.select_columns("action")
seeds = dataset.hf_dataset.select_columns("seed")["seed"]
from_idx = dataset.episode_data_index["from"][episode].item()
to_idx = dataset.episode_data_index["to"][episode].item()
env.reset(seed=seeds[from_idx].item())
logging.info("Replaying episode")
log_say("Replaying episode", play_sounds=True)
for idx in range(from_idx, to_idx):
start_episode_t = time.perf_counter()
action = items[idx]["action"]
env.step(action.unsqueeze(0).numpy())
dt_s = time.perf_counter() - start_episode_t
busy_wait(1 / fps - dt_s)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest="mode", required=True)
# Set common options for all the subparsers
base_parser = argparse.ArgumentParser(add_help=False)
base_parser.add_argument(
"--robot-path",
type=str,
default="lerobot/configs/robot/koch.yaml",
help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
)
base_parser.add_argument(
"--sim-config",
help="Path to a yaml config you want to use for initializing a sim environment based on gym ",
)
parser_record = subparsers.add_parser("teleoperate", parents=[base_parser])
parser_record = subparsers.add_parser("record", parents=[base_parser])
parser_record.add_argument(
"--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
)
parser_record.add_argument(
"--root",
type=Path,
default=None,
help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').",
)
parser_record.add_argument(
"--repo-id",
type=str,
default="lerobot/test",
help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
)
parser_record.add_argument(
"--episode-time-s",
type=int,
default=60,
help="Number of seconds for data recording for each episode.",
)
parser_record.add_argument(
"--task",
type=str,
required=True,
help="A description of the task preformed during recording that can be used as a language instruction.",
)
parser_record.add_argument("--num-episodes", type=int, default=50, help="Number of episodes to record.")
parser_record.add_argument(
"--run-compute-stats",
type=int,
default=1,
help="By default, run the computation of the data statistics at the end of data collection. Compute intensive and not required to just replay an episode.",
)
parser_record.add_argument(
"--push-to-hub",
type=int,
default=1,
help="Upload dataset to Hugging Face hub.",
)
parser_record.add_argument(
"--tags",
type=str,
nargs="*",
help="Add tags to your dataset on the hub.",
)
parser_record.add_argument(
"--num-image-writer-processes",
type=int,
default=0,
help=(
"Number of subprocesses handling the saving of frames as PNGs. Set to 0 to use threads only; "
"set to ≥1 to use subprocesses, each using threads to write images. The best number of processes "
"and threads depends on your system. We recommend 4 threads per camera with 0 processes. "
"If fps is unstable, adjust the thread count. If still unstable, try using 1 or more subprocesses."
),
)
parser_record.add_argument(
"--num-image-writer-threads-per-camera",
type=int,
default=4,
help=(
"Number of threads writing the frames as png images on disk, per camera. "
"Too much threads might cause unstable teleoperation fps due to main thread being blocked. "
"Not enough threads might cause low camera fps."
),
)
parser_record.add_argument(
"--display-cameras",
type=int,
default=0,
help="Visualize image observations with opencv.",
)
parser_record.add_argument(
"--resume",
type=int,
default=0,
help="Resume recording on an existing dataset.",
)
parser_replay = subparsers.add_parser("replay", parents=[base_parser])
parser_replay.add_argument(
"--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
)
parser_replay.add_argument(
"--root",
type=Path,
default=None,
help="Root directory where the dataset will be stored locally (e.g. 'data/hf_username/dataset_name'). By default, stored in cache folder.",
)
parser_replay.add_argument(
"--repo-id",
type=str,
default="lerobot/test",
help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
)
parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episodes to replay.")
args = parser.parse_args()
init_logging()
control_mode = args.mode
robot_path = args.robot_path
env_config_path = args.sim_config
kwargs = vars(args)
del kwargs["mode"]
del kwargs["robot_path"]
del kwargs["sim_config"]
# make gym env
env_cfg = init_hydra_config(env_config_path)
importlib.import_module(f"gym_{env_cfg.env.name}")
def env_constructor():
return gym.make(env_cfg.env.handle, disable_env_checker=True, **env_cfg.env.gym)
robot = None
process_leader_actions_fn = None
if control_mode in ["teleoperate", "record"]:
# make robot
robot_overrides = ["~cameras", "~follower_arms"]
robot_cfg = init_hydra_config(robot_path, robot_overrides)
robot = make_robot(robot_cfg)
robot.connect()
calib_kwgs = init_sim_calibration(robot, env_cfg.calibration)
def process_leader_actions_fn(action):
return real_positions_to_sim(action, **calib_kwgs)
robot.leader_arms.main.calibration = None
if control_mode == "teleoperate":
teleoperate(env_constructor, robot, process_leader_actions_fn)
elif control_mode == "record":
record(env_constructor, robot, process_leader_actions_fn, **kwargs)
elif control_mode == "replay":
replay(env_constructor, **kwargs)
else:
raise ValueError(
f"Invalid control mode: '{control_mode}', only valid modes are teleoperate, record and replay."
)
if robot and robot.is_connected:
# Disconnect manually to avoid a "Core dump" during process
# termination due to camera threads not properly exiting.
robot.disconnect()

View File

@@ -1,335 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluate a policy by running rollouts on the real robot and computing metrics.
Usage examples: evaluate a checkpoint from the LeRobot training script for 10 episodes.
```
python lerobot/scripts/eval_on_robot.py \
-p outputs/train/model/checkpoints/005000/pretrained_model \
eval.n_episodes=10
```
**NOTE** (michel-aractingi): This script is incomplete and it is being prepared
for running training on the real robot.
"""
import argparse
import logging
import time
from copy import deepcopy
import numpy as np
import torch
from tqdm import trange
from lerobot.common.policies.policy_protocol import Policy
from lerobot.common.robot_devices.control_utils import busy_wait, is_headless
from lerobot.common.robot_devices.robots.factory import Robot, make_robot
from lerobot.common.utils.utils import (
init_hydra_config,
init_logging,
log_say,
)
def rollout(robot: Robot, policy: Policy, fps: int, control_time_s: float = 20, use_amp: bool = True) -> dict:
"""Run a batched policy rollout on the real robot.
The return dictionary contains:
"robot": A a dictionary of (batch, sequence + 1, *) tensors mapped to observation
keys. NOTE the that this has an extra sequence element relative to the other keys in the
dictionary. This is because an extra observation is included for after the environment is
terminated or truncated.
"action": A (batch, sequence, action_dim) tensor of actions applied based on the observations (not
including the last observations).
"reward": A (batch, sequence) tensor of rewards received for applying the actions.
"success": A (batch, sequence) tensor of success conditions (the only time this can be True is upon
environment termination/truncation).
"done": A (batch, sequence) tensor of **cumulative** done conditions. For any given batch element,
the first True is followed by True's all the way till the end. This can be used for masking
extraneous elements from the sequences above.
Args:
robot: The robot class that defines the interface with the real robot.
policy: The policy. Must be a PyTorch nn module.
Returns:
The dictionary described above.
"""
# assert isinstance(policy, nn.Module), "Policy must be a PyTorch nn module."
# device = get_device_from_parameters(policy)
# define keyboard listener
listener, events = init_keyboard_listener()
# Reset the policy. TODO (michel-aractingi) add real policy evaluation once the code is ready.
# policy.reset()
# Get observation from real robot
observation = robot.capture_observation()
# Calculate reward. TODO (michel-aractingi)
# in HIL-SERL it will be with a reward classifier
reward = calculate_reward(observation)
all_observations = []
all_actions = []
all_rewards = []
all_successes = []
start_episode_t = time.perf_counter()
timestamp = 0.0
while timestamp < control_time_s:
start_loop_t = time.perf_counter()
all_observations.append(deepcopy(observation))
# observation = {key: observation[key].to(device, non_blocking=True) for key in observation}
# Apply the next action.
while events["pause_policy"] and not events["human_intervention_step"]:
busy_wait(0.5)
if events["human_intervention_step"]:
# take over the robot's actions
observation, action = robot.teleop_step(record_data=True)
action = action["action"] # teleop step returns torch tensors but in a dict
else:
# explore with policy
with torch.inference_mode():
action = robot.follower_arms["main"].read("Present_Position")
action = torch.from_numpy(action)
robot.send_action(action)
# action = predict_action(observation, policy, device, use_amp)
observation = robot.capture_observation()
# Calculate reward
# in HIL-SERL it will be with a reward classifier
reward = calculate_reward(observation)
all_actions.append(action)
all_rewards.append(torch.from_numpy(reward))
all_successes.append(torch.tensor([False]))
dt_s = time.perf_counter() - start_loop_t
busy_wait(1 / fps - dt_s)
timestamp = time.perf_counter() - start_episode_t
if events["exit_early"]:
events["exit_early"] = False
events["human_intervention_step"] = False
events["pause_policy"] = False
break
all_observations.append(deepcopy(observation))
dones = torch.tensor([False] * len(all_actions))
dones[-1] = True
# Stack the sequence along the first dimension so that we have (batch, sequence, *) tensors.
ret = {
"action": torch.stack(all_actions, dim=1),
"next.reward": torch.stack(all_rewards, dim=1),
"next.success": torch.stack(all_successes, dim=1),
"done": dones,
}
stacked_observations = {}
for key in all_observations[0]:
stacked_observations[key] = torch.stack([obs[key] for obs in all_observations], dim=1)
ret["observation"] = stacked_observations
listener.stop()
return ret
def eval_policy(
robot: Robot,
policy: torch.nn.Module,
fps: float,
n_episodes: int,
control_time_s: int = 20,
use_amp: bool = True,
) -> dict:
"""
Args:
env: The batch of environments.
policy: The policy.
n_episodes: The number of episodes to evaluate.
Returns:
Dictionary with metrics and data regarding the rollouts.
"""
# TODO (michel-aractingi) comment this out for testing with a fixed policy
# assert isinstance(policy, Policy)
# policy.eval()
sum_rewards = []
max_rewards = []
successes = []
rollouts = []
start_eval = time.perf_counter()
progbar = trange(n_episodes, desc="Evaluating policy on real robot")
for _batch_idx in progbar:
rollout_data = rollout(robot, policy, fps, control_time_s, use_amp)
rollouts.append(rollout_data)
sum_rewards.append(sum(rollout_data["next.reward"]))
max_rewards.append(max(rollout_data["next.reward"]))
successes.append(rollout_data["next.success"][-1])
info = {
"per_episode": [
{
"episode_ix": i,
"sum_reward": sum_reward,
"max_reward": max_reward,
"pc_success": success * 100,
}
for i, (sum_reward, max_reward, success) in enumerate(
zip(
sum_rewards[:n_episodes],
max_rewards[:n_episodes],
successes[:n_episodes],
strict=False,
)
)
],
"aggregated": {
"avg_sum_reward": float(np.nanmean(torch.cat(sum_rewards[:n_episodes]))),
"avg_max_reward": float(np.nanmean(torch.cat(max_rewards[:n_episodes]))),
"pc_success": float(np.nanmean(torch.cat(successes[:n_episodes])) * 100),
"eval_s": time.time() - start_eval,
"eval_ep_s": (time.time() - start_eval) / n_episodes,
},
}
if robot.is_connected:
robot.disconnect()
return info
def calculate_reward(observation):
"""
Method to calculate reward function in some way.
In HIL-SERL this is done through defining a reward classifier
"""
# reward = reward_classifier(observation)
return np.array([0.0])
def init_keyboard_listener():
# Allow to exit early while recording an episode or resetting the environment,
# by tapping the right arrow key '->'. This might require a sudo permission
# to allow your terminal to monitor keyboard events.
events = {}
events["exit_early"] = False
events["rerecord_episode"] = False
events["pause_policy"] = False
events["human_intervention_step"] = False
if is_headless():
logging.warning(
"Headless environment detected. On-screen cameras display and keyboard inputs will not be available."
)
listener = None
return listener, events
# Only import pynput if not in a headless environment
from pynput import keyboard
def on_press(key):
try:
if key == keyboard.Key.right:
print("Right arrow key pressed. Exiting loop...")
events["exit_early"] = True
elif key == keyboard.Key.left:
print("Left arrow key pressed. Exiting loop and rerecord the last episode...")
events["rerecord_episode"] = True
events["exit_early"] = True
elif key == keyboard.Key.space:
# check if first space press then pause the policy for the user to get ready
# if second space press then the user is ready to start intervention
if not events["pause_policy"]:
print(
"Space key pressed. Human intervention required.\n"
"Place the leader in similar pose to the follower and press space again."
)
events["pause_policy"] = True
log_say("Human intervention stage. Get ready to take over.", play_sounds=True)
else:
events["human_intervention_step"] = True
print("Space key pressed. Human intervention starting.")
log_say("Starting human intervention.", play_sounds=True)
except Exception as e:
print(f"Error handling key press: {e}")
listener = keyboard.Listener(on_press=on_press)
listener.start()
return listener, events
if __name__ == "__main__":
init_logging()
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--robot-path",
type=str,
default="lerobot/configs/robot/koch.yaml",
help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
)
group.add_argument(
"--robot-overrides",
type=str,
nargs="*",
help="Any key=value arguments to override config values (use dots for.nested=overrides)",
)
group.add_argument(
"-p",
"--pretrained-policy-name-or-path",
help=(
"Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
"saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch "
"(useful for debugging). This argument is mutually exclusive with `--config`."
),
)
group.add_argument(
"--config",
help=(
"Path to a yaml config you want to use for initializing a policy from scratch (useful for "
"debugging). This argument is mutually exclusive with `--pretrained-policy-name-or-path` (`-p`)."
),
)
parser.add_argument("--revision", help="Optionally provide the Hugging Face Hub revision ID.")
parser.add_argument(
"--out-dir",
help=(
"Where to save the evaluation outputs. If not provided, outputs are saved in "
"outputs/eval/{timestamp}_{env_name}_{policy_name}"
),
)
args = parser.parse_args()
robot_cfg = init_hydra_config(args.robot_path, args.robot_overrides)
robot = make_robot(robot_cfg)
if not robot.is_connected:
robot.connect()
eval_policy(robot, None, fps=40, n_episodes=2, control_time_s=100)

View File

@@ -66,7 +66,7 @@ def get_from_raw_to_lerobot_format_fn(raw_format: str):
from lerobot.common.datasets.push_dataset_to_hub.umi_zarr_format import from_raw_to_lerobot_format
elif raw_format == "aloha_hdf5":
from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format
elif raw_format in ["rlds", "openx"]:
elif "openx_rlds" in raw_format:
from lerobot.common.datasets.push_dataset_to_hub.openx_rlds_format import from_raw_to_lerobot_format
elif raw_format == "dora_parquet":
from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import from_raw_to_lerobot_format
@@ -204,14 +204,24 @@ def push_dataset_to_hub(
# convert dataset from original raw format to LeRobot format
from_raw_to_lerobot_format = get_from_raw_to_lerobot_format_fn(raw_format)
hf_dataset, episode_data_index, info = from_raw_to_lerobot_format(
raw_dir,
videos_dir,
fps,
video,
episodes,
encoding,
)
fmt_kwgs = {
"raw_dir": raw_dir,
"videos_dir": videos_dir,
"fps": fps,
"video": video,
"episodes": episodes,
"encoding": encoding,
}
if "openx_rlds." in raw_format:
# Support for official OXE dataset name inside `raw_format`.
# For instance, `raw_format="oxe_rlds"` uses the default formating (TODO what does that mean?),
# and `raw_format="oxe_rlds.bridge_orig"` uses the brdige_orig formating
_, openx_dataset_name = raw_format.split(".")
print(f"Converting dataset [{openx_dataset_name}] from 'openx_rlds' to LeRobot format.")
fmt_kwgs["openx_dataset_name"] = openx_dataset_name
hf_dataset, episode_data_index, info = from_raw_to_lerobot_format(**fmt_kwgs)
lerobot_dataset = LeRobotDataset.from_preloaded(
repo_id=repo_id,
@@ -280,7 +290,7 @@ def main():
"--raw-format",
type=str,
required=True,
help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`, `rlds`, `openx`).",
help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`, `openx_rlds`).",
)
parser.add_argument(
"--repo-id",

View File

@@ -1,310 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
from contextlib import nullcontext
from pathlib import Path
from pprint import pformat
import hydra
import torch
import torch.nn as nn
import wandb
from deepdiff import DeepDiff
from omegaconf import DictConfig, OmegaConf
from termcolor import colored
from torch import optim
from torch.cuda.amp import GradScaler
from torch.utils.data import DataLoader, WeightedRandomSampler, random_split
from tqdm import tqdm
from lerobot.common.datasets.factory import resolve_delta_timestamps
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.logger import Logger
from lerobot.common.policies.factory import _policy_cfg_from_hydra_cfg
from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
from lerobot.common.utils.utils import (
format_big_number,
get_safe_torch_device,
init_hydra_config,
set_global_seed,
)
def get_model(cfg, logger):
classifier_config = _policy_cfg_from_hydra_cfg(ClassifierConfig, cfg)
model = Classifier(classifier_config)
if cfg.resume:
model.load_state_dict(Classifier.from_pretrained(str(logger.last_pretrained_model_dir)).state_dict())
return model
def create_balanced_sampler(dataset, cfg):
# Creates a weighted sampler to handle class imbalance
labels = torch.tensor([item[cfg.training.label_key] for item in dataset])
_, counts = torch.unique(labels, return_counts=True)
class_weights = 1.0 / counts.float()
sample_weights = class_weights[labels]
return WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
def train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg):
# Single epoch training loop with AMP support and progress tracking
model.train()
correct = 0
total = 0
pbar = tqdm(train_loader, desc="Training")
for batch_idx, batch in enumerate(pbar):
start_time = time.perf_counter()
images = batch[cfg.training.image_key].to(device)
labels = batch[cfg.training.label_key].float().to(device)
# Forward pass with optional AMP
with torch.autocast(device_type=device.type) if cfg.training.use_amp else nullcontext():
outputs = model(images)
loss = criterion(outputs.logits, labels)
# Backward pass with gradient scaling if AMP enabled
optimizer.zero_grad()
if cfg.training.use_amp:
grad_scaler.scale(loss).backward()
grad_scaler.step(optimizer)
grad_scaler.update()
else:
loss.backward()
optimizer.step()
# Track metrics
if model.config.num_classes == 2:
predictions = (torch.sigmoid(outputs.logits) > 0.5).float()
else:
predictions = torch.argmax(outputs.logits, dim=1)
correct += (predictions == labels).sum().item()
total += labels.size(0)
current_acc = 100 * correct / total
train_info = {
"loss": loss.item(),
"accuracy": current_acc,
"dataloading_s": time.perf_counter() - start_time,
}
logger.log_dict(train_info, step + batch_idx, mode="train")
pbar.set_postfix({"loss": f"{loss.item():.4f}", "acc": f"{current_acc:.2f}%"})
def validate(model, val_loader, criterion, device, logger, cfg, num_samples_to_log=8):
# Validation loop with metric tracking and sample logging
model.eval()
correct = 0
total = 0
batch_start_time = time.perf_counter()
samples = []
running_loss = 0
with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.training.use_amp else nullcontext():
for batch in tqdm(val_loader, desc="Validation"):
images = batch[cfg.training.image_key].to(device)
labels = batch[cfg.training.label_key].float().to(device)
outputs = model(images)
loss = criterion(outputs.logits, labels)
# Track metrics
if model.config.num_classes == 2:
predictions = (torch.sigmoid(outputs.logits) > 0.5).float()
else:
predictions = torch.argmax(outputs.logits, dim=1)
correct += (predictions == labels).sum().item()
total += labels.size(0)
running_loss += loss.item()
# Log sample predictions for visualization
if len(samples) < num_samples_to_log:
for i in range(min(num_samples_to_log - len(samples), len(images))):
if model.config.num_classes == 2:
confidence = round(outputs.probabilities[i].item(), 3)
else:
confidence = [round(prob, 3) for prob in outputs.probabilities[i].tolist()]
samples.append(
{
"image": wandb.Image(images[i].cpu()),
"true_label": labels[i].item(),
"predicted": predictions[i].item(),
"confidence": confidence,
}
)
accuracy = 100 * correct / total
avg_loss = running_loss / len(val_loader)
eval_info = {
"loss": avg_loss,
"accuracy": accuracy,
"eval_s": time.perf_counter() - batch_start_time,
"eval/prediction_samples": wandb.Table(
data=[[s["image"], s["true_label"], s["predicted"], f"{s['confidence']}"] for s in samples],
columns=["Image", "True Label", "Predicted", "Confidence"],
)
if logger._cfg.wandb.enable
else None,
}
return accuracy, eval_info
@hydra.main(version_base="1.2", config_path="../configs", config_name="hilserl_classifier")
def train(cfg: DictConfig) -> None:
# Main training pipeline with support for resuming training
logging.info(OmegaConf.to_yaml(cfg))
# Initialize training environment
device = get_safe_torch_device(cfg.device, log=True)
set_global_seed(cfg.seed)
out_dir = Path(cfg.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
logger = Logger(cfg, out_dir, cfg.wandb.job_name if cfg.wandb.enable else None)
# Setup dataset and dataloaders
dataset = LeRobotDataset(cfg.dataset_repo_id)
logging.info(f"Dataset size: {len(dataset)}")
train_size = int(cfg.train_split_proportion * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
sampler = create_balanced_sampler(train_dataset, cfg)
train_loader = DataLoader(
train_dataset,
batch_size=cfg.training.batch_size,
num_workers=cfg.training.num_workers,
sampler=sampler,
pin_memory=True,
)
val_loader = DataLoader(
val_dataset,
batch_size=cfg.eval.batch_size,
shuffle=False,
num_workers=cfg.training.num_workers,
pin_memory=True,
)
# Resume training if requested
step = 0
best_val_acc = 0
if cfg.resume:
if not Logger.get_last_checkpoint_dir(out_dir).exists():
raise RuntimeError(
"You have set resume=True, but there is no model checkpoint in "
f"{Logger.get_last_checkpoint_dir(out_dir)}"
)
checkpoint_cfg_path = str(Logger.get_last_pretrained_model_dir(out_dir) / "config.yaml")
logging.info(
colored(
"You have set resume=True, indicating that you wish to resume a run",
color="yellow",
attrs=["bold"],
)
)
# Load and validate checkpoint configuration
checkpoint_cfg = init_hydra_config(checkpoint_cfg_path)
# Check for differences between the checkpoint configuration and provided configuration.
# Hack to resolve the delta_timestamps ahead of time in order to properly diff.
resolve_delta_timestamps(cfg)
diff = DeepDiff(OmegaConf.to_container(checkpoint_cfg), OmegaConf.to_container(cfg))
# Ignore the `resume` and parameters.
if "values_changed" in diff and "root['resume']" in diff["values_changed"]:
del diff["values_changed"]["root['resume']"]
if len(diff) > 0:
logging.warning(
"At least one difference was detected between the checkpoint configuration and "
f"the provided configuration: \n{pformat(diff)}\nNote that the checkpoint configuration "
"takes precedence.",
)
# Use the checkpoint config instead of the provided config (but keep `resume` parameter).
cfg = checkpoint_cfg
cfg.resume = True
# Initialize model and training components
model = get_model(cfg=cfg, logger=logger).to(device)
optimizer = optim.AdamW(model.parameters(), lr=cfg.training.learning_rate)
# Use BCEWithLogitsLoss for binary classification and CrossEntropyLoss for multi-class
criterion = nn.BCEWithLogitsLoss() if model.config.num_classes == 2 else nn.CrossEntropyLoss()
grad_scaler = GradScaler(enabled=cfg.training.use_amp)
# Log model parameters
num_learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_total_params = sum(p.numel() for p in model.parameters())
logging.info(f"Learnable parameters: {format_big_number(num_learnable_params)}")
logging.info(f"Total parameters: {format_big_number(num_total_params)}")
if cfg.resume:
step = logger.load_last_training_state(optimizer, None)
# Training loop with validation and checkpointing
for epoch in range(cfg.training.num_epochs):
logging.info(f"\nEpoch {epoch+1}/{cfg.training.num_epochs}")
train_epoch(model, train_loader, criterion, optimizer, grad_scaler, device, logger, step, cfg)
# Periodic validation
if cfg.training.eval_freq > 0 and (epoch + 1) % cfg.training.eval_freq == 0:
val_acc, eval_info = validate(
model,
val_loader,
criterion,
device,
logger,
cfg,
)
logger.log_dict(eval_info, step + len(train_loader), mode="eval")
# Save best model
if val_acc > best_val_acc:
best_val_acc = val_acc
logger.save_checkpoint(
train_step=step + len(train_loader),
policy=model,
optimizer=optimizer,
scheduler=None,
identifier="best",
)
# Periodic checkpointing
if cfg.training.save_checkpoint and (epoch + 1) % cfg.training.save_freq == 0:
logger.save_checkpoint(
train_step=step + len(train_loader),
policy=model,
optimizer=optimizer,
scheduler=None,
identifier=f"{epoch+1:06d}",
)
step += len(train_loader)
logging.info("Training completed")
if __name__ == "__main__":
train()

View File

@@ -207,17 +207,11 @@ def main():
required=True,
help="Episode to visualize.",
)
parser.add_argument(
"--local-files-only",
type=int,
default=0,
help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.",
)
parser.add_argument(
"--root",
type=Path,
default=None,
help="Root directory for the dataset stored locally (e.g. `--root data`). By default, the dataset will be loaded from hugging face cache folder, or downloaded from the hub if available.",
help="Root directory for a dataset stored locally (e.g. `--root data`). By default, the dataset will be loaded from hugging face cache folder, or downloaded from the hub if available.",
)
parser.add_argument(
"--output-dir",
@@ -275,10 +269,9 @@ def main():
kwargs = vars(args)
repo_id = kwargs.pop("repo_id")
root = kwargs.pop("root")
local_files_only = kwargs.pop("local_files_only")
logging.info("Loading dataset")
dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only)
dataset = LeRobotDataset(repo_id, root=root, local_files_only=True)
visualize_dataset(dataset, **vars(args))

View File

@@ -234,12 +234,6 @@ def main():
required=True,
help="Name of hugging face repositery containing a LeRobotDataset dataset (e.g. `lerobot/pusht` for https://huggingface.co/datasets/lerobot/pusht).",
)
parser.add_argument(
"--local-files-only",
type=int,
default=0,
help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.",
)
parser.add_argument(
"--root",
type=Path,
@@ -288,9 +282,7 @@ def main():
kwargs = vars(args)
repo_id = kwargs.pop("repo_id")
root = kwargs.pop("root")
local_files_only = kwargs.pop("local_files_only")
dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only)
dataset = LeRobotDataset(repo_id, root=root, local_files_only=True)
visualize_dataset_html(dataset, **kwargs)

View File

@@ -158,7 +158,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
assert dataset.meta.total_episodes == 2
assert len(dataset) == 2
replay(robot, episode=0, fps=1, root=root, repo_id=repo_id, play_sounds=False, local_files_only=True)
replay(robot, episode=0, fps=1, root=root, repo_id=repo_id, play_sounds=False)
# TODO(rcadene, aliberts): rethink this design
if robot_type == "aloha":
@@ -295,12 +295,24 @@ def test_resume_record(tmpdir, request, robot_type, mock):
dataset = record(**record_kwargs)
assert len(dataset) == 1, f"`dataset` should contain 1 frame, not {len(dataset)}"
# init_dataset_return_value = {}
# def wrapped_init_dataset(*args, **kwargs):
# nonlocal init_dataset_return_value
# init_dataset_return_value = init_dataset(*args, **kwargs)
# return init_dataset_return_value
# with patch("lerobot.scripts.control_robot.init_dataset", wraps=wrapped_init_dataset):
with pytest.raises(FileExistsError):
# Dataset already exists, but resume=False by default
record(**record_kwargs)
dataset = record(**record_kwargs, resume=True)
assert len(dataset) == 2, f"`dataset` should contain 2 frames, not {len(dataset)}"
# assert (
# init_dataset_return_value["num_episodes"] == 2
# ), "`init_dataset` should load the previous episode"
@pytest.mark.parametrize("robot_type, mock", [("koch", True)])

View File

@@ -383,7 +383,7 @@ def test_backward_compatibility(env_name, policy_name, extra_overrides, file_nam
include a report on what changed and how that affected the outputs.
2. Go to the `if __name__ == "__main__"` block of `tests/scripts/save_policy_to_safetensors.py` and
add the policies you want to update the test artifacts for.
3. Run `python tests/scripts/save_policy_to_safetensors.py`. The test artifact
3. Run `DATA_DIR=tests/data python tests/scripts/save_policy_to_safetensors.py`. The test artifact
should be updated.
4. Check that this test now passes.
5. Remember to restore `tests/scripts/save_policy_to_safetensors.py` to its original state.

View File

@@ -5,7 +5,7 @@ we skip them for now in our CI.
Example to run backward compatiblity tests locally:
```
python -m pytest --run-skipped tests/test_push_dataset_to_hub.py::test_push_dataset_to_hub_pusht_backward_compatibility
DATA_DIR=tests/data python -m pytest --run-skipped tests/test_push_dataset_to_hub.py::test_push_dataset_to_hub_pusht_backward_compatibility
```
"""
@@ -330,7 +330,7 @@ def test_push_dataset_to_hub_format(required_packages, tmpdir, raw_format, repo_
],
)
@pytest.mark.skip(
"Not compatible with our CI since it downloads raw datasets. Run with `python -m pytest --run-skipped tests/test_push_dataset_to_hub.py::test_push_dataset_to_hub_pusht_backward_compatibility`"
"Not compatible with our CI since it downloads raw datasets. Run with `DATA_DIR=tests/data python -m pytest --run-skipped tests/test_push_dataset_to_hub.py::test_push_dataset_to_hub_pusht_backward_compatibility`"
)
def test_push_dataset_to_hub_pusht_backward_compatibility(tmpdir, raw_format, repo_id):
_, dataset_id = repo_id.split("/")

View File

@@ -1,251 +0,0 @@
import os
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
import torch
from hydra import compose, initialize_config_dir
from torch import nn
from torch.utils.data import Dataset
from lerobot.common.policies.hilserl.classifier.configuration_classifier import ClassifierConfig
from lerobot.common.policies.hilserl.classifier.modeling_classifier import Classifier
from lerobot.scripts.train_hilserl_classifier import (
create_balanced_sampler,
train,
train_epoch,
validate,
)
class MockDataset(Dataset):
def __init__(self, data):
self.data = data
self.meta = MagicMock()
self.meta.stats = {}
def __getitem__(self, idx):
return self.data[idx]
def __len__(self):
return len(self.data)
def make_dummy_model():
model_config = ClassifierConfig(num_classes=2, model_name="hf-tiny-model-private/tiny-random-ResNetModel")
model = Classifier(config=model_config)
return model
def test_create_balanced_sampler():
# Mock dataset with imbalanced classes
data = [
{"label": 0},
{"label": 0},
{"label": 1},
{"label": 0},
{"label": 1},
{"label": 1},
{"label": 1},
{"label": 1},
]
dataset = MockDataset(data)
cfg = MagicMock()
cfg.training.label_key = "label"
sampler = create_balanced_sampler(dataset, cfg)
# Get weights from the sampler
weights = sampler.weights.float()
# Check that samples have appropriate weights
labels = [item["label"] for item in data]
class_counts = torch.tensor([labels.count(0), labels.count(1)], dtype=torch.float32)
class_weights = 1.0 / class_counts
expected_weights = torch.tensor([class_weights[label] for label in labels], dtype=torch.float32)
# Test that the weights are correct
assert torch.allclose(weights, expected_weights)
def test_train_epoch():
model = make_dummy_model()
# Mock components
model.train = MagicMock()
train_loader = [
{
"image": torch.rand(2, 3, 224, 224),
"label": torch.tensor([0.0, 1.0]),
}
]
criterion = nn.BCEWithLogitsLoss()
optimizer = MagicMock()
grad_scaler = MagicMock()
device = torch.device("cpu")
logger = MagicMock()
step = 0
cfg = MagicMock()
cfg.training.image_key = "image"
cfg.training.label_key = "label"
cfg.training.use_amp = False
# Call the function under test
train_epoch(
model,
train_loader,
criterion,
optimizer,
grad_scaler,
device,
logger,
step,
cfg,
)
# Check that model.train() was called
model.train.assert_called_once()
# Check that optimizer.zero_grad() was called
optimizer.zero_grad.assert_called()
# Check that logger.log_dict was called
logger.log_dict.assert_called()
def test_validate():
model = make_dummy_model()
# Mock components
model.eval = MagicMock()
val_loader = [
{
"image": torch.rand(2, 3, 224, 224),
"label": torch.tensor([0.0, 1.0]),
}
]
criterion = nn.BCEWithLogitsLoss()
device = torch.device("cpu")
logger = MagicMock()
cfg = MagicMock()
cfg.training.image_key = "image"
cfg.training.label_key = "label"
cfg.training.use_amp = False
# Call validate
accuracy, eval_info = validate(model, val_loader, criterion, device, logger, cfg)
# Check that model.eval() was called
model.eval.assert_called_once()
# Check accuracy/eval_info are calculated and of the correct type
assert isinstance(accuracy, float)
assert isinstance(eval_info, dict)
@pytest.mark.parametrize("resume", [True, False])
@patch("lerobot.scripts.train_hilserl_classifier.init_hydra_config")
@patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_checkpoint_dir")
@patch("lerobot.scripts.train_hilserl_classifier.Logger.get_last_pretrained_model_dir")
@patch("lerobot.scripts.train_hilserl_classifier.Logger")
@patch("lerobot.scripts.train_hilserl_classifier.LeRobotDataset")
@patch("lerobot.scripts.train_hilserl_classifier.make_policy")
def test_resume_function(
mock_make_policy,
mock_dataset,
mock_logger,
mock_get_last_pretrained_model_dir,
mock_get_last_checkpoint_dir,
mock_init_hydra_config,
resume,
):
# Initialize Hydra
test_file_dir = os.path.dirname(os.path.abspath(__file__))
config_dir = os.path.abspath(os.path.join(test_file_dir, "..", "lerobot", "configs", "policy"))
assert os.path.exists(config_dir), f"Config directory does not exist at {config_dir}"
with initialize_config_dir(config_dir=config_dir, job_name="test_app", version_base="1.2"):
cfg = compose(
config_name="reward_classifier",
overrides=[
"device=cpu",
"seed=42",
f"output_dir={tempfile.mkdtemp()}",
"wandb.enable=False",
f"resume={resume}",
"dataset_repo_id=dataset_repo_id",
"train_split_proportion=0.8",
"training.num_workers=0",
"training.batch_size=2",
"training.image_key=image",
"training.label_key=label",
"training.use_amp=False",
"training.num_epochs=1",
"eval.batch_size=2",
],
)
# Mock the init_hydra_config function to return cfg
mock_init_hydra_config.return_value = cfg
# Mock dataset
dataset = MockDataset([{"image": torch.rand(3, 224, 224), "label": i % 2} for i in range(10)])
mock_dataset.return_value = dataset
# Mock checkpoint handling
mock_checkpoint_dir = MagicMock(spec=Path)
mock_checkpoint_dir.exists.return_value = resume # Only exists if resuming
mock_get_last_checkpoint_dir.return_value = mock_checkpoint_dir
mock_get_last_pretrained_model_dir.return_value = Path(tempfile.mkdtemp())
# Mock logger
logger = MagicMock()
resumed_step = 1000
if resume:
logger.load_last_training_state.return_value = resumed_step
else:
logger.load_last_training_state.return_value = 0
mock_logger.return_value = logger
# Instantiate the model and set make_policy to return it
model = make_dummy_model()
mock_make_policy.return_value = model
# Call train
train(cfg)
# Check that checkpoint handling methods were called
if resume:
mock_get_last_checkpoint_dir.assert_called_once_with(Path(cfg.output_dir))
mock_get_last_pretrained_model_dir.assert_called_once_with(Path(cfg.output_dir))
mock_checkpoint_dir.exists.assert_called_once()
logger.load_last_training_state.assert_called_once()
else:
mock_get_last_checkpoint_dir.assert_not_called()
mock_get_last_pretrained_model_dir.assert_not_called()
mock_checkpoint_dir.exists.assert_not_called()
logger.load_last_training_state.assert_not_called()
# Collect the steps from logger.log_dict calls
train_log_calls = logger.log_dict.call_args_list
# Extract the steps used in the train logging
steps = []
for call in train_log_calls:
mode = call.kwargs.get("mode", call.args[2] if len(call.args) > 2 else None)
if mode == "train":
step = call.kwargs.get("step", call.args[1] if len(call.args) > 1 else None)
steps.append(step)
expected_start_step = resumed_step if resume else 0
# Calculate expected_steps
train_size = int(cfg.train_split_proportion * len(dataset))
batch_size = cfg.training.batch_size
num_batches = (train_size + batch_size - 1) // batch_size
expected_steps = [expected_start_step + i for i in range(num_batches)]
assert steps == expected_steps, f"Expected steps {expected_steps}, got {steps}"