Compare commits

..

20 Commits

Author SHA1 Message Date
Thomas Wolf
a7c030076f conversion script from original ACT repo 2024-06-10 23:44:33 +02:00
Thomas Wolf
ddaaa9f279 updating with adding masking in ACT - start adding some tests 2024-06-10 15:30:57 +02:00
Thomas Wolf
ef074d7281 add train and evals 2024-06-09 14:03:47 +02:00
Thomas Wolf
797f79f182 update 2024-06-08 17:54:06 +02:00
Thomas Wolf
7ff93e8a51 Update examples/real_robot_example/0_record_training_data.py
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:17:28 +02:00
Thomas Wolf
c304474f6a Update examples/real_robot_example/0_record_training_data.py
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:17:19 +02:00
Thomas Wolf
e53a03ca53 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:17:12 +02:00
Thomas Wolf
86508a167f Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:17:05 +02:00
Thomas Wolf
ac9bbf2cd7 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:16:55 +02:00
Thomas Wolf
cc3dcf2b89 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:16:48 +02:00
Thomas Wolf
5332fef758 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:16:41 +02:00
Thomas Wolf
a40d0cbcc7 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:16:33 +02:00
Thomas Wolf
3717639f11 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:16:25 +02:00
Thomas Wolf
562f09451e Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:16:17 +02:00
Thomas Wolf
754944151c Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:15:56 +02:00
Thomas Wolf
480ed50d36 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:15:08 +02:00
Thomas Wolf
ac7d6228ed Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:15:01 +02:00
Thomas Wolf
a461a71277 Update README.md
Co-authored-by: Remi <re.cadene@gmail.com>
2024-06-08 12:14:55 +02:00
Thomas Wolf
57d3d27c78 add simple manual real world gym env example 2024-06-05 00:37:46 +02:00
Thomas Wolf
0935e49c8a add lerobotdataset in readme 2024-06-04 18:45:12 +02:00
1025 changed files with 16763 additions and 41418 deletions

View File

@@ -1,68 +0,0 @@
{
"homing_offset": [
2048,
3072,
3072,
-1024,
-1024,
2048,
-2048,
2048,
-2048
],
"drive_mode": [
1,
1,
1,
0,
0,
1,
0,
1,
0
],
"start_pos": [
2015,
3058,
3061,
1071,
1071,
2035,
2152,
2029,
2499
],
"end_pos": [
-1008,
-1963,
-1966,
2141,
2143,
-971,
3043,
-1077,
3144
],
"calib_mode": [
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"LINEAR"
],
"motor_names": [
"waist",
"shoulder",
"shoulder_shadow",
"elbow",
"elbow_shadow",
"forearm_roll",
"wrist_angle",
"wrist_rotate",
"gripper"
]
}

View File

@@ -1,68 +0,0 @@
{
"homing_offset": [
2048,
3072,
3072,
-1024,
-1024,
2048,
-2048,
2048,
-1024
],
"drive_mode": [
1,
1,
1,
0,
0,
1,
0,
1,
0
],
"start_pos": [
2035,
3024,
3019,
979,
981,
1982,
2166,
2124,
1968
],
"end_pos": [
-990,
-2017,
-2015,
2078,
2076,
-1030,
3117,
-1016,
2556
],
"calib_mode": [
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"LINEAR"
],
"motor_names": [
"waist",
"shoulder",
"shoulder_shadow",
"elbow",
"elbow_shadow",
"forearm_roll",
"wrist_angle",
"wrist_rotate",
"gripper"
]
}

View File

@@ -1,68 +0,0 @@
{
"homing_offset": [
2048,
3072,
3072,
-1024,
-1024,
2048,
-2048,
2048,
-2048
],
"drive_mode": [
1,
1,
1,
0,
0,
1,
0,
1,
0
],
"start_pos": [
2056,
2895,
2896,
1191,
1190,
2018,
2051,
2056,
2509
],
"end_pos": [
-1040,
-2004,
-2006,
2126,
2127,
-1010,
3050,
-1117,
3143
],
"calib_mode": [
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"LINEAR"
],
"motor_names": [
"waist",
"shoulder",
"shoulder_shadow",
"elbow",
"elbow_shadow",
"forearm_roll",
"wrist_angle",
"wrist_rotate",
"gripper"
]
}

View File

@@ -1,68 +0,0 @@
{
"homing_offset": [
2048,
3072,
3072,
-1024,
-1024,
2048,
-2048,
2048,
-2048
],
"drive_mode": [
1,
1,
1,
0,
0,
1,
0,
1,
0
],
"start_pos": [
2068,
3034,
3030,
1038,
1041,
1991,
1948,
2090,
1985
],
"end_pos": [
-1025,
-2014,
-2015,
2058,
2060,
-955,
3091,
-940,
2576
],
"calib_mode": [
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"DEGREE",
"LINEAR"
],
"motor_names": [
"waist",
"shoulder",
"shoulder_shadow",
"elbow",
"elbow_shadow",
"forearm_roll",
"wrist_angle",
"wrist_rotate",
"gripper"
]
}

View File

@@ -1,17 +1,3 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Misc
.git
tmp
@@ -73,12 +59,13 @@ pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
!tests/artifacts
!tests/data
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
@@ -86,11 +73,6 @@ coverage.xml
.hypothesis/
.pytest_cache/
# Ignore .cache except calibration
.cache/*
!.cache/calibration/
!.cache/calibration/**
# Translations
*.mo
*.pot

17
.gitattributes vendored
View File

@@ -1,21 +1,6 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*.memmap filter=lfs diff=lfs merge=lfs -text
*.stl filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
*.mp4 filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.json !text !filter !merge !diff
tests/artifacts/cameras/*.png filter=lfs diff=lfs merge=lfs -text
tests/artifacts/cameras/*.bag filter=lfs diff=lfs merge=lfs -text
*.json filter=lfs diff=lfs merge=lfs -text

View File

@@ -1,17 +1,3 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "\U0001F41B Bug Report"
description: Submit a bug report to help us improve LeRobot
body:

View File

@@ -21,7 +21,7 @@ Provide a simple way for the reviewer to try out your changes.
Examples:
```bash
pytest -sx tests/test_stuff.py::test_something
DATA_DIR=tests/data pytest -sx tests/test_stuff.py::test_something
```
```bash
python lerobot/scripts/train.py --some.option=true

View File

@@ -1,17 +1,3 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Inspired by
# https://github.com/huggingface/peft/blob/main/.github/workflows/build_docker_images.yml
name: Builds
@@ -22,33 +8,34 @@ on:
schedule:
- cron: "0 1 * * *"
permissions: {}
env:
PYTHON_VERSION: "3.10"
jobs:
latest-cpu:
name: CPU
runs-on:
group: aws-general-8-plus
runs-on: ubuntu-latest
steps:
- name: Install Git LFS
- name: Cleanup disk
run: |
sudo apt-get update
sudo apt-get install git-lfs
git lfs install
sudo df -h
# sudo ls -l /usr/local/lib/
# sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
cache-binary: false
- name: Check out code
uses: actions/checkout@v4
with:
lfs: true
persist-credentials: false
- name: Login to DockerHub
uses: docker/login-action@v3
@@ -68,25 +55,27 @@ jobs:
latest-cuda:
name: GPU
runs-on:
group: aws-general-8-plus
runs-on: ubuntu-latest
steps:
- name: Install Git LFS
- name: Cleanup disk
run: |
sudo apt-get update
sudo apt-get install git-lfs
git lfs install
sudo df -h
# sudo ls -l /usr/local/lib/
# sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
cache-binary: false
- name: Check out code
uses: actions/checkout@v4
with:
lfs: true
persist-credentials: false
- name: Login to DockerHub
uses: docker/login-action@v3
@@ -106,18 +95,25 @@ jobs:
latest-cuda-dev:
name: GPU Dev
runs-on:
group: aws-general-8-plus
runs-on: ubuntu-latest
steps:
- name: Cleanup disk
run: |
sudo df -h
# sudo ls -l /usr/local/lib/
# sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
cache-binary: false
- name: Check out code
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Login to DockerHub
uses: docker/login-action@v3

View File

@@ -1,23 +0,0 @@
name: Build documentation
on:
workflow_dispatch:
push:
paths:
- "docs/**"
branches:
- main
- doc-builder*
- v*-release
jobs:
build: # zizmor: ignore[excessive-permissions] We follow the same pattern as in Transformers
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
with:
commit_sha: ${{ github.sha }}
package: lerobot
additional_args: --not_python_module
secrets:
token: ${{ secrets.HUGGINGFACE_PUSH }}
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}

View File

@@ -1,19 +0,0 @@
name: Build PR Documentation
on:
pull_request:
paths:
- "docs/**"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build: # zizmor: ignore[excessive-permissions] We follow the same pattern as in Transformers
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
with:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: lerobot
additional_args: --not_python_module

View File

@@ -1,17 +1,3 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Inspired by
# https://github.com/huggingface/peft/blob/main/.github/workflows/nightly.yml
name: Nightly
@@ -21,17 +7,16 @@ on:
schedule:
- cron: "0 2 * * *"
permissions: {}
# env:
env:
DATA_DIR: tests/data
# SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
jobs:
run_all_tests_cpu:
name: CPU
strategy:
fail-fast: false
runs-on:
group: aws-general-8-plus
runs-on: ubuntu-latest
container:
image: huggingface/lerobot-cpu:latest
options: --shm-size "16gb"
@@ -44,9 +29,13 @@ jobs:
working-directory: /lerobot
steps:
- name: Tests
env:
DATA_DIR: tests/data
run: pytest -v --cov=./lerobot --disable-warnings tests
- name: Tests end-to-end
env:
DATA_DIR: tests/data
run: make test-end-to-end
@@ -54,8 +43,7 @@ jobs:
name: GPU
strategy:
fail-fast: false
runs-on:
group: aws-g6-4xlarge-plus
runs-on: [single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
TEST_TYPE: "single_gpu"

View File

@@ -1,29 +1,15 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Quality
on:
workflow_dispatch:
workflow_call:
pull_request:
branches:
- main
push:
branches:
- main
permissions: {}
env:
PYTHON_VERSION: "3.10"
@@ -33,9 +19,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
persist-credentials: false
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
@@ -46,27 +30,27 @@ jobs:
id: get-ruff-version
run: |
RUFF_VERSION=$(awk '/repo: https:\/\/github.com\/astral-sh\/ruff-pre-commit/{flag=1;next}/rev:/{if(flag){print $2;exit}}' .pre-commit-config.yaml)
echo "ruff_version=${RUFF_VERSION}" >> $GITHUB_OUTPUT
echo "RUFF_VERSION=${RUFF_VERSION}" >> $GITHUB_ENV
- name: Install Ruff
env:
RUFF_VERSION: ${{ steps.get-ruff-version.outputs.ruff_version }}
run: python -m pip install "ruff==${RUFF_VERSION}"
run: python -m pip install "ruff==${{ env.RUFF_VERSION }}"
- name: Ruff check
run: ruff check --output-format=github
run: ruff check
- name: Ruff format
run: ruff format --diff
typos:
name: Typos
poetry_check:
name: Poetry check
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
persist-credentials: false
uses: actions/checkout@v3
- name: typos-action
uses: crate-ci/typos@v1.29.10
- name: Install poetry
run: pipx install poetry
- name: Poetry check
run: poetry check

View File

@@ -1,29 +1,15 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Inspired by
# https://github.com/huggingface/peft/blob/main/.github/workflows/test-docker-build.yml
name: Test Dockerfiles
on:
pull_request:
branches:
- main
paths:
# Run only when DockerFile files are modified
- "docker/**"
permissions: {}
env:
PYTHON_VERSION: "3.10"
@@ -36,42 +22,51 @@ jobs:
steps:
- name: Check out code
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Get changed files
id: changed-files
uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
uses: tj-actions/changed-files@v44
with:
files: docker/**
json: "true"
- name: Run step if only the files listed above change # zizmor: ignore[template-injection]
- name: Run step if only the files listed above change
if: steps.changed-files.outputs.any_changed == 'true'
id: set-matrix
env:
ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
run: |
echo "matrix=${{ steps.changed-files.outputs.all_changed_files}}" >> $GITHUB_OUTPUT
build_modified_dockerfiles:
name: Build modified Docker images
needs: get_changed_files
runs-on:
group: aws-general-8-plus
if: needs.get_changed_files.outputs.matrix != ''
runs-on: ubuntu-latest
if: ${{ needs.get_changed_files.outputs.matrix }} != ''
strategy:
fail-fast: false
matrix:
docker-file: ${{ fromJson(needs.get_changed_files.outputs.matrix) }}
steps:
- name: Cleanup disk
run: |
sudo df -h
# sudo ls -l /usr/local/lib/
# sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
cache-binary: false
- name: Check out code
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Build Docker image
uses: docker/build-push-action@v5

View File

@@ -1,30 +1,15 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Tests
on:
pull_request:
branches:
- main
paths:
- "lerobot/**"
- "tests/**"
- "examples/**"
- ".github/**"
- "pyproject.toml"
- ".pre-commit-config.yaml"
- "Makefile"
- ".cache/**"
- "poetry.lock"
push:
branches:
- main
@@ -33,116 +18,109 @@ on:
- "tests/**"
- "examples/**"
- ".github/**"
- "pyproject.toml"
- ".pre-commit-config.yaml"
- "Makefile"
- ".cache/**"
permissions: {}
env:
UV_VERSION: "0.6.0"
- "poetry.lock"
jobs:
pytest:
name: Pytest
runs-on: ubuntu-latest
env:
DATA_DIR: tests/data
MUJOCO_GL: egl
steps:
- uses: actions/checkout@v4
with:
lfs: true # Ensure LFS files are pulled
persist-credentials: false
- name: Install apt dependencies
# portaudio19-dev is needed to install pyaudio
- name: Install EGL
run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
- name: Install poetry
run: |
sudo apt-get update && \
sudo apt-get install -y libegl1-mesa-dev ffmpeg portaudio19-dev
pipx install poetry && poetry config virtualenvs.in-project true
echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Install uv and python
uses: astral-sh/setup-uv@v5
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
enable-cache: true
version: ${{ env.UV_VERSION }}
python-version: "3.10"
cache: "poetry"
- name: Install lerobot (all extras)
run: uv sync --all-extras
- name: Install poetry dependencies
run: |
poetry install --all-extras
- name: Test with pytest
run: |
uv run pytest tests -v --cov=./lerobot --durations=0 \
pytest tests -v --cov=./lerobot --durations=0 \
-W ignore::DeprecationWarning:imageio_ffmpeg._utils:7 \
-W ignore::UserWarning:torch.utils.data.dataloader:558 \
-W ignore::UserWarning:gymnasium.utils.env_checker:247 \
&& rm -rf tests/outputs outputs
pytest-minimal:
name: Pytest (minimal install)
runs-on: ubuntu-latest
env:
DATA_DIR: tests/data
MUJOCO_GL: egl
steps:
- uses: actions/checkout@v4
with:
lfs: true # Ensure LFS files are pulled
persist-credentials: false
- name: Install apt dependencies
run: sudo apt-get update && sudo apt-get install -y ffmpeg
- name: Install poetry
run: |
pipx install poetry && poetry config virtualenvs.in-project true
echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Install uv and python
uses: astral-sh/setup-uv@v5
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
enable-cache: true
version: ${{ env.UV_VERSION }}
python-version: "3.10"
- name: Install lerobot
run: uv sync --extra "test"
- name: Install poetry dependencies
run: |
poetry install --extras "test"
- name: Test with pytest
run: |
uv run pytest tests -v --cov=./lerobot --durations=0 \
pytest tests -v --cov=./lerobot --durations=0 \
-W ignore::DeprecationWarning:imageio_ffmpeg._utils:7 \
-W ignore::UserWarning:torch.utils.data.dataloader:558 \
-W ignore::UserWarning:gymnasium.utils.env_checker:247 \
&& rm -rf tests/outputs outputs
end-to-end:
name: End-to-end
runs-on: ubuntu-latest
env:
DATA_DIR: tests/data
MUJOCO_GL: egl
steps:
- uses: actions/checkout@v4
with:
lfs: true # Ensure LFS files are pulled
persist-credentials: false
- name: Install apt dependencies
# portaudio19-dev is needed to install pyaudio
- name: Install EGL
run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
- name: Install poetry
run: |
sudo apt-get update && \
sudo apt-get install -y libegl1-mesa-dev ffmpeg portaudio19-dev
pipx install poetry && poetry config virtualenvs.in-project true
echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Install uv and python
uses: astral-sh/setup-uv@v5
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
enable-cache: true
version: ${{ env.UV_VERSION }}
python-version: "3.10"
cache: "poetry"
- name: Install lerobot (all extras)
- name: Install poetry dependencies
run: |
uv venv
uv sync --all-extras
- name: venv
run: |
echo "PYTHON_PATH=${{ github.workspace }}/.venv/bin/python" >> $GITHUB_ENV
poetry install --all-extras
- name: Test end-to-end
run: |

View File

@@ -1,35 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
on:
push:
name: Secret Leaks
permissions: {}
jobs:
trufflehog:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false
- name: Secret Scanning
uses: trufflesecurity/trufflehog@main
with:
extra_args: --only-verified

View File

@@ -1,16 +0,0 @@
name: Upload PR Documentation
on: # zizmor: ignore[dangerous-triggers] We follow the same pattern as in Transformers
workflow_run:
workflows: [ "Build PR Documentation" ]
types:
- completed
jobs:
build: # zizmor: ignore[excessive-permissions] We follow the same pattern as in Transformers
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
with:
package_name: lerobot
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}

28
.gitignore vendored
View File

@@ -1,17 +1,3 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
.dev
# Logging
logs
tmp
@@ -63,10 +49,6 @@ share/python-wheels/
*.egg
MANIFEST
# uv/poetry lock files
poetry.lock
uv.lock
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
@@ -78,12 +60,13 @@ pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
!tests/artifacts
!tests/data
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
@@ -91,11 +74,6 @@ coverage.xml
.hypothesis/
.pytest_cache/
# Ignore .cache except calibration
.cache/*
!.cache/calibration/
!.cache/calibration/**
# Translations
*.mo
*.pot
@@ -143,8 +121,8 @@ celerybeat.pid
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

View File

@@ -1,31 +1,9 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
exclude: "tests/artifacts/.*\\.safetensors$"
exclude: ^(tests/data)
default_language_version:
python: python3.10
repos:
##### Meta #####
- repo: meta
hooks:
- id: check-useless-excludes
- id: check-hooks-apply
##### Style / Misc. #####
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v4.6.0
hooks:
- id: check-added-large-files
- id: debug-statements
@@ -35,40 +13,21 @@ repos:
- id: check-toml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/adhtruong/mirrors-typos
rev: v1.31.1
hooks:
- id: typos
args: [--force-exclude]
- repo: https://github.com/asottile/pyupgrade
rev: v3.19.1
rev: v3.15.2
hooks:
- id: pyupgrade
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.5
rev: v0.4.3
hooks:
- id: ruff
args: [--fix]
- id: ruff-format
##### Security #####
- repo: https://github.com/gitleaks/gitleaks
rev: v8.24.3
- repo: https://github.com/python-poetry/poetry
rev: 1.8.0
hooks:
- id: gitleaks
- repo: https://github.com/woodruffw/zizmor-pre-commit
rev: v1.5.2
hooks:
- id: zizmor
- repo: https://github.com/PyCQA/bandit
rev: 1.8.3
hooks:
- id: bandit
args: ["-c", "pyproject.toml"]
additional_dependencies: ["bandit[toml]"]
- id: poetry-check
- id: poetry-lock
args:
- "--check"
- "--no-update"

View File

@@ -20,7 +20,7 @@ Some of the ways you can contribute to 🤗 LeRobot:
* Contributing to the examples or to the documentation.
* Submitting issues related to bugs or desired new features.
Following the guides below, feel free to open issues and PRs and to coordinate your efforts with the community on our [Discord Channel](https://discord.gg/VjFz58wn3R). For specific inquiries, reach out to [Remi Cadene](mailto:remi.cadene@huggingface.co).
Following the guides below, feel free to open issues and PRs and to coordinate your efforts with the community on our [Discord Channel](https://discord.gg/VjFz58wn3R). For specific inquiries, reach out to [Remi Cadene](remi.cadene@huggingface.co).
If you are not sure how to contribute or want to know the next features we working on, look on this project page: [LeRobot TODO](https://github.com/orgs/huggingface/projects/46)
@@ -129,71 +129,38 @@ Follow these steps to start contributing:
🚨 **Do not** work on the `main` branch.
4. for development, we advise to use a tool like `poetry` or `uv` instead of just `pip` to easily track our dependencies.
Follow the instructions to [install poetry](https://python-poetry.org/docs/#installation) (use a version >=2.1.0) or to [install uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) if you don't have one of them already.
4. for development, we use `poetry` instead of just `pip` to easily track our dependencies.
If you don't have it already, follow the [instructions](https://python-poetry.org/docs/#installation) to install it.
Set up a development environment with conda or miniconda:
```bash
conda create -y -n lerobot-dev python=3.10 && conda activate lerobot-dev
```
If you're using `uv`, it can manage python versions so you can instead do:
```bash
uv venv --python 3.10 && source .venv/bin/activate
```
To develop on 🤗 LeRobot, you will at least need to install the `dev` and `test` extras dependencies along with the core library:
using `poetry`
```bash
poetry sync --extras "dev test"
```
using `uv`
```bash
uv sync --extra dev --extra test
poetry install --sync --extras "dev test"
```
You can also install the project with all its dependencies (including environments):
using `poetry`
```bash
poetry sync --all-extras
```
using `uv`
```bash
uv sync --all-extras
poetry install --sync --all-extras
```
> **Note:** If you don't install simulation environments with `--all-extras`, the tests that require them will be skipped when running the pytest suite locally. However, they *will* be tested in the CI. In general, we advise you to install everything and test locally before pushing.
Whichever command you chose to install the project (e.g. `poetry sync --all-extras`), you should run it again when pulling code with an updated version of `pyproject.toml` and `poetry.lock` in order to synchronize your virtual environment with the new dependencies.
Whichever command you chose to install the project (e.g. `poetry install --sync --all-extras`), you should run it again when pulling code with an updated version of `pyproject.toml` and `poetry.lock` in order to synchronize your virtual environment with the new dependencies.
The equivalent of `pip install some-package`, would just be:
using `poetry`
```bash
poetry add some-package
```
using `uv`
```bash
uv add some-package
```
When making changes to the poetry sections of the `pyproject.toml`, you should run the following command to lock dependencies.
using `poetry`
```bash
poetry lock
poetry lock --no-update
```
using `uv`
```bash
uv lock
```
5. Develop the features on your branch.
As you work on the features, you should make sure that the test suite
@@ -228,7 +195,7 @@ Follow these steps to start contributing:
git commit
```
Note, if you already committed some changes that have a wrong formatting, you can use:
Note, if you already commited some changes that have a wrong formatting, you can use:
```bash
pre-commit run --all-files
```
@@ -291,7 +258,7 @@ sudo apt-get install git-lfs
git lfs install
```
Pull artifacts if they're not in [tests/artifacts](tests/artifacts)
Pull artifacts if they're not in [tests/data](tests/data)
```bash
git lfs pull
```
@@ -300,7 +267,7 @@ We use `pytest` in order to run the tests. From the root of the
repository, here's how to run tests with `pytest` for the library:
```bash
python -m pytest -sv ./tests
DATA_DIR="tests/data" python -m pytest -sv ./tests
```

223
Makefile
View File

@@ -1,25 +1,11 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
.PHONY: tests
PYTHON_PATH := $(shell which python)
# If uv is installed and a virtual environment exists, use it
UV_CHECK := $(shell command -v uv)
ifneq ($(UV_CHECK),)
PYTHON_PATH := $(shell .venv/bin/python)
# If Poetry is installed, redefine PYTHON_PATH to use the Poetry-managed Python
POETRY_CHECK := $(shell command -v poetry)
ifneq ($(POETRY_CHECK),)
PYTHON_PATH := $(shell poetry run which python)
endif
export PATH := $(dir $(PYTHON_PATH)):$(PATH)
@@ -34,109 +20,144 @@ build-gpu:
test-end-to-end:
${MAKE} DEVICE=$(DEVICE) test-act-ete-train
${MAKE} DEVICE=$(DEVICE) test-act-ete-train-resume
${MAKE} DEVICE=$(DEVICE) test-act-ete-eval
${MAKE} DEVICE=$(DEVICE) test-act-ete-train-amp
${MAKE} DEVICE=$(DEVICE) test-act-ete-eval-amp
${MAKE} DEVICE=$(DEVICE) test-diffusion-ete-train
${MAKE} DEVICE=$(DEVICE) test-diffusion-ete-eval
${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-train
${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-eval
${MAKE} DEVICE=$(DEVICE) test-default-ete-eval
${MAKE} DEVICE=$(DEVICE) test-act-pusht-tutorial
test-act-ete-train:
python lerobot/scripts/train.py \
--policy.type=act \
--policy.dim_model=64 \
--policy.n_action_steps=20 \
--policy.chunk_size=20 \
--policy.device=$(DEVICE) \
--env.type=aloha \
--env.episode_length=5 \
--dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
--dataset.image_transforms.enable=true \
--dataset.episodes="[0]" \
--batch_size=2 \
--steps=4 \
--eval_freq=2 \
--eval.n_episodes=1 \
--eval.batch_size=1 \
--save_freq=2 \
--save_checkpoint=true \
--log_freq=1 \
--wandb.enable=false \
--output_dir=tests/outputs/act/
test-act-ete-train-resume:
python lerobot/scripts/train.py \
--config_path=tests/outputs/act/checkpoints/000002/pretrained_model/train_config.json \
--resume=true
policy=act \
policy.dim_model=64 \
env=aloha \
wandb.enable=False \
training.offline_steps=2 \
training.online_steps=0 \
eval.n_episodes=1 \
eval.batch_size=1 \
device=$(DEVICE) \
training.save_checkpoint=true \
training.save_freq=2 \
policy.n_action_steps=20 \
policy.chunk_size=20 \
training.batch_size=2 \
hydra.run.dir=tests/outputs/act/
test-act-ete-eval:
python lerobot/scripts/eval.py \
--policy.path=tests/outputs/act/checkpoints/000004/pretrained_model \
--policy.device=$(DEVICE) \
--env.type=aloha \
--env.episode_length=5 \
--eval.n_episodes=1 \
--eval.batch_size=1
-p tests/outputs/act/checkpoints/000002/pretrained_model \
eval.n_episodes=1 \
eval.batch_size=1 \
env.episode_length=8 \
device=$(DEVICE) \
test-act-ete-train-amp:
python lerobot/scripts/train.py \
policy=act \
policy.dim_model=64 \
env=aloha \
wandb.enable=False \
training.offline_steps=2 \
training.online_steps=0 \
eval.n_episodes=1 \
eval.batch_size=1 \
device=$(DEVICE) \
training.save_checkpoint=true \
training.save_freq=2 \
policy.n_action_steps=20 \
policy.chunk_size=20 \
training.batch_size=2 \
hydra.run.dir=tests/outputs/act_amp/ \
use_amp=true
test-act-ete-eval-amp:
python lerobot/scripts/eval.py \
-p tests/outputs/act_amp/checkpoints/000002/pretrained_model \
eval.n_episodes=1 \
eval.batch_size=1 \
env.episode_length=8 \
device=$(DEVICE) \
use_amp=true
test-diffusion-ete-train:
python lerobot/scripts/train.py \
--policy.type=diffusion \
--policy.down_dims='[64,128,256]' \
--policy.diffusion_step_embed_dim=32 \
--policy.num_inference_steps=10 \
--policy.device=$(DEVICE) \
--env.type=pusht \
--env.episode_length=5 \
--dataset.repo_id=lerobot/pusht \
--dataset.image_transforms.enable=true \
--dataset.episodes="[0]" \
--batch_size=2 \
--steps=2 \
--eval_freq=2 \
--eval.n_episodes=1 \
--eval.batch_size=1 \
--save_checkpoint=true \
--save_freq=2 \
--log_freq=1 \
--wandb.enable=false \
--output_dir=tests/outputs/diffusion/
policy=diffusion \
policy.down_dims=\[64,128,256\] \
policy.diffusion_step_embed_dim=32 \
policy.num_inference_steps=10 \
env=pusht \
wandb.enable=False \
training.offline_steps=2 \
training.online_steps=0 \
eval.n_episodes=1 \
eval.batch_size=1 \
device=$(DEVICE) \
training.save_checkpoint=true \
training.save_freq=2 \
training.batch_size=2 \
hydra.run.dir=tests/outputs/diffusion/
test-diffusion-ete-eval:
python lerobot/scripts/eval.py \
--policy.path=tests/outputs/diffusion/checkpoints/000002/pretrained_model \
--policy.device=$(DEVICE) \
--env.type=pusht \
--env.episode_length=5 \
--eval.n_episodes=1 \
--eval.batch_size=1
-p tests/outputs/diffusion/checkpoints/000002/pretrained_model \
eval.n_episodes=1 \
eval.batch_size=1 \
env.episode_length=8 \
device=$(DEVICE) \
# TODO(alexander-soare): Restore online_steps to 2 when it is reinstated.
test-tdmpc-ete-train:
python lerobot/scripts/train.py \
--policy.type=tdmpc \
--policy.device=$(DEVICE) \
--env.type=xarm \
--env.task=XarmLift-v0 \
--env.episode_length=5 \
--dataset.repo_id=lerobot/xarm_lift_medium \
--dataset.image_transforms.enable=true \
--dataset.episodes="[0]" \
--batch_size=2 \
--steps=2 \
--eval_freq=2 \
--eval.n_episodes=1 \
--eval.batch_size=1 \
--save_checkpoint=true \
--save_freq=2 \
--log_freq=1 \
--wandb.enable=false \
--output_dir=tests/outputs/tdmpc/
policy=tdmpc \
env=xarm \
env.task=XarmLift-v0 \
dataset_repo_id=lerobot/xarm_lift_medium \
wandb.enable=False \
training.offline_steps=2 \
training.online_steps=0 \
eval.n_episodes=1 \
eval.batch_size=1 \
env.episode_length=2 \
device=$(DEVICE) \
training.save_checkpoint=true \
training.save_freq=2 \
training.batch_size=2 \
hydra.run.dir=tests/outputs/tdmpc/
test-tdmpc-ete-eval:
python lerobot/scripts/eval.py \
--policy.path=tests/outputs/tdmpc/checkpoints/000002/pretrained_model \
--policy.device=$(DEVICE) \
--env.type=xarm \
--env.episode_length=5 \
--env.task=XarmLift-v0 \
--eval.n_episodes=1 \
--eval.batch_size=1
-p tests/outputs/tdmpc/checkpoints/000002/pretrained_model \
eval.n_episodes=1 \
eval.batch_size=1 \
env.episode_length=8 \
device=$(DEVICE) \
test-default-ete-eval:
python lerobot/scripts/eval.py \
--config lerobot/configs/default.yaml \
eval.n_episodes=1 \
eval.batch_size=1 \
env.episode_length=8 \
device=$(DEVICE) \
test-act-pusht-tutorial:
cp examples/advanced/1_train_act_pusht/act_pusht.yaml lerobot/configs/policy/created_by_Makefile.yaml
python lerobot/scripts/train.py \
policy=created_by_Makefile.yaml \
env=pusht \
wandb.enable=False \
training.offline_steps=2 \
eval.n_episodes=1 \
eval.batch_size=1 \
env.episode_length=2 \
device=$(DEVICE) \
training.save_model=true \
training.save_freq=2 \
training.batch_size=2 \
hydra.run.dir=tests/outputs/act_pusht/
rm lerobot/configs/policy/created_by_Makefile.yaml

258
README.md
View File

@@ -22,49 +22,13 @@
</div>
<h2 align="center">
<p><a href="https://github.com/huggingface/lerobot/blob/main/examples/12_use_so101.md">
Build Your Own SO-101 Robot!</a></p>
</h2>
<div align="center">
<div style="display: flex; gap: 1rem; justify-content: center; align-items: center;" >
<img
src="media/so101/so101.webp?raw=true"
alt="SO-101 follower arm"
title="SO-101 follower arm"
style="width: 40%;"
/>
<img
src="media/so101/so101-leader.webp?raw=true"
alt="SO-101 leader arm"
title="SO-101 leader arm"
style="width: 40%;"
/>
</div>
<p><strong>Meet the updated SO100, the SO-101 Just €114 per arm!</strong></p>
<p>Train it in minutes with a few simple moves on your laptop.</p>
<p>Then sit back and watch your creation act autonomously! 🤯</p>
<p><a href="https://github.com/huggingface/lerobot/blob/main/examples/12_use_so101.md">
See the full SO-101 tutorial here.</a></p>
<p>Want to take it to the next level? Make your SO-101 mobile by building LeKiwi!</p>
<p>Check out the <a href="https://github.com/huggingface/lerobot/blob/main/examples/11_use_lekiwi.md">LeKiwi tutorial</a> and bring your robot to life on wheels.</p>
<img src="media/lekiwi/kiwi.webp?raw=true" alt="LeKiwi mobile robot" title="LeKiwi mobile robot" width="50%">
</div>
<br/>
<h3 align="center">
<p>LeRobot: State-of-the-art AI for real-world robotics</p>
<p>State-of-the-art Machine Learning for real-world robotics</p>
</h3>
---
🤗 LeRobot aims to provide models, datasets, and tools for real-world robotics in PyTorch. The goal is to lower the barrier to entry to robotics so that everyone can contribute and benefit from sharing datasets and pretrained models.
🤗 LeRobot contains state-of-the-art approaches that have been shown to transfer to the real-world with a focus on imitation learning and reinforcement learning.
@@ -77,9 +41,9 @@
<table>
<tr>
<td><img src="media/gym/aloha_act.gif" width="100%" alt="ACT policy on ALOHA env"/></td>
<td><img src="media/gym/simxarm_tdmpc.gif" width="100%" alt="TDMPC policy on SimXArm env"/></td>
<td><img src="media/gym/pusht_diffusion.gif" width="100%" alt="Diffusion policy on PushT env"/></td>
<td><img src="http://remicadene.com/assets/gif/aloha_act.gif" width="100%" alt="ACT policy on ALOHA env"/></td>
<td><img src="http://remicadene.com/assets/gif/simxarm_tdmpc.gif" width="100%" alt="TDMPC policy on SimXArm env"/></td>
<td><img src="http://remicadene.com/assets/gif/pusht_diffusion.gif" width="100%" alt="Diffusion policy on PushT env"/></td>
</tr>
<tr>
<td align="center">ACT policy on ALOHA env</td>
@@ -90,46 +54,32 @@
### Acknowledgment
- Thanks to Tony Zhao, Zipeng Fu and colleagues for open sourcing ACT policy, ALOHA environments and datasets. Ours are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha) and [Mobile ALOHA](https://mobile-aloha.github.io).
- Thanks to Tony Zaho, Zipeng Fu and colleagues for open sourcing ACT policy, ALOHA environments and datasets. Ours are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha) and [Mobile ALOHA](https://mobile-aloha.github.io).
- Thanks to Cheng Chi, Zhenjia Xu and colleagues for open sourcing Diffusion policy, Pusht environment and datasets, as well as UMI datasets. Ours are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu) and [UMI Gripper](https://umi-gripper.github.io).
- Thanks to Nicklas Hansen, Yunhai Feng and colleagues for open sourcing TDMPC policy, Simxarm environments and datasets. Ours are adapted from [TDMPC](https://github.com/nicklashansen/tdmpc) and [FOWM](https://www.yunhaifeng.com/FOWM).
- Thanks to Antonio Loquercio and Ashish Kumar for their early support.
- Thanks to [Seungjae (Jay) Lee](https://sjlee.cc/), [Mahi Shafiullah](https://mahis.life/) and colleagues for open sourcing [VQ-BeT](https://sjlee.cc/vq-bet/) policy and helping us adapt the codebase to our repository. The policy is adapted from [VQ-BeT repo](https://github.com/jayLEE0301/vq_bet_official).
## Installation
Download our source code:
```bash
git clone https://github.com/huggingface/lerobot.git
cd lerobot
git clone https://github.com/huggingface/lerobot.git && cd lerobot
```
Create a virtual environment with Python 3.10 and activate it, e.g. with [`miniconda`](https://docs.anaconda.com/free/miniconda/index.html):
```bash
conda create -y -n lerobot python=3.10
conda activate lerobot
conda create -y -n lerobot python=3.10 && conda activate lerobot
```
When using `miniconda`, install `ffmpeg` in your environment:
```bash
conda install ffmpeg -c conda-forge
```
> **NOTE:** This usually installs `ffmpeg 7.X` for your platform compiled with the `libsvtav1` encoder. If `libsvtav1` is not supported (check supported encoders with `ffmpeg -encoders`), you can:
> - _[On any platform]_ Explicitly install `ffmpeg 7.X` using:
> ```bash
> conda install ffmpeg=7.1.1 -c conda-forge
> ```
> - _[On Linux only]_ Install [ffmpeg build dependencies](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#GettheDependencies) and [compile ffmpeg from source with libsvtav1](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#libsvtav1), and make sure you use the corresponding ffmpeg binary to your install with `which ffmpeg`.
Install 🤗 LeRobot:
```bash
pip install -e .
pip install .
```
> **NOTE:** If you encounter build errors, you may need to install additional dependencies (`cmake`, `build-essential`, and `ffmpeg libs`). On Linux, run:
`sudo apt-get install cmake build-essential python3-dev pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libswscale-dev libswresample-dev libavfilter-dev pkg-config`. For other systems, see: [Compiling PyAV](https://pyav.org/docs/develop/overview/installation.html#bring-your-own-ffmpeg)
> **NOTE:** Depending on your platform, If you encounter any build errors during this step
you may need to install `cmake` and `build-essential` for building some of our dependencies.
On linux: `sudo apt-get install cmake build-essential`
For simulations, 🤗 LeRobot comes with gymnasium environments that can be installed as extras:
- [aloha](https://github.com/huggingface/gym-aloha)
@@ -138,7 +88,7 @@ For simulations, 🤗 LeRobot comes with gymnasium environments that can be inst
For instance, to install 🤗 LeRobot with aloha and pusht, use:
```bash
pip install -e ".[aloha, pusht]"
pip install ".[aloha, pusht]"
```
To use [Weights and Biases](https://docs.wandb.ai/quickstart) for experiment tracking, log in with
@@ -155,17 +105,18 @@ wandb login
├── examples # contains demonstration examples, start here to learn about LeRobot
| └── advanced # contains even more examples for those who have mastered the basics
├── lerobot
| ├── configs # contains config classes with all options that you can override in the command line
| ├── configs # contains hydra yaml files with all options that you can override in the command line
| | ├── default.yaml # selected by default, it loads pusht environment and diffusion policy
| | ├── env # various sim environments and their datasets: aloha.yaml, pusht.yaml, xarm.yaml
| | └── policy # various policies: act.yaml, diffusion.yaml, tdmpc.yaml
| ├── common # contains classes and utilities
| | ├── datasets # various datasets of human demonstrations: aloha, pusht, xarm
| | ├── envs # various sim environments: aloha, pusht, xarm
| | ├── policies # various policies: act, diffusion, tdmpc
| | ├── robot_devices # various real devices: dynamixel motors, opencv cameras, koch robots
| | └── utils # various utilities
| └── scripts # contains functions to execute via command line
| ├── eval.py # load policy and evaluate it on an environment
| ├── train.py # train a policy via imitation learning and/or reinforcement learning
| ├── control_robot.py # teleoperate a real robot, record data, run a policy
| ├── push_dataset_to_hub.py # convert your dataset into LeRobot dataset format and upload it to the Hugging Face hub
| └── visualize_dataset.py # load a dataset and render its demonstrations
├── outputs # contains results of scripts execution: logs, videos, model checkpoints
@@ -174,7 +125,7 @@ wandb login
### Visualize datasets
Check out [example 1](./examples/1_load_lerobot_dataset.py) that illustrates how to use our dataset class which automatically downloads data from the Hugging Face hub.
Check out [example 1](./examples/1_load_lerobot_dataset.py) that illustrates how to use our dataset class which automatically download data from the Hugging Face hub.
You can also locally visualize episodes from a dataset on the hub by executing our script from the command line:
```bash
@@ -183,12 +134,10 @@ python lerobot/scripts/visualize_dataset.py \
--episode-index 0
```
or from a dataset in a local folder with the `root` option and the `--local-files-only` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`)
or from a dataset in a local folder with the root `DATA_DIR` environment variable
```bash
python lerobot/scripts/visualize_dataset.py \
DATA_DIR='./my_local_data_dir' python lerobot/scripts/visualize_dataset.py \
--repo-id lerobot/pusht \
--root ./my_local_data_dir \
--local-files-only 1 \
--episode-index 0
```
@@ -202,48 +151,48 @@ Our script can also visualize datasets stored on a distant server. See `python l
### The `LeRobotDataset` format
A dataset in `LeRobotDataset` format is very simple to use. It can be loaded from a repository on the Hugging Face hub or a local folder simply with e.g. `dataset = LeRobotDataset("lerobot/aloha_static_coffee")` and can be indexed into like any Hugging Face and PyTorch dataset. For instance `dataset[0]` will retrieve a single temporal frame from the dataset containing observation(s) and an action as PyTorch tensors ready to be fed to a model.
A dataset in `LeRobotDataset` format is very simple to use. It can be loaded from a repository on the Hugging Face hub or a local folder simply with e.g. `dataset = LeRobotDataset("lerobot/aloha_static_coffee")` and can be indexed into like any Hugging Face and Pytorch dataset. For instance `dataset[0]` will retrieve a sample of the dataset observations and actions in pytorch tensors format ready to be fed to a model.
A specificity of `LeRobotDataset` is that, rather than retrieving a single frame by its index, we can retrieve several frames based on their temporal relationship with the indexed frame, by setting `delta_timestamps` to a list of relative times with respect to the indexed frame. For example, with `delta_timestamps = {"observation.image": [-1, -0.5, -0.2, 0]}` one can retrieve, for a given index, 4 frames: 3 "previous" frames 1 second, 0.5 seconds, and 0.2 seconds before the indexed frame, and the indexed frame itself (corresponding to the 0 entry). See example [1_load_lerobot_dataset.py](examples/1_load_lerobot_dataset.py) for more details on `delta_timestamps`.
A specificity of `LeRobotDataset` is that we can retrieve several frames for one sample query. By setting `delta_timestamps` to a list of delta timestamps, e.g. `delta_timestamps = {"observation.image": [-1, -0.5, -0.2, 0]}` one can retrieve, for each query, 4 images including one at -1 second before the current time step, the two others at -0.5 second and -0.2, and the final one at the current time step (0 second). See example [1_load_lerobot_dataset.py](examples/1_load_lerobot_dataset.py) for more details on `delta_timestamps`.
Under the hood, the `LeRobotDataset` format makes use of several ways to serialize data which can be useful to understand if you plan to work more closely with this format. We tried to make a flexible yet simple dataset format that would cover most type of features and specificities present in reinforcement learning and robotics, in simulation and in real-world, with a focus on cameras and robot states but easily extended to other types of sensory inputs as long as they can be represented by a tensor.
Under the hood, the `LeRobotDataset` format makes use of several ways to serialize data which can be useful to understand if you plan to work more closely with this format. We tried to make a flexible yet simple dataset format that would cover most type of features and specificities present in reinforcement learning and robotics, in simulation and in real-world, with a focus on cameras and robot states.
Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects:
```
dataset attributes:
├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example:
│ ├ observation.images.cam_high (VideoFrame):
│ │ VideoFrame = {'path': path to a mp4 video, 'timestamp' (float32): timestamp in the video}
│ ├ observation.state (list of float32): position of an arm joints (for instance)
│ ├ observation.images.cam_high: VideoFrame
│ │ VideoFrame = {'path': path to a mp4 video, 'timestamp': float32 timestamp in the video}
│ ├ observation.state: List of float32: position of an arm joints (for instance)
│ ... (more observations)
│ ├ action (list of float32): goal position of an arm joints (for instance)
│ ├ episode_index (int64): index of the episode for this sample
│ ├ frame_index (int64): index of the frame for this sample in the episode ; starts at 0 for each episode
│ ├ timestamp (float32): timestamp in the episode
│ ├ next.done (bool): indicates the end of an episode ; True for the last frame in each episode
│ └ index (int64): general index in the whole dataset
│ ├ action: List of float32
│ ├ episode_index: int64: index of the episode for this sample
│ ├ frame_index: int64: index of the frame for this sample in the episode ; starts at 0 for each episode
│ ├ timestamp: float32: timestamp in the episode
│ ├ next.done: bool: indicates the end of en episode ; True for the last frame in each episode
│ └ index: int64: general index in the whole dataset
├ episode_data_index: contains 2 tensors with the start and end indices of each episode
│ ├ from (1D int64 tensor): first frame index for each episode shape (num episodes,) starts with 0
│ └ to: (1D int64 tensor): last frame index for each episode shape (num episodes,)
│ ├ from: 1D int64 tensor of first frame index for each episode: shape (num episodes,) starts with 0
│ └ to: 1D int64 tensor of last frame index for each episode: shape (num episodes,)
├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance
│ ├ observation.images.cam_high: {'max': tensor with same number of dimensions (e.g. `(c, 1, 1)` for images, `(c,)` for states), etc.}
│ ...
├ info: a dictionary of metadata on the dataset
│ ├ codebase_version (str): this is to keep track of the codebase version the dataset was created with
├ fps (float): frame per second the dataset is recorded/synchronized to
├ video (bool): indicates if frames are encoded in mp4 video files to save space or stored as png files
encoding (dict): if video, this documents the main options that were used with ffmpeg to encode the videos
├ videos_dir (Path): where the mp4 videos or png images are stored/accessed
└ camera_keys (list of string): the keys to access camera features in the item returned by the dataset (e.g. `["observation.images.cam_high", ...]`)
│ ├ fps: float - frame per second the dataset is recorded/synchronized to
└ video: bool - indicates if frames are encoded in mp4 video files to save space or stored as png files
├ videos_dir: path to where the mp4 videos or png images are stored/accessed
camera_keys: List of string: the keys to access camera features in the item returned by the dataset (e.g. `["observation.images.cam_high", ...]`)
```
A `LeRobotDataset` is serialised using several widespread file formats for each of its parts, namely:
- hf_dataset stored using Hugging Face datasets library serialization to parquet
- videos are stored in mp4 format to save space
- metadata are stored in plain json/jsonl files
- videos are stored in mp4 format to save space or png files
- episode_data_index saved using `safetensor` tensor serializtion format
- stats saved using `safetensor` tensor serializtion format
- info are saved using JSON
Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can specify its location with the `root` argument if it's not in the default `~/.cache/huggingface/lerobot` location.
Dataset can uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can set the `DATA_DIR` environment variable to you root dataset folder as illustrated in the above section on dataset visualization.
### Evaluate a pretrained policy
@@ -252,18 +201,15 @@ Check out [example 2](./examples/2_evaluate_pretrained_policy.py) that illustrat
We also provide a more capable script to parallelize the evaluation over multiple environments during the same rollout. Here is an example with a pretrained model hosted on [lerobot/diffusion_pusht](https://huggingface.co/lerobot/diffusion_pusht):
```bash
python lerobot/scripts/eval.py \
--policy.path=lerobot/diffusion_pusht \
--env.type=pusht \
--eval.batch_size=10 \
--eval.n_episodes=10 \
--policy.use_amp=false \
--policy.device=cuda
-p lerobot/diffusion_pusht \
eval.n_episodes=10 \
eval.batch_size=10
```
Note: After training your own policy, you can re-evaluate the checkpoints with:
```bash
python lerobot/scripts/eval.py --policy.path={OUTPUT_DIR}/checkpoints/last/pretrained_model
python lerobot/scripts/eval.py -p {OUTPUT_DIR}/checkpoints/last/pretrained_model
```
See `python lerobot/scripts/eval.py --help` for more instructions.
@@ -272,46 +218,81 @@ See `python lerobot/scripts/eval.py --help` for more instructions.
Check out [example 3](./examples/3_train_policy.py) that illustrates how to train a model using our core library in python, and [example 4](./examples/4_train_policy_with_script.md) that shows how to use our training script from command line.
To use wandb for logging training and evaluation curves, make sure you've run `wandb login` as a one-time setup step. Then, when running the training command above, enable WandB in the configuration by adding `--wandb.enable=true`.
In general, you can use our training script to easily train any policy. Here is an example of training the ACT policy on trajectories collected by humans on the Aloha simulation environment for the insertion task:
A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser. Please also check [here](./examples/4_train_policy_with_script.md#typical-logs-and-metrics) for the explanation of some commonly used metrics in logs.
```bash
python lerobot/scripts/train.py \
policy=act \
env=aloha \
env.task=AlohaInsertion-v0 \
dataset_repo_id=lerobot/aloha_sim_insertion_human \
```
The experiment directory is automatically generated and will show up in yellow in your terminal. It looks like `outputs/train/2024-05-05/20-21-12_aloha_act_default`. You can manually specify an experiment directory by adding this argument to the `train.py` python command:
```bash
hydra.run.dir=your/new/experiment/dir
```
In the experiment directory there will be a folder called `checkpoints` which will have the following structure:
```bash
checkpoints
├── 000250 # checkpoint_dir for training step 250
│ ├── pretrained_model # Hugging Face pretrained model dir
│ │ ├── config.json # Hugging Face pretrained model config
│ │ ├── config.yaml # consolidated Hydra config
│ │ ├── model.safetensors # model weights
│ │ └── README.md # Hugging Face model card
│ └── training_state.pth # optimizer/scheduler/rng state and training step
```
To use wandb for logging training and evaluation curves, make sure you've run `wandb login` as a one-time setup step. Then, when running the training command above, enable WandB in the configuration by adding:
```bash
wandb.enable=true
```
A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser:
![](media/wandb.png)
Note: For efficiency, during training every checkpoint is evaluated on a low number of episodes. You may use `--eval.n_episodes=500` to evaluate on more episodes than the default. Or, after training, you may want to re-evaluate your best checkpoints on more episodes or change the evaluation settings. See `python lerobot/scripts/eval.py --help` for more instructions.
Note: For efficiency, during training every checkpoint is evaluated on a low number of episodes. You may use `eval.n_episodes=500` to evaluate on more episodes than the default. Or, after training, you may want to re-evaluate your best checkpoints on more episodes or change the evaluation settings. See `python lerobot/scripts/eval.py --help` for more instructions.
#### Reproduce state-of-the-art (SOTA)
We provide some pretrained policies on our [hub page](https://huggingface.co/lerobot) that can achieve state-of-the-art performances.
You can reproduce their training by loading the config from their run. Simply running:
We have organized our configuration files (found under [`lerobot/configs`](./lerobot/configs)) such that they reproduce SOTA results from a given model variant in their respective original works. Simply running:
```bash
python lerobot/scripts/train.py --config_path=lerobot/diffusion_pusht
python lerobot/scripts/train.py policy=diffusion env=pusht
```
reproduces SOTA results for Diffusion Policy on the PushT task.
Pretrained policies, along with reproduction details, can be found under the "Models" section of https://huggingface.co/lerobot.
## Contribute
If you would like to contribute to 🤗 LeRobot, please check out our [contribution guide](https://github.com/huggingface/lerobot/blob/main/CONTRIBUTING.md).
<!-- ### Add a new dataset
### Add a new dataset
To add a dataset to the hub, you need to login using a write-access token, which can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens):
```bash
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
```
Then point to your raw dataset folder (e.g. `data/aloha_static_pingpong_test_raw`), and push your dataset to the hub with:
Then move your dataset folder in `data` directory (e.g. `data/aloha_static_pingpong_test`), and push your dataset to the hub with:
```bash
python lerobot/scripts/push_dataset_to_hub.py \
--raw-dir data/aloha_static_pingpong_test_raw \
--out-dir data \
--repo-id lerobot/aloha_static_pingpong_test \
--raw-format aloha_hdf5
--data-dir data \
--dataset-id aloha_static_pingpong_test \
--raw-format aloha_hdf5 \
--community-id lerobot
```
See `python lerobot/scripts/push_dataset_to_hub.py --help` for more instructions.
If your dataset format is not supported, implement your own in `lerobot/common/datasets/push_dataset_to_hub/${raw_format}_format.py` by copying examples like [pusht_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py), [umi_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py), [aloha_hdf5](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py), or [xarm_pkl](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py). -->
If your dataset format is not supported, implement your own in `lerobot/common/datasets/push_dataset_to_hub/${raw_format}_format.py` by copying examples like [pusht_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py), [umi_zarr](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py), [aloha_hdf5](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py), or [xarm_pkl](https://github.com/huggingface/lerobot/blob/main/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py).
### Add a pretrained policy
@@ -321,7 +302,7 @@ Once you have trained a policy you may upload it to the Hugging Face hub using a
You first need to find the checkpoint folder located inside your experiment directory (e.g. `outputs/train/2024-05-05/20-21-12_aloha_act_default/checkpoints/002500`). Within that there is a `pretrained_model` directory which should contain:
- `config.json`: A serialized version of the policy configuration (following the policy's dataclass config).
- `model.safetensors`: A set of `torch.nn.Module` parameters, saved in [Hugging Face Safetensors](https://huggingface.co/docs/safetensors/index) format.
- `train_config.json`: A consolidated configuration containing all parameters used for training. The policy configuration should match `config.json` exactly. This is useful for anyone who wants to evaluate your policy or for reproducibility.
- `config.yaml`: A consolidated Hydra training configuration containing the policy, environment, and dataset configs. The policy configuration should match `config.json` exactly. The environment config is useful for anyone who wants to evaluate your policy. The dataset config just serves as a paper trail for reproducibility.
To upload these to the hub, run the following:
```bash
@@ -358,7 +339,7 @@ with profile(
## Citation
If you want, you can cite this work with:
```bibtex
```
@misc{cadene2024lerobot,
author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Gallouedec, Quentin and Zouitine, Adil and Wolf, Thomas},
title = {LeRobot: State-of-the-art Machine Learning for Real-World Robotics in Pytorch},
@@ -366,48 +347,3 @@ If you want, you can cite this work with:
year = {2024}
}
```
Additionally, if you are using any of the particular policy architecture, pretrained models, or datasets, it is recommended to cite the original authors of the work as they appear below:
- [Diffusion Policy](https://diffusion-policy.cs.columbia.edu)
```bibtex
@article{chi2024diffusionpolicy,
author = {Cheng Chi and Zhenjia Xu and Siyuan Feng and Eric Cousineau and Yilun Du and Benjamin Burchfiel and Russ Tedrake and Shuran Song},
title ={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion},
journal = {The International Journal of Robotics Research},
year = {2024},
}
```
- [ACT or ALOHA](https://tonyzhaozh.github.io/aloha)
```bibtex
@article{zhao2023learning,
title={Learning fine-grained bimanual manipulation with low-cost hardware},
author={Zhao, Tony Z and Kumar, Vikash and Levine, Sergey and Finn, Chelsea},
journal={arXiv preprint arXiv:2304.13705},
year={2023}
}
```
- [TDMPC](https://www.nicklashansen.com/td-mpc/)
```bibtex
@inproceedings{Hansen2022tdmpc,
title={Temporal Difference Learning for Model Predictive Control},
author={Nicklas Hansen and Xiaolong Wang and Hao Su},
booktitle={ICML},
year={2022}
}
```
- [VQ-BeT](https://sjlee.cc/vq-bet/)
```bibtex
@article{lee2024behavior,
title={Behavior generation with latent actions},
author={Lee, Seungjae and Wang, Yibin and Etukuru, Haritheja and Kim, H Jin and Shafiullah, Nur Muhammad Mahi and Pinto, Lerrel},
journal={arXiv preprint arXiv:2403.03181},
year={2024}
}
```
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=huggingface/lerobot&type=Timeline)](https://star-history.com/#huggingface/lerobot&Timeline)

View File

@@ -1,271 +0,0 @@
# Video benchmark
## Questions
What is the optimal trade-off between:
- maximizing loading time with random access,
- minimizing memory space on disk,
- maximizing success rate of policies,
- compatibility across devices/platforms for decoding videos (e.g. video players, web browsers).
How to encode videos?
- Which video codec (`-vcodec`) to use? h264, h265, AV1?
- What pixel format to use (`-pix_fmt`)? `yuv444p` or `yuv420p`?
- How much compression (`-crf`)? No compression with `0`, intermediate compression with `25` or extreme with `50+`?
- Which frequency to chose for key frames (`-g`)? A key frame every `10` frames?
How to decode videos?
- Which `decoder`? `torchvision`, `torchaudio`, `ffmpegio`, `decord`, or `nvc`?
- What scenarios to use for the requesting timestamps during benchmark? (`timestamps_mode`)
## Variables
**Image content & size**
We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an apartment, or in a factory, or outdoor, or with lots of moving objects in the scene, etc. Similarly, loading times might not vary linearly with the image size (resolution).
For these reasons, we run this benchmark on four representative datasets:
- `lerobot/pusht_image`: (96 x 96 pixels) simulation with simple geometric shapes, fixed camera.
- `aliberts/aloha_mobile_shrimp_image`: (480 x 640 pixels) real-world indoor, moving camera.
- `aliberts/paris_street`: (720 x 1280 pixels) real-world outdoor, moving camera.
- `aliberts/kitchen`: (1080 x 1920 pixels) real-world indoor, fixed camera.
Note: The datasets used for this benchmark need to be image datasets, not video datasets.
**Data augmentations**
We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robust (e.g. robust to color changes, compression, etc.).
### Encoding parameters
| parameter | values |
|-------------|--------------------------------------------------------------|
| **vcodec** | `libx264`, `libx265`, `libsvtav1` |
| **pix_fmt** | `yuv444p`, `yuv420p` |
| **g** | `1`, `2`, `3`, `4`, `5`, `6`, `10`, `15`, `20`, `40`, `None` |
| **crf** | `0`, `5`, `10`, `15`, `20`, `25`, `30`, `40`, `50`, `None` |
Note that `crf` value might be interpreted differently by various video codecs. In other words, the same value used with one codec doesn't necessarily translate into the same compression level with another codec. In fact, the default value (`None`) isn't the same amongst the different video codecs. Importantly, it is also the case for many other ffmpeg arguments like `g` which specifies the frequency of the key frames.
For a comprehensive list and documentation of these parameters, see the ffmpeg documentation depending on the video codec used:
- h264: https://trac.ffmpeg.org/wiki/Encode/H.264
- h265: https://trac.ffmpeg.org/wiki/Encode/H.265
- AV1: https://trac.ffmpeg.org/wiki/Encode/AV1
### Decoding parameters
**Decoder**
We tested two video decoding backends from torchvision:
- `pyav`
- `video_reader` (requires to build torchvision from source)
**Requested timestamps**
Given the way video decoding works, once a keyframe has been loaded, the decoding of subsequent frames is fast.
This of course is affected by the `-g` parameter during encoding, which specifies the frequency of the keyframes. Given our typical use cases in robotics policies which might request a few timestamps in different random places, we want to replicate these use cases with the following scenarios:
- `1_frame`: 1 frame,
- `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`),
- `6_frames`: 6 consecutive frames (e.g. `[t + i / fps for i in range(6)]`)
Note that this differs significantly from a typical use case like watching a movie, in which every frame is loaded sequentially from the beginning to the end and it's acceptable to have big values for `-g`.
Additionally, because some policies might request single timestamps that are a few frames apart, we also have the following scenario:
- `2_frames_4_space`: 2 frames with 4 consecutive frames of spacing in between (e.g `[t, t + 5 / fps]`),
However, due to how video decoding is implemented with `pyav`, we don't have access to an accurate seek so in practice this scenario is essentially the same as `6_frames` since all 6 frames between `t` and `t + 5 / fps` will be decoded.
## Metrics
**Data compression ratio (lower is better)**
`video_images_size_ratio` is the ratio of the memory space on disk taken by the encoded video over the memory space taken by the original images. For instance, `video_images_size_ratio=25%` means that the video takes 4 times less memory space on disk compared to the original images.
**Loading time ratio (lower is better)**
`video_images_load_time_ratio` is the ratio of the time it takes to decode frames from the video at a given timestamps over the time it takes to load the exact same original images. Lower is better. For instance, `video_images_load_time_ratio=200%` means that decoding from video is 2 times slower than loading the original images.
**Average Mean Square Error (lower is better)**
`avg_mse` is the average mean square error between each decoded frame and its corresponding original image over all requested timestamps, and also divided by the number of pixels in the image to be comparable when switching to different image sizes.
**Average Peak Signal to Noise Ratio (higher is better)**
`avg_psnr` measures the ratio between the maximum possible power of a signal and the power of corrupting noise that affects the fidelity of its representation. Higher PSNR indicates better quality.
**Average Structural Similarity Index Measure (higher is better)**
`avg_ssim` evaluates the perceived quality of images by comparing luminance, contrast, and structure. SSIM values range from -1 to 1, where 1 indicates perfect similarity.
One aspect that can't be measured here with those metrics is the compatibility of the encoding across platforms, in particular on web browser, for visualization purposes.
h264, h265 and AV1 are all commonly used codecs and should not pose an issue. However, the chroma subsampling (`pix_fmt`) format might affect compatibility:
- `yuv420p` is more widely supported across various platforms, including web browsers.
- `yuv444p` offers higher color fidelity but might not be supported as broadly.
<!-- **Loss of a pretrained policy (higher is better)** (not available)
`loss_pretrained` is the result of evaluating with the selected encoding/decoding settings a policy pretrained on original images. It is easier to understand than `avg_l2_error`.
**Success rate after retraining (higher is better)** (not available)
`success_rate` is the result of training and evaluating a policy with the selected encoding/decoding settings. It is the most difficult metric to get but also the very best. -->
## How the benchmark works
The benchmark evaluates both encoding and decoding of video frames on the first episode of each dataset.
**Encoding:** for each `vcodec` and `pix_fmt` pair, we use a default value for `g` and `crf` upon which we change a single value (either `g` or `crf`) to one of the specified values (we don't test every combination of those as this would be computationally too heavy).
This gives a unique set of encoding parameters which is used to encode the episode.
**Decoding:** Then, for each of those unique encodings, we iterate through every combination of the decoding parameters `backend` and `timestamps_mode`. For each of them, we record the metrics of a number of samples (given by `--num-samples`). This is parallelized for efficiency and the number of processes can be controlled with `--num-workers`. Ideally, it's best to have a `--num-samples` that is divisible by `--num-workers`.
Intermediate results saved for each `vcodec` and `pix_fmt` combination in csv tables.
These are then all concatenated to a single table ready for analysis.
## Caveats
We tried to measure the most impactful parameters for both encoding and decoding. However, for computational reasons we can't test out every combination.
Additional encoding parameters exist that are not included in this benchmark. In particular:
- `-preset` which allows for selecting encoding presets. This represents a collection of options that will provide a certain encoding speed to compression ratio. By leaving this parameter unspecified, it is considered to be `medium` for libx264 and libx265 and `8` for libsvtav1.
- `-tune` which allows to optimize the encoding for certain aspects (e.g. film quality, fast decoding, etc.).
See the documentation mentioned above for more detailed info on these settings and for a more comprehensive list of other parameters.
Similarly on the decoding side, other decoders exist but are not implemented in our current benchmark. To name a few:
- `torchaudio`
- `ffmpegio`
- `decord`
- `nvc`
Note as well that since we are mostly interested in the performance at decoding time (also because encoding is done only once before uploading a dataset), we did not measure encoding times nor have any metrics regarding encoding.
However, besides the necessity to build ffmpeg from source, encoding did not pose any issue and it didn't take a significant amount of time during this benchmark.
## Install
Building ffmpeg from source is required to include libx265 and libaom/libsvtav1 (av1) video codecs ([compilation guide](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu)).
**Note:** While you still need to build torchvision with a conda-installed `ffmpeg<4.3` to use the `video_reader` decoder (as described in [#220](https://github.com/huggingface/lerobot/pull/220)), you also need another version which is custom-built with all the video codecs for encoding. For the script to then use that version, you can prepend the command above with `PATH="$HOME/bin:$PATH"`, which is where ffmpeg should be built.
## Adding a video decoder
Right now, we're only benchmarking the two video decoder available with torchvision: `pyav` and `video_reader`.
You can easily add a new decoder to benchmark by adding it to this function in the script:
```diff
def decode_video_frames(
video_path: str,
timestamps: list[float],
tolerance_s: float,
backend: str,
) -> torch.Tensor:
if backend in ["pyav", "video_reader"]:
return decode_video_frames_torchvision(
video_path, timestamps, tolerance_s, backend
)
+ elif backend == ["your_decoder"]:
+ return your_decoder_function(
+ video_path, timestamps, tolerance_s, backend
+ )
else:
raise NotImplementedError(backend)
```
## Example
For a quick run, you can try these parameters:
```bash
python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
aliberts/aloha_mobile_shrimp_image \
--vcodec libx264 libx265 \
--pix-fmt yuv444p yuv420p \
--g 2 20 None \
--crf 10 40 None \
--timestamps-modes 1_frame 2_frames \
--backends pyav video_reader \
--num-samples 5 \
--num-workers 5 \
--save-frames 0
```
## Results
### Reproduce
We ran the benchmark with the following parameters:
```bash
# h264 and h265 encodings
python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
aliberts/aloha_mobile_shrimp_image \
aliberts/paris_street \
aliberts/kitchen \
--vcodec libx264 libx265 \
--pix-fmt yuv444p yuv420p \
--g 1 2 3 4 5 6 10 15 20 40 None \
--crf 0 5 10 15 20 25 30 40 50 None \
--timestamps-modes 1_frame 2_frames 6_frames \
--backends pyav video_reader \
--num-samples 50 \
--num-workers 5 \
--save-frames 1
# av1 encoding (only compatible with yuv420p and pyav decoder)
python benchmark/video/run_video_benchmark.py \
--output-dir outputs/video_benchmark \
--repo-ids \
lerobot/pusht_image \
aliberts/aloha_mobile_shrimp_image \
aliberts/paris_street \
aliberts/kitchen \
--vcodec libsvtav1 \
--pix-fmt yuv420p \
--g 1 2 3 4 5 6 10 15 20 40 None \
--crf 0 5 10 15 20 25 30 40 50 None \
--timestamps-modes 1_frame 2_frames 6_frames \
--backends pyav \
--num-samples 50 \
--num-workers 5 \
--save-frames 1
```
The full results are available [here](https://docs.google.com/spreadsheets/d/1OYJB43Qu8fC26k_OyoMFgGBBKfQRCi4BIuYitQnq3sw/edit?usp=sharing)
### Parameters selected for LeRobotDataset
Considering these results, we chose what we think is the best set of encoding parameter:
- vcodec: `libsvtav1`
- pix-fmt: `yuv420p`
- g: `2`
- crf: `30`
Since we're using av1 encoding, we're choosing the `pyav` decoder as `video_reader` does not support it (and `pyav` doesn't require a custom build of `torchvision`).
### Summary
These tables show the results for `g=2` and `crf=30`, using `timestamps-modes=6_frames` and `backend=pyav`
| video_images_size_ratio | vcodec | pix_fmt | | | |
|------------------------------------|------------|---------|-----------|-----------|-----------|
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | **16.97%** | 17.58% | 18.57% | 18.86% | 22.06% |
| aliberts/aloha_mobile_shrimp_image | 2.14% | 2.11% | 1.38% | **1.37%** | 5.59% |
| aliberts/paris_street | 2.12% | 2.13% | **1.54%** | **1.54%** | 4.43% |
| aliberts/kitchen | 1.40% | 1.39% | **1.00%** | **1.00%** | 2.52% |
| video_images_load_time_ratio | vcodec | pix_fmt | | | |
|------------------------------------|---------|---------|----------|---------|-----------|
| | libx264 | | libx265 | | libsvtav1 |
| repo_id | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | 6.45 | 5.19 | **1.90** | 2.12 | 2.47 |
| aliberts/aloha_mobile_shrimp_image | 11.80 | 7.92 | 0.71 | 0.85 | **0.48** |
| aliberts/paris_street | 2.21 | 2.05 | 0.36 | 0.49 | **0.30** |
| aliberts/kitchen | 1.46 | 1.46 | 0.28 | 0.51 | **0.26** |
| | | vcodec | pix_fmt | | | |
|------------------------------------|----------|----------|--------------|----------|-----------|--------------|
| | | libx264 | | libx265 | | libsvtav1 |
| repo_id | metric | yuv420p | yuv444p | yuv420p | yuv444p | yuv420p |
| lerobot/pusht_image | avg_mse | 2.90E-04 | **2.03E-04** | 3.13E-04 | 2.29E-04 | 2.19E-04 |
| | avg_psnr | 35.44 | 37.07 | 35.49 | **37.30** | 37.20 |
| | avg_ssim | 98.28% | **98.85%** | 98.31% | 98.84% | 98.72% |
| aliberts/aloha_mobile_shrimp_image | avg_mse | 2.76E-04 | 2.59E-04 | 3.17E-04 | 3.06E-04 | **1.30E-04** |
| | avg_psnr | 35.91 | 36.21 | 35.88 | 36.09 | **40.17** |
| | avg_ssim | 95.19% | 95.18% | 95.00% | 95.05% | **97.73%** |
| aliberts/paris_street | avg_mse | 6.89E-04 | 6.70E-04 | 4.03E-03 | 4.02E-03 | **3.09E-04** |
| | avg_psnr | 33.48 | 33.68 | 32.05 | 32.15 | **35.40** |
| | avg_ssim | 93.76% | 93.75% | 89.46% | 89.46% | **95.46%** |
| aliberts/kitchen | avg_mse | 2.50E-04 | 2.24E-04 | 4.28E-04 | 4.18E-04 | **1.53E-04** |
| | avg_psnr | 36.73 | 37.33 | 36.56 | 36.75 | **39.12** |
| | avg_ssim | 95.47% | 95.58% | 95.52% | 95.53% | **96.82%** |

View File

@@ -1,102 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Capture video feed from a camera as raw images."""
import argparse
import datetime as dt
import os
import time
from pathlib import Path
import cv2
import rerun as rr
# see https://rerun.io/docs/howto/visualization/limit-ram
RERUN_MEMORY_LIMIT = os.getenv("LEROBOT_RERUN_MEMORY_LIMIT", "5%")
def display_and_save_video_stream(output_dir: Path, fps: int, width: int, height: int, duration: int):
rr.init("lerobot_capture_camera_feed")
rr.spawn(memory_limit=RERUN_MEMORY_LIMIT)
now = dt.datetime.now()
capture_dir = output_dir / f"{now:%Y-%m-%d}" / f"{now:%H-%M-%S}"
if not capture_dir.exists():
capture_dir.mkdir(parents=True, exist_ok=True)
# Opens the default webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
print("Error: Could not open video stream.")
return
cap.set(cv2.CAP_PROP_FPS, fps)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
frame_index = 0
start_time = time.time()
while time.time() - start_time < duration:
ret, frame = cap.read()
if not ret:
print("Error: Could not read frame.")
break
rr.log("video/stream", rr.Image(frame.numpy()), static=True)
cv2.imwrite(str(capture_dir / f"frame_{frame_index:06d}.png"), frame)
frame_index += 1
# Release the capture
cap.release()
# TODO(Steven): Add a graceful shutdown via a close() method for the Viewer context, though not currently supported in the Rerun API.
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--output-dir",
type=Path,
default=Path("outputs/cam_capture/"),
help="Directory where the capture images are written. A subfolder named with the current date & time will be created inside it for each capture.",
)
parser.add_argument(
"--fps",
type=int,
default=30,
help="Frames Per Second of the capture.",
)
parser.add_argument(
"--width",
type=int,
default=1280,
help="Width of the captured images.",
)
parser.add_argument(
"--height",
type=int,
default=720,
help="Height of the captured images.",
)
parser.add_argument(
"--duration",
type=int,
default=20,
help="Duration in seconds for which the video stream should be captured.",
)
args = parser.parse_args()
display_and_save_video_stream(**vars(args))

View File

@@ -1,490 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Assess the performance of video decoding in various configurations.
This script will benchmark different video encoding and decoding parameters.
See the provided README.md or run `python benchmark/video/run_video_benchmark.py --help` for usage info.
"""
import argparse
import datetime as dt
import random
import shutil
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import einops
import numpy as np
import pandas as pd
import PIL
import torch
from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
from tqdm import tqdm
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.datasets.video_utils import (
decode_video_frames_torchvision,
encode_video_frames,
)
from lerobot.common.utils.benchmark import TimeBenchmark
BASE_ENCODING = OrderedDict(
[
("vcodec", "libx264"),
("pix_fmt", "yuv444p"),
("g", 2),
("crf", None),
# TODO(aliberts): Add fastdecode
# ("fastdecode", 0),
]
)
# TODO(rcadene, aliberts): move to `utils.py` folder when we want to refactor
def parse_int_or_none(value) -> int | None:
if value.lower() == "none":
return None
try:
return int(value)
except ValueError as e:
raise argparse.ArgumentTypeError(f"Invalid int or None: {value}") from e
def check_datasets_formats(repo_ids: list) -> None:
for repo_id in repo_ids:
dataset = LeRobotDataset(repo_id)
if len(dataset.meta.video_keys) > 0:
raise ValueError(
f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}"
)
def get_directory_size(directory: Path) -> int:
total_size = 0
for item in directory.rglob("*"):
if item.is_file():
total_size += item.stat().st_size
return total_size
def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> torch.Tensor:
frames = []
for ts in timestamps:
idx = int(ts * fps)
frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
frame = torch.from_numpy(np.array(frame))
frame = frame.type(torch.float32) / 255
frame = einops.rearrange(frame, "h w c -> c h w")
frames.append(frame)
return torch.stack(frames)
def save_decoded_frames(
imgs_dir: Path, save_dir: Path, frames: torch.Tensor, timestamps: list[float], fps: int
) -> None:
if save_dir.exists() and len(list(save_dir.glob("frame_*.png"))) == len(timestamps):
return
save_dir.mkdir(parents=True, exist_ok=True)
for i, ts in enumerate(timestamps):
idx = int(ts * fps)
frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
PIL.Image.fromarray(frame_hwc).save(save_dir / f"frame_{idx:06d}_decoded.png")
shutil.copyfile(imgs_dir / f"frame_{idx:06d}.png", save_dir / f"frame_{idx:06d}_original.png")
def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
ep_num_images = dataset.episode_data_index["to"][0].item()
if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images:
return
imgs_dir.mkdir(parents=True, exist_ok=True)
hf_dataset = dataset.hf_dataset.with_format(None)
# We only save images from the first camera
img_keys = [key for key in hf_dataset.features if key.startswith("observation.image")]
imgs_dataset = hf_dataset.select_columns(img_keys[0])
for i, item in enumerate(
tqdm(imgs_dataset, desc=f"saving {dataset.repo_id} first episode images", leave=False)
):
img = item[img_keys[0]]
img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
if i >= ep_num_images - 1:
break
def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> list[float]:
# Start at 5 to allow for 2_frames_4_space and 6_frames
idx = random.randint(5, ep_num_images - 1)
match timestamps_mode:
case "1_frame":
frame_indexes = [idx]
case "2_frames":
frame_indexes = [idx - 1, idx]
case "2_frames_4_space":
frame_indexes = [idx - 5, idx]
case "6_frames":
frame_indexes = [idx - i for i in range(6)][::-1]
case _:
raise ValueError(timestamps_mode)
return [idx / fps for idx in frame_indexes]
def decode_video_frames(
video_path: str,
timestamps: list[float],
tolerance_s: float,
backend: str,
) -> torch.Tensor:
if backend in ["pyav", "video_reader"]:
return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
else:
raise NotImplementedError(backend)
def benchmark_decoding(
imgs_dir: Path,
video_path: Path,
timestamps_mode: str,
backend: str,
ep_num_images: int,
fps: int,
num_samples: int = 50,
num_workers: int = 4,
save_frames: bool = False,
) -> dict:
def process_sample(sample: int):
time_benchmark = TimeBenchmark()
timestamps = sample_timestamps(timestamps_mode, ep_num_images, fps)
num_frames = len(timestamps)
result = {
"psnr_values": [],
"ssim_values": [],
"mse_values": [],
}
with time_benchmark:
frames = decode_video_frames(video_path, timestamps=timestamps, tolerance_s=5e-1, backend=backend)
result["load_time_video_ms"] = time_benchmark.result_ms / num_frames
with time_benchmark:
original_frames = load_original_frames(imgs_dir, timestamps, fps)
result["load_time_images_ms"] = time_benchmark.result_ms / num_frames
frames_np, original_frames_np = frames.numpy(), original_frames.numpy()
for i in range(num_frames):
result["mse_values"].append(mean_squared_error(original_frames_np[i], frames_np[i]))
result["psnr_values"].append(
peak_signal_noise_ratio(original_frames_np[i], frames_np[i], data_range=1.0)
)
result["ssim_values"].append(
structural_similarity(original_frames_np[i], frames_np[i], data_range=1.0, channel_axis=0)
)
if save_frames and sample == 0:
save_dir = video_path.with_suffix("") / f"{timestamps_mode}_{backend}"
save_decoded_frames(imgs_dir, save_dir, frames, timestamps, fps)
return result
load_times_video_ms = []
load_times_images_ms = []
mse_values = []
psnr_values = []
ssim_values = []
# A sample is a single set of decoded frames specified by timestamps_mode (e.g. a single frame, 2 frames, etc.).
# For each sample, we record metrics (loading time and quality metrics) which are then averaged over all samples.
# As these samples are independent, we run them in parallel threads to speed up the benchmark.
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = [executor.submit(process_sample, i) for i in range(num_samples)]
for future in tqdm(as_completed(futures), total=num_samples, desc="samples", leave=False):
result = future.result()
load_times_video_ms.append(result["load_time_video_ms"])
load_times_images_ms.append(result["load_time_images_ms"])
psnr_values.extend(result["psnr_values"])
ssim_values.extend(result["ssim_values"])
mse_values.extend(result["mse_values"])
avg_load_time_video_ms = float(np.array(load_times_video_ms).mean())
avg_load_time_images_ms = float(np.array(load_times_images_ms).mean())
video_images_load_time_ratio = avg_load_time_video_ms / avg_load_time_images_ms
return {
"avg_load_time_video_ms": avg_load_time_video_ms,
"avg_load_time_images_ms": avg_load_time_images_ms,
"video_images_load_time_ratio": video_images_load_time_ratio,
"avg_mse": float(np.mean(mse_values)),
"avg_psnr": float(np.mean(psnr_values)),
"avg_ssim": float(np.mean(ssim_values)),
}
def benchmark_encoding_decoding(
dataset: LeRobotDataset,
video_path: Path,
imgs_dir: Path,
encoding_cfg: dict,
decoding_cfg: dict,
num_samples: int,
num_workers: int,
save_frames: bool,
overwrite: bool = False,
seed: int = 1337,
) -> list[dict]:
fps = dataset.fps
if overwrite or not video_path.is_file():
tqdm.write(f"encoding {video_path}")
encode_video_frames(
imgs_dir=imgs_dir,
video_path=video_path,
fps=fps,
vcodec=encoding_cfg["vcodec"],
pix_fmt=encoding_cfg["pix_fmt"],
g=encoding_cfg.get("g"),
crf=encoding_cfg.get("crf"),
# fast_decode=encoding_cfg.get("fastdecode"),
overwrite=True,
)
ep_num_images = dataset.episode_data_index["to"][0].item()
width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:])
num_pixels = width * height
video_size_bytes = video_path.stat().st_size
images_size_bytes = get_directory_size(imgs_dir)
video_images_size_ratio = video_size_bytes / images_size_bytes
random.seed(seed)
benchmark_table = []
for timestamps_mode in tqdm(
decoding_cfg["timestamps_modes"], desc="decodings (timestamps_modes)", leave=False
):
for backend in tqdm(decoding_cfg["backends"], desc="decodings (backends)", leave=False):
benchmark_row = benchmark_decoding(
imgs_dir,
video_path,
timestamps_mode,
backend,
ep_num_images,
fps,
num_samples,
num_workers,
save_frames,
)
benchmark_row.update(
**{
"repo_id": dataset.repo_id,
"resolution": f"{width} x {height}",
"num_pixels": num_pixels,
"video_size_bytes": video_size_bytes,
"images_size_bytes": images_size_bytes,
"video_images_size_ratio": video_images_size_ratio,
"timestamps_mode": timestamps_mode,
"backend": backend,
},
**encoding_cfg,
)
benchmark_table.append(benchmark_row)
return benchmark_table
def main(
output_dir: Path,
repo_ids: list[str],
vcodec: list[str],
pix_fmt: list[str],
g: list[int],
crf: list[int],
# fastdecode: list[int],
timestamps_modes: list[str],
backends: list[str],
num_samples: int,
num_workers: int,
save_frames: bool,
):
check_datasets_formats(repo_ids)
encoding_benchmarks = {
"g": g,
"crf": crf,
# "fastdecode": fastdecode,
}
decoding_benchmarks = {
"timestamps_modes": timestamps_modes,
"backends": backends,
}
headers = ["repo_id", "resolution", "num_pixels"]
headers += list(BASE_ENCODING.keys())
headers += [
"timestamps_mode",
"backend",
"video_size_bytes",
"images_size_bytes",
"video_images_size_ratio",
"avg_load_time_video_ms",
"avg_load_time_images_ms",
"video_images_load_time_ratio",
"avg_mse",
"avg_psnr",
"avg_ssim",
]
file_paths = []
for video_codec in tqdm(vcodec, desc="encodings (vcodec)"):
for pixel_format in tqdm(pix_fmt, desc="encodings (pix_fmt)", leave=False):
benchmark_table = []
for repo_id in tqdm(repo_ids, desc="encodings (datasets)", leave=False):
dataset = LeRobotDataset(repo_id)
imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_")
# We only use the first episode
save_first_episode(imgs_dir, dataset)
for key, values in tqdm(encoding_benchmarks.items(), desc="encodings (g, crf)", leave=False):
for value in tqdm(values, desc=f"encodings ({key})", leave=False):
encoding_cfg = BASE_ENCODING.copy()
encoding_cfg["vcodec"] = video_codec
encoding_cfg["pix_fmt"] = pixel_format
encoding_cfg[key] = value
args_path = Path("_".join(str(value) for value in encoding_cfg.values()))
video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4"
benchmark_table += benchmark_encoding_decoding(
dataset,
video_path,
imgs_dir,
encoding_cfg,
decoding_benchmarks,
num_samples,
num_workers,
save_frames,
)
# Save intermediate results
benchmark_df = pd.DataFrame(benchmark_table, columns=headers)
now = dt.datetime.now()
csv_path = (
output_dir
/ f"{now:%Y-%m-%d}_{now:%H-%M-%S}_{video_codec}_{pixel_format}_{num_samples}-samples.csv"
)
benchmark_df.to_csv(csv_path, header=True, index=False)
file_paths.append(csv_path)
del benchmark_df
# Concatenate all results
df_list = [pd.read_csv(csv_path) for csv_path in file_paths]
concatenated_df = pd.concat(df_list, ignore_index=True)
concatenated_path = output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_all_{num_samples}-samples.csv"
concatenated_df.to_csv(concatenated_path, header=True, index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--output-dir",
type=Path,
default=Path("outputs/video_benchmark"),
help="Directory where the video benchmark outputs are written.",
)
parser.add_argument(
"--repo-ids",
type=str,
nargs="*",
default=[
"lerobot/pusht_image",
"aliberts/aloha_mobile_shrimp_image",
"aliberts/paris_street",
"aliberts/kitchen",
],
help="Datasets repo-ids to test against. First episodes only are used. Must be images.",
)
parser.add_argument(
"--vcodec",
type=str,
nargs="*",
default=["libx264", "hevc", "libsvtav1"],
help="Video codecs to be tested",
)
parser.add_argument(
"--pix-fmt",
type=str,
nargs="*",
default=["yuv444p", "yuv420p"],
help="Pixel formats (chroma subsampling) to be tested",
)
parser.add_argument(
"--g",
type=parse_int_or_none,
nargs="*",
default=[1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None],
help="Group of pictures sizes to be tested.",
)
parser.add_argument(
"--crf",
type=parse_int_or_none,
nargs="*",
default=[0, 5, 10, 15, 20, 25, 30, 40, 50, None],
help="Constant rate factors to be tested.",
)
# parser.add_argument(
# "--fastdecode",
# type=int,
# nargs="*",
# default=[0, 1],
# help="Use the fastdecode tuning option. 0 disables it. "
# "For libx264 and libx265/hevc, only 1 is possible. "
# "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
# )
parser.add_argument(
"--timestamps-modes",
type=str,
nargs="*",
default=[
"1_frame",
"2_frames",
"2_frames_4_space",
"6_frames",
],
help="Timestamps scenarios to be tested.",
)
parser.add_argument(
"--backends",
type=str,
nargs="*",
default=["pyav", "video_reader"],
help="Torchvision decoding backend to be tested.",
)
parser.add_argument(
"--num-samples",
type=int,
default=50,
help="Number of samples for each encoding x decoding config.",
)
parser.add_argument(
"--num-workers",
type=int,
default=10,
help="Number of processes for parallelized sample processing.",
)
parser.add_argument(
"--save-frames",
type=int,
default=0,
help="Whether to save decoded frames or not. Enter a non-zero number for true.",
)
args = parser.parse_args()
main(**vars(args))

View File

@@ -1,29 +1,31 @@
# Configure image
ARG PYTHON_VERSION=3.10
FROM python:${PYTHON_VERSION}-slim
# Configure environment variables
ARG PYTHON_VERSION
ENV DEBIAN_FRONTEND=noninteractive
ENV MUJOCO_GL="egl"
ENV PATH="/opt/venv/bin:$PATH"
ARG DEBIAN_FRONTEND=noninteractive
# Install dependencies and set up Python in a single layer
# Install apt dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential cmake git \
libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
speech-dispatcher libgeos-dev \
&& ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
&& python -m venv /opt/venv \
&& apt-get clean && rm -rf /var/lib/apt/lists/* \
&& echo "source /opt/venv/bin/activate" >> /root/.bashrc
build-essential cmake \
libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Clone repository and install LeRobot in a single layer
# Create virtual environment
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN echo "source /opt/venv/bin/activate" >> /root/.bashrc
# Install LeRobot
COPY . /lerobot
WORKDIR /lerobot
RUN /opt/venv/bin/pip install --upgrade --no-cache-dir pip \
&& /opt/venv/bin/pip install --no-cache-dir ".[test, aloha, xarm, pusht, dynamixel]" \
--extra-index-url https://download.pytorch.org/whl/cpu
RUN pip install --upgrade --no-cache-dir pip
RUN pip install --no-cache-dir ".[test, aloha, xarm, pusht]" \
--extra-index-url https://download.pytorch.org/whl/cpu
# Set EGL as the rendering backend for MuJoCo
ENV MUJOCO_GL="egl"
# Execute in bash shell rather than python
CMD ["/bin/bash"]

View File

@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.2.2-devel-ubuntu22.04
FROM nvidia/cuda:12.4.1-base-ubuntu22.04
# Configure image
ARG PYTHON_VERSION=3.10
@@ -8,42 +8,14 @@ ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential cmake \
git git-lfs openssh-client \
nano vim less util-linux tree \
nano vim less util-linux \
htop atop nvtop \
sed gawk grep curl wget zip unzip \
sed gawk grep curl wget \
tcpdump sysstat screen tmux \
libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
speech-dispatcher portaudio19-dev libgeos-dev \
python${PYTHON_VERSION} python${PYTHON_VERSION}-venv python${PYTHON_VERSION}-dev \
libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
python${PYTHON_VERSION} python${PYTHON_VERSION}-venv \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Install ffmpeg build dependencies. See:
# https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu
# TODO(aliberts): create image to build dependencies from source instead
RUN apt-get update && apt-get install -y --no-install-recommends \
autoconf automake yasm \
libass-dev \
libfreetype6-dev \
libgnutls28-dev \
libunistring-dev \
libmp3lame-dev \
libtool \
libvorbis-dev \
meson \
ninja-build \
pkg-config \
texinfo \
yasm \
zlib1g-dev \
nasm \
libx264-dev \
libx265-dev libnuma-dev \
libvpx-dev \
libfdk-aac-dev \
libopus-dev \
libsvtav1-dev libsvtav1enc-dev libsvtav1dec-dev \
libdav1d-dev
# Install gh cli tool
RUN (type -p wget >/dev/null || (apt update && apt-get install wget -y)) \
&& mkdir -p -m 755 /etc/apt/keyrings \

View File

@@ -1,24 +1,29 @@
FROM nvidia/cuda:12.4.1-base-ubuntu22.04
# Configure environment variables
# Configure image
ARG PYTHON_VERSION=3.10
ENV DEBIAN_FRONTEND=noninteractive
ENV MUJOCO_GL="egl"
ENV PATH="/opt/venv/bin:$PATH"
ARG DEBIAN_FRONTEND=noninteractive
# Install dependencies and set up Python in a single layer
# Install apt dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential cmake git \
libglib2.0-0 libgl1-mesa-glx libegl1-mesa ffmpeg \
speech-dispatcher libgeos-dev \
python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
&& python -m venv /opt/venv \
&& apt-get clean && rm -rf /var/lib/apt/lists/* \
&& echo "source /opt/venv/bin/activate" >> /root/.bashrc
build-essential cmake \
libglib2.0-0 libgl1-mesa-glx libegl1-mesa \
python${PYTHON_VERSION} python${PYTHON_VERSION}-venv \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Clone repository and install LeRobot in a single layer
# Create virtual environment
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN echo "source /opt/venv/bin/activate" >> /root/.bashrc
# Install LeRobot
COPY . /lerobot
WORKDIR /lerobot
RUN /opt/venv/bin/pip install --upgrade --no-cache-dir pip \
&& /opt/venv/bin/pip install --no-cache-dir ".[test, aloha, xarm, pusht, dynamixel]"
RUN pip install --upgrade --no-cache-dir pip
RUN pip install --no-cache-dir ".[test, aloha, xarm, pusht]"
# Set EGL as the rendering backend for MuJoCo
ENV MUJOCO_GL="egl"

View File

@@ -1,137 +0,0 @@
<!---
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Generating the documentation
To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
you can install them with the following command, at the root of the code repository:
```bash
pip install -e ".[docs]"
```
You will also need `nodejs`. Please refer to their [installation page](https://nodejs.org/en/download)
---
**NOTE**
You only need to generate the documentation to inspect it locally (if you're planning changes and want to
check how they look before committing for instance). You don't have to `git commit` the built documentation.
---
## Building the documentation
Once you have setup the `doc-builder` and additional packages, you can generate the documentation by
typing the following command:
```bash
doc-builder build lerobot docs/source/ --build_dir ~/tmp/test-build
```
You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
Markdown editor.
## Previewing the documentation
To preview the docs, first install the `watchdog` module with:
```bash
pip install watchdog
```
Then run the following command:
```bash
doc-builder preview lerobot docs/source/
```
The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
---
**NOTE**
The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
---
## Adding a new element to the navigation bar
Accepted files are Markdown (.md).
Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/lerobot/blob/main/docs/source/_toctree.yml) file.
## Renaming section headers and moving sections
It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
```
Sections that were moved:
[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
```
and of course, if you moved it to another file, then:
```
Sections that were moved:
[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
```
Use the relative style to link to the new file so that the versioned docs continue to work.
For an example of a rich moved sections set please see the very end of [the transformers Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).
### Adding a new tutorial
Adding a new tutorial or section is done in two steps:
- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
- Link that file in `./source/_toctree.yml` on the correct toc-tree.
Make sure to put your new file under the proper section. If you have a doubt, feel free to ask in a Github Issue or PR.
### Writing source documentation
Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
and objects like True, None or any strings should usually be put in `code`.
#### Writing a multi-line code block
Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
````
```
# first line of code
# second line
# etc
```
````
#### Adding an image
Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
to this dataset.

View File

@@ -1,12 +0,0 @@
- sections:
- local: index
title: LeRobot
- local: installation
title: Installation
title: Get started
- sections:
- local: assemble_so101
title: Assemble SO-101
- local: getting_started_real_world_robot
title: Getting Started with Real-World Robots
title: "Tutorials"

View File

@@ -1,348 +0,0 @@
# Assemble SO-101
In the steps below we explain how to assemble our flagship robot, the SO-101.
## Source the parts
Follow this [README](https://github.com/TheRobotStudio/SO-ARM100). It contains the bill of materials, with a link to source the parts, as well as the instructions to 3D print the parts,
and advice if it's your first time printing or if you don't own a 3D printer.
Before assembling, you will first need to configure your motors. To this end, we provide a nice script, so let's first install LeRobot. After configuration, we will also guide you through assembly.
## Install LeRobot
To install LeRobot follow our [Installation Guide](./installation)
## Configure motors
To configure the motors designate one bus servo adapter and 6 motors for your leader arm, and similarly the other bus servo adapter and 6 motors for the follower arm. It's convenient to label them and write on each motor if it's for the follower `F` or for the leader `L` and it's ID from 1 to 6.
You now should plug the 5V or 12V power supply to the motor bus. 5V for the STS3215 7.4V motors and 12V for the STS3215 12V motors. Note that the leader arm always uses the 7.4V motors, so watch out that you plug in the right power supply if you have 12V and 7.4V motors, otherwise you might burn your motors! Now, connect the motor bus to your computer via USB. Note that the USB doesn't provide any power, and both the power supply and USB have to be plugged in.
### Find the USB ports associated to each arm
To find the port for each bus servo adapter, run this script:
```bash
python lerobot/scripts/find_motors_bus_port.py
```
##### Example outputs of script
<hfoptions id="example">
<hfoption id="Mac">
Example output leader arm's port: `/dev/tty.usbmodem575E0031751`
```bash
Finding all available ports for the MotorBus.
['/dev/tty.usbmodem575E0032081', '/dev/tty.usbmodem575E0031751']
Remove the usb cable from your MotorsBus and press Enter when done.
[...Disconnect leader arm and press Enter...]
The port of this MotorsBus is /dev/tty.usbmodem575E0031751
Reconnect the usb cable.
```
Example output follower arm port: `/dev/tty.usbmodem575E0032081`
```
Finding all available ports for the MotorBus.
['/dev/tty.usbmodem575E0032081', '/dev/tty.usbmodem575E0031751']
Remove the usb cable from your MotorsBus and press Enter when done.
[...Disconnect follower arm and press Enter...]
The port of this MotorsBus is /dev/tty.usbmodem575E0032081
Reconnect the usb cable.
```
</hfoption>
<hfoption id="Linux">
On Linux, you might need to give access to the USB ports by running:
```bash
sudo chmod 666 /dev/ttyACM0
sudo chmod 666 /dev/ttyACM1
```
Example output leader arm port: `/dev/ttyACM0`
```bash
Finding all available ports for the MotorBus.
['/dev/ttyACM0', '/dev/ttyACM1']
Remove the usb cable from your MotorsBus and press Enter when done.
[...Disconnect leader arm and press Enter...]
The port of this MotorsBus is /dev/ttyACM0
Reconnect the usb cable.
```
Example output follower arm port: `/dev/ttyACM1`
```
Finding all available ports for the MotorBus.
['/dev/ttyACM0', '/dev/ttyACM1']
Remove the usb cable from your MotorsBus and press Enter when done.
[...Disconnect follower arm and press Enter...]
The port of this MotorsBus is /dev/ttyACM1
Reconnect the usb cable.
```
</hfoption>
</hfoptions>
#### Update config file
Now that you have your ports, update the **port** default values of [`SO101RobotConfig`](https://github.com/huggingface/lerobot/blob/main/lerobot/common/robot_devices/robots/configs.py).
You will find a class called `so101` where you can update the `port` values with your actual motor ports:
```diff
@RobotConfig.register_subclass("so101")
@dataclass
class So101RobotConfig(ManipulatorRobotConfig):
calibration_dir: str = ".cache/calibration/so101"
# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
# the number of motors in your follower arms.
max_relative_target: int | None = None
leader_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
- port="/dev/tty.usbmodem58760431091",
+ port="{ADD YOUR LEADER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],
"shoulder_lift": [2, "sts3215"],
"elbow_flex": [3, "sts3215"],
"wrist_flex": [4, "sts3215"],
"wrist_roll": [5, "sts3215"],
"gripper": [6, "sts3215"],
},
),
}
)
follower_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
- port="/dev/tty.usbmodem585A0076891",
+ port="{ADD YOUR FOLLOWER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],
"shoulder_lift": [2, "sts3215"],
"elbow_flex": [3, "sts3215"],
"wrist_flex": [4, "sts3215"],
"wrist_roll": [5, "sts3215"],
"gripper": [6, "sts3215"],
},
),
}
)
```
Here is a video of the process:
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot-find-motorbus.mp4" type="video/mp4" />
</video>
</div>
## Step-by-Step Assembly Instructions
The follower arm uses 6x STS3215 motors with 1/345 gearing. The leader however uses three differently geared motors to make sure it can both sustain its own weight and it can be moved without requiring much force. Which motor is needed for which joint is shown in table below.
| Leader-Arm Axis | Motor | Gear Ratio |
|-----------------|:-------:|:----------:|
| Base / Shoulder Yaw | 1 | 1 / 191 |
| Shoulder Pitch | 2 | 1 / 345 |
| Elbow | 3 | 1 / 191 |
| Wrist Roll | 4 | 1 / 147 |
| Wrist Pitch | 5 | 1 / 147 |
| Gripper | 6 | 1 / 147 |
### Set motor IDs
Plug your motor in one of the two ports of the motor bus and run this script to set its ID to 1. Replace the text after --port to the corresponding control board port.
```bash
python lerobot/scripts/configure_motor.py \
--port /dev/tty.usbmodem58760432961 \
--brand feetech \
--model sts3215 \
--baudrate 1000000 \
--ID 1
```
Then unplug your motor and plug the second motor and set its ID to 2.
```bash
python lerobot/scripts/configure_motor.py \
--port /dev/tty.usbmodem58760432961 \
--brand feetech \
--model sts3215 \
--baudrate 1000000 \
--ID 2
```
Redo this process for all your motors until ID 6. Do the same for the 6 motors of the leader arm, but make sure to change the power supply if you use motors with different voltage and make sure you give the right ID to the right motor according to the table above.
Here is a video of the process:
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot-configure-motor.mp4" type="video/mp4" />
</video>
</div>
### Clean Parts
Remove all support material from the 3D-printed parts, the easiest way to do this is using a small screwdriver to get underneath the support material.
### Joint 1
- Place the first motor into the base.
- Fasten the motor with 4 M2x6mm screws (smallest screws). Two from the top and two from bottom.
- Slide over the first motor holder and fasten it using two M2x6mm screws (one on each side).
- Install both motor horns, securing the top horn with a M3x6mm screw.
- Attach the shoulder part.
- Tighten the shoulder part with 4 M3x6mm screws on top and 4 M3x6mm screws on the bottom
- Add the shoulder motor holder.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Joint1_v2.mp4" type="video/mp4" />
</video>
</div>
### Joint 2
- Slide the second motor in from the top.
- Fasten the second motor with 4 M2x6mm screws.
- Attach both motor horns to motor 2, again use the M3x6mm horn screw.
- Attach the upper arm with 4 M3x6mm screws on each side.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Joint2_v2.mp4" type="video/mp4" />
</video>
</div>
### Joint 3
- Insert motor 3 and fasten using 4 M2x6mm screws
- Attach both motor horns to motor 3 and secure one again with a M3x6mm horn screw.
- Connect the forearm to motor 3 using 4 M3x6mm screws on each side.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Joint3_v2.mp4" type="video/mp4" />
</video>
</div>
### Joint 4
- Slide over motor holder 4.
- Slide in motor 4.
- Fasten motor 4 with 4 M2x6mm screws and attach its motor horns, use a M3x6mm horn screw.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Joint4_v2.mp4" type="video/mp4" />
</video>
</div>
### Joint 5
- Insert motor 5 into the wrist holder and secure it with 2 M2x6mm front screws.
- Install only one motor horn on the wrist motor and secure it with a M3x6mm horn screw.
- Secure the wrist to motor 4 using 4 M3x6mm screws on both sides.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Joint5_v2.mp4" type="video/mp4" />
</video>
</div>
### Gripper / Handle
<hfoptions id="assembly">
<hfoption id="Follower">
- Attach the gripper to motor 5, attach it to the motor horn on the wrist using 4 M3x6mm screws.
- Insert the gripper motor and secure it with 2 M2x6mm screws on each side.
- Attach the motor horns and again use a M3x6mm horn screw.
- Install the gripper claw and secure it with 4 M3x6mm screws on both sides.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Gripper_v2.mp4" type="video/mp4" />
</video>
</div>
</hfoption>
<hfoption id="Leader">
- Mount the leader holder onto the wrist and secure it with 4 M3x6mm screws.
- Attach the handle to motor 5 using 1 M2x6mm screw.
- Insert the gripper motor, secure it with 2 M2x6mm screws on each side, attach a motor horn using a M3x6mm horn screw.
- Attach the follower trigger with 4 M3x6mm screws.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Leader_v2.mp4" type="video/mp4" />
</video>
</div>
</hfoption>
</hfoptions>
##### Wiring
- Attach the motor controller on the back.
- Then insert all wires, use the wire guides everywhere to make sure the wires don't unplug themselves and stay in place.
<div class="video-container">
<video controls width="600">
<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/Wiring_v2.mp4" type="video/mp4" />
</video>
</div>
## Calibrate
Next, you'll need to calibrate your SO-101 robot to ensure that the leader and follower arms have the same position values when they are in the same physical position.
The calibration process is very important because it allows a neural network trained on one SO-101 robot to work on another.
#### Manual calibration of follower arm
You will need to move the follower arm to these positions sequentially, note that the rotated position is on the right side of the robot and you have to open the gripper fully.
| 1. Middle position | 2. Zero position | 3. Rotated position | 4. Rest position |
| ------------ |------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/follower_middle.webp?raw=true" alt="SO-101 leader arm middle position" title="SO-101 leader arm middle position" style="width:100%;"> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/follower_zero.webp?raw=true" alt="SO-101 leader arm zero position" title="SO-101 leader arm zero position" style="width:100%;"> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/follower_rotated.webp?raw=true" alt="SO-101 leader arm rotated position" title="SO-101 leader arm rotated position" style="width:100%;"> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/follower_rest.webp?raw=true" alt="SO-101 leader arm rest position" title="SO-101 leader arm rest position" style="width:100%;"> |
Make sure both arms are connected and run this script to launch manual calibration:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=so101 \
--robot.cameras='{}' \
--control.type=calibrate \
--control.arms='["main_follower"]'
```
#### Manual calibration of leader arm
You will also need to move the leader arm to these positions sequentially:
| 1. Middle position | 2. Zero position | 3. Rotated position | 4. Rest position |
| ------------ |------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/leader_middle.webp?raw=true" alt="SO-101 leader arm middle position" title="SO-101 leader arm middle position" style="width:100%;"> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/leader_zero.webp?raw=true" alt="SO-101 leader arm zero position" title="SO-101 leader arm zero position" style="width:100%;"> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/leader_rotated.webp?raw=true" alt="SO-101 leader arm rotated position" title="SO-101 leader arm rotated position" style="width:100%;"> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/leader_rest.webp?raw=true" alt="SO-101 leader arm rest position" title="SO-101 leader arm rest position" style="width:100%;"> |
Run this script to launch manual calibration:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=so101 \
--robot.cameras='{}' \
--control.type=calibrate \
--control.arms='["main_leader"]'
```
Congrats 🎉, your robot is all set to learn a task on its own. Start training it by following this tutorial: [Getting started with real-world robots](./getting_started_real_world_robot)

View File

@@ -1,370 +0,0 @@
# Getting Started with Real-World Robots
This tutorial will explain you how to train a neural network to autonomously control a real robot.
**You'll learn:**
1. How to record and visualize your dataset.
2. How to train a policy using your data and prepare it for evaluation.
3. How to evaluate your policy and visualize the results.
By following these steps, you'll be able to replicate tasks like picking up a Lego block and placing it in a bin with a high success rate, as demonstrated in [this video](https://x.com/RemiCadene/status/1814680760592572934).
This tutorial is specifically made for the affordable [SO-101](https://github.com/TheRobotStudio/SO-ARM100) robot, but it contains additional information to be easily adapted to various types of robots like [Aloha bimanual robot](https://aloha-2.github.io) by changing some configurations. The SO-101 consists of a leader arm and a follower arm, each with 6 motors. It can work with one or several cameras to record the scene, which serve as visual sensors for the robot.
During the data collection phase, you will control the follower arm by moving the leader arm. This process is known as "teleoperation." This technique is used to collect robot trajectories. Afterward, you'll train a neural network to imitate these trajectories and deploy the network to enable your robot to operate autonomously.
If you encounter any issues at any step of the tutorial, feel free to seek help on [Discord](https://discord.com/invite/s3KuuzsPFb) or don't hesitate to iterate with us on the tutorial by creating issues or pull requests.
## Setup and Calibrate
If you haven't yet setup and calibrate the SO-101 follow these steps:
1. [Find ports and update config file](./assemble_so101#find-the-usb-ports-associated-to-each-arm)
2. [Calibrate](./assemble_so101#calibrate)
## Teleoperate
Run this simple script to teleoperate your robot (it won't connect and display the cameras):
```bash
python lerobot/scripts/control_robot.py \
--robot.type=so101 \
--robot.cameras='{}' \
--control.type=teleoperate
```
The teleoperate command will automatically:
1. Identify any missing calibrations and initiate the calibration procedure.
2. Connect the robot and start teleoperation.
## Setup Cameras
To connect a camera you have three options:
1. OpenCVCamera which allows us to use any camera: usb, realsense, laptop webcam
2. iPhone camera with MacOS
3. Phone camera on Linux
### Use OpenCVCamera
The [`OpenCVCamera`](../lerobot/common/robot_devices/cameras/opencv.py) class allows you to efficiently record frames from most cameras using the [`opencv2`](https://docs.opencv.org) library. For more details on compatibility, see [Video I/O with OpenCV Overview](https://docs.opencv.org/4.x/d0/da7/videoio_overview.html).
To instantiate an [`OpenCVCamera`](../lerobot/common/robot_devices/cameras/opencv.py), you need a camera index (e.g. `OpenCVCamera(camera_index=0)`). When you only have one camera like a webcam of a laptop, the camera index is usually `0` but it might differ, and the camera index might change if you reboot your computer or re-plug your camera. This behavior depends on your operating system.
To find the camera indices, run the following utility script, which will save a few frames from each detected camera:
```bash
python lerobot/common/robot_devices/cameras/opencv.py \
--images-dir outputs/images_from_opencv_cameras
```
The output will look something like this if you have two cameras connected:
```
Mac or Windows detected. Finding available camera indices through scanning all indices from 0 to 60
[...]
Camera found at index 0
Camera found at index 1
[...]
Connecting cameras
OpenCVCamera(0, fps=30.0, width=1920.0, height=1080.0, color_mode=rgb)
OpenCVCamera(1, fps=24.0, width=1920.0, height=1080.0, color_mode=rgb)
Saving images to outputs/images_from_opencv_cameras
Frame: 0000 Latency (ms): 39.52
[...]
Frame: 0046 Latency (ms): 40.07
Images have been saved to outputs/images_from_opencv_cameras
```
Check the saved images in `outputs/images_from_opencv_cameras` to identify which camera index corresponds to which physical camera (e.g. `0` for `camera_00` or `1` for `camera_01`):
```
camera_00_frame_000000.png
[...]
camera_00_frame_000047.png
camera_01_frame_000000.png
[...]
camera_01_frame_000047.png
```
Note: Some cameras may take a few seconds to warm up, and the first frame might be black or green.
Now that you have the camera indexes, you should specify the camera's in the config.
### Use your phone
<hfoptions id="use phone">
<hfoption id="Mac">
To use your iPhone as a camera on macOS, enable the Continuity Camera feature:
- Ensure your Mac is running macOS 13 or later, and your iPhone is on iOS 16 or later.
- Sign in both devices with the same Apple ID.
- Connect your devices with a USB cable or turn on Wi-Fi and Bluetooth for a wireless connection.
For more details, visit [Apple support](https://support.apple.com/en-gb/guide/mac-help/mchl77879b8a/mac).
Your iPhone should be detected automatically when running the camera setup script in the next section.
</hfoption>
<hfoption id="Linux">
If you want to use your phone as a camera on Linux, follow these steps to set up a virtual camera
1. *Install `v4l2loopback-dkms` and `v4l-utils`*. Those packages are required to create virtual camera devices (`v4l2loopback`) and verify their settings with the `v4l2-ctl` utility from `v4l-utils`. Install them using:
```python
sudo apt install v4l2loopback-dkms v4l-utils
```
2. *Install [DroidCam](https://droidcam.app) on your phone*. This app is available for both iOS and Android.
3. *Install [OBS Studio](https://obsproject.com)*. This software will help you manage the camera feed. Install it using [Flatpak](https://flatpak.org):
```python
flatpak install flathub com.obsproject.Studio
```
4. *Install the DroidCam OBS plugin*. This plugin integrates DroidCam with OBS Studio. Install it with:
```python
flatpak install flathub com.obsproject.Studio.Plugin.DroidCam
```
5. *Start OBS Studio*. Launch with:
```python
flatpak run com.obsproject.Studio
```
6. *Add your phone as a source*. Follow the instructions [here](https://droidcam.app/obs/usage). Be sure to set the resolution to `640x480`.
7. *Adjust resolution settings*. In OBS Studio, go to `File > Settings > Video`. Change the `Base(Canvas) Resolution` and the `Output(Scaled) Resolution` to `640x480` by manually typing it in.
8. *Start virtual camera*. In OBS Studio, follow the instructions [here](https://obsproject.com/kb/virtual-camera-guide).
9. *Verify the virtual camera setup*. Use `v4l2-ctl` to list the devices:
```python
v4l2-ctl --list-devices
```
You should see an entry like:
```
VirtualCam (platform:v4l2loopback-000):
/dev/video1
```
10. *Check the camera resolution*. Use `v4l2-ctl` to ensure that the virtual camera output resolution is `640x480`. Change `/dev/video1` to the port of your virtual camera from the output of `v4l2-ctl --list-devices`.
```python
v4l2-ctl -d /dev/video1 --get-fmt-video
```
You should see an entry like:
```
>>> Format Video Capture:
>>> Width/Height : 640/480
>>> Pixel Format : 'YUYV' (YUYV 4:2:2)
```
Troubleshooting: If the resolution is not correct you will have to delete the Virtual Camera port and try again as it cannot be changed.
If everything is set up correctly, you can proceed with the rest of the tutorial.
</hfoption>
</hfoptions>
## Teleoperate with cameras
We can now teleoperate again while at the same time visualizing the cameras and joint positions with `rerun`.
```bash
python lerobot/scripts/control_robot.py \
--robot.type=so101 \
--control.type=teleoperate
--control.display_data=true
```
## Record a dataset
Once you're familiar with teleoperation, you can record your first dataset with SO-101.
We use the Hugging Face hub features for uploading your dataset. If you haven't previously used the Hub, make sure you can login via the cli using a write-access token, this token can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens).
Add your token to the cli by running this command:
```bash
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
```
Then store your Hugging Face repository name in a variable:
```bash
HF_USER=$(huggingface-cli whoami | head -n 1)
echo $HF_USER
```
Now you can record a dataset, to record 2 episodes and upload your dataset to the hub execute this command:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=so101 \
--control.type=record \
--control.fps=30 \
--control.single_task="Grasp a lego block and put it in the bin." \
--control.repo_id=${HF_USER}/so101_test \
--control.tags='["so101","tutorial"]' \
--control.warmup_time_s=5 \
--control.episode_time_s=30 \
--control.reset_time_s=30 \
--control.num_episodes=2 \
--control.push_to_hub=true
```
You will see a lot of lines appearing like this one:
```
INFO 2024-08-10 15:02:58 ol_robot.py:219 dt:33.34 (30.0hz) dtRlead: 5.06 (197.5hz) dtWfoll: 0.25 (3963.7hz) dtRfoll: 6.22 (160.7hz) dtRlaptop: 32.57 (30.7hz) dtRphone: 33.84 (29.5hz)
```
| Field | Meaning |
|:---|:---|
| `2024-08-10 15:02:58` | Timestamp when `print` was called. |
| `ol_robot.py:219` | Source file and line number of the `print` call (`lerobot/scripts/control_robot.py` at line `219`). |
| `dt: 33.34 (30.0 Hz)` | Delta time (ms) between teleop steps (target: 30.0 Hz, `--fps 30`). Yellow if step is too slow. |
| `dtRlead: 5.06 (197.5 Hz)` | Delta time (ms) for reading present position from the **leader arm**. |
| `dtWfoll: 0.25 (3963.7 Hz)` | Delta time (ms) for writing goal position to the **follower arm** (asynchronous). |
| `dtRfoll: 6.22 (160.7 Hz)` | Delta time (ms) for reading present position from the **follower arm**. |
| `dtRlaptop: 32.57 (30.7 Hz)` | Delta time (ms) for capturing an image from the **laptop camera** (async thread). |
| `dtRphone: 33.84 (29.5 Hz)` | Delta time (ms) for capturing an image from the **phone camera** (async thread). |
#### Dataset upload
Locally your dataset is stored in this folder: `~/.cache/huggingface/lerobot/{repo-id}` (e.g. `data/cadene/so101_test`). At the end of data recording, your dataset will be uploaded on your Hugging Face page (e.g. https://huggingface.co/datasets/cadene/so101_test) that you can obtain by running:
```bash
echo https://huggingface.co/datasets/${HF_USER}/so101_test
```
Your dataset will be automatically tagged with `LeRobot` for the community to find it easily, and you can also add custom tags (in this case `tutorial` for example).
You can look for other LeRobot datasets on the hub by searching for `LeRobot` [tags](https://huggingface.co/datasets?other=LeRobot).
#### Record function
The `record` function provides a suite of tools for capturing and managing data during robot operation:
##### 1. Frame Capture and Video Encoding
- Frames from cameras are saved to disk during recording.
- At the end of each episode, frames are encoded into video files.
##### 2. Data Storage
- Data is stored using the `LeRobotDataset` format.
- By default, the dataset is pushed to your Hugging Face page.
- To disable uploading, use `--control.push_to_hub=false`.
##### 3. Checkpointing and Resuming
- Checkpoints are automatically created during recording.
- If an issue occurs, you can resume by re-running the same command with `--control.resume=true`.
- To start recording from scratch, **manually delete** the dataset directory.
##### 4. Recording Parameters
Set the flow of data recording using command-line arguments:
- `--control.warmup_time_s=10`
Number of seconds before starting data collection (default: **10 seconds**).
Allows devices to warm up and synchronize.
- `--control.episode_time_s=60`
Duration of each data recording episode (default: **60 seconds**).
- `--control.reset_time_s=60`
Duration for resetting the environment after each episode (default: **60 seconds**).
- `--control.num_episodes=50`
Total number of episodes to record (default: **50**).
##### 5. Keyboard Controls During Recording
Control the data recording flow using keyboard shortcuts:
- Press **Right Arrow (`→`)**: Early stop the current episode or reset time and move to the next.
- Press **Left Arrow (`←`)**: Cancel the current episode and re-record it.
- Press **Escape (`ESC`)**: Immediately stop the session, encode videos, and upload the dataset.
#### Tips for gathering data
Once you're comfortable with data recording, you can create a larger dataset for training. A good starting task is grasping an object at different locations and placing it in a bin. We suggest recording at least 50 episodes, with 10 episodes per location. Keep the cameras fixed and maintain consistent grasping behavior throughout the recordings. Also make sure the object you are manipulating is visible on the camera's. A good rule of thumb is you should be able to do the task yourself by only looking at the camera images.
In the following sections, youll train your neural network. After achieving reliable grasping performance, you can start introducing more variations during data collection, such as additional grasp locations, different grasping techniques, and altering camera positions.
Avoid adding too much variation too quickly, as it may hinder your results.
#### Troubleshooting:
- On Linux, if the left and right arrow keys and escape key don't have any effect during data recording, make sure you've set the `$DISPLAY` environment variable. See [pynput limitations](https://pynput.readthedocs.io/en/latest/limitations.html#linux).
## Visualize a dataset
If you uploaded your dataset to the hub with `--control.push_to_hub=true`, you can [visualize your dataset online](https://huggingface.co/spaces/lerobot/visualize_dataset) by copy pasting your repo id given by:
```bash
echo ${HF_USER}/so101_test
```
If you didn't upload with `--control.push_to_hub=false`, you can visualize it locally with (via a window in the browser `http://127.0.0.1:9090` with the visualization tool):
```bash
python lerobot/scripts/visualize_dataset_html.py \
--repo-id ${HF_USER}/so101_test \
--local-files-only 1
```
This will launch a local web server that looks like this:
<div style="text-align:center;">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/visualize_dataset_html.webp?raw=true" alt="Koch v1.1 leader and follower arms" title="Koch v1.1 leader and follower arms" width="100%"></img>
</div>
## Replay an episode
A useful feature is the `replay` function, which allows to replay on your robot any episode that you've recorded or episodes from any dataset out there. This function helps you test the repeatability of your robot's actions and assess transferability across robots of the same model.
You can replay the first episode on your robot with:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=so101 \
--control.type=replay \
--control.fps=30 \
--control.repo_id=${HF_USER}/so101_test \
--control.episode=0
```
Your robot should replicate movements similar to those you recorded. For example, check out [this video](https://x.com/RemiCadene/status/1793654950905680090) where we use `replay` on a Aloha robot from [Trossen Robotics](https://www.trossenrobotics.com).
## Train a policy
To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
```bash
python lerobot/scripts/train.py \
--dataset.repo_id=${HF_USER}/so101_test \
--policy.type=act \
--output_dir=outputs/train/act_so101_test \
--job_name=act_so101_test \
--policy.device=cuda \
--wandb.enable=true
```
Let's explain the command:
1. We provided the dataset as argument with `--dataset.repo_id=${HF_USER}/so101_test`.
2. We provided the policy with `policy.type=act`. This loads configurations from [`configuration_act.py`](../lerobot/common/policies/act/configuration_act.py). Importantly, this policy will automatically adapt to the number of motor states, motor actions and cameras of your robot (e.g. `laptop` and `phone`) which have been saved in your dataset.
4. We provided `policy.device=cuda` since we are training on a Nvidia GPU, but you could use `policy.device=mps` to train on Apple silicon.
5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`.
Training should take several hours. You will find checkpoints in `outputs/train/act_so101_test/checkpoints`.
To resume training from a checkpoint, below is an example command to resume from `last` checkpoint of the `act_so101_test` policy:
```bash
python lerobot/scripts/train.py \
--config_path=outputs/train/act_so101_test/checkpoints/last/pretrained_model/train_config.json \
--resume=true
```
#### Upload policy checkpoints
Once training is done, upload the latest checkpoint with:
```bash
huggingface-cli upload ${HF_USER}/act_so101_test \
outputs/train/act_so101_test/checkpoints/last/pretrained_model
```
You can also upload intermediate checkpoints with:
```bash
CKPT=010000
huggingface-cli upload ${HF_USER}/act_so101_test${CKPT} \
outputs/train/act_so101_test/checkpoints/${CKPT}/pretrained_model
```
## Evaluate your policy
You can use the `record` function from [`lerobot/scripts/control_robot.py`](../lerobot/scripts/control_robot.py) but with a policy checkpoint as input. For instance, run this command to record 10 evaluation episodes:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=so101 \
--control.type=record \
--control.fps=30 \
--control.single_task="Grasp a lego block and put it in the bin." \
--control.repo_id=${HF_USER}/eval_act_so101_test \
--control.tags='["tutorial"]' \
--control.warmup_time_s=5 \
--control.episode_time_s=30 \
--control.reset_time_s=30 \
--control.num_episodes=10 \
--control.push_to_hub=true \
--control.policy.path=outputs/train/act_so101_test/checkpoints/last/pretrained_model
```
As you can see, it's almost the same command as previously used to record your training dataset. Two things changed:
1. There is an additional `--control.policy.path` argument which indicates the path to your policy checkpoint with (e.g. `outputs/train/eval_act_so101_test/checkpoints/last/pretrained_model`). You can also use the model repository if you uploaded a model checkpoint to the hub (e.g. `${HF_USER}/act_so101_test`).
2. The name of dataset begins by `eval` to reflect that you are running inference (e.g. `${HF_USER}/eval_act_so101_test`).

View File

@@ -1,19 +0,0 @@
<div class="flex justify-center">
<a target="_blank" href="https://huggingface.co/lerobot">
<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot-logo-thumbnail.png" style="width: 100%"></img>
</a>
</div>
# LeRobot
**State-of-the-art machine learning for real-world robotics**
🤗 LeRobot aims to provide models, datasets, and tools for real-world robotics in PyTorch. The goal is to lower the barrier for entry to robotics so that everyone can contribute and benefit from sharing datasets and pretrained models.
🤗 LeRobot contains state-of-the-art approaches that have been shown to transfer to the real-world with a focus on imitation learning and reinforcement learning.
🤗 LeRobot already provides a set of pretrained models, datasets with human collected demonstrations, and simulated environments so that everyone can get started.
🤗 LeRobot hosts pretrained models and datasets on the LeRobot HuggingFace page.
Join the LeRobot community on [Discord](https://discord.gg/s3KuuzsPFb)

View File

@@ -1,84 +0,0 @@
# Installation
## Install LeRobot
Download our source code:
```bash
git clone https://github.com/huggingface/lerobot.git
cd lerobot
```
Create a virtual environment with Python 3.10, using [`Miniconda`](https://docs.anaconda.com/miniconda/install/#quick-command-line-install)
```bash
conda create -y -n lerobot python=3.10
```
Now restart the shell by running:
<hfoptions id="shell_restart">
<hfoption id="Windows">
```bash
source ~/.bashrc
```
</hfoption>
<hfoption id="Mac">
```bash
source ~/.bash_profile
```
</hfoption>
<hfoption id="zshell">
```bash
source ~/.zshrc
```
</hfoption>
</hfoptions>
Then activate your conda environment, you have to do this each time you open a shell to use lerobot:
```bash
conda activate lerobot
```
When using `miniconda`, install `ffmpeg` in your environment:
```bash
conda install ffmpeg -c conda-forge
```
> [!TIP]
> This usually installs `ffmpeg 7.X` for your platform compiled with the `libsvtav1` encoder. If `libsvtav1` is not supported (check supported encoders with `ffmpeg -encoders`), you can:
> - _[On any platform]_ Explicitly install `ffmpeg 7.X` using:
> ```bash
> conda install ffmpeg=7.1.1 -c conda-forge
> ```
> - _[On Linux only]_ Install [ffmpeg build dependencies](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#GettheDependencies) and [compile ffmpeg from source with libsvtav1](https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu#libsvtav1), and make sure you use the corresponding ffmpeg binary to your install with `which ffmpeg`.
Install 🤗 LeRobot:
```bash
cd lerobot && pip install ".[feetech]"
```
## Troubleshooting
If you encounter build errors, you may need to install additional dependencies: `cmake`, `build-essential`, and `ffmpeg libs`.
To install these for linux run:
```bash
sudo apt-get install cmake build-essential python-dev pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libswscale-dev libswresample-dev libavfilter-dev pkg-config
```
For other systems, see: [Compiling PyAV](https://pyav.org/docs/develop/overview/installation.html#bring-your-own-ffmpeg)
## Sim
For simulations, 🤗 LeRobot comes with gymnasium environments that can be installed as extras:
- [aloha](https://github.com/huggingface/gym-aloha)
- [xarm](https://github.com/huggingface/gym-xarm)
- [pusht](https://github.com/huggingface/gym-pusht)
For instance, to install 🤗 LeRobot with aloha and pusht, use:
```bash
pip install -e ".[aloha, pusht]"
```
## W&B
To use [Weights and Biases](https://docs.wandb.ai/quickstart) for experiment tracking, log in with
```bash
wandb login
```

View File

@@ -1,136 +1,80 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script demonstrates the use of `LeRobotDataset` class for handling and processing robotic datasets from Hugging Face.
It illustrates how to load datasets, manipulate them, and apply transformations suitable for machine learning tasks in PyTorch.
Features included in this script:
- Viewing a dataset's metadata and exploring its properties.
- Loading an existing dataset from the hub or a subset of it.
- Accessing frames by episode number.
- Loading a dataset and accessing its properties.
- Filtering data by episode number.
- Converting tensor data for visualization.
- Saving video files from dataset frames.
- Using advanced dataset features like timestamp-based frame selection.
- Demonstrating compatibility with PyTorch DataLoader for batch processing.
The script ends with examples of how to batch process data using PyTorch's DataLoader.
"""
from pathlib import Path
from pprint import pprint
import imageio
import torch
from huggingface_hub import HfApi
import lerobot
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
# We ported a number of existing datasets ourselves, use this to see the list:
print("List of available datasets:")
pprint(lerobot.available_datasets)
# You can also browse through the datasets created/ported by the community on the hub using the hub api:
hub_api = HfApi()
repo_ids = [info.id for info in hub_api.list_datasets(task_categories="robotics", tags=["LeRobot"])]
pprint(repo_ids)
# Let's take one for this example
repo_id = "lerobot/pusht"
# Or simply explore them in your web browser directly at:
# https://huggingface.co/datasets?other=LeRobot
# Let's take this one for this example
repo_id = "lerobot/aloha_mobile_cabinet"
# We can have a look and fetch its metadata to know more about it:
ds_meta = LeRobotDatasetMetadata(repo_id)
# By instantiating just this class, you can quickly access useful information about the content and the
# structure of the dataset without downloading the actual data yet (only metadata files — which are
# lightweight).
print(f"Total number of episodes: {ds_meta.total_episodes}")
print(f"Average number of frames per episode: {ds_meta.total_frames / ds_meta.total_episodes:.3f}")
print(f"Frames per second used during data collection: {ds_meta.fps}")
print(f"Robot type: {ds_meta.robot_type}")
print(f"keys to access images from cameras: {ds_meta.camera_keys=}\n")
print("Tasks:")
print(ds_meta.tasks)
print("Features:")
pprint(ds_meta.features)
# You can also get a short summary by simply printing the object:
print(ds_meta)
# You can then load the actual dataset from the hub.
# Either load any subset of episodes:
dataset = LeRobotDataset(repo_id, episodes=[0, 10, 11, 23])
# And see how many frames you have:
print(f"Selected episodes: {dataset.episodes}")
print(f"Number of episodes selected: {dataset.num_episodes}")
print(f"Number of frames selected: {dataset.num_frames}")
# Or simply load the entire dataset:
# You can easily load a dataset from a Hugging Face repository
dataset = LeRobotDataset(repo_id)
print(f"Number of episodes selected: {dataset.num_episodes}")
print(f"Number of frames selected: {dataset.num_frames}")
# The previous metadata class is contained in the 'meta' attribute of the dataset:
print(dataset.meta)
# LeRobotDataset actually wraps an underlying Hugging Face dataset
# (see https://huggingface.co/docs/datasets for more information).
# LeRobotDataset is actually a thin wrapper around an underlying Hugging Face dataset
# (see https://huggingface.co/docs/datasets/index for more information).
print(dataset)
print(dataset.hf_dataset)
# LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working
# with the latter, like iterating through the dataset.
# The __getitem__ iterates over the frames of the dataset. Since our datasets are also structured by
# episodes, you can access the frame indices of any episode using the episode_data_index. Here, we access
# frame indices associated to the first episode:
# And provides additional utilities for robotics and compatibility with Pytorch
print(f"\naverage number of frames per episode: {dataset.num_samples / dataset.num_episodes:.3f}")
print(f"frames per second used during data collection: {dataset.fps=}")
print(f"keys to access images from cameras: {dataset.camera_keys=}\n")
# Access frame indexes associated to first episode
episode_index = 0
from_idx = dataset.episode_data_index["from"][episode_index].item()
to_idx = dataset.episode_data_index["to"][episode_index].item()
# Then we grab all the image frames from the first camera:
camera_key = dataset.meta.camera_keys[0]
frames = [dataset[idx][camera_key] for idx in range(from_idx, to_idx)]
# LeRobot datasets actually subclass PyTorch datasets so you can do everything you know and love from working
# with the latter, like iterating through the dataset. Here we grab all the image frames.
frames = [dataset[idx]["observation.image"] for idx in range(from_idx, to_idx)]
# The objects returned by the dataset are all torch.Tensors
print(type(frames[0]))
print(frames[0].shape)
# Video frames are now float32 in range [0,1] channel first (c,h,w) to follow pytorch convention. To visualize
# them, we convert to uint8 in range [0,255]
frames = [(frame * 255).type(torch.uint8) for frame in frames]
# and to channel last (h,w,c).
frames = [frame.permute((1, 2, 0)).numpy() for frame in frames]
# Since we're using pytorch, the shape is in pytorch, channel-first convention (c, h, w).
# We can compare this shape with the information available for that feature
pprint(dataset.features[camera_key])
# In particular:
print(dataset.features[camera_key]["shape"])
# The shape is in (h, w, c) which is a more universal format.
# Finally, we save the frames to a mp4 video for visualization.
Path("outputs/examples/1_load_lerobot_dataset").mkdir(parents=True, exist_ok=True)
imageio.mimsave("outputs/examples/1_load_lerobot_dataset/episode_0.mp4", frames, fps=dataset.fps)
# For many machine learning applications we need to load the history of past observations or trajectories of
# future actions. Our datasets can load previous and future frames for each key/modality, using timestamps
# differences with the current loaded frame. For instance:
delta_timestamps = {
# loads 4 images: 1 second before current frame, 500 ms before, 200 ms before, and current frame
camera_key: [-1, -0.5, -0.20, 0],
# loads 6 state vectors: 1.5 seconds before, 1 second before, ... 200 ms, 100 ms, and current frame
"observation.state": [-1.5, -1, -0.5, -0.20, -0.10, 0],
"observation.image": [-1, -0.5, -0.20, 0],
# loads 8 state vectors: 1.5 seconds before, 1 second before, ... 20 ms, 10 ms, and current frame
"observation.state": [-1.5, -1, -0.5, -0.20, -0.10, -0.02, -0.01, 0],
# loads 64 action vectors: current frame, 1 frame in the future, 2 frames, ... 63 frames in the future
"action": [t / dataset.fps for t in range(64)],
}
# Note that in any case, these delta_timestamps values need to be multiples of (1/fps) so that added to any
# timestamp, you still get a valid timestamp.
dataset = LeRobotDataset(repo_id, delta_timestamps=delta_timestamps)
print(f"\n{dataset[0][camera_key].shape=}") # (4, c, h, w)
print(f"{dataset[0]['observation.state'].shape=}") # (6, c)
print(f"{dataset[0]['action'].shape=}\n") # (64, c)
print(f"\n{dataset[0]['observation.image'].shape=}") # (4,c,h,w)
print(f"{dataset[0]['observation.state'].shape=}") # (8,c)
print(f"{dataset[0]['action'].shape=}\n") # (64,c)
# Finally, our datasets are fully compatible with PyTorch dataloaders and samplers because they are just
# PyTorch datasets.
@@ -140,9 +84,8 @@ dataloader = torch.utils.data.DataLoader(
batch_size=32,
shuffle=True,
)
for batch in dataloader:
print(f"{batch[camera_key].shape=}") # (32, 4, c, h, w)
print(f"{batch['observation.state'].shape=}") # (32, 6, c)
print(f"{batch['action'].shape=}") # (32, 64, c)
print(f"{batch['observation.image'].shape=}") # (32,4,c,h,w)
print(f"{batch['observation.state'].shape=}") # (32,8,c)
print(f"{batch['action'].shape=}") # (32,64,c)
break

View File

@@ -1,25 +1,6 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script demonstrates how to evaluate a pretrained policy from the HuggingFace Hub or from your local
This scripts demonstrates how to evaluate a pretrained policy from the HuggingFace Hub or from your local
training outputs directory. In the latter case, you might want to run examples/3_train_policy.py first.
It requires the installation of the 'gym_pusht' simulation environment. Install it by running:
```bash
pip install -e ".[pusht]"
```
"""
from pathlib import Path
@@ -29,6 +10,7 @@ import gymnasium as gym
import imageio
import numpy
import torch
from huggingface_hub import snapshot_download
from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
@@ -36,15 +18,16 @@ from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
output_directory = Path("outputs/eval/example_pusht_diffusion")
output_directory.mkdir(parents=True, exist_ok=True)
# Select your device
device = "cuda"
device = torch.device("cuda")
# Provide the [hugging face repo id](https://huggingface.co/lerobot/diffusion_pusht):
pretrained_policy_path = "lerobot/diffusion_pusht"
# OR a path to a local outputs/train folder.
# Download the diffusion policy for pusht environment
pretrained_policy_path = Path(snapshot_download("lerobot/diffusion_pusht"))
# OR uncomment the following to evaluate a policy from the local outputs/train folder.
# pretrained_policy_path = Path("outputs/train/example_pusht_diffusion")
policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
policy.eval()
policy.to(device)
# Initialize evaluation environment to render two observation types:
# an image of the scene and state/position of the agent. The environment
@@ -55,17 +38,7 @@ env = gym.make(
max_episode_steps=300,
)
# We can verify that the shapes of the features expected by the policy match the ones from the observations
# produced by the environment
print(policy.config.input_features)
print(env.observation_space)
# Similarly, we can check that the actions produced by the policy will match the actions expected by the
# environment
print(policy.config.output_features)
print(env.action_space)
# Reset the policy and environments to prepare for rollout
# Reset the policy and environmens to prepare for rollout
policy.reset()
numpy_observation, info = env.reset(seed=42)
@@ -119,7 +92,7 @@ while not done:
rewards.append(reward)
frames.append(env.render())
# The rollout is considered done when the success state is reached (i.e. terminated is True),
# The rollout is considered done when the success state is reach (i.e. terminated is True),
# or the maximum number of iterations is reached (i.e. truncated is True)
done = terminated | truncated | done
step += 1

View File

@@ -1,18 +1,4 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This script demonstrates how to train Diffusion Policy on the PushT environment.
"""This scripts demonstrates how to train Diffusion Policy on the PushT environment.
Once you have trained a model with this script, you can try to evaluate it on
examples/2_evaluate_pretrained_policy.py
@@ -22,99 +8,72 @@ from pathlib import Path
import torch
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from lerobot.common.datasets.utils import dataset_to_policy_features
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
from lerobot.configs.types import FeatureType
# Create a directory to store the training checkpoint.
output_directory = Path("outputs/train/example_pusht_diffusion")
output_directory.mkdir(parents=True, exist_ok=True)
def main():
# Create a directory to store the training checkpoint.
output_directory = Path("outputs/train/example_pusht_diffusion")
output_directory.mkdir(parents=True, exist_ok=True)
# Number of offline training steps (we'll only do offline training for this example.)
# Adjust as you prefer. 5000 steps are needed to get something worth evaluating.
training_steps = 5000
device = torch.device("cuda")
log_freq = 250
# # Select your device
device = torch.device("cuda")
# Set up the dataset.
delta_timestamps = {
# Load the previous image and state at -0.1 seconds before current frame,
# then load current image and state corresponding to 0.0 second.
"observation.image": [-0.1, 0.0],
"observation.state": [-0.1, 0.0],
# Load the previous action (-0.1), the next action to be executed (0.0),
# and 14 future actions with a 0.1 seconds spacing. All these actions will be
# used to supervise the policy.
"action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
}
dataset = LeRobotDataset("lerobot/pusht", delta_timestamps=delta_timestamps)
# Number of offline training steps (we'll only do offline training for this example.)
# Adjust as you prefer. 5000 steps are needed to get something worth evaluating.
training_steps = 5000
log_freq = 1
# Set up the the policy.
# Policies are initialized with a configuration class, in this case `DiffusionConfig`.
# For this example, no arguments need to be passed because the defaults are set up for PushT.
# If you're doing something different, you will likely need to change at least some of the defaults.
cfg = DiffusionConfig()
policy = DiffusionPolicy(cfg, dataset_stats=dataset.stats)
policy.train()
policy.to(device)
# When starting from scratch (i.e. not from a pretrained policy), we need to specify 2 things before
# creating the policy:
# - input/output shapes: to properly size the policy
# - dataset stats: for normalization and denormalization of input/outputs
dataset_metadata = LeRobotDatasetMetadata("lerobot/pusht")
features = dataset_to_policy_features(dataset_metadata.features)
output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
input_features = {key: ft for key, ft in features.items() if key not in output_features}
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)
# Policies are initialized with a configuration class, in this case `DiffusionConfig`. For this example,
# we'll just use the defaults and so no arguments other than input/output features need to be passed.
cfg = DiffusionConfig(input_features=input_features, output_features=output_features)
# Create dataloader for offline training.
dataloader = torch.utils.data.DataLoader(
dataset,
num_workers=4,
batch_size=64,
shuffle=True,
pin_memory=device != torch.device("cpu"),
drop_last=True,
)
# We can now instantiate our policy with this config and the dataset stats.
policy = DiffusionPolicy(cfg, dataset_stats=dataset_metadata.stats)
policy.train()
policy.to(device)
# Run training loop.
step = 0
done = False
while not done:
for batch in dataloader:
batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
output_dict = policy.forward(batch)
loss = output_dict["loss"]
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Another policy-dataset interaction is with the delta_timestamps. Each policy expects a given number frames
# which can differ for inputs, outputs and rewards (if there are some).
delta_timestamps = {
"observation.image": [i / dataset_metadata.fps for i in cfg.observation_delta_indices],
"observation.state": [i / dataset_metadata.fps for i in cfg.observation_delta_indices],
"action": [i / dataset_metadata.fps for i in cfg.action_delta_indices],
}
if step % log_freq == 0:
print(f"step: {step} loss: {loss.item():.3f}")
step += 1
if step >= training_steps:
done = True
break
# In this case with the standard configuration for Diffusion Policy, it is equivalent to this:
delta_timestamps = {
# Load the previous image and state at -0.1 seconds before current frame,
# then load current image and state corresponding to 0.0 second.
"observation.image": [-0.1, 0.0],
"observation.state": [-0.1, 0.0],
# Load the previous action (-0.1), the next action to be executed (0.0),
# and 14 future actions with a 0.1 seconds spacing. All these actions will be
# used to supervise the policy.
"action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
}
# We can then instantiate the dataset with these delta_timestamps configuration.
dataset = LeRobotDataset("lerobot/pusht", delta_timestamps=delta_timestamps)
# Then we create our optimizer and dataloader for offline training.
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)
dataloader = torch.utils.data.DataLoader(
dataset,
num_workers=4,
batch_size=64,
shuffle=True,
pin_memory=device.type != "cpu",
drop_last=True,
)
# Run training loop.
step = 0
done = False
while not done:
for batch in dataloader:
batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
loss, _ = policy.forward(batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if step % log_freq == 0:
print(f"step: {step} loss: {loss.item():.3f}")
step += 1
if step >= training_steps:
done = True
break
# Save a policy checkpoint.
policy.save_pretrained(output_directory)
if __name__ == "__main__":
main()
# Save a policy checkpoint.
policy.save_pretrained(output_directory)

View File

@@ -1,274 +1,183 @@
This tutorial will explain the training script, how to use it, and particularly how to configure everything needed for the training run.
> **Note:** The following assumes you're running these commands on a machine equipped with a cuda GPU. If you don't have one (or if you're using a Mac), you can add `--policy.device=cpu` (`--policy.device=mps` respectively). However, be advised that the code executes much slower on cpu.
This tutorial will explain the training script, how to use it, and particularly the use of Hydra to configure everything needed for the training run.
## The training script
LeRobot offers a training script at [`lerobot/scripts/train.py`](../lerobot/scripts/train.py). At a high level it does the following:
LeRobot offers a training script at [`lerobot/scripts/train.py`](../../lerobot/scripts/train.py). At a high level it does the following:
- Initialize/load a configuration for the following steps using.
- Instantiates a dataset.
- (Optional) Instantiates a simulation environment corresponding to that dataset.
- Instantiates a policy.
- Loads a Hydra configuration file for the following steps (more on Hydra in a moment).
- Makes a simulation environment.
- Makes a dataset corresponding to that simulation environment.
- Makes a policy.
- Runs a standard training loop with forward pass, backward pass, optimization step, and occasional logging, evaluation (of the policy on the environment), and checkpointing.
## Overview of the configuration system
## Basics of how we use Hydra
Explaining the ins and outs of [Hydra](https://hydra.cc/docs/intro/) is beyond the scope of this document, but here we'll share the main points you need to know.
First, `lerobot/configs` has a directory structure like this:
```
.
├── default.yaml
├── env
│ ├── aloha.yaml
│ ├── pusht.yaml
│ └── xarm.yaml
└── policy
├── act.yaml
├── diffusion.yaml
└── tdmpc.yaml
```
**_For brevity, in the rest of this document we'll drop the leading `lerobot/configs` path. So `default.yaml` really refers to `lerobot/configs/default.yaml`._**
When you run the training script with
In the training script, the main function `train` expects a `TrainPipelineConfig` object:
```python
# train.py
@parser.wrap()
def train(cfg: TrainPipelineConfig):
python lerobot/scripts/train.py
```
You can inspect the `TrainPipelineConfig` defined in [`lerobot/configs/train.py`](../lerobot/configs/train.py) (which is heavily commented and meant to be a reference to understand any option)
Hydra is set up to read `default.yaml` (via the `@hydra.main` decorator). If you take a look at the `@hydra.main`'s arguments you will see `config_path="../configs", config_name="default"`. At the top of `default.yaml`, is a `defaults` section which looks likes this:
When running the script, inputs for the command line are parsed thanks to the `@parser.wrap()` decorator and an instance of this class is automatically generated. Under the hood, this is done with [Draccus](https://github.com/dlwh/draccus) which is a tool dedicated to this purpose. If you're familiar with Hydra, Draccus can similarly load configurations from config files (.json, .yaml) and also override their values through command line inputs. Unlike Hydra, these configurations are pre-defined in the code through dataclasses rather than being defined entirely in config files. This allows for more rigorous serialization/deserialization, typing, and to manipulate configuration as objects directly in the code and not as dictionaries or namespaces (which enables nice features in an IDE such as autocomplete, jump-to-def, etc.)
Let's have a look at a simplified example. Amongst other attributes, the training config has the following attributes:
```python
@dataclass
class TrainPipelineConfig:
dataset: DatasetConfig
env: envs.EnvConfig | None = None
policy: PreTrainedConfig | None = None
```
in which `DatasetConfig` for example is defined as such:
```python
@dataclass
class DatasetConfig:
repo_id: str
episodes: list[int] | None = None
video_backend: str = "pyav"
```yaml
defaults:
- _self_
- env: pusht
- policy: diffusion
```
This creates a hierarchical relationship where, for example assuming we have a `cfg` instance of `TrainPipelineConfig`, we can access the `repo_id` value with `cfg.dataset.repo_id`.
From the command line, we can specify this value by using a very similar syntax `--dataset.repo_id=repo/id`.
This logic tells Hydra to incorporate configuration parameters from `env/pusht.yaml` and `policy/diffusion.yaml`. _Note: Be aware of the order as any configuration parameters with the same name will be overidden. Thus, `default.yaml` is overriden by `env/pusht.yaml` which is overidden by `policy/diffusion.yaml`_.
By default, every field takes its default value specified in the dataclass. If a field doesn't have a default value, it needs to be specified either from the command line or from a config file which path is also given in the command line (more in this below). In the example above, the `dataset` field doesn't have a default value which means it must be specified.
Then, `default.yaml` also contains common configuration parameters such as `device: cuda` or `use_amp: false` (for enabling fp16 training). Some other parameters are set to `???` which indicates that they are expected to be set in additional yaml files. For instance, `training.offline_steps: ???` in `default.yaml` is set to `200000` in `diffusion.yaml`.
Thanks to this `defaults` section in `default.yaml`, if you want to train Diffusion Policy with PushT, you really only need to run:
## Specifying values from the CLI
```bash
python lerobot/scripts/train.py
```
However, you can be more explicit and launch the exact same Diffusion Policy training on PushT with:
```bash
python lerobot/scripts/train.py policy=diffusion env=pusht
```
This way of overriding defaults via the CLI is especially useful when you want to change the policy and/or environment. For instance, you can train ACT on the default Aloha environment with:
```bash
python lerobot/scripts/train.py policy=act env=aloha
```
There are two things to note here:
- Config overrides are passed as `param_name=param_value`.
- Here we have overridden the defaults section. `policy=act` tells Hydra to use `policy/act.yaml`, and `env=aloha` tells Hydra to use `env/aloha.yaml`.
_As an aside: we've set up all of our configurations so that they reproduce state-of-the-art results from papers in the literature._
## Overriding configuration parameters in the CLI
Now let's say that we want to train on a different task in the Aloha environment. If you look in `env/aloha.yaml` you will see something like:
```yaml
# lerobot/configs/env/aloha.yaml
env:
task: AlohaInsertion-v0
```
And if you look in `policy/act.yaml` you will see something like:
```yaml
# lerobot/configs/policy/act.yaml
dataset_repo_id: lerobot/aloha_sim_insertion_human
```
But our Aloha environment actually supports a cube transfer task as well. To train for this task, you could manually modify the two yaml configuration files respectively.
First, we'd need to switch to using the cube transfer task for the ALOHA environment.
```diff
# lerobot/configs/env/aloha.yaml
env:
- task: AlohaInsertion-v0
+ task: AlohaTransferCube-v0
```
Then, we'd also need to switch to using the cube transfer dataset.
```diff
# lerobot/configs/policy/act.yaml
-dataset_repo_id: lerobot/aloha_sim_insertion_human
+dataset_repo_id: lerobot/aloha_sim_transfer_cube_human
```
Then, you'd be able to run:
```bash
python lerobot/scripts/train.py policy=act env=aloha
```
and you'd be training and evaluating on the cube transfer task.
An alternative approach to editing the yaml configuration files, would be to override the defaults via the command line:
Let's say that we want to train [Diffusion Policy](../lerobot/common/policies/diffusion) on the [pusht](https://huggingface.co/datasets/lerobot/pusht) dataset, using the [gym_pusht](https://github.com/huggingface/gym-pusht) environment for evaluation. The command to do so would look like this:
```bash
python lerobot/scripts/train.py \
--dataset.repo_id=lerobot/pusht \
--policy.type=diffusion \
--env.type=pusht
policy=act \
dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
env=aloha \
env.task=AlohaTransferCube-v0
```
Let's break this down:
- To specify the dataset, we just need to specify its `repo_id` on the hub which is the only required argument in the `DatasetConfig`. The rest of the fields have default values and in this case we are fine with those so we can just add the option `--dataset.repo_id=lerobot/pusht`.
- To specify the policy, we can just select diffusion policy using `--policy` appended with `.type`. Here, `.type` is a special argument which allows us to select config classes inheriting from `draccus.ChoiceRegistry` and that have been decorated with the `register_subclass()` method. To have a better explanation of this feature, have a look at this [Draccus demo](https://github.com/dlwh/draccus?tab=readme-ov-file#more-flexible-configuration-with-choice-types). In our code, we use this mechanism mainly to select policies, environments, robots, and some other components like optimizers. The policies available to select are located in [lerobot/common/policies](../lerobot/common/policies)
- Similarly, we select the environment with `--env.type=pusht`. The different environment configs are available in [`lerobot/common/envs/configs.py`](../lerobot/common/envs/configs.py)
There's something new here. Notice the `.` delimiter used to traverse the configuration hierarchy. _But be aware that the `defaults` section is an exception. As you saw above, we didn't need to write `defaults.policy=act` in the CLI. `policy=act` was enough._
Putting all that knowledge together, here's the command that was used to train https://huggingface.co/lerobot/act_aloha_sim_transfer_cube_human.
Let's see another example. Let's say you've been training [ACT](../lerobot/common/policies/act) on [lerobot/aloha_sim_insertion_human](https://huggingface.co/datasets/lerobot/aloha_sim_insertion_human) using the [gym-aloha](https://github.com/huggingface/gym-aloha) environment for evaluation with:
```bash
python lerobot/scripts/train.py \
--policy.type=act \
--dataset.repo_id=lerobot/aloha_sim_insertion_human \
--env.type=aloha \
--output_dir=outputs/train/act_aloha_insertion
hydra.run.dir=outputs/train/act_aloha_sim_transfer_cube_human \
device=cuda
env=aloha \
env.task=AlohaTransferCube-v0 \
dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
policy=act \
training.eval_freq=10000 \
training.log_freq=250 \
training.offline_steps=100000 \
training.save_model=true \
training.save_freq=25000 \
eval.n_episodes=50 \
eval.batch_size=50 \
wandb.enable=false \
```
> Notice we added `--output_dir` to explicitly tell where to write outputs from this run (checkpoints, training state, configs etc.). This is not mandatory and if you don't specify it, a default directory will be created from the current date and time, env.type and policy.type. This will typically look like `outputs/train/2025-01-24/16-10-05_aloha_act`.
We now want to train a different policy for aloha on another task. We'll change the dataset and use [lerobot/aloha_sim_transfer_cube_human](https://huggingface.co/datasets/lerobot/aloha_sim_transfer_cube_human) instead. Of course, we also need to change the task of the environment as well to match this other task.
Looking at the [`AlohaEnv`](../lerobot/common/envs/configs.py) config, the task is `"AlohaInsertion-v0"` by default, which corresponds to the task we trained on in the command above. The [gym-aloha](https://github.com/huggingface/gym-aloha?tab=readme-ov-file#description) environment also has the `AlohaTransferCube-v0` task which corresponds to this other task we want to train on. Putting this together, we can train this new policy on this different task using:
There's one new thing here: `hydra.run.dir=outputs/train/act_aloha_sim_transfer_cube_human`, which specifies where to save the training output.
## Using a configuration file not in `lerobot/configs`
Above we discusses the our training script is set up such that Hydra looks for `default.yaml` in `lerobot/configs`. But, if you have a configuration file elsewhere in your filesystem you may use:
```bash
python lerobot/scripts/train.py \
--policy.type=act \
--dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
--env.type=aloha \
--env.task=AlohaTransferCube-v0 \
--output_dir=outputs/train/act_aloha_transfer
python lerobot/scripts/train.py --config-dir PARENT/PATH --config-name FILE_NAME_WITHOUT_EXTENSION
```
## Loading from a config file
Note: here we use regular syntax for providing CLI arguments to a Python script, not Hydra's `param_name=param_value` syntax.
Now, let's assume that we want to reproduce the run just above. That run has produced a `train_config.json` file in its checkpoints, which serializes the `TrainPipelineConfig` instance it used:
```json
{
"dataset": {
"repo_id": "lerobot/aloha_sim_transfer_cube_human",
"episodes": null,
...
},
"env": {
"type": "aloha",
"task": "AlohaTransferCube-v0",
"fps": 50,
...
},
"policy": {
"type": "act",
"n_obs_steps": 1,
...
},
...
}
```
As a concrete example, this becomes particularly handy when you have a folder with training outputs, and would like to re-run the training. For example, say you previously ran the training script with one of the earlier commands and have `outputs/train/my_experiment/checkpoints/pretrained_model/config.yaml`. This `config.yaml` file will have the full set of configuration parameters within it. To run the training with the same configuration again, do:
We can then simply load the config values from this file using:
```bash
python lerobot/scripts/train.py \
--config_path=outputs/train/act_aloha_transfer/checkpoints/last/pretrained_model/ \
--output_dir=outputs/train/act_aloha_transfer_2
```
`--config_path` is also a special argument which allows to initialize the config from a local config file. It can point to a directory that contains `train_config.json` or to the config file itself directly.
Similarly to Hydra, we can still override some parameters in the CLI if we want to, e.g.:
```bash
python lerobot/scripts/train.py \
--config_path=outputs/train/act_aloha_transfer/checkpoints/last/pretrained_model/ \
--output_dir=outputs/train/act_aloha_transfer_2
--policy.n_action_steps=80
```
> Note: While `--output_dir` is not required in general, in this case we need to specify it since it will otherwise take the value from the `train_config.json` (which is `outputs/train/act_aloha_transfer`). In order to prevent accidental deletion of previous run checkpoints, we raise an error if you're trying to write in an existing directory. This is not the case when resuming a run, which is what you'll learn next.
`--config_path` can also accept the repo_id of a repo on the hub that contains a `train_config.json` file, e.g. running:
```bash
python lerobot/scripts/train.py --config_path=lerobot/diffusion_pusht
```
will start a training run with the same configuration used for training [lerobot/diffusion_pusht](https://huggingface.co/lerobot/diffusion_pusht)
## Resume training
Being able to resume a training run is important in case it crashed or aborted for any reason. We'll demonstrate how to do that here.
Let's reuse the command from the previous run and add a few more options:
```bash
python lerobot/scripts/train.py \
--policy.type=act \
--dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
--env.type=aloha \
--env.task=AlohaTransferCube-v0 \
--log_freq=25 \
--save_freq=100 \
--output_dir=outputs/train/run_resumption
python lerobot/scripts/train.py --config-dir outputs/train/my_experiment/checkpoints/last/pretrained_model --config-name config
```
Here we've taken care to set up the log frequency and checkpointing frequency to low numbers so we can showcase resumption. You should be able to see some logging and have a first checkpoint within 1 minute (depending on hardware). Wait for the first checkpoint to happen, you should see a line that looks like this in your terminal:
```
INFO 2025-01-24 16:10:56 ts/train.py:263 Checkpoint policy after step 100
```
Now let's simulate a crash by killing the process (hit `ctrl`+`c`). We can then simply resume this run from the last checkpoint available with:
```bash
python lerobot/scripts/train.py \
--config_path=outputs/train/run_resumption/checkpoints/last/pretrained_model/ \
--resume=true
```
You should see from the logging that your training picks up from where it left off.
Another reason for which you might want to resume a run is simply to extend training and add more training steps. The number of training steps is set by the option `--steps`, which is 100 000 by default.
You could double the number of steps of the previous run with:
```bash
python lerobot/scripts/train.py \
--config_path=outputs/train/run_resumption/checkpoints/last/pretrained_model/ \
--resume=true \
--steps=200000
```
## Outputs of a run
In the output directory, there will be a folder called `checkpoints` with the following structure:
```bash
outputs/train/run_resumption/checkpoints
├── 000100 # checkpoint_dir for training step 100
│ ├── pretrained_model/
│ │ ├── config.json # policy config
│ │ ├── model.safetensors # policy weights
│ │ └── train_config.json # train config
│ └── training_state/
│ ├── optimizer_param_groups.json # optimizer param groups
│ ├── optimizer_state.safetensors # optimizer state
│ ├── rng_state.safetensors # rng states
│ ├── scheduler_state.json # scheduler state
│ └── training_step.json # training step
├── 000200
└── last -> 000200 # symlink to the last available checkpoint
```
## Fine-tuning a pre-trained policy
In addition to the features currently in Draccus, we've added a special `.path` argument for the policy, which allows to load a policy as you would with `PreTrainedPolicy.from_pretrained()`. In that case, `path` can be a local directory that contains a checkpoint or a repo_id pointing to a pretrained policy on the hub.
For example, we could fine-tune a [policy pre-trained on the aloha transfer task](https://huggingface.co/lerobot/act_aloha_sim_transfer_cube_human) on the aloha insertion task. We can achieve this with:
```bash
python lerobot/scripts/train.py \
--policy.path=lerobot/act_aloha_sim_transfer_cube_human \
--dataset.repo_id=lerobot/aloha_sim_insertion_human \
--env.type=aloha \
--env.task=AlohaInsertion-v0
```
When doing so, keep in mind that the features of the fine-tuning dataset would have to match the input/output features of the pretrained policy.
## Typical logs and metrics
When you start the training process, you will first see your full configuration being printed in the terminal. You can check it to make sure that you configured your run correctly. The final configuration will also be saved with the checkpoint.
After that, you will see training log like this one:
```
INFO 2024-08-14 13:35:12 ts/train.py:192 step:0 smpl:64 ep:1 epch:0.00 loss:1.112 grdn:15.387 lr:2.0e-07 updt_s:1.738 data_s:4.774
```
or evaluation log:
```
INFO 2024-08-14 13:38:45 ts/train.py:226 step:100 smpl:6K ep:52 epch:0.25 ∑rwrd:20.693 success:0.0% eval_s:120.266
```
These logs will also be saved in wandb if `wandb.enable` is set to `true`. Here are the meaning of some abbreviations:
- `smpl`: number of samples seen during training.
- `ep`: number of episodes seen during training. An episode contains multiple samples in a complete manipulation task.
- `epch`: number of time all unique samples are seen (epoch).
- `grdn`: gradient norm.
- `∑rwrd`: compute the sum of rewards in every evaluation episode and then take an average of them.
- `success`: average success rate of eval episodes. Reward and success are usually different except for the sparsing reward setting, where reward=1 only when the task is completed successfully.
- `eval_s`: time to evaluate the policy in the environment, in second.
- `updt_s`: time to update the network parameters, in second.
- `data_s`: time to load a batch of data, in second.
Some metrics are useful for initial performance profiling. For example, if you find the current GPU utilization is low via the `nvidia-smi` command and `data_s` sometimes is too high, you may need to modify batch size or number of dataloading workers to accelerate dataloading. We also recommend [pytorch profiler](https://github.com/huggingface/lerobot?tab=readme-ov-file#improve-your-code-with-profiling) for detailed performance probing.
## In short
We'll summarize here the main use cases to remember from this tutorial.
#### Train a policy from scratch CLI
```bash
python lerobot/scripts/train.py \
--policy.type=act \ # <- select 'act' policy
--env.type=pusht \ # <- select 'pusht' environment
--dataset.repo_id=lerobot/pusht # <- train on this dataset
```
#### Train a policy from scratch - config file + CLI
```bash
python lerobot/scripts/train.py \
--config_path=path/to/pretrained_model \ # <- can also be a repo_id
--policy.n_action_steps=80 # <- you may still override values
```
#### Resume/continue a training run
```bash
python lerobot/scripts/train.py \
--config_path=checkpoint/pretrained_model/ \
--resume=true \
--steps=200000 # <- you can change some training parameters
```
#### Fine-tuning
```bash
python lerobot/scripts/train.py \
--policy.path=lerobot/act_aloha_sim_transfer_cube_human \ # <- can also be a local path to a checkpoint
--dataset.repo_id=lerobot/aloha_sim_insertion_human \
--env.type=aloha \
--env.task=AlohaInsertion-v0
```
Note that you may still use the regular syntax for config parameter overrides (eg: by adding `training.offline_steps=200000`).
---
Now that you know the basics of how to train a policy, you might want to know how to apply this knowledge to actual robots, or how to record your own datasets and train policies on your specific task?
If that's the case, head over to the next tutorial [`7_get_started_with_real_robot.md`](./7_get_started_with_real_robot.md).
So far we've seen how to train Diffusion Policy for PushT and ACT for ALOHA. Now, what if we want to train ACT for PushT? Well, there are aspects of the ACT configuration that are specific to the ALOHA environments, and these happen to be incompatible with PushT. Therefore, trying to run the following will almost certainly raise an exception of sorts (eg: feature dimension mismatch):
Or in the meantime, happy training! 🤗
```bash
python lerobot/scripts/train.py policy=act env=pusht dataset_repo_id=lerobot/pusht
```
Please, head on over to our [advanced tutorial on adapting policy configuration to various environments](./advanced/train_act_pusht/train_act_pusht.md) to learn more.
Or in the meantime, happy coding! 🤗

View File

@@ -1,998 +0,0 @@
# Getting Started with Real-World Robots
This tutorial will guide you through the process of setting up and training a neural network to autonomously control a real robot.
**What You'll Learn:**
1. How to order and assemble your robot.
2. How to connect, configure, and calibrate your robot.
3. How to record and visualize your dataset.
4. How to train a policy using your data and prepare it for evaluation.
5. How to evaluate your policy and visualize the results.
By following these steps, you'll be able to replicate tasks like picking up a Lego block and placing it in a bin with a high success rate, as demonstrated in [this video](https://x.com/RemiCadene/status/1814680760592572934).
This tutorial is specifically made for the affordable [Koch v1.1](https://github.com/jess-moss/koch-v1-1) robot, but it contains additional information to be easily adapted to various types of robots like [Aloha bimanual robot](https://aloha-2.github.io) by changing some configurations. The Koch v1.1 consists of a leader arm and a follower arm, each with 6 motors. It can work with one or several cameras to record the scene, which serve as visual sensors for the robot.
During the data collection phase, you will control the follower arm by moving the leader arm. This process is known as "teleoperation." This technique is used to collect robot trajectories. Afterward, you'll train a neural network to imitate these trajectories and deploy the network to enable your robot to operate autonomously.
If you encounter any issues at any step of the tutorial, feel free to seek help on [Discord](https://discord.com/invite/s3KuuzsPFb) or don't hesitate to iterate with us on the tutorial by creating issues or pull requests. Thanks!
## 1. Order and Assemble your Koch v1.1
Follow the sourcing and assembling instructions provided on the [Koch v1.1 Github page](https://github.com/jess-moss/koch-v1-1). This will guide you through setting up both the follower and leader arms, as shown in the image below.
<div style="text-align:center;">
<img src="../media/tutorial/koch_v1_1_leader_follower.webp?raw=true" alt="Koch v1.1 leader and follower arms" title="Koch v1.1 leader and follower arms" width="50%">
</div>
For a visual walkthrough of the assembly process, you can refer to [this video tutorial](https://youtu.be/8nQIg9BwwTk).
## 2. Configure motors, calibrate arms, teleoperate your Koch v1.1
First, install the additional dependencies required for robots built with dynamixel motors like Koch v1.1 by running one of the following commands (make sure gcc is installed).
Using `pip`:
```bash
pip install -e ".[dynamixel]"
```
Using `poetry`:
```bash
poetry sync --extras "dynamixel"
```
Using `uv`:
```bash
uv sync --extra "dynamixel"
```
You are now ready to plug the 5V power supply to the motor bus of the leader arm (the smaller one) since all its motors only require 5V.
Then plug the 12V power supply to the motor bus of the follower arm. It has two motors that need 12V, and the rest will be powered with 5V through the voltage convertor.
Finally, connect both arms to your computer via USB. Note that the USB doesn't provide any power, and both arms need to be plugged in with their associated power supply to be detected by your computer.
Now you are ready to configure your motors for the first time, as detailed in the sections below. In the upcoming sections, you'll learn about our classes and functions by running some python code in an interactive session, or by copy-pasting it in a python file.
If you have already configured your motors the first time, you can streamline the process by directly running the teleoperate script (which is detailed further in the tutorial):
> **NOTE:** To visualize the data, enable `--control.display_data=true`. This streams the data using `rerun`.
```bash
python lerobot/scripts/control_robot.py \
--robot.type=koch \
--control.type=teleoperate
```
It will automatically:
1. Identify any missing calibrations and initiate the calibration procedure.
2. Connect the robot and start teleoperation.
### a. Control your motors with DynamixelMotorsBus
You can use the [`DynamixelMotorsBus`](../lerobot/common/robot_devices/motors/dynamixel.py) to communicate with the motors connected as a chain to the corresponding USB bus. This class leverages the Python [Dynamixel SDK](https://emanual.robotis.com/docs/en/software/dynamixel/dynamixel_sdk/sample_code/python_read_write_protocol_2_0/#python-read-write-protocol-20) to facilitate reading from and writing to the motors.
**First Configuration of your motors**
You will need to unplug each motor in turn and run a command the identify the motor. The motor will save its own identification, so you only need to do this once. Start by unplugging all of the motors.
Do the Leader arm first, as all of its motors are of the same type. Plug in your first motor on your leader arm and run this script to set its ID to 1.
```bash
python lerobot/scripts/configure_motor.py \
--port /dev/tty.usbmodem58760432961 \
--brand dynamixel \
--model xl330-m288 \
--baudrate 1000000 \
--id 1
```
Then unplug your first motor and plug the second motor and set its ID to 2.
```bash
python lerobot/scripts/configure_motor.py \
--port /dev/tty.usbmodem58760432961 \
--brand dynamixel \
--model xl330-m288 \
--baudrate 1000000 \
--id 2
```
Redo the process for all your motors until ID 6.
The process for the follower arm is almost the same, but the follower arm has two types of motors. For the first two motors, make sure you set the model to `xl430-w250`. _Important: configuring follower motors requires plugging and unplugging power. Make sure you use the 5V power for the XL330s and the 12V power for the XL430s!_
After all of your motors are configured properly, you're ready to plug them all together in a daisy-chain as shown in the original video.
**Instantiate the DynamixelMotorsBus**
To begin, create two instances of the [`DynamixelMotorsBus`](../lerobot/common/robot_devices/motors/dynamixel.py), one for each arm, using their corresponding USB ports (e.g. `DynamixelMotorsBus(port="/dev/tty.usbmodem575E0031751"`).
To find the correct ports for each arm, run the utility script twice:
```bash
python lerobot/scripts/find_motors_bus_port.py
```
Example output when identifying the leader arm's port (e.g., `/dev/tty.usbmodem575E0031751` on Mac, or possibly `/dev/ttyACM0` on Linux):
```
Finding all available ports for the MotorBus.
['/dev/tty.usbmodem575E0032081', '/dev/tty.usbmodem575E0031751']
Remove the usb cable from your DynamixelMotorsBus and press Enter when done.
[...Disconnect leader arm and press Enter...]
The port of this DynamixelMotorsBus is /dev/tty.usbmodem575E0031751
Reconnect the usb cable.
```
Example output when identifying the follower arm's port (e.g., `/dev/tty.usbmodem575E0032081`, or possibly `/dev/ttyACM1` on Linux):
```
Finding all available ports for the MotorBus.
['/dev/tty.usbmodem575E0032081', '/dev/tty.usbmodem575E0031751']
Remove the usb cable from your DynamixelMotorsBus and press Enter when done.
[...Disconnect follower arm and press Enter...]
The port of this DynamixelMotorsBus is /dev/tty.usbmodem575E0032081
Reconnect the usb cable.
```
Troubleshooting: On Linux, you might need to give access to the USB ports by running this command with your ports:
```bash
sudo chmod 666 /dev/tty.usbmodem575E0032081
sudo chmod 666 /dev/tty.usbmodem575E0031751
```
*Listing and Configuring Motors*
Next, you'll need to list the motors for each arm, including their name, index, and model. Initially, each motor is assigned the factory default index `1`. Since each motor requires a unique index to function correctly when connected in a chain on a common bus, you'll need to assign different indices. It's recommended to use an ascending index order, starting from `1` (e.g., `1, 2, 3, 4, 5, 6`). These indices will be saved in the persistent memory of each motor during the first connection.
To assign indices to the motors, run this code in an interactive Python session. Replace the `port` values with the ones you identified earlier:
```python
from lerobot.common.robot_devices.motors.configs import DynamixelMotorsBusConfig
from lerobot.common.robot_devices.motors.dynamixel import DynamixelMotorsBus
leader_config = DynamixelMotorsBusConfig(
port="/dev/tty.usbmodem575E0031751",
motors={
# name: (index, model)
"shoulder_pan": (1, "xl330-m077"),
"shoulder_lift": (2, "xl330-m077"),
"elbow_flex": (3, "xl330-m077"),
"wrist_flex": (4, "xl330-m077"),
"wrist_roll": (5, "xl330-m077"),
"gripper": (6, "xl330-m077"),
},
)
follower_config = DynamixelMotorsBusConfig(
port="/dev/tty.usbmodem575E0032081",
motors={
# name: (index, model)
"shoulder_pan": (1, "xl430-w250"),
"shoulder_lift": (2, "xl430-w250"),
"elbow_flex": (3, "xl330-m288"),
"wrist_flex": (4, "xl330-m288"),
"wrist_roll": (5, "xl330-m288"),
"gripper": (6, "xl330-m288"),
},
)
leader_arm = DynamixelMotorsBus(leader_config)
follower_arm = DynamixelMotorsBus(follower_config)
```
IMPORTANTLY: Now that you have your ports, update [`KochRobotConfig`](../lerobot/common/robot_devices/robots/configs.py). You will find something like:
```python
@RobotConfig.register_subclass("koch")
@dataclass
class KochRobotConfig(ManipulatorRobotConfig):
calibration_dir: str = ".cache/calibration/koch"
# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
# the number of motors in your follower arms.
max_relative_target: int | None = None
leader_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": DynamixelMotorsBusConfig(
port="/dev/tty.usbmodem585A0085511", <-- UPDATE HERE
motors={
# name: (index, model)
"shoulder_pan": [1, "xl330-m077"],
"shoulder_lift": [2, "xl330-m077"],
"elbow_flex": [3, "xl330-m077"],
"wrist_flex": [4, "xl330-m077"],
"wrist_roll": [5, "xl330-m077"],
"gripper": [6, "xl330-m077"],
},
),
}
)
follower_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": DynamixelMotorsBusConfig(
port="/dev/tty.usbmodem585A0076891", <-- UPDATE HERE
motors={
# name: (index, model)
"shoulder_pan": [1, "xl430-w250"],
"shoulder_lift": [2, "xl430-w250"],
"elbow_flex": [3, "xl330-m288"],
"wrist_flex": [4, "xl330-m288"],
"wrist_roll": [5, "xl330-m288"],
"gripper": [6, "xl330-m288"],
},
),
}
)
```
**Connect and Configure your Motors**
Before you can start using your motors, you'll need to configure them to ensure proper communication. When you first connect the motors, the [`DynamixelMotorsBus`](../lerobot/common/robot_devices/motors/dynamixel.py) automatically detects any mismatch between the current motor indices (factory set to `1`) and the specified indices (e.g., `1, 2, 3, 4, 5, 6`). This triggers a configuration procedure that requires you to unplug the power cord and motors, then reconnect each motor sequentially, starting from the one closest to the bus.
For a visual guide, refer to the [video tutorial of the configuration procedure](https://youtu.be/U78QQ9wCdpY).
To connect and configure the leader arm, run the following code in the same Python interactive session as earlier in the tutorial:
```python
leader_arm.connect()
```
When you connect the leader arm for the first time, you might see an output similar to this:
```
Read failed due to communication error on port /dev/tty.usbmodem575E0032081 for group_key ID_shoulder_pan_shoulder_lift_elbow_flex_wrist_flex_wrist_roll_gripper: [TxRxResult] There is no status packet!
/!\ A configuration issue has been detected with your motors:
If this is the first time you are using these motors, press enter to configure your motors... but before verify that all the cables are connected the proper way. If you find an issue, before making a modification, kill the python process, unplug the power cord to not damage the motors, rewire correctly, then plug the power again and relaunch the script.
Motor indices detected: {9600: [1]}
1. Unplug the power cord
2. Plug/unplug minimal number of cables to only have the first 1 motor(s) (['shoulder_pan']) connected.
3. Re-plug the power cord
Press Enter to continue...
*Follow the procedure*
Setting expected motor indices: [1, 2, 3, 4, 5, 6]
```
Once the leader arm is configured, repeat the process for the follower arm by running:
```python
follower_arm.connect()
```
Congratulations! Both arms are now properly configured and connected. You won't need to go through the configuration procedure again in the future.
**Troubleshooting**:
If the configuration process fails, you may need to do the configuration process via the Dynamixel Wizard.
Known failure modes:
- Calling `arm.connect()` raises `OSError: No motor found, but one new motor expected. Verify power cord is plugged in and retry` on Ubuntu 22.
Steps:
1. Visit https://emanual.robotis.com/docs/en/software/dynamixel/dynamixel_wizard2/#connect-dynamixel.
2. Follow the software installation instructions in section 3 of the web page.
3. Launch the software.
4. Configure the device scanning options in the menu under `Tools` > `Options` > `Scan`. Check only Protocol 2.0, select only the USB port identifier of interest, select all baudrates, set the ID range to `[0, 10]`. _While this step was not strictly necessary, it greatly speeds up scanning_.
5. For each motor in turn:
- Disconnect the power to the driver board.
- Connect **only** the motor of interest to the driver board, making sure to disconnect it from any other motors.
- Reconnect the power to the driver board.
- From the software menu select `Device` > `Scan` and let the scan run. A device should appear.
- If the device has an asterisk (*) near it, it means the firmware is indeed outdated. From the software menu, select `Tools` > `Firmware Update`. Follow the prompts.
- The main panel should have table with various parameters of the device (refer to the web page, section 5). Select the row with `ID`, and then set the desired ID on the bottom right panel by selecting and clicking `Save`.
- Just like you did with the ID, also set the `Baud Rate` to 1 Mbps.
6. Check everything has been done right:
- Rewire the arms in their final configuration and power both of them.
- Scan for devices. All 12 motors should appear.
- Select the motors one by one and move the arm. Check that the graphical indicator near the top right shows the movement.
** There is a common issue with the Dynamixel XL430-W250 motors where the motors become undiscoverable after upgrading their firmware from Mac and Windows Dynamixel Wizard2 applications. When this occurs, it is required to do a firmware recovery (Select `DYNAMIXEL Firmware Recovery` and follow the prompts). There are two known workarounds to conduct this firmware reset:
1) Install the Dynamixel Wizard on a linux machine and complete the firmware recovery
2) Use the Dynamixel U2D2 in order to perform the reset with Windows or Mac. This U2D2 can be purchased [here](https://www.robotis.us/u2d2/).
For either solution, open DYNAMIXEL Wizard 2.0 and select the appropriate port. You will likely be unable to see the motor in the GUI at this time. Select `Firmware Recovery`, carefully choose the correct model, and wait for the process to complete. Finally, re-scan to confirm the firmware recovery was successful.
**Read and Write with DynamixelMotorsBus**
To get familiar with how `DynamixelMotorsBus` communicates with the motors, you can start by reading data from them. Copy past this code in the same interactive python session:
```python
leader_pos = leader_arm.read("Present_Position")
follower_pos = follower_arm.read("Present_Position")
print(leader_pos)
print(follower_pos)
```
Expected output might look like:
```
array([2054, 523, 3071, 1831, 3049, 2441], dtype=int32)
array([2003, 1601, 56, 2152, 3101, 2283], dtype=int32)
```
Try moving the arms to various positions and observe how the values change.
Now let's try to enable torque in the follower arm by copy pasting this code:
```python
from lerobot.common.robot_devices.motors.dynamixel import TorqueMode
follower_arm.write("Torque_Enable", TorqueMode.ENABLED.value)
```
With torque enabled, the follower arm will be locked in its current position. Do not attempt to manually move the arm while torque is enabled, as this could damage the motors.
Now, to get more familiar with reading and writing, let's move the arm programmatically copy pasting the following example code:
```python
# Get the current position
position = follower_arm.read("Present_Position")
# Update first motor (shoulder_pan) position by +10 steps
position[0] += 10
follower_arm.write("Goal_Position", position)
# Update all motors position by -30 steps
position -= 30
follower_arm.write("Goal_Position", position)
# Update gripper by +30 steps
position[-1] += 30
follower_arm.write("Goal_Position", position[-1], "gripper")
```
When you're done playing, you can try to disable the torque, but make sure you hold your robot so that it doesn't fall:
```python
follower_arm.write("Torque_Enable", TorqueMode.DISABLED.value)
```
Finally, disconnect the arms:
```python
leader_arm.disconnect()
follower_arm.disconnect()
```
Alternatively, you can unplug the power cord, which will automatically disable torque and disconnect the motors.
*/!\ Warning*: These motors tend to overheat, especially under torque or if left plugged in for too long. Unplug after use.
### b. Teleoperate your Koch v1.1 with ManipulatorRobot
**Instantiate the ManipulatorRobot**
Before you can teleoperate your robot, you need to instantiate the [`ManipulatorRobot`](../lerobot/common/robot_devices/robots/manipulator.py) using the previously defined `leader_config` and `follower_config`.
For the Koch v1.1 robot, we only have one leader, so we refer to it as `"main"` and define it as `leader_arms={"main": leader_config}`. We do the same for the follower arm. For other robots (like the Aloha), which may have two pairs of leader and follower arms, you would define them like this: `leader_arms={"left": left_leader_config, "right": right_leader_config},`. Same thing for the follower arms.
Run the following code to instantiate your manipulator robot:
```python
from lerobot.common.robot_devices.robots.configs import KochRobotConfig
from lerobot.common.robot_devices.robots.manipulator import ManipulatorRobot
robot_config = KochRobotConfig(
leader_arms={"main": leader_config},
follower_arms={"main": follower_config},
cameras={}, # We don't use any camera for now
)
robot = ManipulatorRobot(robot_config)
```
The `KochRobotConfig` is used to set the associated settings and calibration process. For instance, we activate the torque of the gripper of the leader Koch v1.1 arm and position it at a 40 degree angle to use it as a trigger.
For the [Aloha bimanual robot](https://aloha-2.github.io), we would use `AlohaRobotConfig` to set different settings such as a secondary ID for shadow joints (shoulder, elbow). Specific to Aloha, LeRobot comes with default calibration files stored in `.cache/calibration/aloha_default`. Assuming the motors have been properly assembled, no manual calibration step is expected for Aloha.
**Calibrate and Connect the ManipulatorRobot**
Next, you'll need to calibrate your Koch robot to ensure that the leader and follower arms have the same position values when they are in the same physical position. This calibration is essential because it allows a neural network trained on one Koch robot to work on another.
When you connect your robot for the first time, the [`ManipulatorRobot`](../lerobot/common/robot_devices/robots/manipulator.py) will detect if the calibration file is missing and trigger the calibration procedure. During this process, you will be guided to move each arm to three different positions.
Here are the positions you'll move the follower arm to:
| 1. Zero position | 2. Rotated position | 3. Rest position |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| <img src="../media/koch/follower_zero.webp?raw=true" alt="Koch v1.1 follower arm zero position" title="Koch v1.1 follower arm zero position" style="width:100%;"> | <img src="../media/koch/follower_rotated.webp?raw=true" alt="Koch v1.1 follower arm rotated position" title="Koch v1.1 follower arm rotated position" style="width:100%;"> | <img src="../media/koch/follower_rest.webp?raw=true" alt="Koch v1.1 follower arm rest position" title="Koch v1.1 follower arm rest position" style="width:100%;"> |
And here are the corresponding positions for the leader arm:
| 1. Zero position | 2. Rotated position | 3. Rest position |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| <img src="../media/koch/leader_zero.webp?raw=true" alt="Koch v1.1 leader arm zero position" title="Koch v1.1 leader arm zero position" style="width:100%;"> | <img src="../media/koch/leader_rotated.webp?raw=true" alt="Koch v1.1 leader arm rotated position" title="Koch v1.1 leader arm rotated position" style="width:100%;"> | <img src="../media/koch/leader_rest.webp?raw=true" alt="Koch v1.1 leader arm rest position" title="Koch v1.1 leader arm rest position" style="width:100%;"> |
You can watch a [video tutorial of the calibration procedure](https://youtu.be/8drnU9uRY24) for more details.
During calibration, we count the number of full 360-degree rotations your motors have made since they were first used. That's why we ask you to move to this arbitrary "zero" position. We don't actually "set" the zero position, so you don't need to be accurate. After calculating these "offsets" to shift the motor values around 0, we need to assess the rotation direction of each motor, which might differ. That's why we ask you to rotate all motors to roughly 90 degrees, to measure if the values changed negatively or positively.
Finally, the rest position ensures that the follower and leader arms are roughly aligned after calibration, preventing sudden movements that could damage the motors when starting teleoperation.
Importantly, once calibrated, all Koch robots will move to the same positions (e.g. zero and rotated position) when commanded.
Run the following code to calibrate and connect your robot:
```python
robot.connect()
```
The output will look like this:
```
Connecting main follower arm
Connecting main leader arm
Missing calibration file '.cache/calibration/koch/main_follower.json'
Running calibration of koch main follower...
Move arm to zero position
[...]
Move arm to rotated position
[...]
Move arm to rest position
[...]
Calibration is done! Saving calibration file '.cache/calibration/koch/main_follower.json'
Missing calibration file '.cache/calibration/koch/main_leader.json'
Running calibration of koch main leader...
Move arm to zero position
[...]
Move arm to rotated position
[...]
Move arm to rest position
[...]
Calibration is done! Saving calibration file '.cache/calibration/koch/main_leader.json'
```
*Verifying Calibration*
Once calibration is complete, you can check the positions of the leader and follower arms to ensure they match. If the calibration was successful, the positions should be very similar.
Run this code to get the positions in degrees:
```python
leader_pos = robot.leader_arms["main"].read("Present_Position")
follower_pos = robot.follower_arms["main"].read("Present_Position")
print(leader_pos)
print(follower_pos)
```
Example output:
```
array([-0.43945312, 133.94531, 179.82422, -18.984375, -1.9335938, 34.541016], dtype=float32)
array([-0.58723712, 131.72314, 174.98743, -16.872612, 0.786213, 35.271973], dtype=float32)
```
These values are in degrees, which makes them easier to interpret and debug. The zero position used during calibration should roughly correspond to 0 degrees for each motor, and the rotated position should roughly correspond to 90 degrees for each motor.
**Teleoperate your Koch v1.1**
You can easily teleoperate your robot by reading the positions from the leader arm and sending them as goal positions to the follower arm.
To teleoperate your robot for 30 seconds at a frequency of approximately 200Hz, run the following code:
```python
import tqdm
seconds = 30
frequency = 200
for _ in tqdm.tqdm(range(seconds*frequency)):
leader_pos = robot.leader_arms["main"].read("Present_Position")
robot.follower_arms["main"].write("Goal_Position", leader_pos)
```
*Using `teleop_step` for Teleoperation*
Alternatively, you can teleoperate the robot using the `teleop_step` method from [`ManipulatorRobot`](../lerobot/common/robot_devices/robots/manipulator.py).
Run this code to teleoperate:
```python
for _ in tqdm.tqdm(range(seconds*frequency)):
robot.teleop_step()
```
*Recording data during Teleoperation*
Teleoperation is particularly useful for recording data. You can use the `teleop_step(record_data=True)` to returns both the follower arm's position as `"observation.state"` and the leader arm's position as `"action"`. This function also converts the numpy arrays into PyTorch tensors. If you're working with a robot that has two leader and two follower arms (like the Aloha), the positions are concatenated.
Run the following code to see how slowly moving the leader arm affects the observation and action:
```python
leader_pos = robot.leader_arms["main"].read("Present_Position")
follower_pos = robot.follower_arms["main"].read("Present_Position")
observation, action = robot.teleop_step(record_data=True)
print(follower_pos)
print(observation)
print(leader_pos)
print(action)
```
Expected output:
```
array([7.8223, 131.1328, 165.5859, -23.4668, -0.9668, 32.4316], dtype=float32)
{'observation.state': tensor([7.8223, 131.1328, 165.5859, -23.4668, -0.9668, 32.4316])}
array([3.4277, 134.1211, 179.8242, -18.5449, -1.5820, 34.7168], dtype=float32)
{'action': tensor([3.4277, 134.1211, 179.8242, -18.5449, -1.5820, 34.7168])}
```
*Asynchronous Frame Recording*
Additionally, `teleop_step` can asynchronously record frames from multiple cameras and include them in the observation dictionary as `"observation.images.CAMERA_NAME"`. This feature will be covered in more detail in the next section.
*Disconnecting the Robot*
When you're finished, make sure to disconnect your robot by running:
```python
robot.disconnect()
```
Alternatively, you can unplug the power cord, which will also disable torque.
*/!\ Warning*: These motors tend to overheat, especially under torque or if left plugged in for too long. Unplug after use.
### c. Add your cameras with OpenCVCamera
**(Optional) Use your phone as camera on Linux**
If you want to use your phone as a camera on Linux, follow these steps to set up a virtual camera
1. *Install `v4l2loopback-dkms` and `v4l-utils`*. Those packages are required to create virtual camera devices (`v4l2loopback`) and verify their settings with the `v4l2-ctl` utility from `v4l-utils`. Install them using:
```python
sudo apt install v4l2loopback-dkms v4l-utils
```
2. *Install [DroidCam](https://droidcam.app) on your phone*. This app is available for both iOS and Android.
3. *Install [OBS Studio](https://obsproject.com)*. This software will help you manage the camera feed. Install it using [Flatpak](https://flatpak.org):
```python
flatpak install flathub com.obsproject.Studio
```
4. *Install the DroidCam OBS plugin*. This plugin integrates DroidCam with OBS Studio. Install it with:
```python
flatpak install flathub com.obsproject.Studio.Plugin.DroidCam
```
5. *Start OBS Studio*. Launch with:
```python
flatpak run com.obsproject.Studio
```
6. *Add your phone as a source*. Follow the instructions [here](https://droidcam.app/obs/usage). Be sure to set the resolution to `640x480`.
7. *Adjust resolution settings*. In OBS Studio, go to `File > Settings > Video`. Change the `Base(Canvas) Resolution` and the `Output(Scaled) Resolution` to `640x480` by manually typing it in.
8. *Start virtual camera*. In OBS Studio, follow the instructions [here](https://obsproject.com/kb/virtual-camera-guide).
9. *Verify the virtual camera setup*. Use `v4l2-ctl` to list the devices:
```python
v4l2-ctl --list-devices
```
You should see an entry like:
```
VirtualCam (platform:v4l2loopback-000):
/dev/video1
```
10. *Check the camera resolution*. Use `v4l2-ctl` to ensure that the virtual camera output resolution is `640x480`. Change `/dev/video1` to the port of your virtual camera from the output of `v4l2-ctl --list-devices`.
```python
v4l2-ctl -d /dev/video1 --get-fmt-video
```
You should see an entry like:
```
>>> Format Video Capture:
>>> Width/Height : 640/480
>>> Pixel Format : 'YUYV' (YUYV 4:2:2)
```
Troubleshooting: If the resolution is not correct you will have to delete the Virtual Camera port and try again as it cannot be changed.
If everything is set up correctly, you can proceed with the rest of the tutorial.
**(Optional) Use your iPhone as a camera on MacOS**
To use your iPhone as a camera on macOS, enable the Continuity Camera feature:
- Ensure your Mac is running macOS 13 or later, and your iPhone is on iOS 16 or later.
- Sign in both devices with the same Apple ID.
- Connect your devices with a USB cable or turn on Wi-Fi and Bluetooth for a wireless connection.
For more details, visit [Apple support](https://support.apple.com/en-gb/guide/mac-help/mchl77879b8a/mac).
Your iPhone should be detected automatically when running the camera setup script in the next section.
**Instantiate an OpenCVCamera**
The [`OpenCVCamera`](../lerobot/common/robot_devices/cameras/opencv.py) class allows you to efficiently record frames from most cameras using the [`opencv2`](https://docs.opencv.org) library. For more details on compatibility, see [Video I/O with OpenCV Overview](https://docs.opencv.org/4.x/d0/da7/videoio_overview.html).
To instantiate an [`OpenCVCamera`](../lerobot/common/robot_devices/cameras/opencv.py), you need a camera index (e.g. `OpenCVCamera(camera_index=0)`). When you only have one camera like a webcam of a laptop, the camera index is usually `0` but it might differ, and the camera index might change if you reboot your computer or re-plug your camera. This behavior depends on your operating system.
To find the camera indices, run the following utility script, which will save a few frames from each detected camera:
```bash
python lerobot/common/robot_devices/cameras/opencv.py \
--images-dir outputs/images_from_opencv_cameras
```
The output will look something like this if you have two cameras connected:
```
Mac or Windows detected. Finding available camera indices through scanning all indices from 0 to 60
[...]
Camera found at index 0
Camera found at index 1
[...]
Connecting cameras
OpenCVCamera(0, fps=30.0, width=1920.0, height=1080.0, color_mode=rgb)
OpenCVCamera(1, fps=24.0, width=1920.0, height=1080.0, color_mode=rgb)
Saving images to outputs/images_from_opencv_cameras
Frame: 0000 Latency (ms): 39.52
[...]
Frame: 0046 Latency (ms): 40.07
Images have been saved to outputs/images_from_opencv_cameras
```
Check the saved images in `outputs/images_from_opencv_cameras` to identify which camera index corresponds to which physical camera (e.g. `0` for `camera_00` or `1` for `camera_01`):
```
camera_00_frame_000000.png
[...]
camera_00_frame_000047.png
camera_01_frame_000000.png
[...]
camera_01_frame_000047.png
```
Note: Some cameras may take a few seconds to warm up, and the first frame might be black or green.
Finally, run this code to instantiate and connect your camera:
```python
from lerobot.common.robot_devices.cameras.configs import OpenCVCameraConfig
from lerobot.common.robot_devices.cameras.opencv import OpenCVCamera
config = OpenCVCameraConfig(camera_index=0)
camera = OpenCVCamera(config)
camera.connect()
color_image = camera.read()
print(color_image.shape)
print(color_image.dtype)
```
Expected output for a laptop camera on MacBookPro:
```
(1080, 1920, 3)
uint8
```
Or like this if you followed our tutorial to set a virtual camera:
```
(480, 640, 3)
uint8
```
With certain camera, you can also specify additional parameters like frame rate, resolution, and color mode during instantiation. For instance:
```python
config = OpenCVCameraConfig(camera_index=0, fps=30, width=640, height=480)
```
If the provided arguments are not compatible with the camera, an exception will be raised.
*Disconnecting the camera*
When you're done using the camera, disconnect it by running:
```python
camera.disconnect()
```
**Instantiate your robot with cameras**
Additionally, you can set up your robot to work with your cameras.
Modify the following Python code with the appropriate camera names and configurations:
```python
robot = ManipulatorRobot(
KochRobotConfig(
leader_arms={"main": leader_arm},
follower_arms={"main": follower_arm},
calibration_dir=".cache/calibration/koch",
cameras={
"laptop": OpenCVCameraConfig(0, fps=30, width=640, height=480),
"phone": OpenCVCameraConfig(1, fps=30, width=640, height=480),
},
)
)
robot.connect()
```
As a result, `teleop_step(record_data=True` will return a frame for each camera following the pytorch "channel first" convention but we keep images in `uint8` with pixels in range [0,255] to easily save them.
Modify this code with the names of your cameras and run it:
```python
observation, action = robot.teleop_step(record_data=True)
print(observation["observation.images.laptop"].shape)
print(observation["observation.images.phone"].shape)
print(observation["observation.images.laptop"].min().item())
print(observation["observation.images.laptop"].max().item())
```
The output should look like this:
```
torch.Size([3, 480, 640])
torch.Size([3, 480, 640])
0
255
```
### d. Use `control_robot.py` and our `teleoperate` function
Instead of manually running the python code in a terminal window, you can use [`lerobot/scripts/control_robot.py`](../lerobot/scripts/control_robot.py) to instantiate your robot by providing the robot configurations via command line and control your robot with various modes as explained next.
Try running this code to teleoperate your robot (if you dont have a camera, keep reading):
```bash
python lerobot/scripts/control_robot.py \
--robot.type=koch \
--control.type=teleoperate
```
You will see a lot of lines appearing like this one:
```
INFO 2024-08-10 11:15:03 ol_robot.py:209 dt: 5.12 (195.1hz) dtRlead: 4.93 (203.0hz) dtWfoll: 0.19 (5239.0hz)
```
It contains
- `2024-08-10 11:15:03` which is the date and time of the call to the print function.
- `ol_robot.py:209` which is the end of the file name and the line number where the print function is called (`lerobot/scripts/control_robot.py` line `209`).
- `dt: 5.12 (195.1hz)` which is the "delta time" or the number of milliseconds spent between the previous call to `robot.teleop_step()` and the current one, associated with the frequency (5.12 ms equals 195.1 Hz) ; note that you can control the maximum frequency by adding fps as argument such as `--fps 30`.
- `dtRlead: 4.93 (203.0hz)` which is the number of milliseconds it took to read the position of the leader arm using `leader_arm.read("Present_Position")`.
- `dtWfoll: 0.22 (4446.9hz)` which is the number of milliseconds it took to set a new goal position for the follower arm using `follower_arm.write("Goal_position", leader_pos)` ; note that writing is done asynchronously so it takes less time than reading.
Importantly: If you don't have any camera, you can remove them dynamically with this [draccus](https://github.com/dlwh/draccus) syntax `--robot.cameras='{}'`:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=koch \
--robot.cameras='{}' \
--control.type=teleoperate
```
We advise to create a new yaml file when the command becomes too long.
## 3. Record your Dataset and Visualize it
Using what you've learned previously, you can now easily record a dataset of states and actions for one episode. You can use `busy_wait` to control the speed of teleoperation and record at a fixed `fps` (frame per seconds).
Try this code to record 30 seconds at 60 fps:
```python
import time
from lerobot.scripts.control_robot import busy_wait
record_time_s = 30
fps = 60
states = []
actions = []
for _ in range(record_time_s * fps):
start_time = time.perf_counter()
observation, action = robot.teleop_step(record_data=True)
states.append(observation["observation.state"])
actions.append(action["action"])
dt_s = time.perf_counter() - start_time
busy_wait(1 / fps - dt_s)
# Note that observation and action are available in RAM, but
# you could potentially store them on disk with pickle/hdf5 or
# our optimized format `LeRobotDataset`. More on this next.
```
Importantly, many utilities are still missing. For instance, if you have cameras, you will need to save the images on disk to not go out of RAM, and to do so in threads to not slow down communication with your robot. Also, you will need to store your data in a format optimized for training and web sharing like [`LeRobotDataset`](../lerobot/common/datasets/lerobot_dataset.py). More on this in the next section.
### a. Use the `record` function
You can use the `record` function from [`lerobot/scripts/control_robot.py`](../lerobot/scripts/control_robot.py) to achieve efficient data recording. It encompasses many recording utilities:
1. Frames from cameras are saved on disk in threads, and encoded into videos at the end of each episode recording.
2. Video streams from cameras are displayed in window so that you can verify them.
3. Data is stored with [`LeRobotDataset`](../lerobot/common/datasets/lerobot_dataset.py) format which is pushed to your Hugging Face page (unless `--control.push_to_hub=false` is provided).
4. Checkpoints are done during recording, so if any issue occurs, you can resume recording by re-running the same command again with `--control.resume=true`. You will need to manually delete the dataset directory if you want to start recording from scratch.
5. Set the flow of data recording using command line arguments:
- `--control.warmup_time_s=10` defines the number of seconds before starting data collection. It allows the robot devices to warmup and synchronize (10 seconds by default).
- `--control.episode_time_s=60` defines the number of seconds for data recording for each episode (60 seconds by default).
- `--control.reset_time_s=60` defines the number of seconds for resetting the environment after each episode (60 seconds by default).
- `--control.num_episodes=50` defines the number of episodes to record (50 by default).
6. Control the flow during data recording using keyboard keys:
- Press right arrow `->` at any time during episode recording to early stop and go to resetting. Same during resetting, to early stop and to go to the next episode recording.
- Press left arrow `<-` at any time during episode recording or resetting to early stop, cancel the current episode, and re-record it.
- Press escape `ESC` at any time during episode recording to end the session early and go straight to video encoding and dataset uploading.
7. Similarly to `teleoperate`, you can also use the command line to override anything.
Before trying `record`, if you want to push your dataset to the hub, make sure you've logged in using a write-access token, which can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens):
```bash
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
```
Also, store your Hugging Face repository name in a variable (e.g. `cadene` or `lerobot`). For instance, run this to use your Hugging Face user name as repository:
```bash
HF_USER=$(huggingface-cli whoami | head -n 1)
echo $HF_USER
```
If you don't want to push to hub, use `--control.push_to_hub=false`.
Now run this to record 2 episodes:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=koch \
--control.type=record \
--control.single_task="Grasp a lego block and put it in the bin." \
--control.fps=30 \
--control.repo_id=${HF_USER}/koch_test \
--control.tags='["tutorial"]' \
--control.warmup_time_s=5 \
--control.episode_time_s=30 \
--control.reset_time_s=30 \
--control.num_episodes=2 \
--control.push_to_hub=true
```
This will write your dataset locally to `~/.cache/huggingface/lerobot/{repo-id}` (e.g. `data/cadene/koch_test`) and push it on the hub at `https://huggingface.co/datasets/{HF_USER}/{repo-id}`. Your dataset will be automatically tagged with `LeRobot` for the community to find it easily, and you can also add custom tags (in this case `tutorial` for example).
You can look for other LeRobot datasets on the hub by searching for `LeRobot` tags: https://huggingface.co/datasets?other=LeRobot
You will see a lot of lines appearing like this one:
```
INFO 2024-08-10 15:02:58 ol_robot.py:219 dt:33.34 (30.0hz) dtRlead: 5.06 (197.5hz) dtWfoll: 0.25 (3963.7hz) dtRfoll: 6.22 (160.7hz) dtRlaptop: 32.57 (30.7hz) dtRphone: 33.84 (29.5hz)
```
It contains:
- `2024-08-10 15:02:58` which is the date and time of the call to the print function,
- `ol_robot.py:219` which is the end of the file name and the line number where the print function is called (`lerobot/scripts/control_robot.py` line `219`).
- `dt:33.34 (30.0hz)` which is the "delta time" or the number of milliseconds spent between the previous call to `robot.teleop_step(record_data=True)` and the current one, associated with the frequency (33.34 ms equals 30.0 Hz) ; note that we use `--fps 30` so we expect 30.0 Hz ; when a step takes more time, the line appears in yellow.
- `dtRlead: 5.06 (197.5hz)` which is the delta time of reading the present position of the leader arm.
- `dtWfoll: 0.25 (3963.7hz)` which is the delta time of writing the goal position on the follower arm ; writing is asynchronous so it takes less time than reading.
- `dtRfoll: 6.22 (160.7hz)` which is the delta time of reading the present position on the follower arm.
- `dtRlaptop:32.57 (30.7hz) ` which is the delta time of capturing an image from the laptop camera in the thread running asynchronously.
- `dtRphone:33.84 (29.5hz)` which is the delta time of capturing an image from the phone camera in the thread running asynchronously.
Troubleshooting:
- On Linux, if the left and right arrow keys and escape key don't have any effect during data recording, make sure you've set the `$DISPLAY` environment variable. See [pynput limitations](https://pynput.readthedocs.io/en/latest/limitations.html#linux).
At the end of data recording, your dataset will be uploaded on your Hugging Face page (e.g. https://huggingface.co/datasets/cadene/koch_test) that you can obtain by running:
```bash
echo https://huggingface.co/datasets/${HF_USER}/koch_test
```
### b. Advice for recording dataset
Once you're comfortable with data recording, it's time to create a larger dataset for training. A good starting task is grasping an object at different locations and placing it in a bin. We suggest recording at least 50 episodes, with 10 episodes per location. Keep the cameras fixed and maintain consistent grasping behavior throughout the recordings.
In the following sections, youll train your neural network. After achieving reliable grasping performance, you can start introducing more variations during data collection, such as additional grasp locations, different grasping techniques, and altering camera positions.
Avoid adding too much variation too quickly, as it may hinder your results.
In the coming months, we plan to release a foundational model for robotics. We anticipate that fine-tuning this model will enhance generalization, reducing the need for strict consistency during data collection.
### c. Visualize all episodes
You can visualize your dataset by running:
```bash
python lerobot/scripts/visualize_dataset_html.py \
--repo-id ${HF_USER}/koch_test
```
Note: You might need to add `--local-files-only 1` if your dataset was not uploaded to hugging face hub.
This will launch a local web server that looks like this:
<div style="text-align:center;">
<img src="../media/tutorial/visualize_dataset_html.webp?raw=true" alt="Koch v1.1 leader and follower arms" title="Koch v1.1 leader and follower arms" width="100%">
</div>
### d. Replay episode on your robot with the `replay` function
A useful feature of [`lerobot/scripts/control_robot.py`](../lerobot/scripts/control_robot.py) is the `replay` function, which allows to replay on your robot any episode that you've recorded or episodes from any dataset out there. This function helps you test the repeatability of your robot's actions and assess transferability across robots of the same model.
To replay the first episode of the dataset you just recorded, run the following command:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=koch \
--control.type=replay \
--control.fps=30 \
--control.repo_id=${HF_USER}/koch_test \
--control.episode=0
```
Your robot should replicate movements similar to those you recorded. For example, check out [this video](https://x.com/RemiCadene/status/1793654950905680090) where we use `replay` on a Aloha robot from [Trossen Robotics](https://www.trossenrobotics.com).
## 4. Train a policy on your data
### a. Use the `train` script
To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command:
```bash
python lerobot/scripts/train.py \
--dataset.repo_id=${HF_USER}/koch_test \
--policy.type=act \
--output_dir=outputs/train/act_koch_test \
--job_name=act_koch_test \
--policy.device=cuda \
--wandb.enable=true
```
Let's explain it:
1. We provided the dataset as argument with `--dataset.repo_id=${HF_USER}/koch_test`.
2. We provided the policy with `policy.type=act`. This loads configurations from [`configuration_act.py`](../lerobot/common/policies/act/configuration_act.py). Importantly, this policy will automatically adapt to the number of motor sates, motor actions and cameras of your robot (e.g. `laptop` and `phone`) which have been saved in your dataset.
4. We provided `policy.device=cuda` since we are training on a Nvidia GPU, but you could use `policy.device=mps` to train on Apple silicon.
5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`.
For more information on the `train` script see the previous tutorial: [`examples/4_train_policy_with_script.md`](../examples/4_train_policy_with_script.md)
### b. (Optional) Upload policy checkpoints to the hub
Once training is done, upload the latest checkpoint with:
```bash
huggingface-cli upload ${HF_USER}/act_koch_test \
outputs/train/act_koch_test/checkpoints/last/pretrained_model
```
You can also upload intermediate checkpoints with:
```bash
CKPT=010000
huggingface-cli upload ${HF_USER}/act_koch_test_${CKPT} \
outputs/train/act_koch_test/checkpoints/${CKPT}/pretrained_model
```
## 5. Evaluate your policy
Now that you have a policy checkpoint, you can easily control your robot with it using methods from [`ManipulatorRobot`](../lerobot/common/robot_devices/robots/manipulator.py) and the policy.
Try this code for running inference for 60 seconds at 30 fps:
```python
from lerobot.common.policies.act.modeling_act import ACTPolicy
inference_time_s = 60
fps = 30
device = "cuda" # TODO: On Mac, use "mps" or "cpu"
ckpt_path = "outputs/train/act_koch_test/checkpoints/last/pretrained_model"
policy = ACTPolicy.from_pretrained(ckpt_path)
policy.to(device)
for _ in range(inference_time_s * fps):
start_time = time.perf_counter()
# Read the follower state and access the frames from the cameras
observation = robot.capture_observation()
# Convert to pytorch format: channel first and float32 in [0,1]
# with batch dimension
for name in observation:
if "image" in name:
observation[name] = observation[name].type(torch.float32) / 255
observation[name] = observation[name].permute(2, 0, 1).contiguous()
observation[name] = observation[name].unsqueeze(0)
observation[name] = observation[name].to(device)
# Compute the next action with the policy
# based on the current observation
action = policy.select_action(observation)
# Remove batch dimension
action = action.squeeze(0)
# Move to cpu, if not already the case
action = action.to("cpu")
# Order the robot to move
robot.send_action(action)
dt_s = time.perf_counter() - start_time
busy_wait(1 / fps - dt_s)
```
### a. Use our `record` function
Ideally, when controlling your robot with your neural network, you would want to record evaluation episodes and to be able to visualize them later on, or even train on them like in Reinforcement Learning. This pretty much corresponds to recording a new dataset but with a neural network providing the actions instead of teleoperation.
To this end, you can use the `record` function from [`lerobot/scripts/control_robot.py`](../lerobot/scripts/control_robot.py) but with a policy checkpoint as input. For instance, run this command to record 10 evaluation episodes:
```bash
python lerobot/scripts/control_robot.py \
--robot.type=koch \
--control.type=record \
--control.fps=30 \
--control.repo_id=${HF_USER}/eval_act_koch_test \
--control.tags='["tutorial"]' \
--control.warmup_time_s=5 \
--control.episode_time_s=30 \
--control.reset_time_s=30 \
--control.num_episodes=10 \
--control.push_to_hub=true \
--control.policy.path=outputs/train/act_koch_test/checkpoints/last/pretrained_model
```
As you can see, it's almost the same command as previously used to record your training dataset. Two things changed:
1. There is an additional `--control.policy.path` argument which indicates the path to your policy checkpoint with (e.g. `outputs/train/eval_koch_test/checkpoints/last/pretrained_model`). You can also use the model repository if you uploaded a model checkpoint to the hub (e.g. `${HF_USER}/act_koch_test`).
2. The name of dataset begins by `eval` to reflect that you are running inference (e.g. `${HF_USER}/eval_act_koch_test`).
### b. Visualize evaluation afterwards
You can then visualize your evaluation dataset by running the same command as before but with the new inference dataset as argument:
```bash
python lerobot/scripts/visualize_dataset.py \
--repo-id ${HF_USER}/eval_act_koch_test
```
## 6. Next step
Join our [Discord](https://discord.com/invite/s3KuuzsPFb) to collaborate on data collection and help us train a fully open-source foundational models for robotics!

View File

@@ -0,0 +1,37 @@
This tutorial explains how to resume a training run that you've started with the training script. If you don't know how our training script and configuration system works, please read [4_train_policy_with_script.md](./4_train_policy_with_script.md) first.
## Basic training resumption
Let's consider the example of training ACT for one of the ALOHA tasks. Here's a command that can achieve that:
```bash
python lerobot/scripts/train.py \
hydra.run.dir=outputs/train/run_resumption \
policy=act \
dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
env=aloha \
env.task=AlohaTransferCube-v0 \
training.log_freq=25 \
training.save_checkpoint=true \
training.save_freq=100
```
Here we're using the default dataset and environment for ACT, and we've taken care to set up the log frequency and checkpointing frequency to low numbers so we can test resumption. You should be able to see some logging and have a first checkpoint within 1 minute. Please interrupt the training after the first checkpoint.
To resume, all that we have to do is run the training script, providing the run directory, and the resume option:
```bash
python lerobot/scripts/train.py \
hydra.run.dir=outputs/train/run_resumption \
resume=true
```
You should see from the logging that your training picks up from where it left off.
Note that with `resume=true`, the configuration file from the last checkpoint in the training output directory is loaded. So it doesn't matter that we haven't provided all the other configuration parameters from our previous command (although there may be warnings to notify you that your command has a different configuration than than the checkpoint).
---
Now you should know how to resume your training run in case it gets interrupted or you want to extend a finished training run.
Happy coding! 🤗

View File

@@ -1,67 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script demonstrates how to use torchvision's image transformation with LeRobotDataset for data
augmentation purposes. The transformations are passed to the dataset as an argument upon creation, and
transforms are applied to the observation images before they are returned in the dataset's __getitem__.
"""
from pathlib import Path
from torchvision.transforms import ToPILImage, v2
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
dataset_repo_id = "lerobot/aloha_static_screw_driver"
# Create a LeRobotDataset with no transformations
dataset = LeRobotDataset(dataset_repo_id, episodes=[0])
# This is equivalent to `dataset = LeRobotDataset(dataset_repo_id, image_transforms=None)`
# Get the index of the first observation in the first episode
first_idx = dataset.episode_data_index["from"][0].item()
# Get the frame corresponding to the first camera
frame = dataset[first_idx][dataset.meta.camera_keys[0]]
# Define the transformations
transforms = v2.Compose(
[
v2.ColorJitter(brightness=(0.5, 1.5)),
v2.ColorJitter(contrast=(0.5, 1.5)),
v2.ColorJitter(hue=(-0.1, 0.1)),
v2.RandomAdjustSharpness(sharpness_factor=2, p=1),
]
)
# Create another LeRobotDataset with the defined transformations
transformed_dataset = LeRobotDataset(dataset_repo_id, episodes=[0], image_transforms=transforms)
# Get a frame from the transformed dataset
transformed_frame = transformed_dataset[first_idx][transformed_dataset.meta.camera_keys[0]]
# Create a directory to store output images
output_dir = Path("outputs/image_transforms")
output_dir.mkdir(parents=True, exist_ok=True)
# Save the original frame
to_pil = ToPILImage()
to_pil(frame).save(output_dir / "original_frame.png", quality=100)
print(f"Original frame saved to {output_dir / 'original_frame.png'}.")
# Save the transformed frame
to_pil(transformed_frame).save(output_dir / "transformed_frame.png", quality=100)
print(f"Transformed frame saved to {output_dir / 'transformed_frame.png'}.")

View File

@@ -0,0 +1,87 @@
# @package _global_
# Change the seed to match what PushT eval uses
# (to avoid evaluating on seeds used for generating the training data).
seed: 100000
# Change the dataset repository to the PushT one.
dataset_repo_id: lerobot/pusht
override_dataset_stats:
observation.image:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
training:
offline_steps: 80000
online_steps: 0
eval_freq: 10000
save_freq: 100000
log_freq: 250
save_model: true
batch_size: 8
lr: 1e-5
lr_backbone: 1e-5
weight_decay: 1e-4
grad_clip_norm: 10
online_steps_between_rollouts: 1
delta_timestamps:
action: "[i / ${fps} for i in range(${policy.chunk_size})]"
eval:
n_episodes: 50
batch_size: 50
# See `configuration_act.py` for more details.
policy:
name: act
# Input / output structure.
n_obs_steps: 1
chunk_size: 100 # chunk_size
n_action_steps: 100
input_shapes:
observation.image: [3, 96, 96]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]
# Normalization / Unnormalization
input_normalization_modes:
observation.image: mean_std
# Use min_max normalization just because it's more standard.
observation.state: min_max
output_normalization_modes:
# Use min_max normalization just because it's more standard.
action: min_max
# Architecture.
# Vision backbone.
vision_backbone: resnet18
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
replace_final_stride_with_dilation: false
# Transformer layers.
pre_norm: false
dim_model: 512
n_heads: 8
dim_feedforward: 3200
feedforward_activation: relu
n_encoder_layers: 4
# Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
# that means only the first layer is used. Here we match the original implementation by setting this to 1.
# See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
n_decoder_layers: 1
# VAE.
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
# Inference.
temporal_ensemble_momentum: null
# Training and loss computation.
dropout: 0.1
kl_weight: 10.0

View File

@@ -0,0 +1,70 @@
In this tutorial we will learn how to adapt a policy configuration to be compatible with a new environment and dataset. As a concrete example, we will adapt the default configuration for ACT to be compatible with the PushT environment and dataset.
If you haven't already read our tutorial on the [training script and configuration tooling](../4_train_policy_with_script.md) please do so prior to tackling this tutorial.
Let's get started!
Suppose we want to train ACT for PushT. Well, there are aspects of the ACT configuration that are specific to the ALOHA environments, and these happen to be incompatible with PushT. Therefore, trying to run the following will almost certainly raise an exception of sorts (eg: feature dimension mismatch):
```bash
python lerobot/scripts/train.py policy=act env=pusht dataset_repo_id=lerobot/pusht
```
We need to adapt the parameters of the ACT policy configuration to the PushT environment. The most important ones are the image keys.
ALOHA's datasets and environments typically use a variable number of cameras. In `lerobot/configs/policy/act.yaml` you may notice two relevant sections. Here we show you the minimal diff needed to adjust to PushT:
```diff
override_dataset_stats:
- observation.images.top:
+ observation.image:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
policy:
input_shapes:
- observation.images.top: [3, 480, 640]
+ observation.image: [3, 96, 96]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]
input_normalization_modes:
- observation.images.top: mean_std
+ observation.image: mean_std
observation.state: min_max
output_normalization_modes:
action: min_max
```
Here we've accounted for the following:
- PushT uses "observation.image" for its image key.
- PushT provides smaller images.
_Side note: technically we could override these via the CLI, but with many changes it gets a bit messy, and we also have a bit of a challenge in that we're using `.` in our observation keys which is treated by Hydra as a hierarchical separator_.
For your convenience, we provide [`act_pusht.yaml`](./act_pusht.yaml) in this directory. It contains the diff above, plus some other (optional) ones that are explained within. Please copy it into `lerobot/configs/policy` with:
```bash
cp examples/advanced/1_train_act_pusht/act_pusht.yaml lerobot/configs/policy/act_pusht.yaml
```
(remember from a [previous tutorial](../4_train_policy_with_script.md) that Hydra will look in the `lerobot/configs` directory). Now try running the following.
<!-- Note to contributor: are you changing this command? Note that it's tested in `Makefile`, so change it there too! -->
```bash
python lerobot/scripts/train.py policy=act_pusht env=pusht
```
Notice that this is much the same as the command that failed at the start of the tutorial, only:
- Now we are using `policy=act_pusht` to point to our new configuration file.
- We can drop `dataset_repo_id=lerobot/pusht` as the change is incorporated in our new configuration file.
Hurrah! You're now training ACT for the PushT environment.
---
The bottom line of this tutorial is that when training policies for different environments and datasets you will need to understand what parts of the policy configuration are specific to those and make changes accordingly.
Happy coding! 🤗

View File

@@ -1,17 +1,3 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This script demonstrates how to slice a dataset and calculate the loss on a subset of the data.
This technique can be useful for debugging and testing purposes, as well as identifying whether a policy
@@ -23,82 +9,82 @@ on the target environment, whether that be in simulation or the real world.
"""
import math
from pathlib import Path
import torch
from huggingface_hub import snapshot_download
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
device = torch.device("cuda")
def main():
device = torch.device("cuda")
# Download the diffusion policy for pusht environment
pretrained_policy_path = Path(snapshot_download("lerobot/diffusion_pusht"))
# OR uncomment the following to evaluate a policy from the local outputs/train folder.
# pretrained_policy_path = Path("outputs/train/example_pusht_diffusion")
# Download the diffusion policy for pusht environment
pretrained_policy_path = "lerobot/diffusion_pusht"
# OR uncomment the following to evaluate a policy from the local outputs/train folder.
# pretrained_policy_path = Path("outputs/train/example_pusht_diffusion")
policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
policy.eval()
policy.to(device)
policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
policy.eval()
policy.to(device)
# Set up the dataset.
delta_timestamps = {
# Load the previous image and state at -0.1 seconds before current frame,
# then load current image and state corresponding to 0.0 second.
"observation.image": [-0.1, 0.0],
"observation.state": [-0.1, 0.0],
# Load the previous action (-0.1), the next action to be executed (0.0),
# and 14 future actions with a 0.1 seconds spacing. All these actions will be
# used to calculate the loss.
"action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
}
# Set up the dataset.
delta_timestamps = {
# Load the previous image and state at -0.1 seconds before current frame,
# then load current image and state corresponding to 0.0 second.
"observation.image": [-0.1, 0.0],
"observation.state": [-0.1, 0.0],
# Load the previous action (-0.1), the next action to be executed (0.0),
# and 14 future actions with a 0.1 seconds spacing. All these actions will be
# used to calculate the loss.
"action": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],
}
# Load the last 10% of episodes of the dataset as a validation set.
# - Load full dataset
full_dataset = LeRobotDataset("lerobot/pusht", split="train")
# - Calculate train and val subsets
num_train_episodes = math.floor(full_dataset.num_episodes * 90 / 100)
num_val_episodes = full_dataset.num_episodes - num_train_episodes
print(f"Number of episodes in full dataset: {full_dataset.num_episodes}")
print(f"Number of episodes in training dataset (90% subset): {num_train_episodes}")
print(f"Number of episodes in validation dataset (10% subset): {num_val_episodes}")
# - Get first frame index of the validation set
first_val_frame_index = full_dataset.episode_data_index["from"][num_train_episodes].item()
# - Load frames subset belonging to validation set using the `split` argument.
# It utilizes the `datasets` library's syntax for slicing datasets.
# For more information on the Slice API, please see:
# https://huggingface.co/docs/datasets/v2.19.0/loading#slice-splits
train_dataset = LeRobotDataset(
"lerobot/pusht", split=f"train[:{first_val_frame_index}]", delta_timestamps=delta_timestamps
)
val_dataset = LeRobotDataset(
"lerobot/pusht", split=f"train[{first_val_frame_index}:]", delta_timestamps=delta_timestamps
)
print(f"Number of frames in training dataset (90% subset): {len(train_dataset)}")
print(f"Number of frames in validation dataset (10% subset): {len(val_dataset)}")
# Load the last 10% of episodes of the dataset as a validation set.
# - Load dataset metadata
dataset_metadata = LeRobotDatasetMetadata("lerobot/pusht")
# - Calculate train and val episodes
total_episodes = dataset_metadata.total_episodes
episodes = list(range(dataset_metadata.total_episodes))
num_train_episodes = math.floor(total_episodes * 90 / 100)
train_episodes = episodes[:num_train_episodes]
val_episodes = episodes[num_train_episodes:]
print(f"Number of episodes in full dataset: {total_episodes}")
print(f"Number of episodes in training dataset (90% subset): {len(train_episodes)}")
print(f"Number of episodes in validation dataset (10% subset): {len(val_episodes)}")
# - Load train and val datasets
train_dataset = LeRobotDataset(
"lerobot/pusht", episodes=train_episodes, delta_timestamps=delta_timestamps
)
val_dataset = LeRobotDataset("lerobot/pusht", episodes=val_episodes, delta_timestamps=delta_timestamps)
print(f"Number of frames in training dataset (90% subset): {len(train_dataset)}")
print(f"Number of frames in validation dataset (10% subset): {len(val_dataset)}")
# Create dataloader for evaluation.
val_dataloader = torch.utils.data.DataLoader(
val_dataset,
num_workers=4,
batch_size=64,
shuffle=False,
pin_memory=device != torch.device("cpu"),
drop_last=False,
)
# Create dataloader for evaluation.
val_dataloader = torch.utils.data.DataLoader(
val_dataset,
num_workers=4,
batch_size=64,
shuffle=False,
pin_memory=device != torch.device("cpu"),
drop_last=False,
)
# Run validation loop.
loss_cumsum = 0
n_examples_evaluated = 0
for batch in val_dataloader:
batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
output_dict = policy.forward(batch)
# Run validation loop.
loss_cumsum = 0
n_examples_evaluated = 0
for batch in val_dataloader:
batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
loss, _ = policy.forward(batch)
loss_cumsum += output_dict["loss"].item()
n_examples_evaluated += batch["index"].shape[0]
loss_cumsum += loss.item()
n_examples_evaluated += batch["index"].shape[0]
# Calculate the average loss over the validation set.
average_loss = loss_cumsum / n_examples_evaluated
# Calculate the average loss over the validation set.
average_loss = loss_cumsum / n_examples_evaluated
print(f"Average loss on validation set: {average_loss:.4f}")
if __name__ == "__main__":
main()
print(f"Average loss on validation set: {average_loss:.4f}")

View File

@@ -0,0 +1,89 @@
# Using `lerobot` on a real world arm
In this example, we'll be using `lerobot` on a real world arm to:
- record a dataset in the `lerobot` format
- (soon) train a policy on it
- (soon) run the policy in the real-world
## Which robotic arm to use
In this example we're using the [open-source low-cost arm from Alexander Koch](https://github.com/AlexanderKoch-Koch/low_cost_robot) in the specific setup of:
- having 6 servos per arm, i.e. using the elbow-to-wrist extension
- adding two cameras around it, one on top and one in the front
- having a teleoperation arm as well (build the leader and the follower arms in A. Koch repo, both with elbow-to-wrist extensions)
I'm using these cameras (but the setup should not be sensitive to the exact cameras you're using):
- C922 Pro Stream Webcam
- Intel(R) RealSense D455 (using only the RGB input)
In general, this example should be very easily extendable to any type of arm using Dynamixel servos with at least one camera by changing a couple of configuration in the gym env.
## Install the example
Follow these steps:
- install `lerobot`
- install the Dynamixel-sdk: ` pip install dynamixel-sdk`
## Usage
### 0 - record examples
Run the `record_training_data.py` example, selecting the duration and number of episodes you want to record, e.g.
```
DATA_DIR='./data' python record_training_data.py \
--repo-id=thomwolf/blue_red_sort \
--num-episodes=50 \
--num-frames=400
```
TODO:
- various length episodes
- being able to drop episodes
- checking uploading to the hub
### 1 - visualize the dataset
Use the standard dataset visualization script pointing it to the right folder:
```
DATA_DIR='./data' python ../../lerobot/scripts/visualize_dataset.py \
--repo-id thomwolf/blue_red_sort \
--episode-index 0
```
### 2 - Train a policy
From the example directory let's run this command to train a model using ACT
```
DATA_DIR='./data' python ../../lerobot/scripts/train.py \
device=cuda \
hydra.searchpath=[file://./train_config/] \
hydra.run.dir=./outputs/train/blue_red_sort \
dataset_repo_id=thomwolf/blue_red_sort \
env=gym_real_world \
policy=act_real_world \
wandb.enable=false
```
### 3 - Evaluate the policy in the real world
From the example directory let's run this command to evaluate our policy.
The configuration for running the policy is in the checkpoint of the model.
You can override parameters as follow:
```
python run_policy.py \
-p ./outputs/train/blue_red_sort/checkpoints/last/pretrained_model/
env.episode_length=1000
```
## Convert a hdf5 dataset recorded with the original ACT repo
You can convert a dataset from the raw data format of HDF5 files like in: https://github.com/tonyzhaozh/act with the following command:
```
python ./lerobot/scripts/push_dataset_to_hub.py
```

View File

@@ -0,0 +1,840 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from safetensors.torch import load_file, save_file\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"original_ckpt_path = \"/home/thomwolf/Documents/Github/ACT/checkpoints/blue_red_sort/policy_last.ckpt\"\n",
"converted_ckpt_path = \"/home/thomwolf/Documents/Github/ACT/checkpoints/blue_red_sort/model.safetensors\"\n",
"\n",
"comparison_main_path = \"/home/thomwolf/Documents/Github/lerobot/examples/real_robot_example/outputs/train/blue_red_debug_no_masking/checkpoints/last/pretrained_model/\"\n",
"comparison_safetensor_path = comparison_main_path + \"model.safetensors\"\n",
"comparison_config_json_path = comparison_main_path + \"config.json\"\n",
"comparison_config_yaml_path = comparison_main_path + \"config.yaml\""
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"a = torch.load(original_ckpt_path)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"b = load_file(comparison_safetensor_path)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['model.action_head.bias',\n",
" 'model.action_head.weight',\n",
" 'model.backbone.bn1.bias',\n",
" 'model.backbone.bn1.running_mean',\n",
" 'model.backbone.bn1.running_var',\n",
" 'model.backbone.bn1.weight',\n",
" 'model.backbone.conv1.weight',\n",
" 'model.backbone.layer1.0.bn1.bias',\n",
" 'model.backbone.layer1.0.bn1.running_mean',\n",
" 'model.backbone.layer1.0.bn1.running_var',\n",
" 'model.backbone.layer1.0.bn1.weight',\n",
" 'model.backbone.layer1.0.bn2.bias',\n",
" 'model.backbone.layer1.0.bn2.running_mean',\n",
" 'model.backbone.layer1.0.bn2.running_var',\n",
" 'model.backbone.layer1.0.bn2.weight',\n",
" 'model.backbone.layer1.0.conv1.weight',\n",
" 'model.backbone.layer1.0.conv2.weight',\n",
" 'model.backbone.layer1.1.bn1.bias',\n",
" 'model.backbone.layer1.1.bn1.running_mean',\n",
" 'model.backbone.layer1.1.bn1.running_var',\n",
" 'model.backbone.layer1.1.bn1.weight',\n",
" 'model.backbone.layer1.1.bn2.bias',\n",
" 'model.backbone.layer1.1.bn2.running_mean',\n",
" 'model.backbone.layer1.1.bn2.running_var',\n",
" 'model.backbone.layer1.1.bn2.weight',\n",
" 'model.backbone.layer1.1.conv1.weight',\n",
" 'model.backbone.layer1.1.conv2.weight',\n",
" 'model.backbone.layer2.0.bn1.bias',\n",
" 'model.backbone.layer2.0.bn1.running_mean',\n",
" 'model.backbone.layer2.0.bn1.running_var',\n",
" 'model.backbone.layer2.0.bn1.weight',\n",
" 'model.backbone.layer2.0.bn2.bias',\n",
" 'model.backbone.layer2.0.bn2.running_mean',\n",
" 'model.backbone.layer2.0.bn2.running_var',\n",
" 'model.backbone.layer2.0.bn2.weight',\n",
" 'model.backbone.layer2.0.conv1.weight',\n",
" 'model.backbone.layer2.0.conv2.weight',\n",
" 'model.backbone.layer2.0.downsample.0.weight',\n",
" 'model.backbone.layer2.0.downsample.1.bias',\n",
" 'model.backbone.layer2.0.downsample.1.running_mean',\n",
" 'model.backbone.layer2.0.downsample.1.running_var',\n",
" 'model.backbone.layer2.0.downsample.1.weight',\n",
" 'model.backbone.layer2.1.bn1.bias',\n",
" 'model.backbone.layer2.1.bn1.running_mean',\n",
" 'model.backbone.layer2.1.bn1.running_var',\n",
" 'model.backbone.layer2.1.bn1.weight',\n",
" 'model.backbone.layer2.1.bn2.bias',\n",
" 'model.backbone.layer2.1.bn2.running_mean',\n",
" 'model.backbone.layer2.1.bn2.running_var',\n",
" 'model.backbone.layer2.1.bn2.weight',\n",
" 'model.backbone.layer2.1.conv1.weight',\n",
" 'model.backbone.layer2.1.conv2.weight',\n",
" 'model.backbone.layer3.0.bn1.bias',\n",
" 'model.backbone.layer3.0.bn1.running_mean',\n",
" 'model.backbone.layer3.0.bn1.running_var',\n",
" 'model.backbone.layer3.0.bn1.weight',\n",
" 'model.backbone.layer3.0.bn2.bias',\n",
" 'model.backbone.layer3.0.bn2.running_mean',\n",
" 'model.backbone.layer3.0.bn2.running_var',\n",
" 'model.backbone.layer3.0.bn2.weight',\n",
" 'model.backbone.layer3.0.conv1.weight',\n",
" 'model.backbone.layer3.0.conv2.weight',\n",
" 'model.backbone.layer3.0.downsample.0.weight',\n",
" 'model.backbone.layer3.0.downsample.1.bias',\n",
" 'model.backbone.layer3.0.downsample.1.running_mean',\n",
" 'model.backbone.layer3.0.downsample.1.running_var',\n",
" 'model.backbone.layer3.0.downsample.1.weight',\n",
" 'model.backbone.layer3.1.bn1.bias',\n",
" 'model.backbone.layer3.1.bn1.running_mean',\n",
" 'model.backbone.layer3.1.bn1.running_var',\n",
" 'model.backbone.layer3.1.bn1.weight',\n",
" 'model.backbone.layer3.1.bn2.bias',\n",
" 'model.backbone.layer3.1.bn2.running_mean',\n",
" 'model.backbone.layer3.1.bn2.running_var',\n",
" 'model.backbone.layer3.1.bn2.weight',\n",
" 'model.backbone.layer3.1.conv1.weight',\n",
" 'model.backbone.layer3.1.conv2.weight',\n",
" 'model.backbone.layer4.0.bn1.bias',\n",
" 'model.backbone.layer4.0.bn1.running_mean',\n",
" 'model.backbone.layer4.0.bn1.running_var',\n",
" 'model.backbone.layer4.0.bn1.weight',\n",
" 'model.backbone.layer4.0.bn2.bias',\n",
" 'model.backbone.layer4.0.bn2.running_mean',\n",
" 'model.backbone.layer4.0.bn2.running_var',\n",
" 'model.backbone.layer4.0.bn2.weight',\n",
" 'model.backbone.layer4.0.conv1.weight',\n",
" 'model.backbone.layer4.0.conv2.weight',\n",
" 'model.backbone.layer4.0.downsample.0.weight',\n",
" 'model.backbone.layer4.0.downsample.1.bias',\n",
" 'model.backbone.layer4.0.downsample.1.running_mean',\n",
" 'model.backbone.layer4.0.downsample.1.running_var',\n",
" 'model.backbone.layer4.0.downsample.1.weight',\n",
" 'model.backbone.layer4.1.bn1.bias',\n",
" 'model.backbone.layer4.1.bn1.running_mean',\n",
" 'model.backbone.layer4.1.bn1.running_var',\n",
" 'model.backbone.layer4.1.bn1.weight',\n",
" 'model.backbone.layer4.1.bn2.bias',\n",
" 'model.backbone.layer4.1.bn2.running_mean',\n",
" 'model.backbone.layer4.1.bn2.running_var',\n",
" 'model.backbone.layer4.1.bn2.weight',\n",
" 'model.backbone.layer4.1.conv1.weight',\n",
" 'model.backbone.layer4.1.conv2.weight',\n",
" 'model.decoder.layers.0.linear1.bias',\n",
" 'model.decoder.layers.0.linear1.weight',\n",
" 'model.decoder.layers.0.linear2.bias',\n",
" 'model.decoder.layers.0.linear2.weight',\n",
" 'model.decoder.layers.0.multihead_attn.in_proj_bias',\n",
" 'model.decoder.layers.0.multihead_attn.in_proj_weight',\n",
" 'model.decoder.layers.0.multihead_attn.out_proj.bias',\n",
" 'model.decoder.layers.0.multihead_attn.out_proj.weight',\n",
" 'model.decoder.layers.0.norm1.bias',\n",
" 'model.decoder.layers.0.norm1.weight',\n",
" 'model.decoder.layers.0.norm2.bias',\n",
" 'model.decoder.layers.0.norm2.weight',\n",
" 'model.decoder.layers.0.norm3.bias',\n",
" 'model.decoder.layers.0.norm3.weight',\n",
" 'model.decoder.layers.0.self_attn.in_proj_bias',\n",
" 'model.decoder.layers.0.self_attn.in_proj_weight',\n",
" 'model.decoder.layers.0.self_attn.out_proj.bias',\n",
" 'model.decoder.layers.0.self_attn.out_proj.weight',\n",
" 'model.decoder_pos_embed.weight',\n",
" 'model.encoder.layers.0.linear1.bias',\n",
" 'model.encoder.layers.0.linear1.weight',\n",
" 'model.encoder.layers.0.linear2.bias',\n",
" 'model.encoder.layers.0.linear2.weight',\n",
" 'model.encoder.layers.0.norm1.bias',\n",
" 'model.encoder.layers.0.norm1.weight',\n",
" 'model.encoder.layers.0.norm2.bias',\n",
" 'model.encoder.layers.0.norm2.weight',\n",
" 'model.encoder.layers.0.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.0.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.0.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.0.self_attn.out_proj.weight',\n",
" 'model.encoder.layers.1.linear1.bias',\n",
" 'model.encoder.layers.1.linear1.weight',\n",
" 'model.encoder.layers.1.linear2.bias',\n",
" 'model.encoder.layers.1.linear2.weight',\n",
" 'model.encoder.layers.1.norm1.bias',\n",
" 'model.encoder.layers.1.norm1.weight',\n",
" 'model.encoder.layers.1.norm2.bias',\n",
" 'model.encoder.layers.1.norm2.weight',\n",
" 'model.encoder.layers.1.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.1.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.1.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.1.self_attn.out_proj.weight',\n",
" 'model.encoder.layers.2.linear1.bias',\n",
" 'model.encoder.layers.2.linear1.weight',\n",
" 'model.encoder.layers.2.linear2.bias',\n",
" 'model.encoder.layers.2.linear2.weight',\n",
" 'model.encoder.layers.2.norm1.bias',\n",
" 'model.encoder.layers.2.norm1.weight',\n",
" 'model.encoder.layers.2.norm2.bias',\n",
" 'model.encoder.layers.2.norm2.weight',\n",
" 'model.encoder.layers.2.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.2.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.2.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.2.self_attn.out_proj.weight',\n",
" 'model.encoder.layers.3.linear1.bias',\n",
" 'model.encoder.layers.3.linear1.weight',\n",
" 'model.encoder.layers.3.linear2.bias',\n",
" 'model.encoder.layers.3.linear2.weight',\n",
" 'model.encoder.layers.3.norm1.bias',\n",
" 'model.encoder.layers.3.norm1.weight',\n",
" 'model.encoder.layers.3.norm2.bias',\n",
" 'model.encoder.layers.3.norm2.weight',\n",
" 'model.encoder.layers.3.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.3.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.3.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.3.self_attn.out_proj.weight',\n",
" 'model.encoder_img_feat_input_proj.bias',\n",
" 'model.encoder_img_feat_input_proj.weight',\n",
" 'model.encoder_latent_input_proj.bias',\n",
" 'model.encoder_latent_input_proj.weight',\n",
" 'model.encoder_robot_and_latent_pos_embed.weight',\n",
" 'model.encoder_robot_state_input_proj.bias',\n",
" 'model.encoder_robot_state_input_proj.weight',\n",
" 'model.vae_encoder.layers.0.linear1.bias',\n",
" 'model.vae_encoder.layers.0.linear1.weight',\n",
" 'model.vae_encoder.layers.0.linear2.bias',\n",
" 'model.vae_encoder.layers.0.linear2.weight',\n",
" 'model.vae_encoder.layers.0.norm1.bias',\n",
" 'model.vae_encoder.layers.0.norm1.weight',\n",
" 'model.vae_encoder.layers.0.norm2.bias',\n",
" 'model.vae_encoder.layers.0.norm2.weight',\n",
" 'model.vae_encoder.layers.0.self_attn.in_proj_bias',\n",
" 'model.vae_encoder.layers.0.self_attn.in_proj_weight',\n",
" 'model.vae_encoder.layers.0.self_attn.out_proj.bias',\n",
" 'model.vae_encoder.layers.0.self_attn.out_proj.weight',\n",
" 'model.vae_encoder.layers.1.linear1.bias',\n",
" 'model.vae_encoder.layers.1.linear1.weight',\n",
" 'model.vae_encoder.layers.1.linear2.bias',\n",
" 'model.vae_encoder.layers.1.linear2.weight',\n",
" 'model.vae_encoder.layers.1.norm1.bias',\n",
" 'model.vae_encoder.layers.1.norm1.weight',\n",
" 'model.vae_encoder.layers.1.norm2.bias',\n",
" 'model.vae_encoder.layers.1.norm2.weight',\n",
" 'model.vae_encoder.layers.1.self_attn.in_proj_bias',\n",
" 'model.vae_encoder.layers.1.self_attn.in_proj_weight',\n",
" 'model.vae_encoder.layers.1.self_attn.out_proj.bias',\n",
" 'model.vae_encoder.layers.1.self_attn.out_proj.weight',\n",
" 'model.vae_encoder.layers.2.linear1.bias',\n",
" 'model.vae_encoder.layers.2.linear1.weight',\n",
" 'model.vae_encoder.layers.2.linear2.bias',\n",
" 'model.vae_encoder.layers.2.linear2.weight',\n",
" 'model.vae_encoder.layers.2.norm1.bias',\n",
" 'model.vae_encoder.layers.2.norm1.weight',\n",
" 'model.vae_encoder.layers.2.norm2.bias',\n",
" 'model.vae_encoder.layers.2.norm2.weight',\n",
" 'model.vae_encoder.layers.2.self_attn.in_proj_bias',\n",
" 'model.vae_encoder.layers.2.self_attn.in_proj_weight',\n",
" 'model.vae_encoder.layers.2.self_attn.out_proj.bias',\n",
" 'model.vae_encoder.layers.2.self_attn.out_proj.weight',\n",
" 'model.vae_encoder.layers.3.linear1.bias',\n",
" 'model.vae_encoder.layers.3.linear1.weight',\n",
" 'model.vae_encoder.layers.3.linear2.bias',\n",
" 'model.vae_encoder.layers.3.linear2.weight',\n",
" 'model.vae_encoder.layers.3.norm1.bias',\n",
" 'model.vae_encoder.layers.3.norm1.weight',\n",
" 'model.vae_encoder.layers.3.norm2.bias',\n",
" 'model.vae_encoder.layers.3.norm2.weight',\n",
" 'model.vae_encoder.layers.3.self_attn.in_proj_bias',\n",
" 'model.vae_encoder.layers.3.self_attn.in_proj_weight',\n",
" 'model.vae_encoder.layers.3.self_attn.out_proj.bias',\n",
" 'model.vae_encoder.layers.3.self_attn.out_proj.weight',\n",
" 'model.vae_encoder_action_input_proj.bias',\n",
" 'model.vae_encoder_action_input_proj.weight',\n",
" 'model.vae_encoder_cls_embed.weight',\n",
" 'model.vae_encoder_latent_output_proj.bias',\n",
" 'model.vae_encoder_latent_output_proj.weight',\n",
" 'model.vae_encoder_pos_enc',\n",
" 'model.vae_encoder_robot_state_input_proj.bias',\n",
" 'model.vae_encoder_robot_state_input_proj.weight',\n",
" 'normalize_inputs.buffer_observation_images_front.mean',\n",
" 'normalize_inputs.buffer_observation_images_front.std',\n",
" 'normalize_inputs.buffer_observation_images_top.mean',\n",
" 'normalize_inputs.buffer_observation_images_top.std',\n",
" 'normalize_inputs.buffer_observation_state.mean',\n",
" 'normalize_inputs.buffer_observation_state.std',\n",
" 'normalize_targets.buffer_action.mean',\n",
" 'normalize_targets.buffer_action.std',\n",
" 'unnormalize_outputs.buffer_action.mean',\n",
" 'unnormalize_outputs.buffer_action.std']\n"
]
}
],
"source": [
"dest = list(b.keys())\n",
"pprint(dest)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['model.pos_table',\n",
" 'model.transformer.encoder.layers.0.self_attn.in_proj_weight',\n",
" 'model.transformer.encoder.layers.0.self_attn.in_proj_bias',\n",
" 'model.transformer.encoder.layers.0.self_attn.out_proj.weight',\n",
" 'model.transformer.encoder.layers.0.self_attn.out_proj.bias',\n",
" 'model.transformer.encoder.layers.0.linear1.weight',\n",
" 'model.transformer.encoder.layers.0.linear1.bias',\n",
" 'model.transformer.encoder.layers.0.linear2.weight',\n",
" 'model.transformer.encoder.layers.0.linear2.bias',\n",
" 'model.transformer.encoder.layers.0.norm1.weight',\n",
" 'model.transformer.encoder.layers.0.norm1.bias',\n",
" 'model.transformer.encoder.layers.0.norm2.weight',\n",
" 'model.transformer.encoder.layers.0.norm2.bias',\n",
" 'model.transformer.encoder.layers.1.self_attn.in_proj_weight',\n",
" 'model.transformer.encoder.layers.1.self_attn.in_proj_bias',\n",
" 'model.transformer.encoder.layers.1.self_attn.out_proj.weight',\n",
" 'model.transformer.encoder.layers.1.self_attn.out_proj.bias',\n",
" 'model.transformer.encoder.layers.1.linear1.weight',\n",
" 'model.transformer.encoder.layers.1.linear1.bias',\n",
" 'model.transformer.encoder.layers.1.linear2.weight',\n",
" 'model.transformer.encoder.layers.1.linear2.bias',\n",
" 'model.transformer.encoder.layers.1.norm1.weight',\n",
" 'model.transformer.encoder.layers.1.norm1.bias',\n",
" 'model.transformer.encoder.layers.1.norm2.weight',\n",
" 'model.transformer.encoder.layers.1.norm2.bias',\n",
" 'model.transformer.encoder.layers.2.self_attn.in_proj_weight',\n",
" 'model.transformer.encoder.layers.2.self_attn.in_proj_bias',\n",
" 'model.transformer.encoder.layers.2.self_attn.out_proj.weight',\n",
" 'model.transformer.encoder.layers.2.self_attn.out_proj.bias',\n",
" 'model.transformer.encoder.layers.2.linear1.weight',\n",
" 'model.transformer.encoder.layers.2.linear1.bias',\n",
" 'model.transformer.encoder.layers.2.linear2.weight',\n",
" 'model.transformer.encoder.layers.2.linear2.bias',\n",
" 'model.transformer.encoder.layers.2.norm1.weight',\n",
" 'model.transformer.encoder.layers.2.norm1.bias',\n",
" 'model.transformer.encoder.layers.2.norm2.weight',\n",
" 'model.transformer.encoder.layers.2.norm2.bias',\n",
" 'model.transformer.encoder.layers.3.self_attn.in_proj_weight',\n",
" 'model.transformer.encoder.layers.3.self_attn.in_proj_bias',\n",
" 'model.transformer.encoder.layers.3.self_attn.out_proj.weight',\n",
" 'model.transformer.encoder.layers.3.self_attn.out_proj.bias',\n",
" 'model.transformer.encoder.layers.3.linear1.weight',\n",
" 'model.transformer.encoder.layers.3.linear1.bias',\n",
" 'model.transformer.encoder.layers.3.linear2.weight',\n",
" 'model.transformer.encoder.layers.3.linear2.bias',\n",
" 'model.transformer.encoder.layers.3.norm1.weight',\n",
" 'model.transformer.encoder.layers.3.norm1.bias',\n",
" 'model.transformer.encoder.layers.3.norm2.weight',\n",
" 'model.transformer.encoder.layers.3.norm2.bias',\n",
" 'model.transformer.decoder.layers.0.self_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.0.self_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.0.self_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.0.self_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.0.multihead_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.0.multihead_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.0.multihead_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.0.multihead_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.0.linear1.weight',\n",
" 'model.transformer.decoder.layers.0.linear1.bias',\n",
" 'model.transformer.decoder.layers.0.linear2.weight',\n",
" 'model.transformer.decoder.layers.0.linear2.bias',\n",
" 'model.transformer.decoder.layers.0.norm1.weight',\n",
" 'model.transformer.decoder.layers.0.norm1.bias',\n",
" 'model.transformer.decoder.layers.0.norm2.weight',\n",
" 'model.transformer.decoder.layers.0.norm2.bias',\n",
" 'model.transformer.decoder.layers.0.norm3.weight',\n",
" 'model.transformer.decoder.layers.0.norm3.bias',\n",
" 'model.transformer.decoder.layers.1.self_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.1.self_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.1.self_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.1.self_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.1.multihead_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.1.multihead_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.1.multihead_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.1.multihead_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.1.linear1.weight',\n",
" 'model.transformer.decoder.layers.1.linear1.bias',\n",
" 'model.transformer.decoder.layers.1.linear2.weight',\n",
" 'model.transformer.decoder.layers.1.linear2.bias',\n",
" 'model.transformer.decoder.layers.1.norm1.weight',\n",
" 'model.transformer.decoder.layers.1.norm1.bias',\n",
" 'model.transformer.decoder.layers.1.norm2.weight',\n",
" 'model.transformer.decoder.layers.1.norm2.bias',\n",
" 'model.transformer.decoder.layers.1.norm3.weight',\n",
" 'model.transformer.decoder.layers.1.norm3.bias',\n",
" 'model.transformer.decoder.layers.2.self_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.2.self_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.2.self_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.2.self_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.2.multihead_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.2.multihead_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.2.multihead_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.2.multihead_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.2.linear1.weight',\n",
" 'model.transformer.decoder.layers.2.linear1.bias',\n",
" 'model.transformer.decoder.layers.2.linear2.weight',\n",
" 'model.transformer.decoder.layers.2.linear2.bias',\n",
" 'model.transformer.decoder.layers.2.norm1.weight',\n",
" 'model.transformer.decoder.layers.2.norm1.bias',\n",
" 'model.transformer.decoder.layers.2.norm2.weight',\n",
" 'model.transformer.decoder.layers.2.norm2.bias',\n",
" 'model.transformer.decoder.layers.2.norm3.weight',\n",
" 'model.transformer.decoder.layers.2.norm3.bias',\n",
" 'model.transformer.decoder.layers.3.self_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.3.self_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.3.self_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.3.self_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.3.multihead_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.3.multihead_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.3.multihead_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.3.multihead_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.3.linear1.weight',\n",
" 'model.transformer.decoder.layers.3.linear1.bias',\n",
" 'model.transformer.decoder.layers.3.linear2.weight',\n",
" 'model.transformer.decoder.layers.3.linear2.bias',\n",
" 'model.transformer.decoder.layers.3.norm1.weight',\n",
" 'model.transformer.decoder.layers.3.norm1.bias',\n",
" 'model.transformer.decoder.layers.3.norm2.weight',\n",
" 'model.transformer.decoder.layers.3.norm2.bias',\n",
" 'model.transformer.decoder.layers.3.norm3.weight',\n",
" 'model.transformer.decoder.layers.3.norm3.bias',\n",
" 'model.transformer.decoder.layers.4.self_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.4.self_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.4.self_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.4.self_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.4.multihead_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.4.multihead_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.4.multihead_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.4.multihead_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.4.linear1.weight',\n",
" 'model.transformer.decoder.layers.4.linear1.bias',\n",
" 'model.transformer.decoder.layers.4.linear2.weight',\n",
" 'model.transformer.decoder.layers.4.linear2.bias',\n",
" 'model.transformer.decoder.layers.4.norm1.weight',\n",
" 'model.transformer.decoder.layers.4.norm1.bias',\n",
" 'model.transformer.decoder.layers.4.norm2.weight',\n",
" 'model.transformer.decoder.layers.4.norm2.bias',\n",
" 'model.transformer.decoder.layers.4.norm3.weight',\n",
" 'model.transformer.decoder.layers.4.norm3.bias',\n",
" 'model.transformer.decoder.layers.5.self_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.5.self_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.5.self_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.5.self_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.5.multihead_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.5.multihead_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.5.multihead_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.5.multihead_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.5.linear1.weight',\n",
" 'model.transformer.decoder.layers.5.linear1.bias',\n",
" 'model.transformer.decoder.layers.5.linear2.weight',\n",
" 'model.transformer.decoder.layers.5.linear2.bias',\n",
" 'model.transformer.decoder.layers.5.norm1.weight',\n",
" 'model.transformer.decoder.layers.5.norm1.bias',\n",
" 'model.transformer.decoder.layers.5.norm2.weight',\n",
" 'model.transformer.decoder.layers.5.norm2.bias',\n",
" 'model.transformer.decoder.layers.5.norm3.weight',\n",
" 'model.transformer.decoder.layers.5.norm3.bias',\n",
" 'model.transformer.decoder.layers.6.self_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.6.self_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.6.self_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.6.self_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.6.multihead_attn.in_proj_weight',\n",
" 'model.transformer.decoder.layers.6.multihead_attn.in_proj_bias',\n",
" 'model.transformer.decoder.layers.6.multihead_attn.out_proj.weight',\n",
" 'model.transformer.decoder.layers.6.multihead_attn.out_proj.bias',\n",
" 'model.transformer.decoder.layers.6.linear1.weight',\n",
" 'model.transformer.decoder.layers.6.linear1.bias',\n",
" 'model.transformer.decoder.layers.6.linear2.weight',\n",
" 'model.transformer.decoder.layers.6.linear2.bias',\n",
" 'model.transformer.decoder.layers.6.norm1.weight',\n",
" 'model.transformer.decoder.layers.6.norm1.bias',\n",
" 'model.transformer.decoder.layers.6.norm2.weight',\n",
" 'model.transformer.decoder.layers.6.norm2.bias',\n",
" 'model.transformer.decoder.layers.6.norm3.weight',\n",
" 'model.transformer.decoder.layers.6.norm3.bias',\n",
" 'model.transformer.decoder.norm.weight',\n",
" 'model.transformer.decoder.norm.bias',\n",
" 'model.encoder.layers.0.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.0.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.0.self_attn.out_proj.weight',\n",
" 'model.encoder.layers.0.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.0.linear1.weight',\n",
" 'model.encoder.layers.0.linear1.bias',\n",
" 'model.encoder.layers.0.linear2.weight',\n",
" 'model.encoder.layers.0.linear2.bias',\n",
" 'model.encoder.layers.0.norm1.weight',\n",
" 'model.encoder.layers.0.norm1.bias',\n",
" 'model.encoder.layers.0.norm2.weight',\n",
" 'model.encoder.layers.0.norm2.bias',\n",
" 'model.encoder.layers.1.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.1.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.1.self_attn.out_proj.weight',\n",
" 'model.encoder.layers.1.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.1.linear1.weight',\n",
" 'model.encoder.layers.1.linear1.bias',\n",
" 'model.encoder.layers.1.linear2.weight',\n",
" 'model.encoder.layers.1.linear2.bias',\n",
" 'model.encoder.layers.1.norm1.weight',\n",
" 'model.encoder.layers.1.norm1.bias',\n",
" 'model.encoder.layers.1.norm2.weight',\n",
" 'model.encoder.layers.1.norm2.bias',\n",
" 'model.encoder.layers.2.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.2.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.2.self_attn.out_proj.weight',\n",
" 'model.encoder.layers.2.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.2.linear1.weight',\n",
" 'model.encoder.layers.2.linear1.bias',\n",
" 'model.encoder.layers.2.linear2.weight',\n",
" 'model.encoder.layers.2.linear2.bias',\n",
" 'model.encoder.layers.2.norm1.weight',\n",
" 'model.encoder.layers.2.norm1.bias',\n",
" 'model.encoder.layers.2.norm2.weight',\n",
" 'model.encoder.layers.2.norm2.bias',\n",
" 'model.encoder.layers.3.self_attn.in_proj_weight',\n",
" 'model.encoder.layers.3.self_attn.in_proj_bias',\n",
" 'model.encoder.layers.3.self_attn.out_proj.weight',\n",
" 'model.encoder.layers.3.self_attn.out_proj.bias',\n",
" 'model.encoder.layers.3.linear1.weight',\n",
" 'model.encoder.layers.3.linear1.bias',\n",
" 'model.encoder.layers.3.linear2.weight',\n",
" 'model.encoder.layers.3.linear2.bias',\n",
" 'model.encoder.layers.3.norm1.weight',\n",
" 'model.encoder.layers.3.norm1.bias',\n",
" 'model.encoder.layers.3.norm2.weight',\n",
" 'model.encoder.layers.3.norm2.bias',\n",
" 'model.action_head.weight',\n",
" 'model.action_head.bias',\n",
" 'model.is_pad_head.weight',\n",
" 'model.is_pad_head.bias',\n",
" 'model.query_embed.weight',\n",
" 'model.input_proj.weight',\n",
" 'model.input_proj.bias',\n",
" 'model.backbones.0.0.body.conv1.weight',\n",
" 'model.backbones.0.0.body.bn1.weight',\n",
" 'model.backbones.0.0.body.bn1.bias',\n",
" 'model.backbones.0.0.body.bn1.running_mean',\n",
" 'model.backbones.0.0.body.bn1.running_var',\n",
" 'model.backbones.0.0.body.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer1.0.conv1.weight',\n",
" 'model.backbones.0.0.body.layer1.0.bn1.weight',\n",
" 'model.backbones.0.0.body.layer1.0.bn1.bias',\n",
" 'model.backbones.0.0.body.layer1.0.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer1.0.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer1.0.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer1.0.conv2.weight',\n",
" 'model.backbones.0.0.body.layer1.0.bn2.weight',\n",
" 'model.backbones.0.0.body.layer1.0.bn2.bias',\n",
" 'model.backbones.0.0.body.layer1.0.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer1.0.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer1.0.bn2.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer1.1.conv1.weight',\n",
" 'model.backbones.0.0.body.layer1.1.bn1.weight',\n",
" 'model.backbones.0.0.body.layer1.1.bn1.bias',\n",
" 'model.backbones.0.0.body.layer1.1.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer1.1.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer1.1.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer1.1.conv2.weight',\n",
" 'model.backbones.0.0.body.layer1.1.bn2.weight',\n",
" 'model.backbones.0.0.body.layer1.1.bn2.bias',\n",
" 'model.backbones.0.0.body.layer1.1.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer1.1.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer1.1.bn2.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer2.0.conv1.weight',\n",
" 'model.backbones.0.0.body.layer2.0.bn1.weight',\n",
" 'model.backbones.0.0.body.layer2.0.bn1.bias',\n",
" 'model.backbones.0.0.body.layer2.0.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer2.0.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer2.0.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer2.0.conv2.weight',\n",
" 'model.backbones.0.0.body.layer2.0.bn2.weight',\n",
" 'model.backbones.0.0.body.layer2.0.bn2.bias',\n",
" 'model.backbones.0.0.body.layer2.0.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer2.0.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer2.0.bn2.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer2.0.downsample.0.weight',\n",
" 'model.backbones.0.0.body.layer2.0.downsample.1.weight',\n",
" 'model.backbones.0.0.body.layer2.0.downsample.1.bias',\n",
" 'model.backbones.0.0.body.layer2.0.downsample.1.running_mean',\n",
" 'model.backbones.0.0.body.layer2.0.downsample.1.running_var',\n",
" 'model.backbones.0.0.body.layer2.0.downsample.1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer2.1.conv1.weight',\n",
" 'model.backbones.0.0.body.layer2.1.bn1.weight',\n",
" 'model.backbones.0.0.body.layer2.1.bn1.bias',\n",
" 'model.backbones.0.0.body.layer2.1.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer2.1.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer2.1.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer2.1.conv2.weight',\n",
" 'model.backbones.0.0.body.layer2.1.bn2.weight',\n",
" 'model.backbones.0.0.body.layer2.1.bn2.bias',\n",
" 'model.backbones.0.0.body.layer2.1.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer2.1.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer2.1.bn2.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer3.0.conv1.weight',\n",
" 'model.backbones.0.0.body.layer3.0.bn1.weight',\n",
" 'model.backbones.0.0.body.layer3.0.bn1.bias',\n",
" 'model.backbones.0.0.body.layer3.0.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer3.0.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer3.0.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer3.0.conv2.weight',\n",
" 'model.backbones.0.0.body.layer3.0.bn2.weight',\n",
" 'model.backbones.0.0.body.layer3.0.bn2.bias',\n",
" 'model.backbones.0.0.body.layer3.0.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer3.0.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer3.0.bn2.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer3.0.downsample.0.weight',\n",
" 'model.backbones.0.0.body.layer3.0.downsample.1.weight',\n",
" 'model.backbones.0.0.body.layer3.0.downsample.1.bias',\n",
" 'model.backbones.0.0.body.layer3.0.downsample.1.running_mean',\n",
" 'model.backbones.0.0.body.layer3.0.downsample.1.running_var',\n",
" 'model.backbones.0.0.body.layer3.0.downsample.1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer3.1.conv1.weight',\n",
" 'model.backbones.0.0.body.layer3.1.bn1.weight',\n",
" 'model.backbones.0.0.body.layer3.1.bn1.bias',\n",
" 'model.backbones.0.0.body.layer3.1.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer3.1.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer3.1.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer3.1.conv2.weight',\n",
" 'model.backbones.0.0.body.layer3.1.bn2.weight',\n",
" 'model.backbones.0.0.body.layer3.1.bn2.bias',\n",
" 'model.backbones.0.0.body.layer3.1.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer3.1.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer3.1.bn2.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer4.0.conv1.weight',\n",
" 'model.backbones.0.0.body.layer4.0.bn1.weight',\n",
" 'model.backbones.0.0.body.layer4.0.bn1.bias',\n",
" 'model.backbones.0.0.body.layer4.0.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer4.0.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer4.0.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer4.0.conv2.weight',\n",
" 'model.backbones.0.0.body.layer4.0.bn2.weight',\n",
" 'model.backbones.0.0.body.layer4.0.bn2.bias',\n",
" 'model.backbones.0.0.body.layer4.0.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer4.0.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer4.0.bn2.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer4.0.downsample.0.weight',\n",
" 'model.backbones.0.0.body.layer4.0.downsample.1.weight',\n",
" 'model.backbones.0.0.body.layer4.0.downsample.1.bias',\n",
" 'model.backbones.0.0.body.layer4.0.downsample.1.running_mean',\n",
" 'model.backbones.0.0.body.layer4.0.downsample.1.running_var',\n",
" 'model.backbones.0.0.body.layer4.0.downsample.1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer4.1.conv1.weight',\n",
" 'model.backbones.0.0.body.layer4.1.bn1.weight',\n",
" 'model.backbones.0.0.body.layer4.1.bn1.bias',\n",
" 'model.backbones.0.0.body.layer4.1.bn1.running_mean',\n",
" 'model.backbones.0.0.body.layer4.1.bn1.running_var',\n",
" 'model.backbones.0.0.body.layer4.1.bn1.num_batches_tracked',\n",
" 'model.backbones.0.0.body.layer4.1.conv2.weight',\n",
" 'model.backbones.0.0.body.layer4.1.bn2.weight',\n",
" 'model.backbones.0.0.body.layer4.1.bn2.bias',\n",
" 'model.backbones.0.0.body.layer4.1.bn2.running_mean',\n",
" 'model.backbones.0.0.body.layer4.1.bn2.running_var',\n",
" 'model.backbones.0.0.body.layer4.1.bn2.num_batches_tracked',\n",
" 'model.input_proj_robot_state.weight',\n",
" 'model.input_proj_robot_state.bias',\n",
" 'model.cls_embed.weight',\n",
" 'model.encoder_action_proj.weight',\n",
" 'model.encoder_action_proj.bias',\n",
" 'model.encoder_joint_proj.weight',\n",
" 'model.encoder_joint_proj.bias',\n",
" 'model.latent_proj.weight',\n",
" 'model.latent_proj.bias',\n",
" 'model.latent_out_proj.weight',\n",
" 'model.latent_out_proj.bias',\n",
" 'model.additional_pos_embed.weight']\n"
]
}
],
"source": [
"orig = list(a.keys())\n",
"pprint(orig)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"a = torch.load(original_ckpt_path)\n",
"\n",
"to_remove_startswith = ['model.transformer.decoder.layers.1.',\n",
" 'model.transformer.decoder.layers.2.',\n",
" 'model.transformer.decoder.layers.3.',\n",
" 'model.transformer.decoder.layers.4.',\n",
" 'model.transformer.decoder.layers.5.',\n",
" 'model.transformer.decoder.layers.6.',\n",
" 'model.transformer.decoder.norm.',\n",
" 'model.is_pad_head']\n",
"\n",
"to_remove_in = ['num_batches_tracked',]\n",
"\n",
"conv = {}\n",
"\n",
"keys = list(a.keys())\n",
"for k in keys:\n",
" if any(k.startswith(tr) for tr in to_remove_startswith):\n",
" a.pop(k)\n",
" continue\n",
" if any(tr in k for tr in to_remove_in):\n",
" a.pop(k)\n",
" continue\n",
" if k.startswith('model.transformer.encoder.layers.'):\n",
" conv[k.replace('transformer.', '')] = a.pop(k)\n",
" if k.startswith('model.transformer.decoder.layers.0.'):\n",
" conv[k.replace('transformer.', '')] = a.pop(k)\n",
" if k.startswith('model.encoder.layers.'):\n",
" conv[k.replace('encoder.', 'vae_encoder.')] = a.pop(k)\n",
" if k.startswith('model.action_head.'):\n",
" conv[k] = a.pop(k)\n",
" if k.startswith('model.pos_table'):\n",
" conv[k.replace('pos_table', 'vae_encoder_pos_enc')] = a.pop(k)\n",
" if k.startswith('model.query_embed.'):\n",
" conv[k.replace('query_embed', 'decoder_pos_embed')] = a.pop(k)\n",
" if k.startswith('model.input_proj.'):\n",
" conv[k.replace('input_proj.', 'encoder_img_feat_input_proj.')] = a.pop(k)\n",
" if k.startswith('model.input_proj_robot_state.'):\n",
" conv[k.replace('input_proj_robot_state.', 'encoder_robot_state_input_proj.')] = a.pop(k)\n",
" if k.startswith('model.backbones.0.0.body.'):\n",
" conv[k.replace('backbones.0.0.body', 'backbone')] = a.pop(k)\n",
" if k.startswith('model.cls_embed.'):\n",
" conv[k.replace('cls_embed', 'vae_encoder_cls_embed')] = a.pop(k)\n",
" if k.startswith('model.encoder_action_proj.'):\n",
" conv[k.replace('encoder_action_proj', 'vae_encoder_action_input_proj')] = a.pop(k)\n",
" if k.startswith('model.encoder_joint_proj.'):\n",
" conv[k.replace('encoder_joint_proj', 'vae_encoder_robot_state_input_proj')] = a.pop(k)\n",
" if k.startswith('model.latent_proj.'):\n",
" conv[k.replace('latent_proj', 'vae_encoder_latent_output_proj')] = a.pop(k)\n",
" if k.startswith('model.latent_out_proj.'):\n",
" conv[k.replace('latent_out_proj', 'encoder_latent_input_proj')] = a.pop(k)\n",
" if k.startswith('model.additional_pos_embed.'):\n",
" conv[k.replace('additional_pos_embed', 'encoder_robot_and_latent_pos_embed')] = a.pop(k)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OrderedDict()"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"for k, v in conv.items():\n",
" assert b[k].shape == v.shape\n",
" b[k] = v"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"save_file(b, converted_ckpt_path)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/thomwolf/Documents/Github/ACT/checkpoints/blue_red_sort/config.yaml'"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now also copy the config files\n",
"import shutil\n",
"shutil.copy(comparison_config_json_path, converted_ckpt_path.replace('model.safetensors', 'config.json'))\n",
"shutil.copy(comparison_config_yaml_path, converted_ckpt_path.replace('model.safetensors', 'config.yaml'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "lerobot",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,8 @@
from gymnasium.envs.registration import register
register(
id="gym_real_world/RealEnv-v0",
entry_point="gym_real_world.gym_environment:RealEnv",
max_episode_steps=300,
nondeterministic=True,
)

View File

@@ -0,0 +1,363 @@
# ruff: noqa
"""From Alexander Koch low_cost_robot codebase at https://github.com/AlexanderKoch-Koch/low_cost_robot
Dynamixel class to control the dynamixel servos
"""
from __future__ import annotations
import enum
import math
import os
from dataclasses import dataclass
import numpy as np
from dynamixel_sdk import * # Uses Dynamixel SDK library
def pos2pwm(pos: np.ndarray) -> np.ndarray:
"""
:param pos: numpy array of joint positions in range [-pi, pi]
:return: numpy array of pwm values in range [0, 4096]
"""
return ((pos / 3.14 + 1.0) * 2048).astype(np.int64)
def pwm2pos(pwm: np.ndarray) -> np.ndarray:
"""
:param pwm: numpy array of pwm values in range [0, 4096]
:return: numpy array of joint positions in range [-pi, pi]
"""
return (pwm / 2048 - 1) * 3.14
def pwm2vel(pwm: np.ndarray) -> np.ndarray:
"""
:param pwm: numpy array of pwm/s joint velocities
:return: numpy array of rad/s joint velocities
"""
return pwm * 3.14 / 2048
def vel2pwm(vel: np.ndarray) -> np.ndarray:
"""
:param vel: numpy array of rad/s joint velocities
:return: numpy array of pwm/s joint velocities
"""
return (vel * 2048 / 3.14).astype(np.int64)
class ReadAttribute(enum.Enum):
TEMPERATURE = 146
VOLTAGE = 145
VELOCITY = 128
POSITION = 132
CURRENT = 126
PWM = 124
HARDWARE_ERROR_STATUS = 70
HOMING_OFFSET = 20
BAUDRATE = 8
class OperatingMode(enum.Enum):
VELOCITY = 1
POSITION = 3
CURRENT_CONTROLLED_POSITION = 5
PWM = 16
UNKNOWN = -1
class Dynamixel:
ADDR_TORQUE_ENABLE = 64
ADDR_GOAL_POSITION = 116
ADDR_VELOCITY_LIMIT = 44
ADDR_GOAL_PWM = 100
OPERATING_MODE_ADDR = 11
POSITION_I = 82
POSITION_P = 84
ADDR_ID = 7
@dataclass
class Config:
def instantiate(self):
return Dynamixel(self)
baudrate: int = 57600
protocol_version: float = 2.0
device_name: str = "" # /dev/tty.usbserial-1120'
dynamixel_id: int = 1
def __init__(self, config: Config):
self.config = config
self.connect()
def connect(self):
if self.config.device_name == "":
for port_name in os.listdir("/dev"):
if "ttyUSB" in port_name or "ttyACM" in port_name:
self.config.device_name = "/dev/" + port_name
print(f"using device {self.config.device_name}")
self.portHandler = PortHandler(self.config.device_name)
# self.portHandler.LA
self.packetHandler = PacketHandler(self.config.protocol_version)
if not self.portHandler.openPort():
raise Exception(f"Failed to open port {self.config.device_name}")
if not self.portHandler.setBaudRate(self.config.baudrate):
raise Exception(f"failed to set baudrate to {self.config.baudrate}")
# self.operating_mode = OperatingMode.UNKNOWN
# self.torque_enabled = False
# self._disable_torque()
self.operating_modes = [None for _ in range(32)]
self.torque_enabled = [None for _ in range(32)]
return True
def disconnect(self):
self.portHandler.closePort()
def set_goal_position(self, motor_id, goal_position):
# if self.operating_modes[motor_id] is not OperatingMode.POSITION:
# self._disable_torque(motor_id)
# self.set_operating_mode(motor_id, OperatingMode.POSITION)
# if not self.torque_enabled[motor_id]:
# self._enable_torque(motor_id)
# self._enable_torque(motor_id)
dxl_comm_result, dxl_error = self.packetHandler.write4ByteTxRx(
self.portHandler, motor_id, self.ADDR_GOAL_POSITION, goal_position
)
# self._process_response(dxl_comm_result, dxl_error)
# print(f'set position of motor {motor_id} to {goal_position}')
def set_pwm_value(self, motor_id: int, pwm_value, tries=3):
if self.operating_modes[motor_id] is not OperatingMode.PWM:
self._disable_torque(motor_id)
self.set_operating_mode(motor_id, OperatingMode.PWM)
if not self.torque_enabled[motor_id]:
self._enable_torque(motor_id)
# print(f'enabling torque')
dxl_comm_result, dxl_error = self.packetHandler.write2ByteTxRx(
self.portHandler, motor_id, self.ADDR_GOAL_PWM, pwm_value
)
# self._process_response(dxl_comm_result, dxl_error)
# print(f'set pwm of motor {motor_id} to {pwm_value}')
if dxl_comm_result != COMM_SUCCESS:
if tries <= 1:
raise ConnectionError(f"dxl_comm_result: {self.packetHandler.getTxRxResult(dxl_comm_result)}")
else:
print(f"dynamixel pwm setting failure trying again with {tries - 1} tries")
self.set_pwm_value(motor_id, pwm_value, tries=tries - 1)
elif dxl_error != 0:
print(f"dxl error {dxl_error}")
raise ConnectionError(f"dynamixel error: {self.packetHandler.getTxRxResult(dxl_error)}")
def read_temperature(self, motor_id: int):
return self._read_value(motor_id, ReadAttribute.TEMPERATURE, 1)
def read_velocity(self, motor_id: int):
pos = self._read_value(motor_id, ReadAttribute.VELOCITY, 4)
if pos > 2**31:
pos -= 2**32
# print(f'read position {pos} for motor {motor_id}')
return pos
def read_position(self, motor_id: int):
pos = self._read_value(motor_id, ReadAttribute.POSITION, 4)
if pos > 2**31:
pos -= 2**32
# print(f'read position {pos} for motor {motor_id}')
return pos
def read_position_degrees(self, motor_id: int) -> float:
return (self.read_position(motor_id) / 4096) * 360
def read_position_radians(self, motor_id: int) -> float:
return (self.read_position(motor_id) / 4096) * 2 * math.pi
def read_current(self, motor_id: int):
current = self._read_value(motor_id, ReadAttribute.CURRENT, 2)
if current > 2**15:
current -= 2**16
return current
def read_present_pwm(self, motor_id: int):
return self._read_value(motor_id, ReadAttribute.PWM, 2)
def read_hardware_error_status(self, motor_id: int):
return self._read_value(motor_id, ReadAttribute.HARDWARE_ERROR_STATUS, 1)
def disconnect(self):
self.portHandler.closePort()
def set_id(self, old_id, new_id, use_broadcast_id: bool = False):
"""
sets the id of the dynamixel servo
@param old_id: current id of the servo
@param new_id: new id
@param use_broadcast_id: set ids of all connected dynamixels if True.
If False, change only servo with self.config.id
@return:
"""
if use_broadcast_id:
current_id = 254
else:
current_id = old_id
dxl_comm_result, dxl_error = self.packetHandler.write1ByteTxRx(
self.portHandler, current_id, self.ADDR_ID, new_id
)
self._process_response(dxl_comm_result, dxl_error, old_id)
self.config.id = id
def _enable_torque(self, motor_id):
dxl_comm_result, dxl_error = self.packetHandler.write1ByteTxRx(
self.portHandler, motor_id, self.ADDR_TORQUE_ENABLE, 1
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
self.torque_enabled[motor_id] = True
def _disable_torque(self, motor_id):
dxl_comm_result, dxl_error = self.packetHandler.write1ByteTxRx(
self.portHandler, motor_id, self.ADDR_TORQUE_ENABLE, 0
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
self.torque_enabled[motor_id] = False
def _process_response(self, dxl_comm_result: int, dxl_error: int, motor_id: int):
if dxl_comm_result != COMM_SUCCESS:
raise ConnectionError(
f"dxl_comm_result for motor {motor_id}: {self.packetHandler.getTxRxResult(dxl_comm_result)}"
)
elif dxl_error != 0:
print(f"dxl error {dxl_error}")
raise ConnectionError(
f"dynamixel error for motor {motor_id}: {self.packetHandler.getTxRxResult(dxl_error)}"
)
def set_operating_mode(self, motor_id: int, operating_mode: OperatingMode):
dxl_comm_result, dxl_error = self.packetHandler.write2ByteTxRx(
self.portHandler, motor_id, self.OPERATING_MODE_ADDR, operating_mode.value
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
self.operating_modes[motor_id] = operating_mode
def set_pwm_limit(self, motor_id: int, limit: int):
dxl_comm_result, dxl_error = self.packetHandler.write2ByteTxRx(self.portHandler, motor_id, 36, limit)
self._process_response(dxl_comm_result, dxl_error, motor_id)
def set_velocity_limit(self, motor_id: int, velocity_limit):
dxl_comm_result, dxl_error = self.packetHandler.write4ByteTxRx(
self.portHandler, motor_id, self.ADDR_VELOCITY_LIMIT, velocity_limit
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
def set_P(self, motor_id: int, P: int):
dxl_comm_result, dxl_error = self.packetHandler.write2ByteTxRx(
self.portHandler, motor_id, self.POSITION_P, P
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
def set_I(self, motor_id: int, I: int):
dxl_comm_result, dxl_error = self.packetHandler.write2ByteTxRx(
self.portHandler, motor_id, self.POSITION_I, I
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
def read_home_offset(self, motor_id: int):
self._disable_torque(motor_id)
# dxl_comm_result, dxl_error = self.packetHandler.write4ByteTxRx(self.portHandler, motor_id,
# ReadAttribute.HOMING_OFFSET.value, home_position)
home_offset = self._read_value(motor_id, ReadAttribute.HOMING_OFFSET, 4)
# self._process_response(dxl_comm_result, dxl_error)
self._enable_torque(motor_id)
return home_offset
def set_home_offset(self, motor_id: int, home_position: int):
self._disable_torque(motor_id)
dxl_comm_result, dxl_error = self.packetHandler.write4ByteTxRx(
self.portHandler, motor_id, ReadAttribute.HOMING_OFFSET.value, home_position
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
self._enable_torque(motor_id)
def set_baudrate(self, motor_id: int, baudrate):
# translate baudrate into dynamixel baudrate setting id
if baudrate == 57600:
baudrate_id = 1
elif baudrate == 1_000_000:
baudrate_id = 3
elif baudrate == 2_000_000:
baudrate_id = 4
elif baudrate == 3_000_000:
baudrate_id = 5
elif baudrate == 4_000_000:
baudrate_id = 6
else:
raise Exception("baudrate not implemented")
self._disable_torque(motor_id)
dxl_comm_result, dxl_error = self.packetHandler.write1ByteTxRx(
self.portHandler, motor_id, ReadAttribute.BAUDRATE.value, baudrate_id
)
self._process_response(dxl_comm_result, dxl_error, motor_id)
def _read_value(self, motor_id, attribute: ReadAttribute, num_bytes: int, tries=10):
try:
if num_bytes == 1:
value, dxl_comm_result, dxl_error = self.packetHandler.read1ByteTxRx(
self.portHandler, motor_id, attribute.value
)
elif num_bytes == 2:
value, dxl_comm_result, dxl_error = self.packetHandler.read2ByteTxRx(
self.portHandler, motor_id, attribute.value
)
elif num_bytes == 4:
value, dxl_comm_result, dxl_error = self.packetHandler.read4ByteTxRx(
self.portHandler, motor_id, attribute.value
)
except Exception:
if tries == 0:
raise Exception
else:
return self._read_value(motor_id, attribute, num_bytes, tries=tries - 1)
if dxl_comm_result != COMM_SUCCESS:
if tries <= 1:
# print("%s" % self.packetHandler.getTxRxResult(dxl_comm_result))
raise ConnectionError(f"dxl_comm_result {dxl_comm_result} for servo {motor_id} value {value}")
else:
print(f"dynamixel read failure for servo {motor_id} trying again with {tries - 1} tries")
time.sleep(0.02)
return self._read_value(motor_id, attribute, num_bytes, tries=tries - 1)
elif dxl_error != 0: # # print("%s" % self.packetHandler.getRxPacketError(dxl_error))
# raise ConnectionError(f'dxl_error {dxl_error} binary ' + "{0:b}".format(37))
if tries == 0 and dxl_error != 128:
raise Exception(f"Failed to read value from motor {motor_id} error is {dxl_error}")
else:
return self._read_value(motor_id, attribute, num_bytes, tries=tries - 1)
return value
def set_home_position(self, motor_id: int):
print(f"setting home position for motor {motor_id}")
self.set_home_offset(motor_id, 0)
current_position = self.read_position(motor_id)
print(f"position before {current_position}")
self.set_home_offset(motor_id, -current_position)
# dynamixel.set_home_offset(motor_id, -4096)
# dynamixel.set_home_offset(motor_id, -4294964109)
current_position = self.read_position(motor_id)
# print(f'signed position {current_position - 2** 32}')
print(f"position after {current_position}")
if __name__ == "__main__":
dynamixel = Dynamixel.Config(baudrate=1_000_000, device_name="/dev/tty.usbmodem57380045631").instantiate()
motor_id = 1
pos = dynamixel.read_position(motor_id)
for i in range(10):
s = time.monotonic()
pos = dynamixel.read_position(motor_id)
delta = time.monotonic() - s
print(f"read position took {delta}")
print(f"position {pos}")

View File

@@ -0,0 +1,192 @@
import time
from unittest.mock import MagicMock
import cv2
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from .dynamixel import pos2pwm, pwm2pos
from .robot import Robot
FPS = 30
CAMERAS_SHAPES = {
"images.high": (480, 640, 3),
"images.low": (480, 640, 3),
}
CAMERAS_PORTS = {
"images.high": "/dev/video6",
"images.low": "/dev/video0",
}
LEADER_PORT = "/dev/ttyACM1"
FOLLOWER_PORT = "/dev/ttyACM0"
MockRobot = MagicMock()
MockRobot.read_position = MagicMock()
MockRobot.read_position.return_value = np.array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0])
MockCamera = MagicMock()
MockCamera.isOpened = MagicMock(return_value=True)
MockCamera.read = MagicMock(return_value=(True, np.zeros((480, 640, 3), dtype=np.uint8)))
def capture_image(cam, cam_width, cam_height):
# Capture a single frame
_, frame = cam.read()
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# # Define your crop coordinates (top left corner and bottom right corner)
# x1, y1 = 400, 0 # Example starting coordinates (top left of the crop rectangle)
# x2, y2 = 1600, 900 # Example ending coordinates (bottom right of the crop rectangle)
# # Crop the image
# image = image[y1:y2, x1:x2]
# Resize the image
image = cv2.resize(image, (cam_width, cam_height), interpolation=cv2.INTER_AREA)
return image
class RealEnv(gym.Env):
metadata = {}
def __init__(
self,
record: bool = False,
num_joints: int = 6,
cameras_shapes: dict = CAMERAS_SHAPES,
cameras_ports: dict = CAMERAS_PORTS,
follower_port: str = FOLLOWER_PORT,
leader_port: str = LEADER_PORT,
warmup_steps: int = 100,
trigger_torque=70,
fps: int = FPS,
fps_tolerance: float = 0.1,
mock: bool = False,
):
self.num_joints = num_joints
self.cameras_shapes = cameras_shapes
self.cameras_ports = cameras_ports
self.warmup_steps = warmup_steps
assert len(self.cameras_shapes) == len(self.cameras_ports), "Number of cameras and shapes must match."
self.follower_port = follower_port
self.leader_port = leader_port
self.record = record
self.fps = fps
self.fps_tolerance = fps_tolerance
# Initialize the robot
self.follower = Robot(device_name=self.follower_port) if not mock else MockRobot
if self.record:
self.leader = Robot(device_name=self.leader_port) if not mock else MockRobot
self.leader.set_trigger_torque(trigger_torque)
# Initialize the cameras - sorted by camera names
self.cameras = {}
for cn, p in sorted(self.cameras_ports.items()):
self.cameras[cn] = cv2.VideoCapture(p) if not mock else MockCamera
if not self.cameras[cn].isOpened():
raise OSError(
f"Cannot open camera port {p} for {cn}."
f" Make sure the camera is connected and the port is correct."
f"Also check you are not spinning several instances of the same environment (eval.batch_size)"
)
# Specify gym action and observation spaces
observation_space = {}
if self.num_joints > 0:
observation_space["agent_pos"] = spaces.Box(
low=-1000.0,
high=1000.0,
shape=(num_joints,),
dtype=np.float64,
)
if self.record:
observation_space["leader_pos"] = spaces.Box(
low=-1000.0,
high=1000.0,
shape=(num_joints,),
dtype=np.float64,
)
if self.cameras_shapes:
for cn, hwc_shape in self.cameras_shapes.items():
# Assumes images are unsigned int8 in [0,255]
observation_space[cn] = spaces.Box(
low=0,
high=255,
# height x width x channels (e.g. 480 x 640 x 3)
shape=hwc_shape,
dtype=np.uint8,
)
self.observation_space = spaces.Dict(observation_space)
self.action_space = spaces.Box(low=-1, high=1, shape=(num_joints,), dtype=np.float32)
self._observation = {}
self._terminated = False
self.timestamps = []
def _get_obs(self):
qpos = self.follower.read_position()
self._observation["agent_pos"] = pwm2pos(qpos)
for cn, c in self.cameras.items():
self._observation[cn] = capture_image(c, self.cameras_shapes[cn][1], self.cameras_shapes[cn][0])
if self.record:
action = self.leader.read_position()
self._observation["leader_pos"] = pwm2pos(action)
def reset(self, seed: int | None = None):
# Reset the robot and sync the leader and follower if we are recording
for _ in range(self.warmup_steps):
self._get_obs()
if self.record:
self.follower.set_goal_pos(pos2pwm(self._observation["leader_pos"]))
self._terminated = False
info = {}
self.timestamps = []
return self._observation, info
def step(self, action: np.ndarray = None):
if self.timestamps:
# wait the right amount of time to stay at the desired fps
time.sleep(max(0, 1 / self.fps - (time.time() - self.timestamps[-1])))
self.timestamps.append(time.time())
# Get the observation
self._get_obs()
if self.record:
# Teleoperate the leader
self.follower.set_goal_pos(pos2pwm(self._observation["leader_pos"]))
else:
# Apply the action to the follower
self.follower.set_goal_pos(pos2pwm(action))
reward = 0
terminated = truncated = self._terminated
info = {"timestamp": self.timestamps[-1] - self.timestamps[0], "fps_error": False}
# Check if we are able to keep up with the desired fps
if len(self.timestamps) > 1 and (self.timestamps[-1] - self.timestamps[-2]) > 1 / (
self.fps - self.fps_tolerance
):
print(
f"Error: recording fps {1 / (self.timestamps[-1] - self.timestamps[-2]):.5f} is lower"
f" than min admited fps {(self.fps - self.fps_tolerance):.5f}"
f" at frame {len(self.timestamps)}"
)
info["fps_error"] = True
return self._observation, reward, terminated, truncated, info
def render(self): ...
def close(self):
self.follower._disable_torque()
if self.record:
self.leader._disable_torque()

View File

@@ -0,0 +1,168 @@
# ruff: noqa
"""From Alexander Koch low_cost_robot codebase at https://github.com/AlexanderKoch-Koch/low_cost_robot
Class to control the robot using dynamixel servos.
"""
from enum import Enum, auto
from typing import Union
import numpy as np
from dynamixel_sdk import DXL_HIBYTE, DXL_HIWORD, DXL_LOBYTE, DXL_LOWORD, GroupSyncRead, GroupSyncWrite
from .dynamixel import Dynamixel, OperatingMode, ReadAttribute
class MotorControlType(Enum):
PWM = auto()
POSITION_CONTROL = auto()
DISABLED = auto()
UNKNOWN = auto()
class Robot:
def __init__(self, device_name: str, baudrate=1_000_000, servo_ids=[1, 2, 3, 4, 5, 6]) -> None:
self.servo_ids = servo_ids
self.dynamixel = Dynamixel.Config(baudrate=baudrate, device_name=device_name).instantiate()
self._init_motors()
def _init_motors(self):
self.position_reader = GroupSyncRead(
self.dynamixel.portHandler, self.dynamixel.packetHandler, ReadAttribute.POSITION.value, 4
)
for id in self.servo_ids:
self.position_reader.addParam(id)
self.velocity_reader = GroupSyncRead(
self.dynamixel.portHandler, self.dynamixel.packetHandler, ReadAttribute.VELOCITY.value, 4
)
for id in self.servo_ids:
self.velocity_reader.addParam(id)
self.pos_writer = GroupSyncWrite(
self.dynamixel.portHandler, self.dynamixel.packetHandler, self.dynamixel.ADDR_GOAL_POSITION, 4
)
for id in self.servo_ids:
self.pos_writer.addParam(id, [2048])
self.pwm_writer = GroupSyncWrite(
self.dynamixel.portHandler, self.dynamixel.packetHandler, self.dynamixel.ADDR_GOAL_PWM, 2
)
for id in self.servo_ids:
self.pwm_writer.addParam(id, [2048])
self._disable_torque()
self.motor_control_state = MotorControlType.DISABLED
def read_position(self, tries=2):
"""
Reads the joint positions of the robot. 2048 is the center position. 0 and 4096 are 180 degrees in each direction.
:param tries: maximum number of tries to read the position
:return: list of joint positions in range [0, 4096]
"""
result = self.position_reader.txRxPacket()
if result != 0:
if tries > 0:
return self.read_position(tries=tries - 1)
else:
print("failed to read position!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
positions = []
for id in self.servo_ids:
position = self.position_reader.getData(id, ReadAttribute.POSITION.value, 4)
if position > 2**31:
position -= 2**32
positions.append(position)
return np.array(positions)
def read_velocity(self):
"""
Reads the joint velocities of the robot.
:return: list of joint velocities,
"""
self.velocity_reader.txRxPacket()
velocties = []
for id in self.servo_ids:
velocity = self.velocity_reader.getData(id, ReadAttribute.VELOCITY.value, 4)
if velocity > 2**31:
velocity -= 2**32
velocties.append(velocity)
return np.array(velocties)
def set_goal_pos(self, action):
"""
:param action: list or numpy array of target joint positions in range [0, 4096]
"""
if self.motor_control_state is not MotorControlType.POSITION_CONTROL:
self._set_position_control()
for i, motor_id in enumerate(self.servo_ids):
data_write = [
DXL_LOBYTE(DXL_LOWORD(action[i])),
DXL_HIBYTE(DXL_LOWORD(action[i])),
DXL_LOBYTE(DXL_HIWORD(action[i])),
DXL_HIBYTE(DXL_HIWORD(action[i])),
]
self.pos_writer.changeParam(motor_id, data_write)
self.pos_writer.txPacket()
def set_pwm(self, action):
"""
Sets the pwm values for the servos.
:param action: list or numpy array of pwm values in range [0, 885]
"""
if self.motor_control_state is not MotorControlType.PWM:
self._set_pwm_control()
for i, motor_id in enumerate(self.servo_ids):
data_write = [
DXL_LOBYTE(DXL_LOWORD(action[i])),
DXL_HIBYTE(DXL_LOWORD(action[i])),
]
self.pwm_writer.changeParam(motor_id, data_write)
self.pwm_writer.txPacket()
def set_trigger_torque(self, torque: int):
"""
Sets a constant torque torque for the last servo in the chain. This is useful for the trigger of the leader arm
"""
self.dynamixel._enable_torque(self.servo_ids[-1])
self.dynamixel.set_pwm_value(self.servo_ids[-1], torque)
def limit_pwm(self, limit: Union[int, list, np.ndarray]):
"""
Limits the pwm values for the servos in for position control
@param limit: 0 ~ 885
@return:
"""
if isinstance(limit, int):
limits = [
limit,
] * 5
else:
limits = limit
self._disable_torque()
for motor_id, limit in zip(self.servo_ids, limits, strict=False):
self.dynamixel.set_pwm_limit(motor_id, limit)
self._enable_torque()
def _disable_torque(self):
print(f"disabling torque for servos {self.servo_ids}")
for motor_id in self.servo_ids:
self.dynamixel._disable_torque(motor_id)
def _enable_torque(self):
print(f"enabling torque for servos {self.servo_ids}")
for motor_id in self.servo_ids:
self.dynamixel._enable_torque(motor_id)
def _set_pwm_control(self):
self._disable_torque()
for motor_id in self.servo_ids:
self.dynamixel.set_operating_mode(motor_id, OperatingMode.PWM)
self._enable_torque()
self.motor_control_state = MotorControlType.PWM
def _set_position_control(self):
self._disable_torque()
for motor_id in self.servo_ids:
self.dynamixel.set_operating_mode(motor_id, OperatingMode.POSITION)
self._enable_torque()
self.motor_control_state = MotorControlType.POSITION_CONTROL

View File

@@ -0,0 +1,237 @@
"""This script demonstrates how to record a LeRobot dataset of training data
using a very simple gym environment (see in examples/real_robot_example/gym_real_world/gym_environment.py).
"""
import argparse
import copy
import os
from pathlib import Path
import gym_real_world # noqa: F401
import gymnasium as gym
import numpy as np
import torch
from datasets import Dataset, Features, Sequence, Value
from omegaconf import OmegaConf
from tqdm import tqdm
from lerobot.common.datasets.compute_stats import compute_stats
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes, save_images_concurrently
from lerobot.common.datasets.utils import (
hf_transform_to_torch,
)
from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames
from lerobot.scripts.push_dataset_to_hub import push_meta_data_to_hub, push_videos_to_hub, save_meta_data
# parse the repo_id name via command line
parser = argparse.ArgumentParser()
parser.add_argument("--repo-id", type=str, default="thomwolf/blue_red_sort")
parser.add_argument("--num-episodes", type=int, default=2)
parser.add_argument("--num-frames", type=int, default=400)
parser.add_argument("--num-workers", type=int, default=16)
parser.add_argument("--keep-last", action="store_true")
parser.add_argument("--data_dir", type=str, default=None)
parser.add_argument("--push-to-hub", action="store_true")
parser.add_argument("--fps", type=int, default=30, help="Frames per second of the recording.")
parser.add_argument(
"--fps_tolerance",
type=float,
default=0.5,
help="Tolerance in fps for the recording before dropping episodes.",
)
parser.add_argument(
"--revision", type=str, default=CODEBASE_VERSION, help="Codebase version used to generate the dataset."
)
parser.add_argument("--gym-config", type=str, default=None, help="Path to the gym config file.")
parser.add_argument("--mock_robot", action="store_true")
args = parser.parse_args()
repo_id = args.repo_id
num_episodes = args.num_episodes
num_frames = args.num_frames
revision = args.revision
fps = args.fps
fps_tolerance = args.fps_tolerance
out_data = DATA_DIR / repo_id if args.data_dir is None else Path(args.data_dir)
# During data collection, frames are stored as png images in `images_dir`
images_dir = out_data / "images"
# After data collection, png images of each episode are encoded into a mp4 file stored in `videos_dir`
videos_dir = out_data / "videos"
meta_data_dir = out_data / "meta_data"
gym_config = None
if args.config is not None:
gym_config = OmegaConf.load(args.config)
# Create image and video directories
if not os.path.exists(images_dir):
os.makedirs(images_dir, exist_ok=True)
if not os.path.exists(videos_dir):
os.makedirs(videos_dir, exist_ok=True)
if __name__ == "__main__":
# Create the gym environment - check the kwargs in gym_real_world/gym_environment.py
gym_handle = "gym_real_world/RealEnv-v0"
gym_kwargs = {}
if gym_config is not None:
gym_kwargs = OmegaConf.to_container(gym_config.gym_kwargs)
env = gym.make(
gym_handle, disable_env_checker=True, record=True, fps=fps, fps_tolerance=fps_tolerance, mock=True
)
ep_dicts = []
episode_data_index = {"from": [], "to": []}
ep_fps = []
id_from = 0
id_to = 0
os.system('spd-say "gym environment created"')
ep_idx = 0
while ep_idx < num_episodes:
# bring the follower to the leader and start camera
env.reset()
os.system(f'spd-say "go {ep_idx}"')
# init buffers
obs_replay = {k: [] for k in env.observation_space}
drop_episode = False
timestamps = []
for _ in tqdm(range(num_frames)):
# Apply the next action
observation, _, _, _, info = env.step(action=None)
# images_stacked = np.hstack(list(observation['pixels'].values()))
# images_stacked = cv2.cvtColor(images_stacked, cv2.COLOR_RGB2BGR)
# cv2.imshow('frame', images_stacked)
if info["fps_error"]:
os.system(f'spd-say "Error fps too low, dropping episode {ep_idx}"')
drop_episode = True
break
# store data
for key in observation:
obs_replay[key].append(copy.deepcopy(observation[key]))
timestamps.append(info["timestamp"])
# if cv2.waitKey(1) & 0xFF == ord('q'):
# break
os.system('spd-say "stop"')
if not drop_episode:
os.system(f'spd-say "saving episode {ep_idx}"')
ep_dict = {}
# store images in png and create the video
for img_key in env.cameras:
save_images_concurrently(
obs_replay[img_key],
images_dir / f"{img_key}_episode_{ep_idx:06d}",
args.num_workers,
)
fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
# store the reference to the video frame
ep_dict[f"observation.{img_key}"] = [
{"path": f"videos/{fname}", "timestamp": tstp} for tstp in timestamps
]
state = torch.tensor(np.array(obs_replay["agent_pos"]))
action = torch.tensor(np.array(obs_replay["leader_pos"]))
next_done = torch.zeros(num_frames, dtype=torch.bool)
next_done[-1] = True
ep_dict["observation.state"] = state
ep_dict["action"] = action
ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
ep_dict["timestamp"] = torch.tensor(timestamps)
ep_dict["next.done"] = next_done
ep_fps.append(num_frames / timestamps[-1])
ep_dicts.append(ep_dict)
print(f"Episode {ep_idx} done, fps: {ep_fps[-1]:.2f}")
episode_data_index["from"].append(id_from)
episode_data_index["to"].append(
id_from + num_frames if args.keep_last else id_from + num_frames - 1
)
id_to = id_from + num_frames if args.keep_last else id_from + num_frames - 1
id_from = id_to
ep_idx += 1
env.close()
os.system('spd-say "encode video frames"')
for ep_idx in range(num_episodes):
for img_key in env.cameras:
# If necessary, we may want to encode the video
# with variable frame rate: https://superuser.com/questions/1661901/encoding-video-from-vfr-still-images
encode_video_frames(
images_dir / f"{img_key}_episode_{ep_idx:06d}",
videos_dir / f"{img_key}_episode_{ep_idx:06d}.mp4",
ep_fps[ep_idx],
)
os.system('spd-say "concatenate episodes"')
data_dict = concatenate_episodes(
ep_dicts, drop_episodes_last_frame=not args.keep_last
) # Since our fps varies we are sometimes off tolerance for the last frame
features = {}
keys = [key for key in data_dict if "observation.images." in key]
for key in keys:
features[key] = VideoFrame()
features["observation.state"] = Sequence(
length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
)
features["action"] = Sequence(
length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
)
features["episode_index"] = Value(dtype="int64", id=None)
features["frame_index"] = Value(dtype="int64", id=None)
features["timestamp"] = Value(dtype="float32", id=None)
features["next.done"] = Value(dtype="bool", id=None)
features["index"] = Value(dtype="int64", id=None)
hf_dataset = Dataset.from_dict(data_dict, features=Features(features))
hf_dataset.set_transform(hf_transform_to_torch)
info = {
"fps": sum(ep_fps) / len(ep_fps), # to have a good tolerance in data processing for the slowest video
"video": 1,
}
os.system('spd-say "from preloaded"')
lerobot_dataset = LeRobotDataset.from_preloaded(
repo_id=repo_id,
version=revision,
hf_dataset=hf_dataset,
episode_data_index=episode_data_index,
info=info,
videos_dir=videos_dir,
)
os.system('spd-say "compute stats"')
stats = compute_stats(lerobot_dataset)
os.system('spd-say "save to disk"')
hf_dataset = hf_dataset.with_format(None) # to remove transforms that cant be saved
hf_dataset.save_to_disk(str(out_data / "train"))
save_meta_data(info, stats, episode_data_index, meta_data_dir)
if args.push_to_hub:
hf_dataset.push_to_hub(repo_id, token=True, revision="main")
hf_dataset.push_to_hub(repo_id, token=True, revision=revision)
push_meta_data_to_hub(repo_id, meta_data_dir, revision="main")
push_meta_data_to_hub(repo_id, meta_data_dir, revision=revision)
push_videos_to_hub(repo_id, videos_dir, revision="main")
push_videos_to_hub(repo_id, videos_dir, revision=revision)

View File

@@ -0,0 +1,60 @@
import argparse
import logging
from pathlib import Path
import gym_real_world # noqa: F401
import gymnasium as gym # noqa: F401
from huggingface_hub import snapshot_download
from huggingface_hub.utils._errors import RepositoryNotFoundError
from huggingface_hub.utils._validators import HFValidationError
from lerobot.common.utils.utils import init_logging
from lerobot.scripts.eval import eval
if __name__ == "__main__":
init_logging()
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"-p",
"--pretrained-policy-name-or-path",
help=(
"Either the repo ID of a model hosted on the Hub or a path to a directory containing weights "
"saved using `Policy.save_pretrained`. If not provided, the policy is initialized from scratch "
"(useful for debugging). This argument is mutually exclusive with `--config`."
),
)
parser.add_argument("--revision", help="Optionally provide the Hugging Face Hub revision ID.")
parser.add_argument(
"overrides",
nargs="*",
help="Any key=value arguments to override config values (use dots for.nested=overrides)",
)
args = parser.parse_args()
try:
pretrained_policy_path = Path(
snapshot_download(args.pretrained_policy_name_or_path, revision=args.revision)
)
except (HFValidationError, RepositoryNotFoundError) as e:
if isinstance(e, HFValidationError):
error_message = (
"The provided pretrained_policy_name_or_path is not a valid Hugging Face Hub repo ID."
)
else:
error_message = (
"The provided pretrained_policy_name_or_path was not found on the Hugging Face Hub."
)
logging.warning(f"{error_message} Treating it as a local directory.")
pretrained_policy_path = Path(args.pretrained_policy_name_or_path)
if not pretrained_policy_path.is_dir() or not pretrained_policy_path.exists():
raise ValueError(
"The provided pretrained_policy_name_or_path is not a valid/existing Hugging Face Hub "
"repo ID, nor is it an existing local directory."
)
eval(pretrained_policy_path=pretrained_policy_path, config_overrides=args.overrides)

View File

@@ -0,0 +1,19 @@
# @package _global_
fps: 30
env:
name: real_world
task: RealEnv-v0
state_dim: 6
action_dim: 6
fps: ${fps}
episode_length: 200
real_world: true
gym:
cameras_shapes:
images.high: [480, 640, 3]
images.low: [480, 640, 3]
cameras_ports:
images.high: /dev/video6
images.low: /dev/video0

View File

@@ -0,0 +1,19 @@
# @package _global_
fps: 30
env:
name: real_world
task: RealEnv-v0
state_dim: 6
action_dim: 6
fps: ${fps}
episode_length: 200
real_world: true
gym:
cameras_shapes:
images.top: [480, 640, 3]
images.front: [480, 640, 3]
cameras_ports:
images.top: /dev/video6
images.front: /dev/video0

View File

@@ -0,0 +1,103 @@
# @package _global_
# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets.
# Compared to `act.yaml`, it contains 4 cameras (i.e. right_wrist, left_wrist, images,
# low) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used
# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation.
# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot).
# Look at its README for more information on how to evaluate a checkpoint in the real-world.
#
# Example of usage for training:
# ```bash
# python lerobot/scripts/train.py \
# policy=act_real \
# env=aloha_real
# ```
seed: 1000
dataset_repo_id: ???
override_dataset_stats:
observation.images.high:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
observation.images.low:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
training:
offline_steps: 1000
online_steps: 0
eval_freq: -1
save_freq: 1000
log_freq: 100
save_checkpoint: true
batch_size: 8
lr: 1e-5
lr_backbone: 1e-5
weight_decay: 1e-4
grad_clip_norm: 10
online_steps_between_rollouts: 1
delta_timestamps:
action: "[i / ${fps} for i in range(1, ${policy.chunk_size} + 1)]"
eval:
n_episodes: 1
batch_size: 1
# See `configuration_act.py` for more details.
policy:
name: act
# Input / output structure.
n_obs_steps: 1
chunk_size: 100 # chunk_size
n_action_steps: 100
input_shapes:
# TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
observation.images.high: [3, 480, 640]
observation.images.low: [3, 480, 640]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]
# Normalization / Unnormalization
input_normalization_modes:
observation.images.high: mean_std
observation.images.low: mean_std
observation.state: mean_std
output_normalization_modes:
action: mean_std
# Architecture.
# Vision backbone.
vision_backbone: resnet18
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
replace_final_stride_with_dilation: false
# Transformer layers.
pre_norm: false
dim_model: 512
n_heads: 8
dim_feedforward: 3200
feedforward_activation: relu
n_encoder_layers: 4
# Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
# that means only the first layer is used. Here we match the original implementation by setting this to 1.
# See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
n_decoder_layers: 1
# VAE.
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
# Inference.
temporal_ensemble_momentum: null
# Training and loss computation.
dropout: 0.1
kl_weight: 10.0

View File

@@ -0,0 +1,103 @@
# @package _global_
# Use `act_real.yaml` to train on real-world Aloha/Aloha2 datasets.
# Compared to `act.yaml`, it contains 4 cameras (i.e. right_wrist, left_wrist, images,
# front) instead of 1 camera (i.e. top). Also, `training.eval_freq` is set to -1. This config is used
# to evaluate checkpoints at a certain frequency of training steps. When it is set to -1, it deactivates evaluation.
# This is because real-world evaluation is done through [dora-lerobot](https://github.com/dora-rs/dora-lerobot).
# Look at its README for more information on how to evaluate a checkpoint in the real-world.
#
# Example of usage for training:
# ```bash
# python lerobot/scripts/train.py \
# policy=act_real \
# env=aloha_real
# ```
seed: 1000
dataset_repo_id: ???
override_dataset_stats:
observation.images.top:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
observation.images.front:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
training:
offline_steps: 1000
online_steps: 0
eval_freq: -1
save_freq: 1000
log_freq: 100
save_checkpoint: true
batch_size: 8
lr: 1e-5
lr_backbone: 1e-5
weight_decay: 1e-4
grad_clip_norm: 10
online_steps_between_rollouts: 1
delta_timestamps:
action: "[i / ${fps} for i in range(1, ${policy.chunk_size} + 1)]"
eval:
n_episodes: 1
batch_size: 1
# See `configuration_act.py` for more details.
policy:
name: act
# Input / output structure.
n_obs_steps: 1
chunk_size: 100 # chunk_size
n_action_steps: 100
input_shapes:
# TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
observation.images.top: [3, 480, 640]
observation.images.front: [3, 480, 640]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]
# Normalization / Unnormalization
input_normalization_modes:
observation.images.top: mean_std
observation.images.front: mean_std
observation.state: mean_std
output_normalization_modes:
action: mean_std
# Architecture.
# Vision backbone.
vision_backbone: resnet18
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
replace_final_stride_with_dilation: false
# Transformer layers.
pre_norm: false
dim_model: 512
n_heads: 8
dim_feedforward: 3200
feedforward_activation: relu
n_encoder_layers: 4
# Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
# that means only the first layer is used. Here we match the original implementation by setting this to 1.
# See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
n_decoder_layers: 1
# VAE.
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
# Inference.
temporal_ensemble_momentum: null
# Training and loss computation.
dropout: 0.1
kl_weight: 10.0

View File

@@ -1,98 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.robots.lekiwi.config_lekiwi import LeKiwiClientConfig
from lerobot.common.robots.lekiwi.lekiwi_client import OBS_STATE, LeKiwiClient
from lerobot.common.teleoperators.keyboard import KeyboardTeleop, KeyboardTeleopConfig
from lerobot.common.teleoperators.so100 import SO100Leader, SO100LeaderConfig
NB_CYCLES_CLIENT_CONNECTION = 250
def main():
logging.info("Configuring Teleop Devices")
leader_arm_config = SO100LeaderConfig(port="/dev/tty.usbmodem58760434171")
leader_arm = SO100Leader(leader_arm_config)
keyboard_config = KeyboardTeleopConfig()
keyboard = KeyboardTeleop(keyboard_config)
logging.info("Configuring LeKiwi Client")
robot_config = LeKiwiClientConfig(remote_ip="192.0.2.42", id="lekiwi")
robot = LeKiwiClient(robot_config)
logging.info("Creating LeRobot Dataset")
# The observations that we get are expected to be in body frame (x,y,theta)
obs_dict = {f"{OBS_STATE}." + key: value for key, value in robot.state_feature.items()}
# The actions that we send are expected to be in wheel frame (motor encoders)
act_dict = {"action." + key: value for key, value in robot.action_feature.items()}
features_dict = {
**act_dict,
**obs_dict,
**robot.camera_features,
}
dataset = LeRobotDataset.create(
repo_id="user/lekiwi" + str(int(time.time())),
fps=10,
features=features_dict,
)
logging.info("Connecting Teleop Devices")
leader_arm.connect()
keyboard.connect()
logging.info("Connecting remote LeKiwi")
robot.connect()
if not robot.is_connected or not leader_arm.is_connected or not keyboard.is_connected:
logging.error("Failed to connect to all devices")
return
logging.info("Starting LeKiwi teleoperation")
i = 0
while i < NB_CYCLES_CLIENT_CONNECTION:
arm_action = leader_arm.get_action()
base_action = keyboard.get_action()
action = {**arm_action, **base_action} if len(base_action) > 0 else arm_action
action_sent = robot.send_action(action)
observation = robot.get_observation()
frame = {**action_sent, **observation}
frame.update({"task": "Dummy Example Task Dataset"})
logging.info("Saved a frame into the dataset")
dataset.add_frame(frame)
i += 1
logging.info("Disconnecting Teleop Devices and LeKiwi Client")
robot.disconnect()
leader_arm.disconnect()
keyboard.disconnect()
logging.info("Uploading dataset to the hub")
dataset.save_episode()
dataset.push_to_hub()
logging.info("Finished LeKiwi cleanly")
if __name__ == "__main__":
main()

View File

@@ -27,9 +27,6 @@ Example:
print(lerobot.available_real_world_datasets)
print(lerobot.available_policies)
print(lerobot.available_policies_per_env)
print(lerobot.available_robots)
print(lerobot.available_cameras)
print(lerobot.available_motors)
```
When implementing a new dataset loadable with LeRobotDataset follow these steps:
@@ -58,6 +55,7 @@ available_tasks_per_env = {
],
"pusht": ["PushT-v0"],
"xarm": ["XarmLift-v0"],
"dora_aloha_real": ["DoraAloha-v0", "DoraKoch-v0", "DoraReachy2-v0"],
}
available_envs = list(available_tasks_per_env.keys())
@@ -72,8 +70,6 @@ available_datasets_per_env = {
"lerobot/aloha_sim_transfer_cube_human_image",
"lerobot/aloha_sim_transfer_cube_scripted_image",
],
# TODO(alexander-soare): Add "lerobot/pusht_keypoints". Right now we can't because this is too tightly
# coupled with tests.
"pusht": ["lerobot/pusht", "lerobot/pusht_image"],
"xarm": [
"lerobot/xarm_lift_medium",
@@ -85,6 +81,23 @@ available_datasets_per_env = {
"lerobot/xarm_push_medium_image",
"lerobot/xarm_push_medium_replay_image",
],
"dora_aloha_real": [
"lerobot/aloha_static_battery",
"lerobot/aloha_static_candy",
"lerobot/aloha_static_coffee",
"lerobot/aloha_static_coffee_new",
"lerobot/aloha_static_cups_open",
"lerobot/aloha_static_fork_pick_up",
"lerobot/aloha_static_pingpong_test",
"lerobot/aloha_static_pro_pencil",
"lerobot/aloha_static_screw_driver",
"lerobot/aloha_static_tape",
"lerobot/aloha_static_thread_velcro",
"lerobot/aloha_static_towel",
"lerobot/aloha_static_vinh_cup",
"lerobot/aloha_static_vinh_cup_left",
"lerobot/aloha_static_ziploc_slide",
],
}
available_real_world_datasets = [
@@ -110,99 +123,25 @@ available_real_world_datasets = [
"lerobot/aloha_static_vinh_cup_left",
"lerobot/aloha_static_ziploc_slide",
"lerobot/umi_cup_in_the_wild",
"lerobot/unitreeh1_fold_clothes",
"lerobot/unitreeh1_rearrange_objects",
"lerobot/unitreeh1_two_robot_greeting",
"lerobot/unitreeh1_warehouse",
"lerobot/nyu_rot_dataset",
"lerobot/utokyo_saytap",
"lerobot/imperialcollege_sawyer_wrist_cam",
"lerobot/utokyo_xarm_bimanual",
"lerobot/tokyo_u_lsmo",
"lerobot/utokyo_pr2_opening_fridge",
"lerobot/cmu_franka_exploration_dataset",
"lerobot/cmu_stretch",
"lerobot/asu_table_top",
"lerobot/utokyo_pr2_tabletop_manipulation",
"lerobot/utokyo_xarm_pick_and_place",
"lerobot/ucsd_kitchen_dataset",
"lerobot/austin_buds_dataset",
"lerobot/dlr_sara_grid_clamp",
"lerobot/conq_hose_manipulation",
"lerobot/columbia_cairlab_pusht_real",
"lerobot/dlr_sara_pour",
"lerobot/dlr_edan_shared_control",
"lerobot/ucsd_pick_and_place_dataset",
"lerobot/berkeley_cable_routing",
"lerobot/nyu_franka_play_dataset",
"lerobot/austin_sirius_dataset",
"lerobot/cmu_play_fusion",
"lerobot/berkeley_gnm_sac_son",
"lerobot/nyu_door_opening_surprising_effectiveness",
"lerobot/berkeley_fanuc_manipulation",
"lerobot/jaco_play",
"lerobot/viola",
"lerobot/kaist_nonprehensile",
"lerobot/berkeley_mvp",
"lerobot/uiuc_d3field",
"lerobot/berkeley_gnm_recon",
"lerobot/austin_sailor_dataset",
"lerobot/utaustin_mutex",
"lerobot/roboturk",
"lerobot/stanford_hydra_dataset",
"lerobot/berkeley_autolab_ur5",
"lerobot/stanford_robocook",
"lerobot/toto",
"lerobot/fmb",
"lerobot/droid_100",
"lerobot/berkeley_rpt",
"lerobot/stanford_kuka_multimodal_dataset",
"lerobot/iamlab_cmu_pickup_insert",
"lerobot/taco_play",
"lerobot/berkeley_gnm_cory_hall",
"lerobot/usc_cloth_sim",
]
available_datasets = sorted(
set(itertools.chain(*available_datasets_per_env.values(), available_real_world_datasets))
available_datasets = list(
itertools.chain(*available_datasets_per_env.values(), available_real_world_datasets)
)
# lists all available policies from `lerobot/common/policies`
# lists all available policies from `lerobot/common/policies` by their class attribute: `name`.
available_policies = [
"act",
"diffusion",
"tdmpc",
"vqbet",
]
# lists all available robots from `lerobot/common/robot_devices/robots`
available_robots = [
"koch",
"koch_bimanual",
"aloha",
"so100",
"so101",
]
# lists all available cameras from `lerobot/common/robot_devices/cameras`
available_cameras = [
"opencv",
"intelrealsense",
]
# lists all available motors from `lerobot/common/robot_devices/motors`
available_motors = [
"dynamixel",
"feetech",
]
# keys and values refer to yaml files
available_policies_per_env = {
"aloha": ["act"],
"pusht": ["diffusion", "vqbet"],
"pusht": ["diffusion"],
"xarm": ["tdmpc"],
"koch_real": ["act_koch_real"],
"aloha_real": ["act_aloha_real"],
"dora_aloha_real": ["act_real"],
}
env_task_pairs = [(env, task) for env, tasks in available_tasks_per_env.items() for task in tasks]

View File

@@ -1,79 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helper to recalibrate your device (robot or teleoperator).
Example:
```shell
python -m lerobot.calibrate \
--teleop.type=so100_leader \
--teleop.port=/dev/tty.usbmodem58760431551 \
--teleop.id=blue
```
"""
import logging
from dataclasses import asdict, dataclass
from pprint import pformat
import draccus
from lerobot.common.robots import ( # noqa: F401
Robot,
RobotConfig,
koch_follower,
make_robot_from_config,
so100_follower,
)
from lerobot.common.teleoperators import ( # noqa: F401
Teleoperator,
TeleoperatorConfig,
make_teleoperator_from_config,
)
from lerobot.common.utils.utils import init_logging
from .common.teleoperators import koch_leader, so100_leader # noqa: F401
@dataclass
class CalibrateConfig:
teleop: TeleoperatorConfig | None = None
robot: RobotConfig | None = None
def __post_init__(self):
if bool(self.teleop) == bool(self.robot):
raise ValueError("Choose either a teleop or a robot.")
self.device = self.robot if self.robot else self.teleop
@draccus.wrap()
def calibrate(cfg: CalibrateConfig):
init_logging()
logging.info(pformat(asdict(cfg)))
if isinstance(cfg.device, RobotConfig):
device = make_robot_from_config(cfg.device)
elif isinstance(cfg.device, TeleoperatorConfig):
device = make_teleoperator_from_config(cfg.device)
device.connect(calibrate=False)
device.calibrate()
device.disconnect()
if __name__ == "__main__":
calibrate()

View File

@@ -1,17 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .camera import Camera
from .configs import CameraConfig
from .utils import make_cameras_from_configs

View File

@@ -1,49 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import numpy as np
from .configs import CameraConfig, ColorMode
class Camera(abc.ABC):
def __init__(self, config: CameraConfig):
self.fps: int | None = config.fps
self.width: int | None = config.width
self.height: int | None = config.height
@property
@abc.abstractmethod
def is_connected(self) -> bool:
pass
@abc.abstractmethod
def connect(self, do_warmup_read: bool = True) -> None:
pass
@abc.abstractmethod
def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
pass
@abc.abstractmethod
def async_read(self, timeout_ms: float = 2000) -> np.ndarray:
pass
@abc.abstractmethod
def disconnect(self) -> None:
pass

View File

@@ -1,44 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
from dataclasses import dataclass
from enum import Enum
import draccus
class ColorMode(Enum):
RGB = "rgb"
BGR = "bgr"
class Cv2Rotation(Enum):
NO_ROTATION = 0
ROTATE_90 = 90
ROTATE_180 = 180
ROTATE_270 = -90
@dataclass(kw_only=True)
class CameraConfig(draccus.ChoiceRegistry, abc.ABC):
fps: int | None = None
width: int | None = None
height: int | None = None
@property
def type(self) -> str:
return self.get_choice_name(self.__class__)

View File

@@ -1,16 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .camera_realsense import RealSenseCamera
from .configuration_realsense import RealSenseCameraConfig

View File

@@ -1,672 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Provides the RealSenseCamera class for capturing frames from Intel RealSense cameras.
"""
import contextlib
import logging
import math
import queue
import time
from threading import Event, Thread
from typing import Any, Dict, List
import cv2
import numpy as np
import pyrealsense2 as rs
from lerobot.common.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
from lerobot.common.utils.utils import capture_timestamp_utc
from ..camera import Camera
from ..configs import ColorMode
from ..utils import get_cv2_rotation
from .configuration_realsense import RealSenseCameraConfig
logger = logging.getLogger(__name__)
class RealSenseCamera(Camera):
"""
Manages interactions with Intel RealSense cameras for frame and depth recording.
This class provides an interface similar to `OpenCVCamera` but tailored for
RealSense devices, leveraging the `pyrealsense2` library. It uses the camera's
unique serial number for identification, offering more stability than device
indices, especially on Linux. It also supports capturing depth maps alongside
color frames.
Use the provided utility script to find available camera indices and default profiles:
```bash
python -m lerobot.find_cameras
```
A `RealSenseCamera` instance requires a configuration object specifying the
camera's serial number or a unique device name. If using the name, ensure only
one camera with that name is connected.
The camera's default settings (FPS, resolution, color mode) from the stream
profile are used unless overridden in the configuration.
Args:
config (RealSenseCameraConfig): Configuration object containing settings like
serial number or name, desired FPS, width, height, color mode, rotation,
and whether to capture depth.
Example:
```python
from lerobot.common.cameras.intel.camera_realsense import RealSenseCamera
from lerobot.common.cameras.intel.configuration_realsense import RealSenseCameraConfig
from lerobot.common.cameras.configs import ColorMode
# Basic usage with serial number
config = RealSenseCameraConfig(serial_number="1234567890") # Replace with actual SN
camera = RealSenseCamera(config)
try:
camera.connect()
print(f"Connected to {camera}")
color_image = camera.read() # Synchronous read (color only)
print(f"Read frame shape: {color_image.shape}")
async_image = camera.async_read() # Asynchronous read
print(f"Async read frame shape: {async_image.shape}")
except Exception as e:
print(f"An error occurred: {e}")
finally:
camera.disconnect()
print(f"Disconnected from {camera}")
# Example with depth capture and custom settings
custom_config = RealSenseCameraConfig(
serial_number="1234567890", # Replace with actual SN
fps=30,
width=1280,
height=720,
color_mode=ColorMode.BGR, # Request BGR output
rotation=0,
use_depth=True
)
depth_camera = RealSenseCamera(custom_config)
try:
depth_camera.connect()
color_image, depth_map = depth_camera.read() # Returns tuple
print(f"Color shape: {color_image.shape}, Depth shape: {depth_map.shape}")
finally:
depth_camera.disconnect()
# Example using a unique camera name
name_config = RealSenseCameraConfig(name="Intel RealSense D435") # If unique
name_camera = RealSenseCamera(name_config)
# ... connect, read, disconnect ...
```
"""
def __init__(self, config: RealSenseCameraConfig):
"""
Initializes the RealSenseCamera instance.
Args:
config: The configuration settings for the camera.
"""
super().__init__(config)
self.config = config
if config.name is not None: # NOTE(Steven): Do we want to continue supporting this?
self.serial_number = self._find_serial_number_from_name(config.name)
elif config.serial_number is not None:
self.serial_number = str(config.serial_number)
else:
raise ValueError("RealSenseCameraConfig must provide either 'serial_number' or 'name'.")
self.fps: int | None = config.fps
self.channels: int = config.channels
self.color_mode: ColorMode = config.color_mode
self.use_depth: bool = config.use_depth
self.rs_pipeline: rs.pipeline | None = None
self.rs_profile: rs.pipeline_profile | None = None
self.thread: Thread | None = None
self.stop_event: Event | None = None
self.frame_queue: queue.Queue = queue.Queue(maxsize=1)
self.logs: dict = {} # For timestamping or other metadata
self.rotation: int | None = get_cv2_rotation(config.rotation)
if self.height and self.width:
if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
self.prerotated_width, self.prerotated_height = self.height, self.width
else:
self.prerotated_width, self.prerotated_height = self.width, self.height
def __str__(self) -> str:
"""Returns a string representation of the camera instance."""
return f"{self.__class__.__name__}({self.serial_number})"
@property
def is_connected(self) -> bool:
"""Checks if the camera pipeline is started and streams are active."""
return self.rs_pipeline is not None and self.rs_profile is not None
@staticmethod
def find_cameras(raise_when_empty: bool = True) -> List[Dict[str, Any]]:
"""
Detects available Intel RealSense cameras connected to the system.
Args:
raise_when_empty (bool): If True, raises an OSError if no cameras are found.
Returns:
List[Dict[str, Any]]: A list of dictionaries,
where each dictionary contains 'type', 'id' (serial number), 'name',
firmware version, USB type, and other available specs, and the default profile properties (width, height, fps, format).
Raises:
OSError: If `raise_when_empty` is True and no cameras are detected,
or if pyrealsense2 is not installed.
ImportError: If pyrealsense2 is not installed.
"""
found_cameras_info = []
context = rs.context()
devices = context.query_devices()
if not devices:
logger.warning("No RealSense devices detected.")
if raise_when_empty:
raise OSError(
"No RealSense devices detected. Ensure cameras are connected, "
"library (`pyrealsense2`) is installed, and firmware is up-to-date."
)
for device in devices:
camera_info = {
"name": device.get_info(rs.camera_info.name),
"type": "RealSense",
"id": device.get_info(rs.camera_info.serial_number),
"firmware_version": device.get_info(rs.camera_info.firmware_version),
"usb_type_descriptor": device.get_info(rs.camera_info.usb_type_descriptor),
"physical_port": device.get_info(rs.camera_info.physical_port),
"product_id": device.get_info(rs.camera_info.product_id),
"product_line": device.get_info(rs.camera_info.product_line),
}
# Get stream profiles for each sensor
sensors = device.query_sensors()
for sensor in sensors:
profiles = sensor.get_stream_profiles()
for profile in profiles:
if profile.is_video_stream_profile() and profile.is_default():
vprofile = profile.as_video_stream_profile()
stream_info = {
"stream_type": vprofile.stream_name(),
"format": vprofile.format().name,
"width": vprofile.width(),
"height": vprofile.height(),
"fps": vprofile.fps(),
}
camera_info["default_stream_profile"] = stream_info
found_cameras_info.append(camera_info)
logger.debug(f"Found RealSense camera: {camera_info}")
logger.info(f"Detected RealSense cameras: {[cam['id'] for cam in found_cameras_info]}")
return found_cameras_info
def _find_serial_number_from_name(self, name: str) -> str:
"""Finds the serial number for a given unique camera name."""
camera_infos = self.find_cameras(raise_when_empty=True)
found_devices = [cam for cam in camera_infos if str(cam["name"]) == name]
if not found_devices:
available_names = [cam["name"] for cam in camera_infos]
raise ValueError(
f"No RealSense camera found with name '{name}'. Available camera names: {available_names}"
)
if len(found_devices) > 1:
serial_numbers = [dev["serial_number"] for dev in found_devices]
raise ValueError(
f"Multiple RealSense cameras found with name '{name}'. "
f"Please use a unique serial number instead. Found SNs: {serial_numbers}"
)
serial_number = str(found_devices[0]["serial_number"])
logger.info(f"Found serial number '{serial_number}' for camera name '{name}'.")
return serial_number
def _configure_realsense_settings(self) -> rs.config:
"""Creates and configures the RealSense pipeline configuration object."""
rs_config = rs.config()
rs.config.enable_device(rs_config, self.serial_number)
if self.width and self.height and self.fps:
logger.debug(
f"Requesting Color Stream: {self.prerotated_width}x{self.prerotated_height} @ {self.fps} FPS, Format: {rs.format.rgb8}"
)
rs_config.enable_stream(
rs.stream.color, self.prerotated_width, self.prerotated_height, rs.format.rgb8, self.fps
)
if self.use_depth:
logger.debug(
f"Requesting Depth Stream: {self.prerotated_width}x{self.prerotated_height} @ {self.fps} FPS, Format: {rs.format.z16}"
)
rs_config.enable_stream(
rs.stream.depth, self.prerotated_width, self.prerotated_height, rs.format.z16, self.fps
)
else:
logger.debug(f"Requesting Color Stream: Default settings, Format: {rs.stream.color}")
rs_config.enable_stream(rs.stream.color)
if self.use_depth:
logger.debug(f"Requesting Depth Stream: Default settings, Format: {rs.stream.depth}")
rs_config.enable_stream(rs.stream.depth)
return rs_config
def _validate_capture_settings(self) -> None:
"""
Validates if the actual stream settings match the requested configuration.
This method compares the requested FPS, width, and height against the
actual settings obtained from the active RealSense profile after the
pipeline has started.
Raises:
RuntimeError: If the actual camera settings significantly deviate
from the requested ones.
DeviceNotConnectedError: If the camera is not connected when attempting
to validate settings.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"Cannot validate settings for {self} as it is not connected.")
self._validate_fps(self.rs_profile.get_stream(rs.stream.color).as_video_stream_profile())
self._validate_width_and_height(self.rs_profile.get_stream(rs.stream.color).as_video_stream_profile())
if self.use_depth:
self._validate_fps(self.rs_profile.get_stream(rs.stream.depth).as_video_stream_profile())
self._validate_width_and_height(
self.rs_profile.get_stream(rs.stream.depth).as_video_stream_profile()
)
def connect(self, do_warmup_read: bool = True):
"""
Connects to the RealSense camera specified in the configuration.
Initializes the RealSense pipeline, configures the required streams (color
and optionally depth), starts the pipeline, and validates the actual stream settings.
Raises:
DeviceAlreadyConnectedError: If the camera is already connected.
ValueError: If the configuration is invalid (e.g., missing serial/name, name not unique).
ConnectionError: If the camera is found but fails to start the pipeline.
RuntimeError: If the pipeline starts but fails to apply requested settings.
OSError: If no RealSense devices are detected at all.
"""
if self.is_connected:
raise DeviceAlreadyConnectedError(f"{self} is already connected.")
logger.debug(f"Attempting to connect to camera {self.serial_number}...")
self.rs_pipeline = rs.pipeline()
rs_config = self._configure_realsense_settings()
try:
self.rs_profile = self.rs_pipeline.start(rs_config)
logger.debug(f"Successfully started pipeline for camera {self.serial_number}.")
except RuntimeError as e:
self.rs_profile = None
self.rs_pipeline = None
raise ConnectionError(
f"Failed to open RealSense camera {self.serial_number}. Error: {e}. "
f"Run 'python -m find_cameras list-cameras' for details."
) from e
logger.debug(f"Validating stream configuration for {self.serial_number}...")
self._validate_capture_settings()
if do_warmup_read:
logger.debug(f"Reading a warm-up frame for {self.serial_number}...")
self.read() # NOTE(Steven): For now we just read one frame, we could also loop for X frames/secs
logger.info(f"Camera {self.serial_number} connected and configured successfully.")
def _validate_fps(self, stream) -> None:
"""Validates and sets the internal FPS based on actual stream FPS."""
actual_fps = stream.fps()
if self.fps is None:
self.fps = actual_fps
logger.info(f"FPS not specified, using camera default: {self.fps} FPS.")
return
# Use math.isclose for robust float comparison
if not math.isclose(self.fps, actual_fps, rel_tol=1e-3):
logger.warning(
f"Requested FPS {self.fps} for {self}, but camera reported {actual_fps}. "
"This might be due to camera limitations."
)
raise RuntimeError(
f"Failed to set requested FPS {self.fps} for {self}. Actual value reported: {actual_fps}."
)
logger.debug(f"FPS set to {actual_fps} for {self}.")
def _validate_width_and_height(self, stream) -> None:
"""Validates and sets the internal capture width and height based on actual stream width."""
actual_width = int(round(stream.width()))
actual_height = int(round(stream.height()))
if self.width is None or self.height is None:
if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
self.width, self.height = actual_height, actual_width
self.prerotated_width, self.prerotated_height = actual_width, actual_height
else:
self.width, self.height = actual_width, actual_height
self.prerotated_width, self.prerotated_height = actual_width, actual_height
logger.info(f"Capture width set to camera default: {self.width}.")
logger.info(f"Capture height set to camera default: {self.height}.")
return
if self.prerotated_width != actual_width:
logger.warning(
f"Requested capture width {self.prerotated_width} for {self}, but camera reported {actual_width}."
)
raise RuntimeError(
f"Failed to set requested capture width {self.prerotated_width} for {self}. Actual value: {actual_width}."
)
logger.debug(f"Capture width set to {actual_width} for {self}.")
if self.prerotated_height != actual_height:
logger.warning(
f"Requested capture height {self.prerotated_height} for {self}, but camera reported {actual_height}."
)
raise RuntimeError(
f"Failed to set requested capture height {self.prerotated_height} for {self}. Actual value: {actual_height}."
)
logger.debug(f"Capture height set to {actual_height} for {self}.")
def read_depth(self, timeout_ms: int = 5000) -> np.ndarray:
"""
Reads a single frame (depth) synchronously from the camera.
This is a blocking call. It waits for a coherent set of frames (depth)
from the camera hardware via the RealSense pipeline.
Args:
timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 5000ms.
Returns:
np.ndarray: The depth map as a NumPy array (height, width)
of type `np.uint16` (raw depth values in millimeters) and rotation.
Raises:
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If reading frames from the pipeline fails or frames are invalid.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if not self.use_depth:
raise RuntimeError(
f"Failed to capture depth frame from {self}. '.read_depth()'. Depth stream is not enabled."
)
start_time = time.perf_counter()
ret, frame = self.rs_pipeline.try_wait_for_frames(
timeout_ms=timeout_ms
) # NOTE(Steven): This read has a timeout
if not ret or frame is None:
raise RuntimeError(
f"Failed to capture frame from {self}. '.read_depth()' returned status={ret} and frame is None."
)
depth_frame = frame.get_depth_frame()
depth_map = np.asanyarray(depth_frame.get_data())
depth_map_processed = self._postprocess_image(depth_map)
read_duration_ms = (time.perf_counter() - start_time) * 1e3
logger.debug(f"{self} synchronous read took: {read_duration_ms:.1f}ms")
self.logs["timestamp_utc"] = capture_timestamp_utc()
return depth_map_processed
def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 5000) -> np.ndarray:
"""
Reads a single frame (color) synchronously from the camera.
This is a blocking call. It waits for a coherent set of frames (color)
from the camera hardware via the RealSense pipeline.
Args:
timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 5000ms.
Returns:
np.ndarray: The captured color frame as a NumPy array
(height, width, channels), processed according to `color_mode` and rotation.
Raises:
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If reading frames from the pipeline fails or frames are invalid.
ValueError: If an invalid `color_mode` is requested.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
start_time = time.perf_counter()
ret, frame = self.rs_pipeline.try_wait_for_frames(
timeout_ms=timeout_ms
) # NOTE(Steven): This read has a timeout while opencv doesn't
if not ret or frame is None:
raise RuntimeError(
f"Failed to capture frame from {self}. '.read()' returned status={ret} and frame is None."
)
color_frame = frame.get_color_frame()
color_image_raw = np.asanyarray(color_frame.get_data())
color_image_processed = self._postprocess_image(color_image_raw, color_mode)
read_duration_ms = (time.perf_counter() - start_time) * 1e3
logger.debug(f"{self} synchronous read took: {read_duration_ms:.1f}ms")
self.logs["timestamp_utc"] = capture_timestamp_utc()
return color_image_processed
def _postprocess_image(self, image: np.ndarray, color_mode: ColorMode | None = None) -> np.ndarray:
"""
Applies color conversion, dimension validation, and rotation to a raw color frame.
Args:
image (np.ndarray): The raw image frame (expected RGB format from RealSense).
color_mode (Optional[ColorMode]): The target color mode (RGB or BGR). If None,
uses the instance's default `self.color_mode`.
Returns:
np.ndarray: The processed image frame according to `self.color_mode` and `self.rotation`.
Raises:
ValueError: If the requested `color_mode` is invalid.
RuntimeError: If the raw frame dimensions do not match the configured
`width` and `height`.
"""
if color_mode and color_mode not in (ColorMode.RGB, ColorMode.BGR):
raise ValueError(
f"Invalid requested color mode '{color_mode}'. Expected {ColorMode.RGB} or {ColorMode.BGR}."
)
h, w, c = image.shape
if h != self.prerotated_height or w != self.prerotated_width:
raise RuntimeError(
f"Captured frame dimensions ({h}x{w}) do not match configured capture dimensions ({self.prerotated_height}x{self.prerotated_width}) for {self}."
)
if c != self.channels:
logger.warning(
f"Captured frame channels ({c}) do not match configured channels ({self.channels}) for {self}."
)
processed_image = image
if self.color_mode == ColorMode.BGR:
processed_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
logger.debug(f"Converted frame from RGB to BGR for {self}.")
if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
processed_image = cv2.rotate(processed_image, self.rotation)
logger.debug(f"Rotated frame by {self.config.rotation} degrees for {self}.")
return processed_image
def _read_loop(self):
"""
Internal loop run by the background thread for asynchronous reading.
Continuously reads frames (color and optional depth) using `read()`
and places the latest result (single image or tuple) into the `frame_queue`.
It overwrites any previous frame in the queue.
"""
logger.debug(f"Starting read loop thread for {self}.")
while not self.stop_event.is_set():
try:
frame_data = self.read(timeout_ms=500)
with contextlib.suppress(queue.Empty):
_ = self.frame_queue.get_nowait()
self.frame_queue.put(frame_data)
logger.debug(f"Frame data placed in queue for {self}.")
except DeviceNotConnectedError:
logger.error(f"Read loop for {self} stopped: Camera disconnected.")
break
except Exception as e:
logger.warning(f"Error reading frame in background thread for {self}: {e}")
logger.debug(f"Stopping read loop thread for {self}.")
def _ensure_read_thread_running(self):
"""Starts or restarts the background read thread if it's not running."""
if self.thread is not None and self.thread.is_alive():
self.thread.join(timeout=0.1)
if self.stop_event is not None:
self.stop_event.set()
self.stop_event = Event()
self.thread = Thread(
target=self._read_loop, args=(), name=f"RealSenseReadLoop-{self}-{self.serial_number}"
)
self.thread.daemon = True
self.thread.start()
logger.debug(f"Read thread started for {self}.")
# NOTE(Steven): Missing implementation for depth for now
def async_read(self, timeout_ms: float = 2000) -> np.ndarray:
"""
Reads the latest available frame data (color or color+depth) asynchronously.
This method retrieves the most recent frame captured by the background
read thread. It does not block waiting for the camera hardware directly,
only waits for a frame to appear in the internal queue up to the specified
timeout.
Args:
timeout_ms (float): Maximum time in milliseconds to wait for a frame
to become available in the queue. Defaults to 2000ms (2 seconds).
Returns:
np.ndarray:
The latest captured frame data (color image), processed according to configuration.
Raises:
DeviceNotConnectedError: If the camera is not connected.
TimeoutError: If no frame data becomes available within the specified timeout.
RuntimeError: If the background thread died unexpectedly or another queue error occurs.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
self._ensure_read_thread_running()
try:
return self.frame_queue.get(timeout=timeout_ms / 1000.0)
except queue.Empty as e:
thread_alive = self.thread is not None and self.thread.is_alive()
logger.error(
f"Timeout waiting for frame from {self} queue after {timeout_ms}ms. "
f"(Read thread alive: {thread_alive})"
)
raise TimeoutError(
f"Timed out waiting for frame from camera {self.serial_number} after {timeout_ms} ms. "
f"Read thread alive: {thread_alive}."
) from e
except Exception as e:
logger.exception(f"Unexpected error getting frame data from queue for {self}: {e}")
raise RuntimeError(
f"Error getting frame data from queue for camera {self.serial_number}: {e}"
) from e
def _shutdown_read_thread(self):
"""Signals the background read thread to stop and waits for it to join."""
if self.stop_event is not None:
logger.debug(f"Signaling stop event for read thread of {self}.")
self.stop_event.set()
if self.thread is not None and self.thread.is_alive():
logger.debug(f"Waiting for read thread of {self} to join...")
self.thread.join(timeout=2.0)
if self.thread.is_alive():
logger.warning(f"Read thread for {self} did not terminate gracefully after 2 seconds.")
else:
logger.debug(f"Read thread for {self} joined successfully.")
self.thread = None
self.stop_event = None
def disconnect(self):
"""
Disconnects from the camera, stops the pipeline, and cleans up resources.
Stops the background read thread (if running) and stops the RealSense pipeline.
Raises:
DeviceNotConnectedError: If the camera is already disconnected (pipeline not running).
"""
if not self.is_connected and self.thread is None:
raise DeviceNotConnectedError(
f"Attempted to disconnect {self}, but it appears already disconnected."
)
logger.debug(f"Disconnecting from camera {self.serial_number}...")
if self.thread is not None:
self._shutdown_read_thread()
if self.rs_pipeline is not None:
logger.debug(f"Stopping RealSense pipeline object for {self}.")
self.rs_pipeline.stop()
self.rs_pipeline = None
self.rs_profile = None
logger.info(f"Camera {self.serial_number} disconnected successfully.")

View File

@@ -1,87 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from ..configs import CameraConfig, ColorMode, Cv2Rotation
@CameraConfig.register_subclass("intelrealsense")
@dataclass
class RealSenseCameraConfig(CameraConfig):
"""Configuration class for Intel RealSense cameras.
This class provides specialized configuration options for Intel RealSense cameras,
including support for depth sensing and device identification via serial number or name.
Example configurations for Intel RealSense D405:
```python
# Basic configurations
RealSenseCameraConfig(128422271347, 30, 1280, 720) # 1280x720 @ 30FPS
RealSenseCameraConfig(128422271347, 60, 640, 480) # 640x480 @ 60FPS
# Advanced configurations
RealSenseCameraConfig(128422271347, 30, 640, 480, use_depth=True) # With depth sensing
RealSenseCameraConfig(128422271347, 30, 640, 480, rotation=Cv2Rotation.ROTATE_90) # With 90° rotation
```
Attributes:
fps: Requested frames per second for the color stream.
width: Requested frame width in pixels for the color stream.
height: Requested frame height in pixels for the color stream.
name: Optional human-readable name to identify the camera.
serial_number: Optional unique serial number to identify the camera.
Either name or serial_number must be provided.
color_mode: Color mode for image output (RGB or BGR). Defaults to RGB.
channels: Number of color channels (currently only 3 is supported).
use_depth: Whether to enable depth stream. Defaults to False.
rotation: Image rotation setting (0°, 90°, 180°, or 270°). Defaults to no rotation.
Note:
- Either name or serial_number must be specified, but not both.
- Depth stream configuration (if enabled) will use the same FPS as the color stream.
- The actual resolution and FPS may be adjusted by the camera to the nearest supported mode.
- Only 3-channel color output (RGB/BGR) is currently supported.
"""
name: str | None = None
serial_number: int | None = None
color_mode: ColorMode = ColorMode.RGB
channels: int | None = 3
use_depth: bool = False
rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION # NOTE(Steven): Check if draccus can parse to an enum
def __post_init__(self):
if self.color_mode not in (ColorMode.RGB, ColorMode.BGR):
raise ValueError(
f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided."
)
if self.rotation not in (
Cv2Rotation.NO_ROTATION,
Cv2Rotation.ROTATE_90,
Cv2Rotation.ROTATE_180,
Cv2Rotation.ROTATE_270,
):
raise ValueError(
f"`rotation` is expected to be in {(Cv2Rotation.NO_ROTATION, Cv2Rotation.ROTATE_90, Cv2Rotation.ROTATE_180, Cv2Rotation.ROTATE_270)}, but {self.rotation} is provided."
)
if self.channels != 3:
raise NotImplementedError(f"Unsupported number of channels: {self.channels}")
if bool(self.name) and bool(self.serial_number):
raise ValueError(
f"One of them must be set: name or serial_number, but {self.name=} and {self.serial_number=} provided."
)

View File

@@ -1,16 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .camera_opencv import OpenCVCamera
from .configuration_opencv import OpenCVCameraConfig

View File

@@ -1,555 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Provides the OpenCVCamera class for capturing frames from cameras using OpenCV.
"""
import contextlib
import logging
import math
import platform
import queue
import time
from pathlib import Path
from threading import Event, Thread
from typing import Any, Dict, List
import cv2
import numpy as np
from lerobot.common.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
from lerobot.common.utils.utils import capture_timestamp_utc
from ..camera import Camera
from ..utils import IndexOrPath, get_cv2_backend, get_cv2_rotation
from .configuration_opencv import ColorMode, OpenCVCameraConfig
# NOTE(Steven): The maximum opencv device index depends on your operating system. For instance,
# if you have 3 cameras, they should be associated to index 0, 1, and 2. This is the case
# on MacOS. However, on Ubuntu, the indices are different like 6, 16, 23.
# When you change the USB port or reboot the computer, the operating system might
# treat the same cameras as new devices. Thus we select a higher bound to search indices.
MAX_OPENCV_INDEX = 60
logger = logging.getLogger(__name__)
class OpenCVCamera(Camera):
"""
Manages camera interactions using OpenCV for efficient frame recording.
This class provides a high-level interface to connect to, configure, and read
frames from cameras compatible with OpenCV's VideoCapture. It supports both
synchronous and asynchronous frame reading.
An OpenCVCamera instance requires a camera index (e.g., 0) or a device path
(e.g., '/dev/video0' on Linux). Camera indices can be unstable across reboots
or port changes, especially on Linux. Use the provided utility script to find
available camera indices or paths:
```bash
python -m lerobot.find_cameras
```
The camera's default settings (FPS, resolution, color mode) are used unless
overridden in the configuration.
Args:
config (OpenCVCameraConfig): Configuration object containing settings like
camera index/path, desired FPS, width, height, color mode, and rotation.
Example:
```python
from lerobot.common.cameras.opencv import OpenCVCamera
from lerobot.common.cameras.configuration_opencv import OpenCVCameraConfig, ColorMode
# Basic usage with camera index 0
config = OpenCVCameraConfig(index_or_path=0)
camera = OpenCVCamera(config)
try:
camera.connect()
print(f"Connected to {camera}")
color_image = camera.read() # Synchronous read
print(f"Read frame shape: {color_image.shape}")
async_image = camera.async_read() # Asynchronous read
print(f"Async read frame shape: {async_image.shape}")
except Exception as e:
print(f"An error occurred: {e}")
finally:
camera.disconnect()
print(f"Disconnected from {camera}")
# Example with custom settings
custom_config = OpenCVCameraConfig(
index_or_path='/dev/video0', # Or use an index
fps=30,
width=1280,
height=720,
color_mode=ColorMode.RGB,
rotation=90
)
custom_camera = OpenCVCamera(custom_config)
# ... connect, read, disconnect ...
```
"""
def __init__(self, config: OpenCVCameraConfig):
"""
Initializes the OpenCVCamera instance.
Args:
config: The configuration settings for the camera.
"""
super().__init__(config)
self.config = config
self.index_or_path: IndexOrPath = config.index_or_path
self.fps: int | None = config.fps
self.channels: int = config.channels
self.color_mode: ColorMode = config.color_mode
self.videocapture_camera: cv2.VideoCapture | None = None
self.thread: Thread | None = None
self.stop_event: Event | None = None
self.frame_queue: queue.Queue = queue.Queue(maxsize=1)
self.logs: dict = {} # NOTE(Steven): Might be removed in the future
self.rotation: int | None = get_cv2_rotation(config.rotation)
self.backend: int = get_cv2_backend() # NOTE(Steven): If we specify backend the opencv open fails
if self.height and self.width:
if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
self.prerotated_width, self.prerotated_height = self.height, self.width
else:
self.prerotated_width, self.prerotated_height = self.width, self.height
def __str__(self) -> str:
"""Returns a string representation of the camera instance."""
return f"{self.__class__.__name__}({self.index_or_path})"
@property
def is_connected(self) -> bool:
"""Checks if the camera is currently connected and opened."""
return isinstance(self.videocapture_camera, cv2.VideoCapture) and self.videocapture_camera.isOpened()
def _configure_capture_settings(self) -> None:
"""
Applies the specified FPS, width, and height settings to the connected camera.
This method attempts to set the camera properties via OpenCV. It checks if
the camera successfully applied the settings and raises an error if not.
Args:
fps: The desired frames per second. If None, the setting is skipped.
width: The desired capture width. If None, the setting is skipped.
height: The desired capture height. If None, the setting is skipped.
Raises:
RuntimeError: If the camera fails to set any of the specified properties
to the requested value.
DeviceNotConnectedError: If the camera is not connected when attempting
to configure settings.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"Cannot configure settings for {self} as it is not connected.")
self._validate_fps()
self._validate_width_and_height()
def connect(self, do_warmup_read: bool = True):
"""
Connects to the OpenCV camera specified in the configuration.
Initializes the OpenCV VideoCapture object, sets desired camera properties
(FPS, width, height), and performs initial checks.
Raises:
DeviceAlreadyConnectedError: If the camera is already connected.
ValueError: If the specified camera index/path is not found or accessible.
ConnectionError: If the camera is found but fails to open.
RuntimeError: If the camera opens but fails to apply requested FPS/resolution settings.
"""
if self.is_connected:
raise DeviceAlreadyConnectedError(f"{self} is already connected.")
# Use 1 thread for OpenCV operations to avoid potential conflicts or
# blocking in multi-threaded applications, especially during data collection.
cv2.setNumThreads(1)
logger.debug(f"Attempting to connect to camera {self.index_or_path} using backend {self.backend}...")
self.videocapture_camera = cv2.VideoCapture(self.index_or_path)
if not self.videocapture_camera.isOpened():
self.videocapture_camera.release()
self.videocapture_camera = None
raise ConnectionError(
f"Failed to open OpenCV camera {self.index_or_path}."
f"Run 'python -m find_cameras list-cameras' for details."
)
logger.debug(f"Successfully opened camera {self.index_or_path}. Applying configuration...")
self._configure_capture_settings()
if do_warmup_read:
logger.debug(f"Reading a warm-up frame for {self.index_or_path}...")
self.read() # NOTE(Steven): For now we just read one frame, we could also loop for X frames/secs
logger.debug(f"Camera {self.index_or_path} connected and configured successfully.")
def _validate_fps(self) -> None:
"""Validates and sets the camera's frames per second (FPS)."""
if self.fps is None:
self.fps = self.videocapture_camera.get(cv2.CAP_PROP_FPS)
logger.info(f"FPS set to camera default: {self.fps}.")
return
success = self.videocapture_camera.set(cv2.CAP_PROP_FPS, float(self.fps))
actual_fps = self.videocapture_camera.get(cv2.CAP_PROP_FPS)
# Use math.isclose for robust float comparison
if not success or not math.isclose(self.fps, actual_fps, rel_tol=1e-3):
logger.warning(
f"Requested FPS {self.fps} for {self}, but camera reported {actual_fps} (set success: {success}). "
"This might be due to camera limitations."
)
raise RuntimeError(
f"Failed to set requested FPS {self.fps} for {self}. Actual value reported: {actual_fps}."
)
logger.debug(f"FPS set to {actual_fps} for {self}.")
def _validate_width_and_height(self) -> None:
"""Validates and sets the camera's frame capture width and height."""
default_width = int(round(self.videocapture_camera.get(cv2.CAP_PROP_FRAME_WIDTH)))
default_height = int(round(self.videocapture_camera.get(cv2.CAP_PROP_FRAME_HEIGHT)))
if self.width is None or self.height is None:
if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
self.width, self.height = default_height, default_width
self.prerotated_width, self.prerotated_height = default_width, default_height
else:
self.width, self.height = default_width, default_height
self.prerotated_width, self.prerotated_height = default_width, default_height
logger.info(f"Capture width set to camera default: {self.width}.")
logger.info(f"Capture height set to camera default: {self.height}.")
return
success = self.videocapture_camera.set(cv2.CAP_PROP_FRAME_WIDTH, float(self.prerotated_width))
actual_width = int(round(self.videocapture_camera.get(cv2.CAP_PROP_FRAME_WIDTH)))
if not success or self.prerotated_width != actual_width:
logger.warning(
f"Requested capture width {self.prerotated_width} for {self}, but camera reported {actual_width} (set success: {success})."
)
raise RuntimeError(
f"Failed to set requested capture width {self.prerotated_width} for {self}. Actual value: {actual_width}."
)
logger.debug(f"Capture width set to {actual_width} for {self}.")
success = self.videocapture_camera.set(cv2.CAP_PROP_FRAME_HEIGHT, float(self.prerotated_height))
actual_height = int(round(self.videocapture_camera.get(cv2.CAP_PROP_FRAME_HEIGHT)))
if not success or self.prerotated_height != actual_height:
logger.warning(
f"Requested capture height {self.prerotated_height} for {self}, but camera reported {actual_height} (set success: {success})."
)
raise RuntimeError(
f"Failed to set requested capture height {self.prerotated_height} for {self}. Actual value: {actual_height}."
)
logger.debug(f"Capture height set to {actual_height} for {self}.")
@staticmethod
def find_cameras(
max_index_search_range=MAX_OPENCV_INDEX, raise_when_empty: bool = True
) -> List[Dict[str, Any]]:
"""
Detects available OpenCV cameras connected to the system.
On Linux, it scans '/dev/video*' paths. On other systems (like macOS, Windows),
it checks indices from 0 up to `max_index_search_range`.
Args:
max_index_search_range (int): The maximum index to check on non-Linux systems.
raise_when_empty (bool): If True, raises an OSError if no cameras are found.
Returns:
List[Dict[str, Any]]: A list of dictionaries,
where each dictionary contains 'type', 'id' (port index or path),
and the default profile properties (width, height, fps, format).
"""
found_cameras_info = []
if platform.system() == "Linux":
logger.info("Linux detected. Scanning '/dev/video*' device paths...")
possible_paths = sorted(Path("/dev").glob("video*"), key=lambda p: p.name)
targets_to_scan = [str(p) for p in possible_paths]
logger.debug(f"Found potential paths: {targets_to_scan}")
else:
logger.info(
f"{platform.system()} system detected. Scanning indices from 0 to {max_index_search_range}..."
)
targets_to_scan = list(range(max_index_search_range))
for target in targets_to_scan:
camera = cv2.VideoCapture(target)
if camera.isOpened():
default_width = int(camera.get(cv2.CAP_PROP_FRAME_WIDTH))
default_height = int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT))
default_fps = camera.get(cv2.CAP_PROP_FPS)
default_format = camera.get(cv2.CAP_PROP_FORMAT)
camera_info = {
"name": f"OpenCV Camera @ {target}",
"type": "OpenCV",
"id": target,
"backend_api": camera.getBackendName(),
"default_stream_profile": {
"format": default_format,
"width": default_width,
"height": default_height,
"fps": default_fps,
},
}
found_cameras_info.append(camera_info)
logger.debug(f"Found OpenCV camera:: {camera_info}")
camera.release()
if not found_cameras_info:
logger.warning("No OpenCV devices detected.")
if raise_when_empty:
raise OSError("No OpenCV devices detected. Ensure cameras are connected.")
logger.info(f"Detected OpenCV cameras: {[cam['id'] for cam in found_cameras_info]}")
return found_cameras_info
def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
"""
Reads a single frame synchronously from the camera.
This is a blocking call. It waits for the next available frame from the
camera hardware via OpenCV.
Args:
color_mode (Optional[ColorMode]): If specified, overrides the default
color mode (`self.color_mode`) for this read operation (e.g.,
request RGB even if default is BGR).
Returns:
np.ndarray: The captured frame as a NumPy array in the format
(height, width, channels), using the specified or default
color mode and applying any configured rotation.
Raises:
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If reading the frame from the camera fails or if the
received frame dimensions don't match expectations before rotation.
ValueError: If an invalid `color_mode` is requested.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
start_time = time.perf_counter()
# NOTE(Steven): Are we okay with this blocking an undefined amount of time?
ret, frame = self.videocapture_camera.read()
if not ret or frame is None:
raise RuntimeError(
f"Failed to capture frame from {self}. '.read()' returned status={ret} and frame is None."
)
# Post-process the frame (color conversion, dimension check, rotation)
processed_frame = self._postprocess_image(frame, color_mode)
read_duration_ms = (time.perf_counter() - start_time) * 1e3
logger.debug(f"{self} synchronous read took: {read_duration_ms:.1f}ms")
self.logs["timestamp_utc"] = capture_timestamp_utc()
return processed_frame
def _postprocess_image(self, image: np.ndarray, color_mode: ColorMode | None = None) -> np.ndarray:
"""
Applies color conversion, dimension validation, and rotation to a raw frame.
Args:
image (np.ndarray): The raw image frame (expected BGR format from OpenCV).
color_mode (Optional[ColorMode]): The target color mode (RGB or BGR). If None,
uses the instance's default `self.color_mode`.
Returns:
np.ndarray: The processed image frame.
Raises:
ValueError: If the requested `color_mode` is invalid.
RuntimeError: If the raw frame dimensions do not match the configured
`width` and `height`.
"""
requested_color_mode = self.color_mode if color_mode is None else color_mode
if requested_color_mode not in (ColorMode.RGB, ColorMode.BGR):
raise ValueError(
f"Invalid requested color mode '{requested_color_mode}'. Expected {ColorMode.RGB} or {ColorMode.BGR}."
)
h, w, c = image.shape
if h != self.prerotated_height or w != self.prerotated_width:
raise RuntimeError(
f"Captured frame dimensions ({h}x{w}) do not match configured capture dimensions ({self.prerotated_height}x{self.prerotated_width}) for {self}."
)
if c != self.channels:
logger.warning(
f"Captured frame channels ({c}) do not match configured channels ({self.channels}) for {self}."
)
processed_image = image
if requested_color_mode == ColorMode.RGB:
processed_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
logger.debug(f"Converted frame from BGR to RGB for {self}.")
if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
processed_image = cv2.rotate(processed_image, self.rotation)
logger.debug(f"Rotated frame by {self.config.rotation} degrees for {self}.")
return processed_image
def _read_loop(self):
"""
Internal loop run by the background thread for asynchronous reading.
Continuously reads frames from the camera using the synchronous `read()`
method and places the latest frame into the `frame_queue`. It overwrites
any previous frame in the queue.
"""
logger.debug(f"Starting read loop thread for {self}.")
while not self.stop_event.is_set():
try:
color_image = self.read()
with contextlib.suppress(queue.Empty):
_ = self.frame_queue.get_nowait()
self.frame_queue.put(color_image)
logger.debug(f"Frame placed in queue for {self}.")
except DeviceNotConnectedError:
logger.error(f"Read loop for {self} stopped: Camera disconnected.")
break
except Exception as e:
logger.warning(f"Error reading frame in background thread for {self}: {e}")
logger.debug(f"Stopping read loop thread for {self}.")
def _ensure_read_thread_running(self):
"""Starts or restarts the background read thread if it's not running."""
if self.thread is not None and self.thread.is_alive():
self.thread.join(timeout=0.1)
if self.stop_event is not None:
self.stop_event.set()
self.stop_event = Event()
self.thread = Thread(
target=self._read_loop, args=(), name=f"OpenCVCameraReadLoop-{self}-{self.index_or_path}"
)
self.thread.daemon = True
self.thread.start()
logger.debug(f"Read thread started for {self}.")
def async_read(self, timeout_ms: float = 2000) -> np.ndarray:
"""
Reads the latest available frame asynchronously.
This method retrieves the most recent frame captured by the background
read thread. It does not block waiting for the camera hardware directly,
only waits for a frame to appear in the internal queue up to the specified
timeout.
Args:
timeout_ms (float): Maximum time in milliseconds to wait for a frame
to become available in the queue. Defaults to 2000ms (2 seconds).
Returns:
np.ndarray: The latest captured frame as a NumPy array in the format
(height, width, channels), processed according to configuration.
Raises:
DeviceNotConnectedError: If the camera is not connected.
TimeoutError: If no frame becomes available within the specified timeout.
RuntimeError: If an unexpected error occurs while retrieving from the queue.
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.thread is None or not self.thread.is_alive():
self._ensure_read_thread_running()
try:
return self.frame_queue.get(timeout=timeout_ms / 1000.0)
except queue.Empty as e:
thread_alive = self.thread is not None and self.thread.is_alive()
logger.error(
f"Timeout waiting for frame from {self} queue after {timeout_ms}ms. "
f"(Read thread alive: {thread_alive})"
)
raise TimeoutError(
f"Timed out waiting for frame from camera {self.index_or_path} after {timeout_ms} ms. "
f"Read thread alive: {thread_alive}."
) from e
except Exception as e:
logger.exception(f"Unexpected error getting frame from queue for {self}: {e}")
raise RuntimeError(f"Error getting frame from queue for camera {self.index_or_path}: {e}") from e
def _shutdown_read_thread(self):
"""Signals the background read thread to stop and waits for it to join."""
if self.stop_event is not None:
logger.debug(f"Signaling stop event for read thread of {self}.")
self.stop_event.set()
if self.thread is not None and self.thread.is_alive():
logger.debug(f"Waiting for read thread of {self} to join...")
self.thread.join(timeout=2.0)
if self.thread.is_alive():
logger.warning(f"Read thread for {self} did not terminate gracefully after 2 seconds.")
else:
logger.debug(f"Read thread for {self} joined successfully.")
self.thread = None
self.stop_event = None
def disconnect(self):
"""
Disconnects from the camera and cleans up resources.
Stops the background read thread (if running) and releases the OpenCV
VideoCapture object.
Raises:
DeviceNotConnectedError: If the camera is already disconnected.
"""
if not self.is_connected and self.thread is None:
raise DeviceNotConnectedError(
f"Attempted to disconnect {self}, but it appears already disconnected."
)
logger.debug(f"Disconnecting from camera {self.index_or_path}...")
if self.thread is not None:
self._shutdown_read_thread()
if self.videocapture_camera is not None:
logger.debug(f"Releasing OpenCV VideoCapture object for {self}.")
self.videocapture_camera.release()
self.videocapture_camera = None
logger.info(f"Camera {self.index_or_path} disconnected successfully.")

View File

@@ -1,76 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from pathlib import Path
from ..configs import CameraConfig, ColorMode, Cv2Rotation
@CameraConfig.register_subclass("opencv")
@dataclass
class OpenCVCameraConfig(CameraConfig):
"""Configuration class for OpenCV-based camera devices or video files.
This class provides configuration options for cameras accessed through OpenCV,
supporting both physical camera devices and video files. It includes settings
for resolution, frame rate, color mode, and image rotation.
Example configurations:
```python
# Basic configurations
OpenCVCameraConfig(0, 30, 1280, 720) # 1280x720 @ 30FPS
OpenCVCameraConfig(/dev/video4, 60, 640, 480) # 640x480 @ 60FPS
# Advanced configurations
OpenCVCameraConfig(128422271347, 30, 640, 480, rotation=Cv2Rotation.ROTATE_90) # With 90° rotation
```
Attributes:
index_or_path: Either an integer representing the camera device index,
or a Path object pointing to a video file.
fps: Requested frames per second for the color stream.
width: Requested frame width in pixels for the color stream.
height: Requested frame height in pixels for the color stream.
color_mode: Color mode for image output (RGB or BGR). Defaults to RGB.
channels: Number of color channels (currently only 3 is supported).
rotation: Image rotation setting (0°, 90°, 180°, or 270°). Defaults to no rotation.
Note:
- Only 3-channel color output (RGB/BGR) is currently supported.
"""
index_or_path: int | Path
color_mode: ColorMode = ColorMode.RGB
channels: int = 3 # NOTE(Steven): Why is this a config?
rotation: Cv2Rotation = Cv2Rotation.NO_ROTATION
def __post_init__(self):
if self.color_mode not in (ColorMode.RGB, ColorMode.BGR):
raise ValueError(
f"`color_mode` is expected to be {ColorMode.RGB.value} or {ColorMode.BGR.value}, but {self.color_mode} is provided."
)
if self.rotation not in (
Cv2Rotation.NO_ROTATION,
Cv2Rotation.ROTATE_90,
Cv2Rotation.ROTATE_180,
Cv2Rotation.ROTATE_270,
):
raise ValueError(
f"`rotation` is expected to be in {(Cv2Rotation.NO_ROTATION, Cv2Rotation.ROTATE_90, Cv2Rotation.ROTATE_180, Cv2Rotation.ROTATE_270)}, but {self.rotation} is provided."
)
if self.channels != 3:
raise NotImplementedError(f"Unsupported number of channels: {self.channels}")

View File

@@ -1,73 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import platform
from pathlib import Path
from typing import TypeAlias
import numpy as np
from PIL import Image
from .camera import Camera
from .configs import CameraConfig, Cv2Rotation
IndexOrPath: TypeAlias = int | Path
def make_cameras_from_configs(camera_configs: dict[str, CameraConfig]) -> dict[str, Camera]:
cameras = {}
for key, cfg in camera_configs.items():
if cfg.type == "opencv":
from .opencv import OpenCVCamera
cameras[key] = OpenCVCamera(cfg)
elif cfg.type == "intelrealsense":
from .intel.camera_realsense import RealSenseCamera
cameras[key] = RealSenseCamera(cfg)
else:
raise ValueError(f"The motor type '{cfg.type}' is not valid.")
return cameras
def get_cv2_rotation(rotation: Cv2Rotation) -> int:
import cv2
return {
Cv2Rotation.ROTATE_270: cv2.ROTATE_90_COUNTERCLOCKWISE,
Cv2Rotation.ROTATE_90: cv2.ROTATE_90_CLOCKWISE,
Cv2Rotation.ROTATE_180: cv2.ROTATE_180,
}.get(rotation)
def get_cv2_backend() -> int:
import cv2
return {
"Linux": cv2.CAP_DSHOW,
"Windows": cv2.CAP_AVFOUNDATION,
"Darwin": cv2.CAP_ANY,
}.get(platform.system(), cv2.CAP_V4L2)
def save_image(img_array: np.ndarray, camera_index: int, frame_index: int, images_dir: Path):
img = Image.fromarray(img_array)
path = images_dir / f"camera_{camera_index:02d}_frame_{frame_index:06d}.png"
path.parent.mkdir(parents=True, exist_ok=True)
img.save(str(path), quality=100)

View File

@@ -1,52 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# keys
import os
from pathlib import Path
from huggingface_hub.constants import HF_HOME
OBS_ENV_STATE = "observation.environment_state"
OBS_STATE = "observation.state"
OBS_IMAGE = "observation.image"
OBS_IMAGES = "observation.images"
ACTION = "action"
ROBOTS = "robots"
TELEOPERATORS = "teleoperators"
# files & directories
CHECKPOINTS_DIR = "checkpoints"
LAST_CHECKPOINT_LINK = "last"
PRETRAINED_MODEL_DIR = "pretrained_model"
TRAINING_STATE_DIR = "training_state"
RNG_STATE = "rng_state.safetensors"
TRAINING_STEP = "training_step.json"
OPTIMIZER_STATE = "optimizer_state.safetensors"
OPTIMIZER_PARAM_GROUPS = "optimizer_param_groups.json"
SCHEDULER_STATE = "scheduler_state.json"
if "LEROBOT_HOME" in os.environ:
raise ValueError(
f"You have a 'LEROBOT_HOME' environment variable set to '{os.getenv('LEROBOT_HOME')}'.\n"
"'LEROBOT_HOME' is deprecated, please use 'HF_LEROBOT_HOME' instead."
)
# cache dir
default_cache_path = Path(HF_HOME) / "lerobot"
HF_LEROBOT_HOME = Path(os.getenv("HF_LEROBOT_HOME", default_cache_path)).expanduser()
# calibration dir
default_calibration_path = HF_LEROBOT_HOME / "calibration"
HF_LEROBOT_CALIBRATION = Path(os.getenv("HF_LEROBOT_CALIBRATION", default_calibration_path)).expanduser()

View File

@@ -0,0 +1,334 @@
# Video benchmark
## Questions
What is the optimal trade-off between:
- maximizing loading time with random access,
- minimizing memory space on disk,
- maximizing success rate of policies?
How to encode videos?
- How much compression (`-crf`)? Low compression with `0`, normal compression with `20` or extreme with `56`?
- What pixel format to use (`-pix_fmt`)? `yuv444p` or `yuv420p`?
- How many key frames (`-g`)? A key frame every `10` frames?
How to decode videos?
- Which `decoder`? `torchvision`, `torchaudio`, `ffmpegio`, `decord`, or `nvc`?
## Metrics
**Percentage of data compression (higher is better)**
`compression_factor` is the ratio of the memory space on disk taken by the original images to encode, to the memory space taken by the encoded video. For instance, `compression_factor=4` means that the video takes 4 times less memory space on disk compared to the original images.
**Percentage of loading time (higher is better)**
`load_time_factor` is the ratio of the time it takes to load original images at given timestamps, to the time it takes to decode the exact same frames from the video. Higher is better. For instance, `load_time_factor=0.5` means that decoding from video is 2 times slower than loading the original images.
**Average L2 error per pixel (lower is better)**
`avg_per_pixel_l2_error` is the average L2 error between each decoded frame and its corresponding original image over all requested timestamps, and also divided by the number of pixels in the image to be comparable when switching to different image sizes.
**Loss of a pretrained policy (higher is better)** (not available)
`loss_pretrained` is the result of evaluating with the selected encoding/decoding settings a policy pretrained on original images. It is easier to understand than `avg_l2_error`.
**Success rate after retraining (higher is better)** (not available)
`success_rate` is the result of training and evaluating a policy with the selected encoding/decoding settings. It is the most difficult metric to get but also the very best.
## Variables
**Image content**
We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an appartment, or in a factory, or outdoor, etc. Hence, we run this benchmark on two datasets: `pusht` (simulation) and `umi` (real-world outdoor).
**Requested timestamps**
In this benchmark, we focus on the loading time of random access, so we are not interested in sequentially loading all frames of a video like in a movie. However, the number of consecutive timestamps requested and their spacing can greatly affect the `load_time_factor`. In fact, it is expected to get faster loading time by decoding a large number of consecutive frames from a video, than to load the same data from individual images. To reflect our robotics use case, we consider a few settings:
- `single_frame`: 1 frame,
- `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`),
- `2_frames_4_space`: 2 consecutive frames with 4 frames of spacing (e.g `[t, t + 4 / fps]`),
**Data augmentations**
We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robust (e.g. robust to color changes, compression, etc.).
## Results
**`decoder`**
| repo_id | decoder | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- |
| lerobot/pusht | <span style="color: #32CD32;">torchvision</span> | 0.166 | 0.0000119 |
| lerobot/pusht | ffmpegio | 0.009 | 0.0001182 |
| lerobot/pusht | torchaudio | 0.138 | 0.0000359 |
| lerobot/umi_cup_in_the_wild | <span style="color: #32CD32;">torchvision</span> | 0.174 | 0.0000174 |
| lerobot/umi_cup_in_the_wild | ffmpegio | 0.010 | 0.0000735 |
| lerobot/umi_cup_in_the_wild | torchaudio | 0.154 | 0.0000340 |
### `1_frame`
**`pix_fmt`**
| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | yuv420p | 3.788 | 0.224 | 0.0000760 |
| lerobot/pusht | yuv444p | 3.646 | 0.185 | 0.0000443 |
| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 0.388 | 0.0000469 |
| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.329 | 0.0000397 |
**`g`**
| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 1 | 2.543 | 0.204 | 0.0000556 |
| lerobot/pusht | 2 | 3.646 | 0.182 | 0.0000443 |
| lerobot/pusht | 3 | 4.431 | 0.174 | 0.0000450 |
| lerobot/pusht | 4 | 5.103 | 0.163 | 0.0000448 |
| lerobot/pusht | 5 | 5.625 | 0.163 | 0.0000436 |
| lerobot/pusht | 6 | 5.974 | 0.155 | 0.0000427 |
| lerobot/pusht | 10 | 6.814 | 0.130 | 0.0000410 |
| lerobot/pusht | 15 | 7.431 | 0.105 | 0.0000406 |
| lerobot/pusht | 20 | 7.662 | 0.097 | 0.0000400 |
| lerobot/pusht | 40 | 8.163 | 0.061 | 0.0000405 |
| lerobot/pusht | 100 | 8.761 | 0.039 | 0.0000422 |
| lerobot/pusht | None | 8.909 | 0.024 | 0.0000431 |
| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.444 | 0.0000601 |
| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.345 | 0.0000397 |
| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.282 | 0.0000416 |
| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.271 | 0.0000415 |
| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.260 | 0.0000415 |
| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.249 | 0.0000415 |
| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.195 | 0.0000399 |
| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.169 | 0.0000394 |
| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.140 | 0.0000390 |
| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.096 | 0.0000384 |
| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.046 | 0.0000390 |
| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.022 | 0.0000400 |
**`crf`**
| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 0 | 1.699 | 0.175 | 0.0000035 |
| lerobot/pusht | 5 | 1.409 | 0.181 | 0.0000080 |
| lerobot/pusht | 10 | 1.842 | 0.172 | 0.0000123 |
| lerobot/pusht | 15 | 2.322 | 0.187 | 0.0000211 |
| lerobot/pusht | 20 | 3.050 | 0.181 | 0.0000346 |
| lerobot/pusht | None | 3.646 | 0.189 | 0.0000443 |
| lerobot/pusht | 25 | 3.969 | 0.186 | 0.0000521 |
| lerobot/pusht | 30 | 5.687 | 0.184 | 0.0000850 |
| lerobot/pusht | 40 | 10.818 | 0.193 | 0.0001726 |
| lerobot/pusht | 50 | 18.185 | 0.183 | 0.0002606 |
| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.165 | 0.0000056 |
| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.171 | 0.0000111 |
| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.212 | 0.0000153 |
| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.261 | 0.0000218 |
| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.312 | 0.0000317 |
| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.339 | 0.0000397 |
| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.297 | 0.0000452 |
| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 0.406 | 0.0000629 |
| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 0.468 | 0.0001184 |
| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 0.515 | 0.0001879 |
**best**
| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- |
| lerobot/pusht | 3.646 | 0.188 | 0.0000443 |
| lerobot/umi_cup_in_the_wild | 14.932 | 0.339 | 0.0000397 |
### `2_frames`
**`pix_fmt`**
| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | yuv420p | 3.788 | 0.314 | 0.0000799 |
| lerobot/pusht | yuv444p | 3.646 | 0.303 | 0.0000496 |
| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 0.642 | 0.0000503 |
| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.529 | 0.0000436 |
**`g`**
| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 1 | 2.543 | 0.308 | 0.0000599 |
| lerobot/pusht | 2 | 3.646 | 0.279 | 0.0000496 |
| lerobot/pusht | 3 | 4.431 | 0.259 | 0.0000498 |
| lerobot/pusht | 4 | 5.103 | 0.243 | 0.0000501 |
| lerobot/pusht | 5 | 5.625 | 0.235 | 0.0000492 |
| lerobot/pusht | 6 | 5.974 | 0.230 | 0.0000481 |
| lerobot/pusht | 10 | 6.814 | 0.194 | 0.0000468 |
| lerobot/pusht | 15 | 7.431 | 0.152 | 0.0000460 |
| lerobot/pusht | 20 | 7.662 | 0.151 | 0.0000455 |
| lerobot/pusht | 40 | 8.163 | 0.095 | 0.0000454 |
| lerobot/pusht | 100 | 8.761 | 0.062 | 0.0000472 |
| lerobot/pusht | None | 8.909 | 0.037 | 0.0000479 |
| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.638 | 0.0000625 |
| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.537 | 0.0000436 |
| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.493 | 0.0000437 |
| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.458 | 0.0000446 |
| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.438 | 0.0000445 |
| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.424 | 0.0000444 |
| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.345 | 0.0000435 |
| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.313 | 0.0000417 |
| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.264 | 0.0000421 |
| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.185 | 0.0000414 |
| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.090 | 0.0000420 |
| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.042 | 0.0000424 |
**`crf`**
| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 0 | 1.699 | 0.302 | 0.0000097 |
| lerobot/pusht | 5 | 1.409 | 0.287 | 0.0000142 |
| lerobot/pusht | 10 | 1.842 | 0.283 | 0.0000184 |
| lerobot/pusht | 15 | 2.322 | 0.305 | 0.0000268 |
| lerobot/pusht | 20 | 3.050 | 0.285 | 0.0000402 |
| lerobot/pusht | None | 3.646 | 0.285 | 0.0000496 |
| lerobot/pusht | 25 | 3.969 | 0.293 | 0.0000572 |
| lerobot/pusht | 30 | 5.687 | 0.293 | 0.0000893 |
| lerobot/pusht | 40 | 10.818 | 0.319 | 0.0001762 |
| lerobot/pusht | 50 | 18.185 | 0.304 | 0.0002626 |
| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.235 | 0.0000112 |
| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.261 | 0.0000166 |
| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.333 | 0.0000207 |
| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.406 | 0.0000267 |
| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.489 | 0.0000361 |
| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.537 | 0.0000436 |
| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.578 | 0.0000487 |
| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 0.453 | 0.0000655 |
| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 0.767 | 0.0001192 |
| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 0.816 | 0.0001881 |
**best**
| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- |
| lerobot/pusht | 3.646 | 0.283 | 0.0000496 |
| lerobot/umi_cup_in_the_wild | 14.932 | 0.543 | 0.0000436 |
### `2_frames_4_space`
**`pix_fmt`**
| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | yuv420p | 3.788 | 0.257 | 0.0000855 |
| lerobot/pusht | yuv444p | 3.646 | 0.261 | 0.0000556 |
| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 0.493 | 0.0000476 |
| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.371 | 0.0000404 |
**`g`**
| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 1 | 2.543 | 0.226 | 0.0000670 |
| lerobot/pusht | 2 | 3.646 | 0.222 | 0.0000556 |
| lerobot/pusht | 3 | 4.431 | 0.217 | 0.0000567 |
| lerobot/pusht | 4 | 5.103 | 0.204 | 0.0000555 |
| lerobot/pusht | 5 | 5.625 | 0.179 | 0.0000556 |
| lerobot/pusht | 6 | 5.974 | 0.188 | 0.0000544 |
| lerobot/pusht | 10 | 6.814 | 0.160 | 0.0000531 |
| lerobot/pusht | 15 | 7.431 | 0.150 | 0.0000521 |
| lerobot/pusht | 20 | 7.662 | 0.123 | 0.0000519 |
| lerobot/pusht | 40 | 8.163 | 0.092 | 0.0000519 |
| lerobot/pusht | 100 | 8.761 | 0.053 | 0.0000533 |
| lerobot/pusht | None | 8.909 | 0.034 | 0.0000541 |
| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.409 | 0.0000607 |
| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.381 | 0.0000404 |
| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.355 | 0.0000418 |
| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.346 | 0.0000425 |
| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.354 | 0.0000419 |
| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.336 | 0.0000419 |
| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.314 | 0.0000402 |
| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.269 | 0.0000397 |
| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.246 | 0.0000395 |
| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.171 | 0.0000390 |
| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.091 | 0.0000399 |
| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.043 | 0.0000409 |
**`crf`**
| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 0 | 1.699 | 0.212 | 0.0000193 |
| lerobot/pusht | 5 | 1.409 | 0.211 | 0.0000232 |
| lerobot/pusht | 10 | 1.842 | 0.199 | 0.0000270 |
| lerobot/pusht | 15 | 2.322 | 0.198 | 0.0000347 |
| lerobot/pusht | 20 | 3.050 | 0.211 | 0.0000469 |
| lerobot/pusht | None | 3.646 | 0.206 | 0.0000556 |
| lerobot/pusht | 25 | 3.969 | 0.210 | 0.0000626 |
| lerobot/pusht | 30 | 5.687 | 0.223 | 0.0000927 |
| lerobot/pusht | 40 | 10.818 | 0.227 | 0.0001763 |
| lerobot/pusht | 50 | 18.185 | 0.223 | 0.0002625 |
| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.147 | 0.0000071 |
| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.182 | 0.0000125 |
| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.222 | 0.0000166 |
| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.270 | 0.0000229 |
| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.325 | 0.0000326 |
| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.362 | 0.0000404 |
| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.390 | 0.0000459 |
| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 0.437 | 0.0000633 |
| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 0.499 | 0.0001186 |
| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 0.564 | 0.0001879 |
**best**
| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- |
| lerobot/pusht | 3.646 | 0.224 | 0.0000556 |
| lerobot/umi_cup_in_the_wild | 14.932 | 0.368 | 0.0000404 |
### `6_frames`
**`pix_fmt`**
| repo_id | pix_fmt | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | yuv420p | 3.788 | 0.660 | 0.0000839 |
| lerobot/pusht | yuv444p | 3.646 | 0.546 | 0.0000542 |
| lerobot/umi_cup_in_the_wild | yuv420p | 14.391 | 1.225 | 0.0000497 |
| lerobot/umi_cup_in_the_wild | yuv444p | 14.932 | 0.908 | 0.0000428 |
**`g`**
| repo_id | g | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 1 | 2.543 | 0.552 | 0.0000646 |
| lerobot/pusht | 2 | 3.646 | 0.534 | 0.0000542 |
| lerobot/pusht | 3 | 4.431 | 0.563 | 0.0000546 |
| lerobot/pusht | 4 | 5.103 | 0.537 | 0.0000545 |
| lerobot/pusht | 5 | 5.625 | 0.477 | 0.0000532 |
| lerobot/pusht | 6 | 5.974 | 0.515 | 0.0000530 |
| lerobot/pusht | 10 | 6.814 | 0.410 | 0.0000512 |
| lerobot/pusht | 15 | 7.431 | 0.405 | 0.0000503 |
| lerobot/pusht | 20 | 7.662 | 0.345 | 0.0000500 |
| lerobot/pusht | 40 | 8.163 | 0.247 | 0.0000496 |
| lerobot/pusht | 100 | 8.761 | 0.147 | 0.0000510 |
| lerobot/pusht | None | 8.909 | 0.100 | 0.0000519 |
| lerobot/umi_cup_in_the_wild | 1 | 14.411 | 0.997 | 0.0000620 |
| lerobot/umi_cup_in_the_wild | 2 | 14.932 | 0.911 | 0.0000428 |
| lerobot/umi_cup_in_the_wild | 3 | 20.174 | 0.869 | 0.0000433 |
| lerobot/umi_cup_in_the_wild | 4 | 24.889 | 0.874 | 0.0000438 |
| lerobot/umi_cup_in_the_wild | 5 | 28.825 | 0.864 | 0.0000439 |
| lerobot/umi_cup_in_the_wild | 6 | 31.635 | 0.834 | 0.0000440 |
| lerobot/umi_cup_in_the_wild | 10 | 39.418 | 0.781 | 0.0000421 |
| lerobot/umi_cup_in_the_wild | 15 | 44.577 | 0.679 | 0.0000411 |
| lerobot/umi_cup_in_the_wild | 20 | 47.907 | 0.652 | 0.0000410 |
| lerobot/umi_cup_in_the_wild | 40 | 52.554 | 0.465 | 0.0000404 |
| lerobot/umi_cup_in_the_wild | 100 | 58.241 | 0.245 | 0.0000413 |
| lerobot/umi_cup_in_the_wild | None | 60.530 | 0.116 | 0.0000417 |
**`crf`**
| repo_id | crf | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- | --- |
| lerobot/pusht | 0 | 1.699 | 0.534 | 0.0000163 |
| lerobot/pusht | 5 | 1.409 | 0.524 | 0.0000205 |
| lerobot/pusht | 10 | 1.842 | 0.510 | 0.0000245 |
| lerobot/pusht | 15 | 2.322 | 0.512 | 0.0000324 |
| lerobot/pusht | 20 | 3.050 | 0.508 | 0.0000452 |
| lerobot/pusht | None | 3.646 | 0.518 | 0.0000542 |
| lerobot/pusht | 25 | 3.969 | 0.534 | 0.0000616 |
| lerobot/pusht | 30 | 5.687 | 0.530 | 0.0000927 |
| lerobot/pusht | 40 | 10.818 | 0.552 | 0.0001777 |
| lerobot/pusht | 50 | 18.185 | 0.564 | 0.0002644 |
| lerobot/umi_cup_in_the_wild | 0 | 1.918 | 0.401 | 0.0000101 |
| lerobot/umi_cup_in_the_wild | 5 | 3.207 | 0.499 | 0.0000156 |
| lerobot/umi_cup_in_the_wild | 10 | 4.818 | 0.599 | 0.0000197 |
| lerobot/umi_cup_in_the_wild | 15 | 7.329 | 0.704 | 0.0000258 |
| lerobot/umi_cup_in_the_wild | 20 | 11.361 | 0.834 | 0.0000352 |
| lerobot/umi_cup_in_the_wild | None | 14.932 | 0.925 | 0.0000428 |
| lerobot/umi_cup_in_the_wild | 25 | 17.741 | 0.978 | 0.0000480 |
| lerobot/umi_cup_in_the_wild | 30 | 27.983 | 1.088 | 0.0000648 |
| lerobot/umi_cup_in_the_wild | 40 | 82.449 | 1.324 | 0.0001190 |
| lerobot/umi_cup_in_the_wild | 50 | 186.145 | 1.436 | 0.0001880 |
**best**
| repo_id | compression_factor | load_time_factor | avg_per_pixel_l2_error |
| --- | --- | --- | --- |
| lerobot/pusht | 3.646 | 0.546 | 0.0000542 |
| lerobot/umi_cup_in_the_wild | 14.932 | 0.934 | 0.0000428 |

View File

@@ -0,0 +1,372 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import random
import shutil
import subprocess
import time
from pathlib import Path
import einops
import numpy
import PIL
import torch
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.datasets.video_utils import (
decode_video_frames_torchvision,
)
def get_directory_size(directory):
total_size = 0
# Iterate over all files and subdirectories recursively
for item in directory.rglob("*"):
if item.is_file():
# Add the file size to the total
total_size += item.stat().st_size
return total_size
def run_video_benchmark(
output_dir,
cfg,
timestamps_mode,
seed=1337,
):
output_dir = Path(output_dir)
if output_dir.exists():
shutil.rmtree(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
repo_id = cfg["repo_id"]
# TODO(rcadene): rewrite with hardcoding of original images and episodes
dataset = LeRobotDataset(repo_id)
# Get fps
fps = dataset.fps
# we only load first episode
ep_num_images = dataset.episode_data_index["to"][0].item()
# Save/Load image directory for the first episode
imgs_dir = Path(f"tmp/data/images/{repo_id}/observation.image_episode_000000")
if not imgs_dir.exists():
imgs_dir.mkdir(parents=True, exist_ok=True)
hf_dataset = dataset.hf_dataset.with_format(None)
imgs_dataset = hf_dataset.select_columns("observation.image")
for i, item in enumerate(imgs_dataset):
img = item["observation.image"]
img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
if i >= ep_num_images - 1:
break
sum_original_frames_size_bytes = get_directory_size(imgs_dir)
# Encode images into video
video_path = output_dir / "episode_0.mp4"
g = cfg.get("g")
crf = cfg.get("crf")
pix_fmt = cfg["pix_fmt"]
cmd = f"ffmpeg -r {fps} "
cmd += "-f image2 "
cmd += "-loglevel error "
cmd += f"-i {str(imgs_dir / 'frame_%06d.png')} "
cmd += "-vcodec libx264 "
if g is not None:
cmd += f"-g {g} " # ensures at least 1 keyframe every 10 frames
# cmd += "-keyint_min 10 " set a minimum of 10 frames between 2 key frames
# cmd += "-sc_threshold 0 " disable scene change detection to lower the number of key frames
if crf is not None:
cmd += f"-crf {crf} "
cmd += f"-pix_fmt {pix_fmt} "
cmd += f"{str(video_path)}"
subprocess.run(cmd.split(" "), check=True)
video_size_bytes = video_path.stat().st_size
# Set decoder
decoder = cfg["decoder"]
decoder_kwgs = cfg["decoder_kwgs"]
device = cfg["device"]
if decoder == "torchvision":
decode_frames_fn = decode_video_frames_torchvision
else:
raise ValueError(decoder)
# Estimate average loading time
def load_original_frames(imgs_dir, timestamps):
frames = []
for ts in timestamps:
idx = int(ts * fps)
frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
frame = torch.from_numpy(numpy.array(frame))
frame = frame.type(torch.float32) / 255
frame = einops.rearrange(frame, "h w c -> c h w")
frames.append(frame)
return frames
list_avg_load_time = []
list_avg_load_time_from_images = []
per_pixel_l2_errors = []
random.seed(seed)
for t in range(50):
# test loading 2 frames that are 4 frames appart, which might be a common setting
ts = random.randint(fps, ep_num_images - fps) / fps
if timestamps_mode == "1_frame":
timestamps = [ts]
elif timestamps_mode == "2_frames":
timestamps = [ts - 1 / fps, ts]
elif timestamps_mode == "2_frames_4_space":
timestamps = [ts - 4 / fps, ts]
elif timestamps_mode == "6_frames":
timestamps = [ts - i / fps for i in range(6)][::-1]
else:
raise ValueError(timestamps_mode)
num_frames = len(timestamps)
start_time_s = time.monotonic()
frames = decode_frames_fn(
video_path, timestamps=timestamps, tolerance_s=1e-4, device=device, **decoder_kwgs
)
avg_load_time = (time.monotonic() - start_time_s) / num_frames
list_avg_load_time.append(avg_load_time)
start_time_s = time.monotonic()
original_frames = load_original_frames(imgs_dir, timestamps)
avg_load_time_from_images = (time.monotonic() - start_time_s) / num_frames
list_avg_load_time_from_images.append(avg_load_time_from_images)
# Estimate average L2 error between original frames and decoded frames
for i, ts in enumerate(timestamps):
# are_close = torch.allclose(frames[i], original_frames[i], atol=0.02)
num_pixels = original_frames[i].numel()
per_pixel_l2_error = torch.norm(frames[i] - original_frames[i], p=2).item() / num_pixels
# save decoded frames
if t == 0:
frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
PIL.Image.fromarray(frame_hwc).save(output_dir / f"frame_{i:06d}.png")
# save original_frames
idx = int(ts * fps)
if t == 0:
original_frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
original_frame.save(output_dir / f"original_frame_{i:06d}.png")
per_pixel_l2_errors.append(per_pixel_l2_error)
avg_load_time = float(numpy.array(list_avg_load_time).mean())
avg_load_time_from_images = float(numpy.array(list_avg_load_time_from_images).mean())
avg_per_pixel_l2_error = float(numpy.array(per_pixel_l2_errors).mean())
# Save benchmark info
info = {
"sum_original_frames_size_bytes": sum_original_frames_size_bytes,
"video_size_bytes": video_size_bytes,
"avg_load_time_from_images": avg_load_time_from_images,
"avg_load_time": avg_load_time,
"compression_factor": sum_original_frames_size_bytes / video_size_bytes,
"load_time_factor": avg_load_time_from_images / avg_load_time,
"avg_per_pixel_l2_error": avg_per_pixel_l2_error,
}
with open(output_dir / "info.json", "w") as f:
json.dump(info, f)
return info
def display_markdown_table(headers, rows):
for i, row in enumerate(rows):
new_row = []
for col in row:
if col is None:
new_col = "None"
elif isinstance(col, float):
new_col = f"{col:.3f}"
if new_col == "0.000":
new_col = f"{col:.7f}"
elif isinstance(col, int):
new_col = f"{col}"
else:
new_col = col
new_row.append(new_col)
rows[i] = new_row
header_line = "| " + " | ".join(headers) + " |"
separator_line = "| " + " | ".join(["---" for _ in headers]) + " |"
body_lines = ["| " + " | ".join(row) + " |" for row in rows]
markdown_table = "\n".join([header_line, separator_line] + body_lines)
print(markdown_table)
print()
def load_info(out_dir):
with open(out_dir / "info.json") as f:
info = json.load(f)
return info
def main():
out_dir = Path("tmp/run_video_benchmark")
dry_run = False
repo_ids = ["lerobot/pusht", "lerobot/umi_cup_in_the_wild"]
timestamps_modes = [
"1_frame",
"2_frames",
"2_frames_4_space",
"6_frames",
]
for timestamps_mode in timestamps_modes:
bench_dir = out_dir / timestamps_mode
print(f"### `{timestamps_mode}`")
print()
print("**`pix_fmt`**")
headers = ["repo_id", "pix_fmt", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
rows = []
for repo_id in repo_ids:
for pix_fmt in ["yuv420p", "yuv444p"]:
cfg = {
"repo_id": repo_id,
# video encoding
"g": 2,
"crf": None,
"pix_fmt": pix_fmt,
# video decoding
"device": "cpu",
"decoder": "torchvision",
"decoder_kwgs": {},
}
if not dry_run:
run_video_benchmark(bench_dir / repo_id / f"torchvision_{pix_fmt}", cfg, timestamps_mode)
info = load_info(bench_dir / repo_id / f"torchvision_{pix_fmt}")
rows.append(
[
repo_id,
pix_fmt,
info["compression_factor"],
info["load_time_factor"],
info["avg_per_pixel_l2_error"],
]
)
display_markdown_table(headers, rows)
print("**`g`**")
headers = ["repo_id", "g", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
rows = []
for repo_id in repo_ids:
for g in [1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None]:
cfg = {
"repo_id": repo_id,
# video encoding
"g": g,
"pix_fmt": "yuv444p",
# video decoding
"device": "cpu",
"decoder": "torchvision",
"decoder_kwgs": {},
}
if not dry_run:
run_video_benchmark(bench_dir / repo_id / f"torchvision_g_{g}", cfg, timestamps_mode)
info = load_info(bench_dir / repo_id / f"torchvision_g_{g}")
rows.append(
[
repo_id,
g,
info["compression_factor"],
info["load_time_factor"],
info["avg_per_pixel_l2_error"],
]
)
display_markdown_table(headers, rows)
print("**`crf`**")
headers = ["repo_id", "crf", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
rows = []
for repo_id in repo_ids:
for crf in [0, 5, 10, 15, 20, None, 25, 30, 40, 50]:
cfg = {
"repo_id": repo_id,
# video encoding
"g": 2,
"crf": crf,
"pix_fmt": "yuv444p",
# video decoding
"device": "cpu",
"decoder": "torchvision",
"decoder_kwgs": {},
}
if not dry_run:
run_video_benchmark(bench_dir / repo_id / f"torchvision_crf_{crf}", cfg, timestamps_mode)
info = load_info(bench_dir / repo_id / f"torchvision_crf_{crf}")
rows.append(
[
repo_id,
crf,
info["compression_factor"],
info["load_time_factor"],
info["avg_per_pixel_l2_error"],
]
)
display_markdown_table(headers, rows)
print("**best**")
headers = ["repo_id", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
rows = []
for repo_id in repo_ids:
cfg = {
"repo_id": repo_id,
# video encoding
"g": 2,
"crf": None,
"pix_fmt": "yuv444p",
# video decoding
"device": "cpu",
"decoder": "torchvision",
"decoder_kwgs": {},
}
if not dry_run:
run_video_benchmark(bench_dir / repo_id / "torchvision_best", cfg, timestamps_mode)
info = load_info(bench_dir / repo_id / "torchvision_best")
rows.append(
[
repo_id,
info["compression_factor"],
info["load_time_factor"],
info["avg_per_pixel_l2_error"],
]
)
display_markdown_table(headers, rows)
if __name__ == "__main__":
main()

View File

@@ -1,68 +0,0 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import packaging.version
V2_MESSAGE = """
The dataset you requested ({repo_id}) is in {version} format.
We introduced a new format since v2.0 which is not backward compatible with v1.x.
Please, use our conversion script. Modify the following command with your own task description:
```
python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \\
--repo-id {repo_id} \\
--single-task "TASK DESCRIPTION." # <---- /!\\ Replace TASK DESCRIPTION /!\\
```
A few examples to replace TASK DESCRIPTION: "Pick up the blue cube and place it into the bin.", "Insert the
peg into the socket.", "Slide open the ziploc bag.", "Take the elevator to the 1st floor.", "Open the top
cabinet, store the pot inside it then close the cabinet.", "Push the T-shaped block onto the T-shaped
target.", "Grab the spray paint on the shelf and place it in the bin on top of the robot dog.", "Fold the
sweatshirt.", ...
If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
"""
V21_MESSAGE = """
The dataset you requested ({repo_id}) is in {version} format.
While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
stats instead of per-episode stats. Update your dataset stats to the new format using this command:
```
python lerobot/common/datasets/v21/convert_dataset_v20_to_v21.py --repo-id={repo_id}
```
If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
"""
FUTURE_MESSAGE = """
The dataset you requested ({repo_id}) is only available in {version} format.
As we cannot ensure forward compatibility with it, please update your current version of lerobot.
"""
class CompatibilityError(Exception): ...
class BackwardCompatibilityError(CompatibilityError):
def __init__(self, repo_id: str, version: packaging.version.Version):
message = V2_MESSAGE.format(repo_id=repo_id, version=version)
super().__init__(message)
class ForwardCompatibilityError(CompatibilityError):
def __init__(self, repo_id: str, version: packaging.version.Version):
message = FUTURE_MESSAGE.format(repo_id=repo_id, version=version)
super().__init__(message)

View File

@@ -1,27 +0,0 @@
---
# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
{{ card_data }}
---
This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).
## Dataset Description
{{ dataset_description | default("", true) }}
- **Homepage:** {{ url | default("[More Information Needed]", true)}}
- **Paper:** {{ paper | default("[More Information Needed]", true)}}
- **License:** {{ license | default("[More Information Needed]", true)}}
## Dataset Structure
{{ dataset_structure | default("[More Information Needed]", true)}}
## Citation
**BibTeX:**
```bibtex
{{ citation_bibtex | default("[More Information Needed]", true)}}
```

View File

@@ -13,164 +13,197 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from copy import deepcopy
from math import ceil
from lerobot.common.datasets.utils import load_image_as_numpy
import einops
import torch
import tqdm
from datasets import Image
from lerobot.common.datasets.video_utils import VideoFrame
def estimate_num_samples(
dataset_len: int, min_num_samples: int = 100, max_num_samples: int = 10_000, power: float = 0.75
) -> int:
"""Heuristic to estimate the number of samples based on dataset size.
The power controls the sample growth relative to dataset size.
Lower the power for less number of samples.
def get_stats_einops_patterns(dataset, num_workers=0):
"""These einops patterns will be used to aggregate batches and compute statistics.
For default arguments, we have:
- from 1 to ~500, num_samples=100
- at 1000, num_samples=177
- at 2000, num_samples=299
- at 5000, num_samples=594
- at 10000, num_samples=1000
- at 20000, num_samples=1681
Note: We assume the images are in channel first format
"""
if dataset_len < min_num_samples:
min_num_samples = dataset_len
return max(min_num_samples, min(int(dataset_len**power), max_num_samples))
dataloader = torch.utils.data.DataLoader(
dataset,
num_workers=num_workers,
batch_size=2,
shuffle=False,
)
batch = next(iter(dataloader))
def sample_indices(data_len: int) -> list[int]:
num_samples = estimate_num_samples(data_len)
return np.round(np.linspace(0, data_len - 1, num_samples)).astype(int).tolist()
stats_patterns = {}
for key, feats_type in dataset.features.items():
# sanity check that tensors are not float64
assert batch[key].dtype != torch.float64
if isinstance(feats_type, (VideoFrame, Image)):
# sanity check that images are channel first
_, c, h, w = batch[key].shape
assert c < h and c < w, f"expect channel first images, but instead {batch[key].shape}"
def auto_downsample_height_width(img: np.ndarray, target_size: int = 150, max_size_threshold: int = 300):
_, height, width = img.shape
# sanity check that images are float32 in range [0,1]
assert batch[key].dtype == torch.float32, f"expect torch.float32, but instead {batch[key].dtype=}"
assert batch[key].max() <= 1, f"expect pixels lower than 1, but instead {batch[key].max()=}"
assert batch[key].min() >= 0, f"expect pixels greater than 1, but instead {batch[key].min()=}"
if max(width, height) < max_size_threshold:
# no downsampling needed
return img
downsample_factor = int(width / target_size) if width > height else int(height / target_size)
return img[:, ::downsample_factor, ::downsample_factor]
def sample_images(image_paths: list[str]) -> np.ndarray:
sampled_indices = sample_indices(len(image_paths))
images = None
for i, idx in enumerate(sampled_indices):
path = image_paths[idx]
# we load as uint8 to reduce memory usage
img = load_image_as_numpy(path, dtype=np.uint8, channel_first=True)
img = auto_downsample_height_width(img)
if images is None:
images = np.empty((len(sampled_indices), *img.shape), dtype=np.uint8)
images[i] = img
return images
def get_feature_stats(array: np.ndarray, axis: tuple, keepdims: bool) -> dict[str, np.ndarray]:
return {
"min": np.min(array, axis=axis, keepdims=keepdims),
"max": np.max(array, axis=axis, keepdims=keepdims),
"mean": np.mean(array, axis=axis, keepdims=keepdims),
"std": np.std(array, axis=axis, keepdims=keepdims),
"count": np.array([len(array)]),
}
def compute_episode_stats(episode_data: dict[str, list[str] | np.ndarray], features: dict) -> dict:
ep_stats = {}
for key, data in episode_data.items():
if features[key]["dtype"] == "string":
continue # HACK: we should receive np.arrays of strings
elif features[key]["dtype"] in ["image", "video"]:
ep_ft_array = sample_images(data) # data is a list of image paths
axes_to_reduce = (0, 2, 3) # keep channel dim
keepdims = True
stats_patterns[key] = "b c h w -> c 1 1"
elif batch[key].ndim == 2:
stats_patterns[key] = "b c -> c "
elif batch[key].ndim == 1:
stats_patterns[key] = "b -> 1"
else:
ep_ft_array = data # data is already a np.ndarray
axes_to_reduce = 0 # compute stats over the first axis
keepdims = data.ndim == 1 # keep as np.array
raise ValueError(f"{key}, {feats_type}, {batch[key].shape}")
ep_stats[key] = get_feature_stats(ep_ft_array, axis=axes_to_reduce, keepdims=keepdims)
# finally, we normalize and remove batch dim for images
if features[key]["dtype"] in ["image", "video"]:
ep_stats[key] = {
k: v if k == "count" else np.squeeze(v / 255.0, axis=0) for k, v in ep_stats[key].items()
}
return ep_stats
return stats_patterns
def _assert_type_and_shape(stats_list: list[dict[str, dict]]):
for i in range(len(stats_list)):
for fkey in stats_list[i]:
for k, v in stats_list[i][fkey].items():
if not isinstance(v, np.ndarray):
raise ValueError(
f"Stats must be composed of numpy array, but key '{k}' of feature '{fkey}' is of type '{type(v)}' instead."
)
if v.ndim == 0:
raise ValueError("Number of dimensions must be at least 1, and is 0 instead.")
if k == "count" and v.shape != (1,):
raise ValueError(f"Shape of 'count' must be (1), but is {v.shape} instead.")
if "image" in fkey and k != "count" and v.shape != (3, 1, 1):
raise ValueError(f"Shape of '{k}' must be (3,1,1), but is {v.shape} instead.")
def compute_stats(dataset, batch_size=32, num_workers=16, max_num_samples=None):
"""Compute mean/std and min/max statistics of all data keys in a LeRobotDataset."""
if max_num_samples is None:
max_num_samples = len(dataset)
# for more info on why we need to set the same number of workers, see `load_from_videos`
stats_patterns = get_stats_einops_patterns(dataset, num_workers)
# mean and std will be computed incrementally while max and min will track the running value.
mean, std, max, min = {}, {}, {}, {}
for key in stats_patterns:
mean[key] = torch.tensor(0.0).float()
std[key] = torch.tensor(0.0).float()
max[key] = torch.tensor(-float("inf")).float()
min[key] = torch.tensor(float("inf")).float()
def create_seeded_dataloader(dataset, batch_size, seed):
generator = torch.Generator()
generator.manual_seed(seed)
dataloader = torch.utils.data.DataLoader(
dataset,
num_workers=num_workers,
batch_size=batch_size,
shuffle=True,
drop_last=False,
generator=generator,
)
return dataloader
# Note: Due to be refactored soon. The point of storing `first_batch` is to make sure we don't get
# surprises when rerunning the sampler.
first_batch = None
running_item_count = 0 # for online mean computation
dataloader = create_seeded_dataloader(dataset, batch_size, seed=1337)
for i, batch in enumerate(
tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute mean, min, max")
):
this_batch_size = len(batch["index"])
running_item_count += this_batch_size
if first_batch is None:
first_batch = deepcopy(batch)
for key, pattern in stats_patterns.items():
batch[key] = batch[key].float()
# Numerically stable update step for mean computation.
batch_mean = einops.reduce(batch[key], pattern, "mean")
# Hint: to update the mean we need x̄ₙ = (Nₙ₋₁x̄ₙ₋₁ + Bₙxₙ) / Nₙ, where the subscript represents
# the update step, N is the running item count, B is this batch size, x̄ is the running mean,
# and x is the current batch mean. Some rearrangement is then required to avoid risking
# numerical overflow. Another hint: Nₙ₋₁ = Nₙ - Bₙ. Rearrangement yields
# x̄ₙ = x̄ₙ₋₁ + Bₙ * (xₙ - x̄ₙ₋₁) / Nₙ
mean[key] = mean[key] + this_batch_size * (batch_mean - mean[key]) / running_item_count
max[key] = torch.maximum(max[key], einops.reduce(batch[key], pattern, "max"))
min[key] = torch.minimum(min[key], einops.reduce(batch[key], pattern, "min"))
if i == ceil(max_num_samples / batch_size) - 1:
break
first_batch_ = None
running_item_count = 0 # for online std computation
dataloader = create_seeded_dataloader(dataset, batch_size, seed=1337)
for i, batch in enumerate(
tqdm.tqdm(dataloader, total=ceil(max_num_samples / batch_size), desc="Compute std")
):
this_batch_size = len(batch["index"])
running_item_count += this_batch_size
# Sanity check to make sure the batches are still in the same order as before.
if first_batch_ is None:
first_batch_ = deepcopy(batch)
for key in stats_patterns:
assert torch.equal(first_batch_[key], first_batch[key])
for key, pattern in stats_patterns.items():
batch[key] = batch[key].float()
# Numerically stable update step for mean computation (where the mean is over squared
# residuals).See notes in the mean computation loop above.
batch_std = einops.reduce((batch[key] - mean[key]) ** 2, pattern, "mean")
std[key] = std[key] + this_batch_size * (batch_std - std[key]) / running_item_count
if i == ceil(max_num_samples / batch_size) - 1:
break
for key in stats_patterns:
std[key] = torch.sqrt(std[key])
stats = {}
for key in stats_patterns:
stats[key] = {
"mean": mean[key],
"std": std[key],
"max": max[key],
"min": min[key],
}
return stats
def aggregate_feature_stats(stats_ft_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
"""Aggregates stats for a single feature."""
means = np.stack([s["mean"] for s in stats_ft_list])
variances = np.stack([s["std"] ** 2 for s in stats_ft_list])
counts = np.stack([s["count"] for s in stats_ft_list])
total_count = counts.sum(axis=0)
def aggregate_stats(ls_datasets) -> dict[str, torch.Tensor]:
"""Aggregate stats of multiple LeRobot datasets into one set of stats without recomputing from scratch.
# Prepare weighted mean by matching number of dimensions
while counts.ndim < means.ndim:
counts = np.expand_dims(counts, axis=-1)
The final stats will have the union of all data keys from each of the datasets.
# Compute the weighted mean
weighted_means = means * counts
total_mean = weighted_means.sum(axis=0) / total_count
# Compute the variance using the parallel algorithm
delta_means = means - total_mean
weighted_variances = (variances + delta_means**2) * counts
total_variance = weighted_variances.sum(axis=0) / total_count
return {
"min": np.min(np.stack([s["min"] for s in stats_ft_list]), axis=0),
"max": np.max(np.stack([s["max"] for s in stats_ft_list]), axis=0),
"mean": total_mean,
"std": np.sqrt(total_variance),
"count": total_count,
}
def aggregate_stats(stats_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]:
"""Aggregate stats from multiple compute_stats outputs into a single set of stats.
The final stats will have the union of all data keys from each of the stats dicts.
For instance:
- new_min = min(min_dataset_0, min_dataset_1, ...)
The final stats will have the union of all data keys from each of the datasets. For instance:
- new_max = max(max_dataset_0, max_dataset_1, ...)
- new_mean = (mean of all data, weighted by counts)
- new_min = min(min_dataset_0, min_dataset_1, ...)
- new_mean = (mean of all data)
- new_std = (std of all data)
"""
_assert_type_and_shape(stats_list)
data_keys = {key for stats in stats_list for key in stats}
aggregated_stats = {key: {} for key in data_keys}
for key in data_keys:
stats_with_key = [stats[key] for stats in stats_list if key in stats]
aggregated_stats[key] = aggregate_feature_stats(stats_with_key)
return aggregated_stats
data_keys = set()
for dataset in ls_datasets:
data_keys.update(dataset.stats.keys())
stats = {k: {} for k in data_keys}
for data_key in data_keys:
for stat_key in ["min", "max"]:
# compute `max(dataset_0["max"], dataset_1["max"], ...)`
stats[data_key][stat_key] = einops.reduce(
torch.stack([d.stats[data_key][stat_key] for d in ls_datasets if data_key in d.stats], dim=0),
"n ... -> ...",
stat_key,
)
total_samples = sum(d.num_samples for d in ls_datasets if data_key in d.stats)
# Compute the "sum" statistic by multiplying each mean by the number of samples in the respective
# dataset, then divide by total_samples to get the overall "mean".
# NOTE: the brackets around (d.num_samples / total_samples) are needed tor minimize the risk of
# numerical overflow!
stats[data_key]["mean"] = sum(
d.stats[data_key]["mean"] * (d.num_samples / total_samples)
for d in ls_datasets
if data_key in d.stats
)
# The derivation for standard deviation is a little more involved but is much in the same spirit as
# the computation of the mean.
# Given two sets of data where the statistics are known:
# σ_combined = sqrt[ (n1 * (σ1^2 + d1^2) + n2 * (σ2^2 + d2^2)) / (n1 + n2) ]
# where d1 = μ1 - μ_combined, d2 = μ2 - μ_combined
# NOTE: the brackets around (d.num_samples / total_samples) are needed tor minimize the risk of
# numerical overflow!
stats[data_key]["std"] = torch.sqrt(
sum(
(d.stats[data_key]["std"] ** 2 + (d.stats[data_key]["mean"] - stats[data_key]["mean"]) ** 2)
* (d.num_samples / total_samples)
for d in ls_datasets
if data_key in d.stats
)
)
return stats

View File

@@ -14,105 +14,81 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pprint import pformat
import torch
from omegaconf import ListConfig, OmegaConf
from lerobot.common.datasets.lerobot_dataset import (
LeRobotDataset,
LeRobotDatasetMetadata,
MultiLeRobotDataset,
)
from lerobot.common.datasets.transforms import ImageTransforms
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.train import TrainPipelineConfig
IMAGENET_STATS = {
"mean": [[[0.485]], [[0.456]], [[0.406]]], # (c,1,1)
"std": [[[0.229]], [[0.224]], [[0.225]]], # (c,1,1)
}
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, MultiLeRobotDataset
def resolve_delta_timestamps(
cfg: PreTrainedConfig, ds_meta: LeRobotDatasetMetadata
) -> dict[str, list] | None:
"""Resolves delta_timestamps by reading from the 'delta_indices' properties of the PreTrainedConfig.
def resolve_delta_timestamps(cfg):
"""Resolves delta_timestamps config key (in-place) by using `eval`.
Args:
cfg (PreTrainedConfig): The PreTrainedConfig to read delta_indices from.
ds_meta (LeRobotDatasetMetadata): The dataset from which features and fps are used to build
delta_timestamps against.
Returns:
dict[str, list] | None: A dictionary of delta_timestamps, e.g.:
{
"observation.state": [-0.04, -0.02, 0]
"observation.action": [-0.02, 0, 0.02]
}
returns `None` if the resulting dict is empty.
Doesn't do anything if delta_timestamps is not specified or has already been resolve (as evidenced by
the data type of its values).
"""
delta_timestamps = {}
for key in ds_meta.features:
if key == "next.reward" and cfg.reward_delta_indices is not None:
delta_timestamps[key] = [i / ds_meta.fps for i in cfg.reward_delta_indices]
if key == "action" and cfg.action_delta_indices is not None:
delta_timestamps[key] = [i / ds_meta.fps for i in cfg.action_delta_indices]
if key.startswith("observation.") and cfg.observation_delta_indices is not None:
delta_timestamps[key] = [i / ds_meta.fps for i in cfg.observation_delta_indices]
if len(delta_timestamps) == 0:
delta_timestamps = None
return delta_timestamps
delta_timestamps = cfg.training.get("delta_timestamps")
if delta_timestamps is not None:
for key in delta_timestamps:
if isinstance(delta_timestamps[key], str):
# TODO(rcadene, alexander-soare): remove `eval` to avoid exploit
cfg.training.delta_timestamps[key] = eval(delta_timestamps[key])
def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDataset:
"""Handles the logic of setting up delta timestamps and image transforms before creating a dataset.
Args:
cfg (TrainPipelineConfig): A TrainPipelineConfig config which contains a DatasetConfig and a PreTrainedConfig.
Raises:
NotImplementedError: The MultiLeRobotDataset is currently deactivated.
Returns:
LeRobotDataset | MultiLeRobotDataset
def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotDataset:
"""
image_transforms = (
ImageTransforms(cfg.dataset.image_transforms) if cfg.dataset.image_transforms.enable else None
)
if isinstance(cfg.dataset.repo_id, str):
ds_meta = LeRobotDatasetMetadata(
cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
Args:
cfg: A Hydra config as per the LeRobot config scheme.
split: Select the data subset used to create an instance of LeRobotDataset.
All datasets hosted on [lerobot](https://huggingface.co/lerobot) contain only one subset: "train".
Thus, by default, `split="train"` selects all the available data. `split` aims to work like the
slicer in the hugging face datasets:
https://huggingface.co/docs/datasets/v2.19.0/loading#slice-splits
As of now, it only supports `split="train[:n]"` to load the first n frames of the dataset or
`split="train[n:]"` to load the last n frames. For instance `split="train[:1000]"`.
Returns:
The LeRobotDataset.
"""
if not isinstance(cfg.dataset_repo_id, (str, ListConfig)):
raise ValueError(
"Expected cfg.dataset_repo_id to be either a single string to load one dataset or a list of "
"strings to load multiple datasets."
)
delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
# A soft check to warn if the environment matches the dataset. Don't check if we are using a real world env (dora).
if not cfg.env.real_world:
if isinstance(cfg.dataset_repo_id, str):
dataset_repo_ids = [cfg.dataset_repo_id] # single dataset
else:
dataset_repo_ids = cfg.dataset_repo_id # multiple datasets
for dataset_repo_id in dataset_repo_ids:
if cfg.env.name not in dataset_repo_id:
logging.warning(
f"There might be a mismatch between your training dataset ({dataset_repo_id=}) and your "
f"environment ({cfg.env.name=})."
)
resolve_delta_timestamps(cfg)
# TODO(rcadene): add data augmentations
if isinstance(cfg.dataset_repo_id, str):
dataset = LeRobotDataset(
cfg.dataset.repo_id,
root=cfg.dataset.root,
episodes=cfg.dataset.episodes,
delta_timestamps=delta_timestamps,
image_transforms=image_transforms,
revision=cfg.dataset.revision,
video_backend=cfg.dataset.video_backend,
cfg.dataset_repo_id,
split=split,
delta_timestamps=cfg.training.get("delta_timestamps"),
)
else:
raise NotImplementedError("The MultiLeRobotDataset isn't supported for now.")
dataset = MultiLeRobotDataset(
cfg.dataset.repo_id,
# TODO(aliberts): add proper support for multi dataset
# delta_timestamps=delta_timestamps,
image_transforms=image_transforms,
video_backend=cfg.dataset.video_backend,
)
logging.info(
"Multiple datasets were provided. Applied the following index mapping to the provided datasets: "
f"{pformat(dataset.repo_id_to_index, indent=2)}"
cfg.dataset_repo_id, split=split, delta_timestamps=cfg.training.get("delta_timestamps")
)
if cfg.dataset.use_imagenet_stats:
for key in dataset.meta.camera_keys:
for stats_type, stats in IMAGENET_STATS.items():
dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)
if cfg.get("override_dataset_stats"):
for key, stats_dict in cfg.override_dataset_stats.items():
for stats_type, listconfig in stats_dict.items():
# example of stats_type: min, max, mean, std
stats = OmegaConf.to_container(listconfig, resolve=True)
dataset.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)
return dataset

View File

@@ -1,178 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import multiprocessing
import queue
import threading
from pathlib import Path
import numpy as np
import PIL.Image
import torch
def safe_stop_image_writer(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
dataset = kwargs.get("dataset")
image_writer = getattr(dataset, "image_writer", None) if dataset else None
if image_writer is not None:
print("Waiting for image writer to terminate...")
image_writer.stop()
raise e
return wrapper
def image_array_to_pil_image(image_array: np.ndarray, range_check: bool = True) -> PIL.Image.Image:
# TODO(aliberts): handle 1 channel and 4 for depth images
if image_array.ndim != 3:
raise ValueError(f"The array has {image_array.ndim} dimensions, but 3 is expected for an image.")
if image_array.shape[0] == 3:
# Transpose from pytorch convention (C, H, W) to (H, W, C)
image_array = image_array.transpose(1, 2, 0)
elif image_array.shape[-1] != 3:
raise NotImplementedError(
f"The image has {image_array.shape[-1]} channels, but 3 is required for now."
)
if image_array.dtype != np.uint8:
if range_check:
max_ = image_array.max().item()
min_ = image_array.min().item()
if max_ > 1.0 or min_ < 0.0:
raise ValueError(
"The image data type is float, which requires values in the range [0.0, 1.0]. "
f"However, the provided range is [{min_}, {max_}]. Please adjust the range or "
"provide a uint8 image with values in the range [0, 255]."
)
image_array = (image_array * 255).astype(np.uint8)
return PIL.Image.fromarray(image_array)
def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path):
try:
if isinstance(image, np.ndarray):
img = image_array_to_pil_image(image)
elif isinstance(image, PIL.Image.Image):
img = image
else:
raise TypeError(f"Unsupported image type: {type(image)}")
img.save(fpath)
except Exception as e:
print(f"Error writing image {fpath}: {e}")
def worker_thread_loop(queue: queue.Queue):
while True:
item = queue.get()
if item is None:
queue.task_done()
break
image_array, fpath = item
write_image(image_array, fpath)
queue.task_done()
def worker_process(queue: queue.Queue, num_threads: int):
threads = []
for _ in range(num_threads):
t = threading.Thread(target=worker_thread_loop, args=(queue,))
t.daemon = True
t.start()
threads.append(t)
for t in threads:
t.join()
class AsyncImageWriter:
"""
This class abstract away the initialisation of processes or/and threads to
save images on disk asynchrounously, which is critical to control a robot and record data
at a high frame rate.
When `num_processes=0`, it creates a threads pool of size `num_threads`.
When `num_processes>0`, it creates processes pool of size `num_processes`, where each subprocess starts
their own threads pool of size `num_threads`.
The optimal number of processes and threads depends on your computer capabilities.
We advise to use 4 threads per camera with 0 processes. If the fps is not stable, try to increase or lower
the number of threads. If it is still not stable, try to use 1 subprocess, or more.
"""
def __init__(self, num_processes: int = 0, num_threads: int = 1):
self.num_processes = num_processes
self.num_threads = num_threads
self.queue = None
self.threads = []
self.processes = []
self._stopped = False
if num_threads <= 0 and num_processes <= 0:
raise ValueError("Number of threads and processes must be greater than zero.")
if self.num_processes == 0:
# Use threading
self.queue = queue.Queue()
for _ in range(self.num_threads):
t = threading.Thread(target=worker_thread_loop, args=(self.queue,))
t.daemon = True
t.start()
self.threads.append(t)
else:
# Use multiprocessing
self.queue = multiprocessing.JoinableQueue()
for _ in range(self.num_processes):
p = multiprocessing.Process(target=worker_process, args=(self.queue, self.num_threads))
p.daemon = True
p.start()
self.processes.append(p)
def save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path):
if isinstance(image, torch.Tensor):
# Convert tensor to numpy array to minimize main process time
image = image.cpu().numpy()
self.queue.put((image, fpath))
def wait_until_done(self):
self.queue.join()
def stop(self):
if self._stopped:
return
if self.num_processes == 0:
for _ in self.threads:
self.queue.put(None)
for t in self.threads:
t.join()
else:
num_nones = self.num_processes * self.num_threads
for _ in range(num_nones):
self.queue.put(None)
for p in self.processes:
p.join()
if p.is_alive():
p.terminate()
self.queue.close()
self.queue.join_thread()
self._stopped = True

File diff suppressed because it is too large Load Diff

View File

@@ -1,384 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An online buffer for the online training loop in train.py
Note to maintainers: This duplicates some logic from LeRobotDataset and EpisodeAwareSampler. We should
consider converging to one approach. Here we have opted to use numpy.memmap to back the data buffer. It's much
faster than using HuggingFace Datasets as there's no conversion to an intermediate non-python object. Also it
supports in-place slicing and mutation which is very handy for a dynamic buffer.
"""
import os
from pathlib import Path
from typing import Any
import numpy as np
import torch
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
def _make_memmap_safe(**kwargs) -> np.memmap:
"""Make a numpy memmap with checks on available disk space first.
Expected kwargs are: "filename", "dtype" (must by np.dtype), "mode" and "shape"
For information on dtypes:
https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes-constructing
"""
if kwargs["mode"].startswith("w"):
required_space = kwargs["dtype"].itemsize * np.prod(kwargs["shape"]) # bytes
stats = os.statvfs(Path(kwargs["filename"]).parent)
available_space = stats.f_bavail * stats.f_frsize # bytes
if required_space >= available_space * 0.8:
raise RuntimeError(
f"You're about to take up {required_space} of {available_space} bytes available."
)
return np.memmap(**kwargs)
class OnlineBuffer(torch.utils.data.Dataset):
"""FIFO data buffer for the online training loop in train.py.
Follows the protocol of LeRobotDataset as much as is required to have it be used by the online training
loop in the same way that a LeRobotDataset would be used.
The underlying data structure will have data inserted in a circular fashion. Always insert after the
last index, and when you reach the end, wrap around to the start.
The data is stored in a numpy memmap.
"""
NEXT_INDEX_KEY = "_next_index"
OCCUPANCY_MASK_KEY = "_occupancy_mask"
INDEX_KEY = "index"
FRAME_INDEX_KEY = "frame_index"
EPISODE_INDEX_KEY = "episode_index"
TIMESTAMP_KEY = "timestamp"
IS_PAD_POSTFIX = "_is_pad"
def __init__(
self,
write_dir: str | Path,
data_spec: dict[str, Any] | None,
buffer_capacity: int | None,
fps: float | None = None,
delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
):
"""
The online buffer can be provided from scratch or you can load an existing online buffer by passing
a `write_dir` associated with an existing buffer.
Args:
write_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data key.
Note that if the files already exist, they are opened in read-write mode (used for training
resumption.)
data_spec: A mapping from data key to data specification, like {data_key: {"shape": tuple[int],
"dtype": np.dtype}}. This should include all the data that you wish to record into the buffer,
but note that "index", "frame_index" and "episode_index" are already accounted for by this
class, so you don't need to include them.
buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
system's available disk space when choosing this.
fps: Same as the fps concept in LeRobot dataset. Here it needs to be provided for the
delta_timestamps logic. You can pass None if you are not using delta_timestamps.
delta_timestamps: Same as the delta_timestamps concept in LeRobotDataset. This is internally
converted to dict[str, np.ndarray] for optimization purposes.
"""
self.set_delta_timestamps(delta_timestamps)
self._fps = fps
# Tolerance in seconds used to discard loaded frames when their timestamps are not close enough from
# the requested frames. It is only used when `delta_timestamps` is provided.
# minus 1e-4 to account for possible numerical error
self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
self._buffer_capacity = buffer_capacity
data_spec = self._make_data_spec(data_spec, buffer_capacity)
Path(write_dir).mkdir(parents=True, exist_ok=True)
self._data = {}
for k, v in data_spec.items():
self._data[k] = _make_memmap_safe(
filename=Path(write_dir) / k,
dtype=v["dtype"] if v is not None else None,
mode="r+" if (Path(write_dir) / k).exists() else "w+",
shape=tuple(v["shape"]) if v is not None else None,
)
@property
def delta_timestamps(self) -> dict[str, np.ndarray] | None:
return self._delta_timestamps
def set_delta_timestamps(self, value: dict[str, list[float]] | None):
"""Set delta_timestamps converting the values to numpy arrays.
The conversion is for an optimization in the __getitem__. The loop is much slower if the arrays
need to be converted into numpy arrays.
"""
if value is not None:
self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
else:
self._delta_timestamps = None
def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
"""Makes the data spec for np.memmap."""
if any(k.startswith("_") for k in data_spec):
raise ValueError(
"data_spec keys should not start with '_'. This prefix is reserved for internal logic."
)
preset_keys = {
OnlineBuffer.INDEX_KEY,
OnlineBuffer.FRAME_INDEX_KEY,
OnlineBuffer.EPISODE_INDEX_KEY,
OnlineBuffer.TIMESTAMP_KEY,
}
if len(intersection := set(data_spec).intersection(preset_keys)) > 0:
raise ValueError(
f"data_spec should not contain any of {preset_keys} as these are handled internally. "
f"The provided data_spec has {intersection}."
)
complete_data_spec = {
# _next_index will be a pointer to the next index that we should start filling from when we add
# more data.
OnlineBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": ()},
# Since the memmap is initialized with all-zeros, this keeps track of which indices are occupied
# with real data rather than the dummy initialization.
OnlineBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
OnlineBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
OnlineBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
OnlineBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
OnlineBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
}
for k, v in data_spec.items():
complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
return complete_data_spec
def add_data(self, data: dict[str, np.ndarray]):
"""Add new data to the buffer, which could potentially mean shifting old data out.
The new data should contain all the frames (in order) of any number of episodes. The indices should
start from 0 (note to the developer: this can easily be generalized). See the `rollout` and
`eval_policy` functions in `eval.py` for more information on how the data is constructed.
Shift the incoming data index and episode_index to continue on from the last frame. Note that this
will be done in place!
"""
if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
raise ValueError(f"Missing data keys: {missing_keys}")
new_data_length = len(data[self.data_keys[0]])
if not all(len(data[k]) == new_data_length for k in self.data_keys):
raise ValueError("All data items should have the same length")
next_index = self._data[OnlineBuffer.NEXT_INDEX_KEY]
# Sanity check to make sure that the new data indices start from 0.
assert data[OnlineBuffer.EPISODE_INDEX_KEY][0].item() == 0
assert data[OnlineBuffer.INDEX_KEY][0].item() == 0
# Shift the incoming indices if necessary.
if self.num_frames > 0:
last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1]
last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1]
data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1
data[OnlineBuffer.INDEX_KEY] += last_data_index + 1
# Insert the new data starting from next_index. It may be necessary to wrap around to the start.
n_surplus = max(0, new_data_length - (self._buffer_capacity - next_index))
for k in self.data_keys:
if n_surplus == 0:
slc = slice(next_index, next_index + new_data_length)
self._data[k][slc] = data[k]
self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][slc] = True
else:
self._data[k][next_index:] = data[k][:-n_surplus]
self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][next_index:] = True
self._data[k][:n_surplus] = data[k][-n_surplus:]
if n_surplus == 0:
self._data[OnlineBuffer.NEXT_INDEX_KEY] = next_index + new_data_length
else:
self._data[OnlineBuffer.NEXT_INDEX_KEY] = n_surplus
@property
def data_keys(self) -> list[str]:
keys = set(self._data)
keys.remove(OnlineBuffer.OCCUPANCY_MASK_KEY)
keys.remove(OnlineBuffer.NEXT_INDEX_KEY)
return sorted(keys)
@property
def fps(self) -> float | None:
return self._fps
@property
def num_episodes(self) -> int:
return len(
np.unique(self._data[OnlineBuffer.EPISODE_INDEX_KEY][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
)
@property
def num_frames(self) -> int:
return np.count_nonzero(self._data[OnlineBuffer.OCCUPANCY_MASK_KEY])
def __len__(self):
return self.num_frames
def _item_to_tensors(self, item: dict) -> dict:
item_ = {}
for k, v in item.items():
if isinstance(v, torch.Tensor):
item_[k] = v
elif isinstance(v, np.ndarray):
item_[k] = torch.from_numpy(v)
else:
item_[k] = torch.tensor(v)
return item_
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
if idx >= len(self) or idx < -len(self):
raise IndexError
item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
if self.delta_timestamps is None:
return self._item_to_tensors(item)
episode_index = item[OnlineBuffer.EPISODE_INDEX_KEY]
current_ts = item[OnlineBuffer.TIMESTAMP_KEY]
episode_data_indices = np.where(
np.bitwise_and(
self._data[OnlineBuffer.EPISODE_INDEX_KEY] == episode_index,
self._data[OnlineBuffer.OCCUPANCY_MASK_KEY],
)
)[0]
episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][episode_data_indices]
for data_key in self.delta_timestamps:
# Note: The logic in this loop is copied from `load_previous_and_future_frames`.
# Get timestamps used as query to retrieve data of previous/future frames.
query_ts = current_ts + self.delta_timestamps[data_key]
# Compute distances between each query timestamp and all timestamps of all the frames belonging to
# the episode.
dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
argmin_ = np.argmin(dist, axis=1)
min_ = dist[np.arange(dist.shape[0]), argmin_]
is_pad = min_ > self.tolerance_s
# Check violated query timestamps are all outside the episode range.
assert (
(query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
).all(), (
f"One or several timestamps unexpectedly violate the tolerance ({min_} > {self.tolerance_s=}"
") inside the episode range."
)
# Load frames for this data key.
item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
item[f"{data_key}{OnlineBuffer.IS_PAD_POSTFIX}"] = is_pad
return self._item_to_tensors(item)
def get_data_by_key(self, key: str) -> torch.Tensor:
"""Returns all data for a given data key as a Tensor."""
return torch.from_numpy(self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
def compute_sampler_weights(
offline_dataset: LeRobotDataset,
offline_drop_n_last_frames: int = 0,
online_dataset: OnlineBuffer | None = None,
online_sampling_ratio: float | None = None,
online_drop_n_last_frames: int = 0,
) -> torch.Tensor:
"""Compute the sampling weights for the online training dataloader in train.py.
Args:
offline_dataset: The LeRobotDataset used for offline pre-training.
online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
online_dataset: The OnlineBuffer used in online training.
online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
online dataset is provided, this value must also be provided.
online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
dataset.
Returns:
Tensor of weights for [offline_dataset; online_dataset], normalized to 1.
Notes to maintainers:
- This duplicates some logic from EpisodeAwareSampler. We should consider converging to one approach.
- When used with `torch.utils.data.WeightedRandomSampler`, it could completely replace
`EpisodeAwareSampler` as the online dataset related arguments are optional. The only missing feature
is the ability to turn shuffling off.
- Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
included here to avoid adding complexity.
"""
if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
if (online_dataset is None) ^ (online_sampling_ratio is None):
raise ValueError(
"`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
)
offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
weights = []
if len(offline_dataset) > 0:
offline_data_mask_indices = []
for start_index, end_index in zip(
offline_dataset.episode_data_index["from"],
offline_dataset.episode_data_index["to"],
strict=True,
):
offline_data_mask_indices.extend(
range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
)
offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
weights.append(
torch.full(
size=(len(offline_dataset),),
fill_value=offline_sampling_ratio / offline_data_mask.sum(),
)
* offline_data_mask
)
if online_dataset is not None and len(online_dataset) > 0:
online_data_mask_indices = []
episode_indices = online_dataset.get_data_by_key("episode_index")
for episode_idx in torch.unique(episode_indices):
where_episode = torch.where(episode_indices == episode_idx)
start_index = where_episode[0][0]
end_index = where_episode[0][-1] + 1
online_data_mask_indices.extend(
range(start_index.item(), end_index.item() - online_drop_n_last_frames)
)
online_data_mask = torch.zeros(len(online_dataset), dtype=torch.bool)
online_data_mask[torch.tensor(online_data_mask_indices)] = True
weights.append(
torch.full(
size=(len(online_dataset),),
fill_value=online_sampling_ratio / online_data_mask.sum(),
)
* online_data_mask
)
weights = torch.cat(weights)
if weights.sum() == 0:
weights += 1 / len(weights)
else:
weights /= weights.sum()
return weights

View File

@@ -0,0 +1,85 @@
https://drive.google.com/file/d/1_SOJkgfP5yZyVjMhTt3nwhvyUjcnlI51/view?usp=drive_link
https://drive.google.com/file/d/1rmgN8UUzph1qwJnzG1d-uOafodn-gLvb/view?usp=drive_link
https://drive.google.com/file/d/1NYQ-XxsBVinB6dUoZmVWweT83367P3i2/view?usp=drive_link
https://drive.google.com/file/d/1oAv_j74zxxCJieMG7r5Vl2BeHK1__3s3/view?usp=drive_link
https://drive.google.com/file/d/1wFUJQROsrTJt64YRuIeExhFjr2wnK5uu/view?usp=drive_link
https://drive.google.com/file/d/1KzL3Tt0Le7jVl58XVRUcmigmXjyiuhbK/view?usp=drive_link
https://drive.google.com/file/d/1qy_YBladeHtianSSGtgAPSHtMin7msvf/view?usp=drive_link
https://drive.google.com/file/d/1rA_F0V_qL_nyuC_0aBKCisF4-0TIkF2Y/view?usp=drive_link
https://drive.google.com/file/d/1hw-8qMpz9VgSt62XoASqNRuPECpCwJQP/view?usp=drive_link
https://drive.google.com/file/d/1BpHOl9rKMzdvNGka6js7C0s40hH6vnDA/view?usp=drive_link
https://drive.google.com/file/d/1PazhkhiDnJ-OUMyDVDFxEZNKQQqHiNWS/view?usp=drive_link
https://drive.google.com/file/d/1lZ665R6ATl57dypxH4dGJ2NSt6XYnbuz/view?usp=drive_link
https://drive.google.com/file/d/1V9HzLaf-tlG15wUzT7KrTDCS_z1vi5NV/view?usp=drive_link
https://drive.google.com/file/d/1aKauWiXoKqbNwn_2xs4MrmLlaNYlVNmO/view?usp=drive_link
https://drive.google.com/file/d/1WVD5DFhriO1YmmOgiVHhacR6HWoTPxav/view?usp=drive_link
https://drive.google.com/file/d/1_X43WgeBAsfkhH9EmpyPki8U9joMeAGC/view?usp=drive_link
https://drive.google.com/file/d/1t8x0GqWoNKWtnBsB7_D40Z34nL9ak4kf/view?usp=drive_link
https://drive.google.com/file/d/15V_f26WaKOXjKnq2T3HRWAmtQUi4lbu2/view?usp=drive_link
https://drive.google.com/file/d/11VFIAsiSDsMOBANgrOcZBpKB9AFWnLy7/view?usp=drive_link
https://drive.google.com/file/d/1M0NS7vVaxJv3FHnuRYtdwTFYF7We4LxP/view?usp=drive_link
https://drive.google.com/file/d/1mR0OItTNqFnVLoczcyKYlm6drAy778lO/view?usp=drive_link
https://drive.google.com/file/d/1NbVFWDQAh-z4JJ4D-Zw6Lps9kdvpqh2j/view?usp=drive_link
https://drive.google.com/file/d/1JQoZGBzl4W3QG26-n39tefcGN0fDRMbB/view?usp=drive_link
https://drive.google.com/file/d/1VBjHl-TvZpncopvasIP5G9gecbB2a5f6/view?usp=drive_link
https://drive.google.com/file/d/1VzSf6zaB21nahm7MsPwroXbJ84NIwq0b/view?usp=drive_link
https://drive.google.com/file/d/1OtNnfMEydNtZOcivs4k6E_uJSpf8PkGy/view?usp=drive_link
https://drive.google.com/file/d/14nVvpvsrFr_03Pa_N7MKzwnRwibOUYM6/view?usp=drive_link
https://drive.google.com/file/d/1M8li6duiO2r3lv_9HhF_XJn0oZUIEK5F/view?usp=drive_link
https://drive.google.com/file/d/1Cpzea6fO14lxAaNfSBifqoa4ekhCiLD1/view?usp=drive_link
https://drive.google.com/file/d/1mbxRTm5vlbsY9UJ0jfjM6j9D7kPJjBpG/view?usp=drive_link
https://drive.google.com/file/d/1RXD1i6IfWsHRlCxVmG04h2h5Ycm_WwZN/view?usp=drive_link
https://drive.google.com/file/d/1QFqFSwDGOk1BkgGmqgCcc2BRWnJ6R3MA/view?usp=drive_link
https://drive.google.com/file/d/1bFqWR8DQM0ZUxxtS2bl-RANQvukeFLzp/view?usp=drive_link
https://drive.google.com/file/d/1pR-rH3yNGoyPdD4hJ6-3lXQ-PstBx9du/view?usp=drive_link
https://drive.google.com/file/d/107OAwLY-hva9HeQLIK7VCh-ytdDabVjr/view?usp=drive_link
https://drive.google.com/file/d/1Tpl08QOaSZ37GTO4awFWSdD8wBR9xdlT/view?usp=drive_link
https://drive.google.com/file/d/1MR164AOM-0S1T6RX8xKTV2IHyaCvpqAW/view?usp=drive_link
https://drive.google.com/file/d/1_wknJfVnStIhJ82lU_QtcrwahsqYIsr8/view?usp=drive_link
https://drive.google.com/file/d/1ZuEktWrbYkTx0l5pj3WiZ2CJrfbDOHNo/view?usp=drive_link
https://drive.google.com/file/d/15G_10hkkkq6yxvyI5NGZirlF-RzduR2F/view?usp=drive_link
https://drive.google.com/file/d/1DBKxg3ONqh7dhLuX6oh1Yyo2x383V1Hp/view?usp=drive_link
https://drive.google.com/file/d/1B5iDBkTUr5vopDddV_fHud18SqAHhauS/view?usp=drive_link
https://drive.google.com/file/d/1acwFV0eenRkki1QcjSKH5xqOtys-P3Pr/view?usp=drive_link
https://drive.google.com/file/d/1S47BI83xyrh-FKXsvAQqer98Biu_p8XK/view?usp=drive_link
https://drive.google.com/file/d/1JL6DmBZl3uyq9dyLfgSqtGF06e7E9JwM/view?usp=drive_link
https://drive.google.com/file/d/16WvRS4Kjog8Pxgr0E3sGGnI01YwL9Uql/view?usp=drive_link
https://drive.google.com/file/d/12ttGqL33IPWg0-s1SD44rr22M6LiSQBr/view?usp=drive_link
https://drive.google.com/file/d/1OyZqqnldTU_DliRbr6x0C4a_iWPwIN7j/view?usp=drive_link
https://drive.google.com/file/d/1oYk00IpLnR9fesLfD15Ebe7nVBffEbcS/view?usp=drive_link
https://drive.google.com/file/d/1eyE2-MQduCEqCd-5_kl5zsoOEERAzpZD/view?usp=drive_link
https://drive.google.com/file/d/1ir1Ya-vO0d97pfvbePlUeuKTTRc0qIMU/view?usp=drive_link
https://drive.google.com/file/d/1hOi-JnqlMt47gVnLZHMTqeojyYVErohl/view?usp=drive_link
https://drive.google.com/file/d/1NFFw5_PqigQ7xGqsL-MNq2B1r5yAscCf/view?usp=drive_link
https://drive.google.com/file/d/1uftq1-Zlh8d2sNLWrlVcKYQUwZTD7o24/view?usp=drive_link
https://drive.google.com/file/d/1-ax19dSLPacVgk000T-m3l4flPcg07pM/view?usp=drive_link
https://drive.google.com/file/d/126y-lgn86-ZmCz8hooF1THKJGGObw3OB/view?usp=drive_link
https://drive.google.com/file/d/1JiDniK0VmDIkk92AbBILb8J2Ba59PWML/view?usp=drive_link
https://drive.google.com/file/d/1kr8nPIRljiU0R4J9SMgj80o1FPQxzu9z/view?usp=drive_link
https://drive.google.com/file/d/1bbThWRij1pKBh_kFgV8FwK0sXtTHBoLX/view?usp=drive_link
https://drive.google.com/file/d/1WenzDW6lxk1xkOFm-OiGFfc0ROskAuKU/view?usp=drive_link
https://drive.google.com/file/d/1MiKRzuzUn1yN-k_6kPJJzIGy7dT-nnsD/view?usp=drive_link
https://drive.google.com/file/d/17rRg2tcmB-gNhQ0KoZJQmNfyFeoij1jH/view?usp=drive_link
https://drive.google.com/file/d/11mokBpvrY3ld6sY5WztREtJ1jgqfQV70/view?usp=drive_link
https://drive.google.com/file/d/1Il_6IOx9NDp1bX_KHizJfBwzTufTmn86/view?usp=drive_link
https://drive.google.com/file/d/1KswtJGsxJ7eeBDAmNA_aeLjOxcH6MIxa/view?usp=drive_link
https://drive.google.com/file/d/1gzMhi5uWu4C3Y6WbQ3L-08V96GxTZrRR/view?usp=drive_link
https://drive.google.com/file/d/1nRQFtaBxfUCYc2W90Qibh0kHCt6YQCfc/view?usp=drive_link
https://drive.google.com/file/d/1vs-gyW-KheqHbUATwAhA2mmR9GOGw7f_/view?usp=drive_link
https://drive.google.com/file/d/1MuxzGOA2fgLaHryq82KkQumtuRJGcUOC/view?usp=drive_link
https://drive.google.com/file/d/1IIwxZnGlqrXLUXqG6yMO0r7uhCvhpk9e/view?usp=drive_link
https://drive.google.com/file/d/1vE7XPyaFcXP4DtTY5Y9WKIt7zWgmX-Cr/view?usp=drive_link
https://drive.google.com/file/d/1j-bIV09gr21RC3-x1N_pK4RPLV3fmWKz/view?usp=drive_link
https://drive.google.com/file/d/1t3nW1rD3S-EL0Oymb5U7ZAj5UMkydkln/view?usp=drive_link
https://drive.google.com/file/d/14hbfHCdMKtJZ41F9CQReMec2jeRFTOqR/view?usp=drive_link
https://drive.google.com/file/d/1x-hUyOSne5BW0AzQ3W6_Pf4g5yXQWi9M/view?usp=drive_link
https://drive.google.com/file/d/1sw9JqRg6E-3P84I3ZhzTrJMu0vuiaMmP/view?usp=drive_link
https://drive.google.com/file/d/1LuqhQlL4MGZhB_6THmkovRxrlP26BbdC/view?usp=drive_link
https://drive.google.com/file/d/15C5K6v_lkjnMSmUvVyqHQKwh2N166e7K/view?usp=drive_link
https://drive.google.com/file/d/1ns_9eSsQeeoZ10nlbkLy8tu0GmJFSnkt/view?usp=drive_link
https://drive.google.com/file/d/1NpzWJeK6CqjxzjIMYe6aYdX8xGsQwD4o/view?usp=drive_link
https://drive.google.com/file/d/1NMLezwufKJ9_8xTc9KQThSzVVD71B9Ui/view?usp=drive_link
https://drive.google.com/file/d/1aa71DCUqs6oXlIxX35jgsmsgm-NlDxPV/view?usp=drive_link
https://drive.google.com/file/d/1UJzkIZzAL0j-D5YQBnoq7mHvttASy12O/view?usp=drive_link
https://drive.google.com/file/d/1nPgx36HIJFb7oI94VbRzWjpPP2GANxzG/view?usp=drive_link
https://drive.google.com/file/d/1NovAP-KVJjqcuvWy3d6G4ptGGAIDqcCx/view?usp=drive_link

View File

@@ -0,0 +1,55 @@
https://drive.google.com/file/d/11M3Ye0r5agMaaicPbVGD0q2Hb3rGklbb/view?usp=drive_link
https://drive.google.com/file/d/1-tx7SvYYgSvXCvnf_EI2OVdwK-CkFY6S/view?usp=drive_link
https://drive.google.com/file/d/1EWJunmOpMHaU1hE106wwpbkGYcjQXYAF/view?usp=drive_link
https://drive.google.com/file/d/1IDn95Z7FSiCckrSENtGV4u3RyFHNQSDY/view?usp=drive_link
https://drive.google.com/file/d/1CwzvWj1i7QOtqrZvsCZ6BdZaKNDfpN32/view?usp=drive_link
https://drive.google.com/file/d/1HvAvlhm77nAD3Td24QPSeq8lw-Rl_aOh/view?usp=drive_link
https://drive.google.com/file/d/1t-suKYOPhXH666RpAYNRp2QU_DOy3AeM/view?usp=drive_link
https://drive.google.com/file/d/18xpKgWh7RWyjMN5PkLTOo-AxsAadAuRw/view?usp=drive_link
https://drive.google.com/file/d/1oci5Eto-ztv-AQNz8EnwZveBIhxvk-xJ/view?usp=drive_link
https://drive.google.com/file/d/1Y-t_4vxdE6NpHO0DLJR8f3mD0Q-Wj5-c/view?usp=drive_link
https://drive.google.com/file/d/1lylRqbbbB8bgtpsBWMPACmHJreuKmllv/view?usp=drive_link
https://drive.google.com/file/d/1yliSyMig_NXShWfQx6qyW7Ijf2Y5lFK6/view?usp=drive_link
https://drive.google.com/file/d/1XXhwJsJbeb7KXAooGvJapnm9bjnGUmxS/view?usp=drive_link
https://drive.google.com/file/d/1_xs1f3hW2JArKyvfF7UWubWjyROGTLs6/view?usp=drive_link
https://drive.google.com/file/d/1WVEHpr6EqKCZbkHapQSTXJq4xE4SWFT-/view?usp=drive_link
https://drive.google.com/file/d/1RqOHv9pEQGvW8NUA7ynffFmG999TL_Az/view?usp=drive_link
https://drive.google.com/file/d/1cu5AgD2gh-uA3PFJmzxxzNaF3qOSlYY1/view?usp=drive_link
https://drive.google.com/file/d/1SsrXqiPclNrnYToPZ9Uq-k3y0C4qdHT1/view?usp=drive_link
https://drive.google.com/file/d/1-J7EXf0vjkLIfSqT8ICEsP6CTjzSLBop/view?usp=drive_link
https://drive.google.com/file/d/11O7ewUmoZXfyyKjy_6B5RW4DpjICxqBT/view?usp=drive_link
https://drive.google.com/file/d/1iic44kZoCsjNsfAz2cMstZ9-WQvAhblF/view?usp=drive_link
https://drive.google.com/file/d/1yLV1lVX-2WnWQldGlnQZ0x7QBuDiVkL3/view?usp=drive_link
https://drive.google.com/file/d/1Tybp9ru98TTbGn4eyROpUQwDFuALWXmk/view?usp=drive_link
https://drive.google.com/file/d/13E9OTMiipVJByDs5-J19oWwAz7l94LTN/view?usp=drive_link
https://drive.google.com/file/d/1EeTpJQdMSliw4JzSMtJ6CyTvVdexjM4M/view?usp=drive_link
https://drive.google.com/file/d/1NHyNwoFqzeAu-1_PSpq5JfxaiD_xbpn9/view?usp=drive_link
https://drive.google.com/file/d/1fJcS0phDp4xm_FyGaJ5wr9Pe4KqtHaxD/view?usp=drive_link
https://drive.google.com/file/d/12AqrLUaewDPEcFRqPZeZFb_TQ0Lfi3At/view?usp=drive_link
https://drive.google.com/file/d/1x_hd4Qsq1oJS-aj2t3qM7WbbV7KZj05b/view?usp=drive_link
https://drive.google.com/file/d/14OUSUArmsB068hs6BuEIXQhI1Cyz8Sf0/view?usp=drive_link
https://drive.google.com/file/d/16zlzh1T5zeUJQnFf382NXkFEKEnDub4O/view?usp=drive_link
https://drive.google.com/file/d/1IbDltmN-NEFCNtr1TO4ILxEgQ94rtjWv/view?usp=drive_link
https://drive.google.com/file/d/15gmlf8Gx9455pZ1AlqcCSwh3nDPxMzSr/view?usp=drive_link
https://drive.google.com/file/d/1qHpRL1oZfIMo_vxnm8qfwQ-7l0BZIVva/view?usp=drive_link
https://drive.google.com/file/d/1H1xskIgiFZivkYn23rMzH3xePGOh3VTC/view?usp=drive_link
https://drive.google.com/file/d/1avls6Pv0kYiCMNVknbc1zQsgy64MUDMM/view?usp=drive_link
https://drive.google.com/file/d/1MmWVgCj5khc8KMIifmt3EzF1o-CtPyyn/view?usp=drive_link
https://drive.google.com/file/d/1U0kCc_xqW0WNppf4sbnK14euWKdPZtzB/view?usp=drive_link
https://drive.google.com/file/d/16CaEyQscOuhLj23PEGDTL9DeyNkohkMn/view?usp=drive_link
https://drive.google.com/file/d/1Iu8uM6UUJ0zW8tvN-9UiOe_4oSNzEutg/view?usp=drive_link
https://drive.google.com/file/d/1UImqiBaIxCR-1DNJaZhHqeHhaySOtVIr/view?usp=drive_link
https://drive.google.com/file/d/1VpU2V_leIoRIyv_lAvE7eLHBG8DxCTnp/view?usp=drive_link
https://drive.google.com/file/d/1_Q8J27OT3Xby7QY6yHvIJauFRWEMxkRm/view?usp=drive_link
https://drive.google.com/file/d/1bantmVo1L9Xz4tbiNw_a1UC2Z_HPO1wT/view?usp=drive_link
https://drive.google.com/file/d/1IRIXMJMCBDkBjbaHvAlEiBogSvZ1jK_3/view?usp=drive_link
https://drive.google.com/file/d/1mAHXKjiFbjwydypW2t5Lv8_H5x6nHegl/view?usp=drive_link
https://drive.google.com/file/d/1SfyY796fLrBCMY39OcyuxZafqSCRZPZk/view?usp=drive_link
https://drive.google.com/file/d/1X-44sZ8CcfzIskc0dvSx882o1yFhHaZB/view?usp=drive_link
https://drive.google.com/file/d/1BOIWCCCk6DLD4Bmvc75ZbbLi9AQm-1ao/view?usp=drive_link
https://drive.google.com/file/d/1RuyDtRE1kk76sw-wP8vx5SgLoPF3PA_H/view?usp=drive_link
https://drive.google.com/file/d/1c4eoQiBbGuy3CTAQDUSkd84Ponh1roAQ/view?usp=drive_link
https://drive.google.com/file/d/19PXB9z4Ljq6dsbf9TqcOrrP5SRbw2Tc_/view?usp=drive_link
https://drive.google.com/file/d/1nn1VVZVoIXWdYDozR7XHXE4mPLQG80PQ/view?usp=drive_link
https://drive.google.com/file/d/1MBdFGOKPV8GUhwoSsJ_Ky3qAMLM2Bv3K/view?usp=drive_link
https://drive.google.com/file/d/1of3k_M-7Nh3I1TndcWedxK4ca9dn8Sc5/view?usp=drive_link

View File

@@ -0,0 +1,20 @@
https://drive.google.com/file/d/12ctkOAdkCNGN1JLbZb5ww3XTBn2LFpGI/view?usp=drive_link
https://drive.google.com/file/d/1G_Vd46_4fq6O64gHHjUbJX5Ld44ZZx0y/view?usp=drive_link
https://drive.google.com/file/d/1uKgUy73B3xBogQAOUhfZjO0X5qZGsi2c/view?usp=drive_link
https://drive.google.com/file/d/1fu9cIrfI-fE2LhdGUxbx7-8Ci_PF8Ypm/view?usp=drive_link
https://drive.google.com/file/d/1Ygk9ZPJzx8xw2A9JF3NHbJ44TqnvSTQR/view?usp=drive_link
https://drive.google.com/file/d/18m5xPuccNsEB20WPshm3zhxmXc6k63ED/view?usp=drive_link
https://drive.google.com/file/d/1DiqqxC44rriviRQpqogcv0-EB-Y6nr9g/view?usp=drive_link
https://drive.google.com/file/d/1qPdaoTVDizJXkfXLioWU7iJ8hqCXSyOQ/view?usp=drive_link
https://drive.google.com/file/d/1Fj9kIA_mG7f67WFfACJEaZ7izcHG7vUm/view?usp=drive_link
https://drive.google.com/file/d/1WpYehZnI2P7dUdJPfkE-ij1rqCnjZEbB/view?usp=drive_link
https://drive.google.com/file/d/1_zwWkT4jPyzB38STWb6whlzsPzXmfA9r/view?usp=drive_link
https://drive.google.com/file/d/1U6-J4I_fPlSFFGfhZPxS5_YzKXwXIZYp/view?usp=drive_link
https://drive.google.com/file/d/1pRhxxcTfZp5tQo_EScvJUwfc3amiS6Vk/view?usp=drive_link
https://drive.google.com/file/d/1lWLntqra83RlYU_gN7Vostnfydf6gutd/view?usp=drive_link
https://drive.google.com/file/d/1vIBKo0x-NYEHV1FvRpco1lQMpRdAWAIL/view?usp=drive_link
https://drive.google.com/file/d/1pdrLV3JTQou_XH0Aap61Ssf60iVKm1jJ/view?usp=drive_link
https://drive.google.com/file/d/1QTsLoQ7SwmKdQHjBGVDaR2uTwfFwtrOf/view?usp=drive_link
https://drive.google.com/file/d/1Gytai8M_12J36GY6L_TulEcOC-035jwS/view?usp=drive_link
https://drive.google.com/file/d/14LJudNc629NT-i8xreXtzl27ce_DxOFJ/view?usp=drive_link
https://drive.google.com/file/d/1sBvPCODbzxGAI0S3lgN5cSG9Go3lRi00/view?usp=drive_link

View File

@@ -0,0 +1,18 @@
https://drive.google.com/file/d/1MJn9GbC8p9lN4gC9KDMLEkTkP_gGpXj0/view?usp=drive_link
https://drive.google.com/file/d/1-4LXgjl7ZCOgp-8GCJmFRD8OeqN5Jf7-/view?usp=drive_link
https://drive.google.com/file/d/1Ho06Ce0SPbqU3juaMxNUwAt3zCRLGC8W/view?usp=drive_link
https://drive.google.com/file/d/1ivHoj7_7olBSxH-Y8kqXEW7ttITK-45j/view?usp=drive_link
https://drive.google.com/file/d/1qjY4hM_IvZ8cq2II_n9MeJbvyeuN4oBP/view?usp=drive_link
https://drive.google.com/file/d/1rKVhO_f92-7sw13T8hTVrza3B9oAVgoy/view?usp=drive_link
https://drive.google.com/file/d/1pcLPHO8fBkc1-CRa88tyQtEueE4xiXNi/view?usp=drive_link
https://drive.google.com/file/d/1Vev_chCsIeEdvQ8poEYNsOJFGy_QU8kZ/view?usp=drive_link
https://drive.google.com/file/d/1l5G4zpRkxSLCQjvGPYSN4zfCvVRQuzMz/view?usp=drive_link
https://drive.google.com/file/d/14vgthE1eoakXkr2-DRw50E6lAqYOiUuE/view?usp=drive_link
https://drive.google.com/file/d/17nPSmKKmgQ2B7zkzWrZYiLM3RBuFod82/view?usp=drive_link
https://drive.google.com/file/d/1QcDsxplVvb_ID9BVrihl5FvlC-j7waXi/view?usp=drive_link
https://drive.google.com/file/d/18pEejBpI-eEVaWAAjBCyC0vgbX3T1Esj/view?usp=drive_link
https://drive.google.com/file/d/1H8eH6_IRODtEFT6WoM77ltR5OoOrqXmI/view?usp=drive_link
https://drive.google.com/file/d/1IWlpFRZhoxyG4nS13CWK4leZVk5wbNx4/view?usp=drive_link
https://drive.google.com/file/d/1PbZA8_OCGmMLxNP9xbkLRSChniL4uGxl/view?usp=drive_link
https://drive.google.com/file/d/1p9XAdmG2f_WeflNO4DIJ_tr1rK6M9B4B/view?usp=drive_link
https://drive.google.com/file/d/1nS59Et1cNAvKo3Y4SeSGRuZD5TvBbCF3/view?usp=drive_link

View File

@@ -0,0 +1 @@
https://drive.google.com/drive/folders/1S8eFg98IaGAIKVZ8QFWG1bx4mHa-O204

View File

@@ -0,0 +1,4 @@
https://drive.google.com/drive/folders/1tC_g1AJ8lglBLY-fjsQrG6DMBa3Ucp-0
https://drive.google.com/file/d/1fG_Yi2MJrFjiUVN3XoiWXLtTxHlwwaDv/view?usp=drive_link
https://drive.google.com/file/d/1WX32VWfzzX3Blmd06DRxLwFbMJfVe7P4/view?usp=drive_link
https://drive.google.com/file/d/18onsX3vXg3xkFwP5bVUCjdV4n9TRn0C9/view?usp=drive_link

View File

@@ -0,0 +1,3 @@
https://drive.google.com/drive/folders/1RgyD0JgTX30H4IM5XZn8I3zSV_mr8pyF
https://drive.google.com/file/d/18Cudl6nikDtgRolea7je8iF_gGKzynOP/view?usp=drive_link
https://drive.google.com/file/d/1C1kZYyROzs-PrLc0SkDgUgMi4-L3lauE/view?usp=drive_link

View File

@@ -0,0 +1,3 @@
https://drive.google.com/drive/folders/1TsojQQSXtHEoGnqgJ3gmpPQR2DPLtS2N
https://drive.google.com/file/d/1wfMSZ24oOh5KR_0aaP3Cnu_c4ZCveduB/view?usp=drive_link
https://drive.google.com/file/d/17EuCUWS6uCCr6yyNzpXdcdE-_TTNCKtf/view?usp=drive_link

View File

@@ -0,0 +1,3 @@
https://drive.google.com/drive/folders/1sc-E4QYW7A0o23m1u2VWNGVq5smAsfCo
https://drive.google.com/file/d/18smMymtr8tIxaNUQ61gW6dG50pt3MvGq/view?usp=drive_link
https://drive.google.com/file/d/1Nk7l53d9sJoGDBKAOnNrExX5nLacATc6/view?usp=drive_link

View File

@@ -0,0 +1,3 @@
https://drive.google.com/drive/folders/1aRyoOhQwxhyt1J8XgEig4s6kzaw__LXj
https://drive.google.com/file/d/1pnGIOd-E4-rhz2P3VxpknMKRZCoKt6eI/view?usp=drive_link
https://drive.google.com/file/d/1GKReZHrXU73NMiC5zKCq_UtqPVtYq8eo/view?usp=drive_link

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/19qS_n7vKgDcPeTMnvDHQ5-n73xEbJz5D
https://drive.google.com/file/d/1oC31By0A2bsBeHyUwBdQw1z4ng6yi9Za/view?usp=drive_link

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/1m5rQ6UVH8Q9RQp_6c0CxkQ88-L-ScO7q
https://drive.google.com/file/d/1wHz2qcmwcVG0C0CZ9MjQDQcmj4OY9_a3/view?usp=drive_link

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/1seQGay470nGQ-knBI5TjsTr8iL9Qws5q
https://drive.google.com/file/d/1T89hSX5U99wLGvGTE7yUBaQPOpyj6Sai/view?usp=drive_link

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/1t3eDc5Rg0DveyRe8oTm6Dia_FYU5mXyf
https://drive.google.com/file/d/1TXFaduTakvS0ZWJqKCX-HIvYglum_5CY/view?usp=drive_link

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/1Z9X3DNzd6LS0FFjQemNUMoMA5yk5VQOh
https://drive.google.com/file/d/1Wlyc0vTkjXuWB6zbaVOWhEfD7BmPgUV_/view?usp=drive_link

View File

@@ -0,0 +1,53 @@
https://drive.google.com/drive/folders/1DYgB4ifX4uIid9m9jnC0Zdz8Nf7ZC0fc
https://drive.google.com/file/d/1Eb-NRNk_FmVleCbU_Ng5Y4dfcjTKN7Rv/view?usp=drive_link
https://drive.google.com/file/d/1dkhjEADakT-44l9jf-nK4x89kr4yG_qb/view?usp=drive_link
https://drive.google.com/file/d/14hDhgcZkVqNExGb4tIXpSjMshhqZETch/view?usp=drive_link
https://drive.google.com/file/d/1zVMEHpHbuNyP5A_lYU7RPSLB-4V0yfZw/view?usp=drive_link
https://drive.google.com/file/d/1JtgDjBvy7FnRpFzrx_foC3quorYQFAR-/view?usp=drive_link
https://drive.google.com/file/d/1EHdneB6F-PP0dQlX8qPaXbxmKoBy_YwO/view?usp=drive_link
https://drive.google.com/file/d/17Z0jjVBy1OPKREPu77_n_rQzorDiapji/view?usp=drive_link
https://drive.google.com/file/d/1F4i23qPJ_qTf5jWjfLo4ARGJChznYWt3/view?usp=drive_link
https://drive.google.com/file/d/1kZtXWM3uS0-rLblydBfJ0mMcVnMMXw9w/view?usp=drive_link
https://drive.google.com/file/d/1mNODox87xFfY5Z_o5mcLsr8SHb39jDik/view?usp=drive_link
https://drive.google.com/file/d/1Ob44VdmEUA93FKDECiRb5Ogz2xQg5IWp/view?usp=drive_link
https://drive.google.com/file/d/1fdQLdjj3Cwv33R1wZhfrLz9Del8mqgHb/view?usp=drive_link
https://drive.google.com/file/d/1Yu3L3ft21zP__XL8pCfhb788ZleuW1n5/view?usp=drive_link
https://drive.google.com/file/d/1ozBBWXVZ9hXDh9ooHUNroHdYm8UDqnhJ/view?usp=drive_link
https://drive.google.com/file/d/1o0TGqvfWw_Lunxb5ubKDS21Lr_WC0h75/view?usp=drive_link
https://drive.google.com/file/d/1jZnd5eP5L6BH5l98BPN6OnoQx3fu8e9n/view?usp=drive_link
https://drive.google.com/file/d/1S5sYbz8wcLYp0V67v13i4PRcBxodn4Hg/view?usp=drive_link
https://drive.google.com/file/d/1rFeg_x6ftJYwPtBv34D3h2L2cpDLeR4G/view?usp=drive_link
https://drive.google.com/file/d/1GvS3lcm4o6nm_scUk0XxKeVFNmzjucDZ/view?usp=drive_link
https://drive.google.com/file/d/1-9i0riphC7NhhDahcQfD1QoBXP5gF90A/view?usp=drive_link
https://drive.google.com/file/d/15p_IqGsMbKuvzMS872THAZr-3SBtb1Fr/view?usp=drive_link
https://drive.google.com/file/d/1ToyYcBfJL8gbQn0q_59zPLsFmm7dmMJo/view?usp=drive_link
https://drive.google.com/file/d/1e_7PNH7CYafE4pAebP7ZdI7XFbmEcy_i/view?usp=drive_link
https://drive.google.com/file/d/1JoabvGVsIQdug2xOhUIhetEIyDM91y_Y/view?usp=drive_link
https://drive.google.com/file/d/1kOMw1y0lmnVaCjwZICfzCsx6e0Z8MNGR/view?usp=drive_link
https://drive.google.com/file/d/16it_wd1JOevUQTK2_CvF_pBACTgpIPgM/view?usp=drive_link
https://drive.google.com/file/d/1IRcCj9HnJSfbyMgr5XEERGlEnWeZQwOc/view?usp=drive_link
https://drive.google.com/file/d/1Z2dIJfq_S3liGmPN9Rphvkmucnmw7tlb/view?usp=drive_link
https://drive.google.com/file/d/1J3NoAjzndGx9yNyaBOJHdNny1epzUoBt/view?usp=drive_link
https://drive.google.com/file/d/18nOvxV1k8FSmBrhT4TPo2sKKSZXougyx/view?usp=drive_link
https://drive.google.com/file/d/1CT8FxclafFMjSd7gCWVw3VSeryeiF04i/view?usp=drive_link
https://drive.google.com/file/d/16M9KVqQMFfSsXfypK0bocFft8Nz3j2Rt/view?usp=drive_link
https://drive.google.com/file/d/18QPVkw6bj6HW8LTPrQLWrrUX4R6RcF42/view?usp=drive_link
https://drive.google.com/file/d/1hQTVtA5hBTE_StXpJafTZJ3tgt2VQQ_t/view?usp=drive_link
https://drive.google.com/file/d/1Dn-d5g69H6EgAWgsFdrcbJKtz7ySsCQ8/view?usp=drive_link
https://drive.google.com/file/d/13hMr16483P7ALYv73yMRUN37fJdVQM62/view?usp=drive_link
https://drive.google.com/file/d/1848yN3XMN5zJMEgApt6KzrWgfRPfimtv/view?usp=drive_link
https://drive.google.com/file/d/1oAD9kSnS0fTgj-CjD4u9VdZ5X67IOIMa/view?usp=drive_link
https://drive.google.com/file/d/1ilzIWLCCG5b_KgF5s0wdN2I5-lFNpwC1/view?usp=drive_link
https://drive.google.com/file/d/1rjsT2YBjnidxod1s9s-myAYz8boHr-WB/view?usp=drive_link
https://drive.google.com/file/d/18Gg48HTub15bd8qzbhiCUufbVy0fbN5G/view?usp=drive_link
https://drive.google.com/file/d/1WsSnQSqmMTVSRwrhT1Y-v782My2zcjLm/view?usp=drive_link
https://drive.google.com/file/d/1ea9ZCvoyc-xqiFXgeDcA_mOWsw7VUuoi/view?usp=drive_link
https://drive.google.com/file/d/1wv1v3-XhPgbNzp62BXbJTDzMPu2tlDUc/view?usp=drive_link
https://drive.google.com/file/d/18-ikzt8LoZ83Gi3goKCELs4U4z8hrRoF/view?usp=drive_link
https://drive.google.com/file/d/16Bjhp7JNCXkGuLvyNcZowAx3W-Y-15DV/view?usp=drive_link
https://drive.google.com/file/d/1Gc-KRI-xwcp1fMR55ugbrLg_5y3SPde-/view?usp=drive_link
https://drive.google.com/file/d/1oP72Q386Z4Sy5MMm-t5yNogIe5Van_9k/view?usp=drive_link
https://drive.google.com/file/d/112T90eDUDVH-SyOV7UnZl5bscAH2hcfq/view?usp=drive_link
https://drive.google.com/file/d/1y-uKOesRRhjgDtFbG_j65f4SGg0v8XDg/view?usp=drive_link
https://drive.google.com/file/d/1LOP05OagoI3km-ZKQBrS204A85UVk7Ok/view?usp=drive_link
https://drive.google.com/file/d/1QkHQKgasVzWsmdPvkXgGhWyQ84d93_Az/view?usp=drive_link

View File

@@ -0,0 +1 @@
https://drive.google.com/drive/folders/1Ut2cv6o6Pkfgg46DgwVUM7Z5PkNG8eJ-

View File

@@ -0,0 +1 @@
https://drive.google.com/drive/folders/1FqxPV0PgvgIu8XFjtvZSPSExuNcxVVAY

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/1SKtG0ct9q0nVdYssJNMWSOjikcXliT58
https://drive.google.com/file/d/1nchD21O30B3i3LDoqramo1zgW5YvpJIN/view?usp=drive_link

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/1_4DHf2cma0xsChLQFghwigX6Ukti5-zQ
https://drive.google.com/file/d/1_8vS4hDNDgUQY-SmekrNaa7dF67QJYU-/view?usp=drive_link

View File

@@ -0,0 +1,2 @@
https://drive.google.com/drive/folders/1_4DHf2cma0xsChLQFghwigX6Ukti5-zQ
https://drive.google.com/file/d/1_8vS4hDNDgUQY-SmekrNaa7dF67QJYU-/view?usp=drive_link

Some files were not shown because too many files have changed in this diff Show More