Compare commits

...

6 Commits

Author SHA1 Message Date
Pepijn
57491b44ee Remove v5 link diffusion policy 2025-06-02 10:28:55 +02:00
Quentin Gallouédec
bb3d014677 Use HF Papers 2025-05-17 04:02:07 +00:00
masato-ka
a445d9c9da bug fix for #1071 When --display_data=true, Failed running control_robot. (#1073) 2025-05-09 16:53:40 +02:00
CharlesCNorton
f24030d4d8 Update 12_use_so101.md (#1081)
Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com>
2025-05-09 11:04:25 +02:00
Mishig
7598aeaad7 Update 10_use_so100.md; use diff syntax (#944)
Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com>
2025-05-09 11:01:12 +02:00
Pepijn
4485cc0b5b docs: minor corrections and clean-up (#1089) 2025-05-09 11:00:25 +02:00
13 changed files with 87 additions and 79 deletions

View File

@@ -96,8 +96,8 @@ Reconnect the usb cable.
#### Update config file
Now that you have your ports, update the **port** default values of [`SO101RobotConfig`](https://github.com/huggingface/lerobot/blob/main/lerobot/common/robot_devices/robots/configs.py).
You will find something like, update the `port` values with your actual motor ports:
```python
You will find a class called `so101` where you can update the `port` values with your actual motor ports:
```diff
@RobotConfig.register_subclass("so101")
@dataclass
class So101RobotConfig(ManipulatorRobotConfig):
@@ -110,7 +110,8 @@ class So101RobotConfig(ManipulatorRobotConfig):
leader_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
port="/dev/tty.usbmodem58760431091", <-- UPDATE HERE
- port="/dev/tty.usbmodem58760431091",
+ port="{ADD YOUR LEADER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],
@@ -127,7 +128,8 @@ class So101RobotConfig(ManipulatorRobotConfig):
follower_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
port="/dev/tty.usbmodem585A0076891", <-- UPDATE HERE
- port="/dev/tty.usbmodem585A0076891",
+ port="{ADD YOUR FOLLOWER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],
@@ -297,7 +299,7 @@ Remove all support material from the 3D-printed parts, the easiest way to do thi
##### Wiring
- Attach the motor controller on the back.
- Then insert all wires, use the wire guides everywhere to make sure the wires don't unplug themself and stay in place.
- Then insert all wires, use the wire guides everywhere to make sure the wires don't unplug themselves and stay in place.
<div class="video-container">
<video controls width="600">

View File

@@ -83,7 +83,7 @@ camera_01_frame_000047.png
Note: Some cameras may take a few seconds to warm up, and the first frame might be black or green.
Now that you have the camera indexes, you should specify the camera's in the config. TODO(pepijn): add more info about setting camera config, rotate etc..
Now that you have the camera indexes, you should specify the camera's in the config.
### Use your phone
<hfoptions id="use phone">
@@ -152,7 +152,7 @@ If everything is set up correctly, you can proceed with the rest of the tutorial
## Teleoperate with cameras
We can now teleoperate again while at the same time visualizing the camera's and joint positions with `rerun`.
We can now teleoperate again while at the same time visualizing the cameras and joint positions with `rerun`.
```bash
python lerobot/scripts/control_robot.py \

View File

@@ -128,7 +128,7 @@ sudo chmod 666 /dev/ttyACM1
#### d. Update config file
IMPORTANTLY: Now that you have your ports, update the **port** default values of [`SO100RobotConfig`](../lerobot/common/robot_devices/robots/configs.py). You will find something like:
```python
```diff
@RobotConfig.register_subclass("so100")
@dataclass
class So100RobotConfig(ManipulatorRobotConfig):
@@ -141,7 +141,8 @@ class So100RobotConfig(ManipulatorRobotConfig):
leader_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
port="/dev/tty.usbmodem58760431091", <-- UPDATE HERE
- port="/dev/tty.usbmodem58760431091",
+ port="{ADD YOUR LEADER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],
@@ -158,7 +159,8 @@ class So100RobotConfig(ManipulatorRobotConfig):
follower_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
port="/dev/tty.usbmodem585A0076891", <-- UPDATE HERE
- port="/dev/tty.usbmodem585A0076891",
+ port="{ADD YOUR FOLLOWER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],

View File

@@ -145,8 +145,8 @@ Reconnect the usb cable.
#### Update config file
Now that you have your ports, update the **port** default values of [`SO101RobotConfig`](https://github.com/huggingface/lerobot/blob/main/lerobot/common/robot_devices/robots/configs.py).
You will find something a class called `so101` where you can update the `port` values with your actual motor ports:
```python
You will find a class called `so101` where you can update the `port` values with your actual motor ports:
```diff
@RobotConfig.register_subclass("so101")
@dataclass
class So101RobotConfig(ManipulatorRobotConfig):
@@ -159,7 +159,8 @@ class So101RobotConfig(ManipulatorRobotConfig):
leader_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
port="/dev/tty.usbmodem58760431091", <-- UPDATE HERE
- port="/dev/tty.usbmodem58760431091",
+ port="{ADD YOUR LEADER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],
@@ -176,7 +177,8 @@ class So101RobotConfig(ManipulatorRobotConfig):
follower_arms: dict[str, MotorsBusConfig] = field(
default_factory=lambda: {
"main": FeetechMotorsBusConfig(
port="/dev/tty.usbmodem585A0076891", <-- UPDATE HERE
- port="/dev/tty.usbmodem585A0076891",
+ port="{ADD YOUR FOLLOWER PORT}",
motors={
# name: (index, model)
"shoulder_pan": [1, "sts3215"],
@@ -308,7 +310,7 @@ Remove all support material from the 3D-printed parts.
##### Wiring
- Attach the motor controller on the back.
- Then insert all wires, use the wire guides everywhere to make sure the wires don't unplug themself and stay in place.
- Then insert all wires, use the wire guides everywhere to make sure the wires don't unplug themselves and stay in place.
<video controls width="640" src="https://github.com/user-attachments/assets/4c2cacfd-9276-4ee4-8bf2-ba2492667b78" type="video/mp4"></video>
@@ -351,7 +353,7 @@ python lerobot/scripts/control_robot.py \
```
## Control your robot
Congrats 🎉, your robot is all set to learn a task on its own. Next we will explain you how to train a neural network to autonomously control a real robot.
Congrats 🎉, your robot is all set to learn a task on its own. Next we will explain to you how to train a neural network to autonomously control a real robot.
**You'll learn to:**
1. How to record and visualize your dataset.
@@ -515,7 +517,7 @@ If you have an additional camera you can add a wrist camera to the SO101. There
## Teleoperate with cameras
We can now teleoperate again while at the same time visualizing the camera's and joint positions with `rerun`.
We can now teleoperate again while at the same time visualizing the cameras and joint positions with `rerun`.
```bash
python lerobot/scripts/control_robot.py \
@@ -526,7 +528,7 @@ python lerobot/scripts/control_robot.py \
## Record a dataset
Once you're familiar with teleoperation, you can record your first dataset with SO-100.
Once you're familiar with teleoperation, you can record your first dataset with SO-101.
We use the Hugging Face hub features for uploading your dataset. If you haven't previously used the Hub, make sure you can login via the cli using a write-access token, this token can be generated from the [Hugging Face settings](https://huggingface.co/settings/tokens).

View File

@@ -36,7 +36,7 @@ ALOHA_MOBILE_INFO = {
"robot_config": AlohaRobotConfig(),
"license": "mit",
"url": "https://mobile-aloha.github.io/",
"paper": "https://arxiv.org/abs/2401.02117",
"paper": "https://huggingface.co/papers/2401.02117",
"citation_bibtex": dedent(r"""
@inproceedings{fu2024mobile,
author = {Fu, Zipeng and Zhao, Tony Z. and Finn, Chelsea},
@@ -49,7 +49,7 @@ ALOHA_STATIC_INFO = {
"robot_config": AlohaRobotConfig(),
"license": "mit",
"url": "https://tonyzhaozh.github.io/aloha/",
"paper": "https://arxiv.org/abs/2304.13705",
"paper": "https://huggingface.co/papers/2304.13705",
"citation_bibtex": dedent(r"""
@article{Zhao2023LearningFB,
title={Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware},
@@ -57,13 +57,13 @@ ALOHA_STATIC_INFO = {
journal={RSS},
year={2023},
volume={abs/2304.13705},
url={https://arxiv.org/abs/2304.13705}
url={https://huggingface.co/papers/2304.13705}
}""").lstrip(),
}
PUSHT_INFO = {
"license": "mit",
"url": "https://diffusion-policy.cs.columbia.edu/",
"paper": "https://arxiv.org/abs/2303.04137v5",
"paper": "https://huggingface.co/papers/2303.04137v5",
"citation_bibtex": dedent(r"""
@article{chi2024diffusionpolicy,
author = {Cheng Chi and Zhenjia Xu and Siyuan Feng and Eric Cousineau and Yilun Du and Benjamin Burchfiel and Russ Tedrake and Shuran Song},
@@ -75,7 +75,7 @@ PUSHT_INFO = {
XARM_INFO = {
"license": "mit",
"url": "https://www.nicklashansen.com/td-mpc/",
"paper": "https://arxiv.org/abs/2203.04955",
"paper": "https://huggingface.co/papers/2203.04955",
"citation_bibtex": dedent(r"""
@inproceedings{Hansen2022tdmpc,
title={Temporal Difference Learning for Model Predictive Control},
@@ -244,7 +244,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://ut-austin-rpl.github.io/BUDS-website/",
"paper": "https://arxiv.org/abs/2109.13841",
"paper": "https://huggingface.co/papers/2109.13841",
"citation_bibtex": dedent(r"""
@article{zhu2022bottom,
title={Bottom-Up Skill Discovery From Unsegmented Demonstrations for Long-Horizon Robot Manipulation},
@@ -261,7 +261,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://ut-austin-rpl.github.io/sailor/",
"paper": "https://arxiv.org/abs/2210.11435",
"paper": "https://huggingface.co/papers/2210.11435",
"citation_bibtex": dedent(r"""
@inproceedings{nasiriany2022sailor,
title={Learning and Retrieval from Prior Data for Skill-based Imitation Learning},
@@ -274,7 +274,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://ut-austin-rpl.github.io/sirius/",
"paper": "https://arxiv.org/abs/2211.08416",
"paper": "https://huggingface.co/papers/2211.08416",
"citation_bibtex": dedent(r"""
@inproceedings{liu2022robot,
title = {Robot Learning on the Job: Human-in-the-Loop Autonomy and Learning During Deployment},
@@ -298,14 +298,14 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "cc-by-4.0",
"url": "https://sites.google.com/view/cablerouting/home",
"paper": "https://arxiv.org/abs/2307.08927",
"paper": "https://huggingface.co/papers/2307.08927",
"citation_bibtex": dedent(r"""
@article{luo2023multistage,
author = {Jianlan Luo and Charles Xu and Xinyang Geng and Gilbert Feng and Kuan Fang and Liam Tan and Stefan Schaal and Sergey Levine},
title = {Multi-Stage Cable Routing through Hierarchical Imitation Learning},
journal = {arXiv pre-print},
year = {2023},
url = {https://arxiv.org/abs/2307.08927},
url = {https://huggingface.co/papers/2307.08927},
}""").lstrip(),
},
"berkeley_fanuc_manipulation": {
@@ -322,7 +322,7 @@ DATASETS = {
"berkeley_gnm_cory_hall": {
"tasks_col": "language_instruction",
"license": "mit",
"paper": "https://arxiv.org/abs/1709.10489",
"paper": "https://huggingface.co/papers/1709.10489",
"citation_bibtex": dedent(r"""
@inproceedings{kahn2018self,
title={Self-supervised deep reinforcement learning with generalized computation graphs for robot navigation},
@@ -337,7 +337,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://sites.google.com/view/recon-robot",
"paper": "https://arxiv.org/abs/2104.05859",
"paper": "https://huggingface.co/papers/2104.05859",
"citation_bibtex": dedent(r"""
@inproceedings{shah2021rapid,
title={Rapid Exploration for Open-World Navigation with Latent Goal Models},
@@ -351,7 +351,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://sites.google.com/view/SACSoN-review",
"paper": "https://arxiv.org/abs/2306.01874",
"paper": "https://huggingface.co/papers/2306.01874",
"citation_bibtex": dedent(r"""
@article{hirose2023sacson,
title={SACSoN: Scalable Autonomous Data Collection for Social Navigation},
@@ -363,7 +363,7 @@ DATASETS = {
"berkeley_mvp": {
"tasks_col": "language_instruction",
"license": "mit",
"paper": "https://arxiv.org/abs/2203.06173",
"paper": "https://huggingface.co/papers/2203.06173",
"citation_bibtex": dedent(r"""
@InProceedings{Radosavovic2022,
title = {Real-World Robot Learning with Masked Visual Pre-training},
@@ -375,7 +375,7 @@ DATASETS = {
"berkeley_rpt": {
"tasks_col": "language_instruction",
"license": "mit",
"paper": "https://arxiv.org/abs/2306.10007",
"paper": "https://huggingface.co/papers/2306.10007",
"citation_bibtex": dedent(r"""
@article{Radosavovic2023,
title={Robot Learning with Sensorimotor Pre-training},
@@ -388,7 +388,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://human-world-model.github.io/",
"paper": "https://arxiv.org/abs/2308.10901",
"paper": "https://huggingface.co/papers/2308.10901",
"citation_bibtex": dedent(r"""
@inproceedings{mendonca2023structured,
title={Structured World Models from Human Videos},
@@ -401,7 +401,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://play-fusion.github.io/",
"paper": "https://arxiv.org/abs/2312.04549",
"paper": "https://huggingface.co/papers/2312.04549",
"citation_bibtex": dedent(r"""
@inproceedings{chen2023playfusion,
title={PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play},
@@ -414,7 +414,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://robo-affordances.github.io/",
"paper": "https://arxiv.org/abs/2304.08488",
"paper": "https://huggingface.co/papers/2304.08488",
"citation_bibtex": dedent(r"""
@inproceedings{bahl2023affordances,
title={Affordances from Human Videos as a Versatile Representation for Robotics},
@@ -433,7 +433,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://diffusion-policy.cs.columbia.edu/",
"paper": "https://arxiv.org/abs/2303.04137v5",
"paper": "https://huggingface.co/papers/2303.04137",
"citation_bibtex": dedent(r"""
@inproceedings{chi2023diffusionpolicy,
title={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion},
@@ -505,7 +505,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://droid-dataset.github.io/",
"paper": "https://arxiv.org/abs/2403.12945",
"paper": "https://huggingface.co/papers/2403.12945",
"citation_bibtex": dedent(r"""
@article{khazatsky2024droid,
title = {DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset},
@@ -517,7 +517,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "cc-by-4.0",
"url": "https://functional-manipulation-benchmark.github.io/",
"paper": "https://arxiv.org/abs/2401.08553",
"paper": "https://huggingface.co/papers/2401.08553",
"citation_bibtex": dedent(r"""
@article{luo2024fmb,
title={FMB: a Functional Manipulation Benchmark for Generalizable Robotic Learning},
@@ -530,7 +530,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://openreview.net/forum?id=WuBv9-IGDUA",
"paper": "https://arxiv.org/abs/2401.14502",
"paper": "https://huggingface.co/papers/2401.14502",
"citation_bibtex": dedent(r"""
@inproceedings{saxena2023multiresolution,
title={Multi-Resolution Sensing for Real-Time Control with Vision-Language Models},
@@ -575,7 +575,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://jyopari.github.io/VINN/",
"paper": "https://arxiv.org/abs/2112.01511",
"paper": "https://huggingface.co/papers/2112.01511",
"citation_bibtex": dedent(r"""
@misc{pari2021surprising,
title={The Surprising Effectiveness of Representation Learning for Visual Imitation},
@@ -590,7 +590,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://play-to-policy.github.io/",
"paper": "https://arxiv.org/abs/2210.10047",
"paper": "https://huggingface.co/papers/2210.10047",
"citation_bibtex": dedent(r"""
@article{cui2022play,
title = {From Play to Policy: Conditional Behavior Generation from Uncurated Robot Data},
@@ -603,7 +603,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://rot-robot.github.io/",
"paper": "https://arxiv.org/abs/2206.15469",
"paper": "https://huggingface.co/papers/2206.15469",
"citation_bibtex": dedent(r"""
@inproceedings{haldar2023watch,
title={Watch and match: Supercharging imitation with regularized optimal transport},
@@ -633,7 +633,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://sites.google.com/view/hydra-il-2023",
"paper": "https://arxiv.org/abs/2306.17237",
"paper": "https://huggingface.co/papers/2306.17237",
"citation_bibtex": dedent(r"""
@article{belkhale2023hydra,
title={HYDRA: Hybrid Robot Actions for Imitation Learning},
@@ -646,21 +646,21 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://sites.google.com/view/visionandtouch",
"paper": "https://arxiv.org/abs/1810.10191",
"paper": "https://huggingface.co/papers/1810.10191",
"citation_bibtex": dedent(r"""
@inproceedings{lee2019icra,
title={Making sense of vision and touch: Self-supervised learning of multimodal representations for contact-rich tasks},
author={Lee, Michelle A and Zhu, Yuke and Srinivasan, Krishnan and Shah, Parth and Savarese, Silvio and Fei-Fei, Li and Garg, Animesh and Bohg, Jeannette},
booktitle={2019 IEEE International Conference on Robotics and Automation (ICRA)},
year={2019},
url={https://arxiv.org/abs/1810.10191}
url={https://huggingface.co/papers/1810.10191}
}""").lstrip(),
},
"stanford_robocook": {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://hshi74.github.io/robocook/",
"paper": "https://arxiv.org/abs/2306.14447",
"paper": "https://huggingface.co/papers/2306.14447",
"citation_bibtex": dedent(r"""
@article{shi2023robocook,
title={RoboCook: Long-Horizon Elasto-Plastic Object Manipulation with Diverse Tools},
@@ -673,7 +673,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "cc-by-4.0",
"url": "https://www.kaggle.com/datasets/oiermees/taco-robot",
"paper": "https://arxiv.org/abs/2209.08959, https://arxiv.org/abs/2210.01911",
"paper": "https://huggingface.co/papers/2209.08959, https://huggingface.co/papers/2210.01911",
"citation_bibtex": dedent(r"""
@inproceedings{rosete2022tacorl,
author = {Erick Rosete-Beas and Oier Mees and Gabriel Kalweit and Joschka Boedecker and Wolfram Burgard},
@@ -693,7 +693,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "URL",
"paper": "https://arxiv.org/abs/2107.05842",
"paper": "https://huggingface.co/papers/2107.05842",
"citation_bibtex": dedent(r"""
@Article{Osa22,
author = {Takayuki Osa},
@@ -709,7 +709,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://toto-benchmark.org/",
"paper": "https://arxiv.org/abs/2306.00942",
"paper": "https://huggingface.co/papers/2306.00942",
"citation_bibtex": dedent(r"""
@inproceedings{zhou2023train,
author={Zhou, Gaoyue and Dean, Victoria and Srirama, Mohan Kumar and Rajeswaran, Aravind and Pari, Jyothish and Hatch, Kyle and Jain, Aryan and Yu, Tianhe and Abbeel, Pieter and Pinto, Lerrel and Finn, Chelsea and Gupta, Abhinav},
@@ -733,7 +733,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://owmcorl.github.io/#",
"paper": "https://arxiv.org/abs/2310.16029",
"paper": "https://huggingface.co/papers/2310.16029",
"citation_bibtex": dedent(r"""
@preprint{Feng2023Finetuning,
title={Finetuning Offline World Models in the Real World},
@@ -745,7 +745,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://robopil.github.io/d3fields/",
"paper": "https://arxiv.org/abs/2309.16118",
"paper": "https://huggingface.co/papers/2309.16118",
"citation_bibtex": dedent(r"""
@article{wang2023d3field,
title={D^3Field: Dynamic 3D Descriptor Fields for Generalizable Robotic Manipulation},
@@ -758,7 +758,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://uscresl.github.io/dmfd/",
"paper": "https://arxiv.org/abs/2207.10148",
"paper": "https://huggingface.co/papers/2207.10148",
"citation_bibtex": dedent(r"""
@article{salhotra2022dmfd,
author={Salhotra, Gautam and Liu, I-Chun Arthur and Dominguez-Kuhne, Marcus and Sukhatme, Gaurav S.},
@@ -775,7 +775,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://ut-austin-rpl.github.io/MUTEX/",
"paper": "https://arxiv.org/abs/2309.14320",
"paper": "https://huggingface.co/papers/2309.14320",
"citation_bibtex": dedent(r"""
@inproceedings{shah2023mutex,
title={{MUTEX}: Learning Unified Policies from Multimodal Task Specifications},
@@ -811,7 +811,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://saytap.github.io/",
"paper": "https://arxiv.org/abs/2306.07580",
"paper": "https://huggingface.co/papers/2306.07580",
"citation_bibtex": dedent(r"""
@article{saytap2023,
author = {Yujin Tang and Wenhao Yu and Jie Tan and Heiga Zen and Aleksandra Faust and
@@ -847,7 +847,7 @@ DATASETS = {
"tasks_col": "language_instruction",
"license": "mit",
"url": "https://ut-austin-rpl.github.io/VIOLA/",
"paper": "https://arxiv.org/abs/2210.11339",
"paper": "https://huggingface.co/papers/2210.11339",
"citation_bibtex": dedent(r"""
@article{zhu2022viola,
title={VIOLA: Imitation Learning for Vision-Based Manipulation with Object Proposal Priors},

View File

@@ -15,7 +15,7 @@
# limitations under the License.
"""Action Chunking Transformer Policy
As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705).
As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://huggingface.co/papers/2304.13705).
The majority of changes here involve removing unused code, unifying naming, and adding helpful comments.
"""
@@ -41,7 +41,7 @@ from lerobot.common.policies.pretrained import PreTrainedPolicy
class ACTPolicy(PreTrainedPolicy):
"""
Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
Hardware (paper: https://huggingface.co/papers/2304.13705, code: https://github.com/tonyzhaozh/act)
"""
config_class = ACTConfig
@@ -161,7 +161,7 @@ class ACTPolicy(PreTrainedPolicy):
# Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for
# each dimension independently, we sum over the latent dimension to get the total
# KL-divergence per batch element, then take the mean over the batch.
# (See App. B of https://arxiv.org/abs/1312.6114 for more details).
# (See App. B of https://huggingface.co/papers/1312.6114 for more details).
mean_kld = (
(-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean()
)
@@ -175,7 +175,7 @@ class ACTPolicy(PreTrainedPolicy):
class ACTTemporalEnsembler:
def __init__(self, temporal_ensemble_coeff: float, chunk_size: int) -> None:
"""Temporal ensembling as described in Algorithm 2 of https://arxiv.org/abs/2304.13705.
"""Temporal ensembling as described in Algorithm 2 of https://huggingface.co/papers/2304.13705.
The weights are calculated as wᵢ = exp(-temporal_ensemble_coeff * i) where w₀ is the oldest action.
They are then normalized to sum to 1 by dividing by Σwᵢ. Here's some intuition around how the

View File

@@ -81,7 +81,7 @@ class DiffusionConfig(PreTrainedConfig):
n_groups: Number of groups used in the group norm of the Unet's convolutional blocks.
diffusion_step_embed_dim: The Unet is conditioned on the diffusion timestep via a small non-linear
network. This is the output dimension of that network, i.e., the embedding dimension.
use_film_scale_modulation: FiLM (https://arxiv.org/abs/1709.07871) is used for the Unet conditioning.
use_film_scale_modulation: FiLM (https://huggingface.co/papers/1709.07871) is used for the Unet conditioning.
Bias modulation is used be default, while this parameter indicates whether to also use scale
modulation.
noise_scheduler_type: Name of the noise scheduler to use. Supported options: ["DDPM", "DDIM"].

View File

@@ -48,7 +48,7 @@ from lerobot.common.policies.utils import (
class DiffusionPolicy(PreTrainedPolicy):
"""
Diffusion Policy as per "Diffusion Policy: Visuomotor Policy Learning via Action Diffusion"
(paper: https://arxiv.org/abs/2303.04137, code: https://github.com/real-stanford/diffusion_policy).
(paper: https://huggingface.co/papers/2303.04137, code: https://github.com/real-stanford/diffusion_policy).
"""
config_class = DiffusionConfig
@@ -370,7 +370,7 @@ class DiffusionModel(nn.Module):
class SpatialSoftmax(nn.Module):
"""
Spatial Soft Argmax operation described in "Deep Spatial Autoencoders for Visuomotor Learning" by Finn et al.
(https://arxiv.org/pdf/1509.06113). A minimal port of the robomimic implementation.
(https://huggingface.co/papers/1509.06113). A minimal port of the robomimic implementation.
At a high level, this takes 2D feature maps (from a convnet/ViT) and returns the "center of mass"
of activations of each channel, i.e., keypoints in the image space for the policy to focus on.
@@ -728,7 +728,7 @@ class DiffusionConditionalResidualBlock1d(nn.Module):
self.conv1 = DiffusionConv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups)
# FiLM modulation (https://arxiv.org/abs/1709.07871) outputs per-channel bias and (maybe) scale.
# FiLM modulation (https://huggingface.co/papers/1709.07871) outputs per-channel bias and (maybe) scale.
cond_channels = out_channels * 2 if use_film_scale_modulation else out_channels
self.cond_encoder = nn.Sequential(nn.Mish(), nn.Linear(cond_dim, cond_channels))

View File

@@ -17,7 +17,7 @@
"""
π0+FAST: Efficient Action Tokenization for Vision-Language-Action Models
[Paper](https://arxiv.org/abs/2501.09747)
[Paper](https://huggingface.co/papers/2501.09747)
[Jax code](https://github.com/Physical-Intelligence/openpi)
Designed by Physical Intelligence. Ported from Jax by Hugging Face.

View File

@@ -17,8 +17,8 @@
"""Implementation of Finetuning Offline World Models in the Real World.
The comments in this code may sometimes refer to these references:
TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://arxiv.org/abs/2203.04955)
FOWM paper: Finetuning Offline World Models in the Real World (https://arxiv.org/abs/2310.16029)
TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://huggingface.co/papers/2203.04955)
FOWM paper: Finetuning Offline World Models in the Real World (https://huggingface.co/papers/2310.16029)
"""
# ruff: noqa: N806

View File

@@ -162,7 +162,7 @@ class VQBeTPolicy(PreTrainedPolicy):
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch["observation.images"] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
batch = self.normalize_targets(batch)
# VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://arxiv.org/pdf/2403.03181)
# VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://huggingface.co/papers/2403.03181)
if not self.vqbet.action_head.vqvae_model.discretized.item():
# loss: total loss of training RVQ
# n_different_codes: how many of the total possible VQ codes are being used in single batch (how many of them have at least one encoder embedding as a nearest neighbor). This can be at most `vqvae_n_embed * number of layers of RVQ (=2)`.
@@ -185,7 +185,7 @@ class VQBeTPolicy(PreTrainedPolicy):
class SpatialSoftmax(nn.Module):
"""
Spatial Soft Argmax operation described in "Deep Spatial Autoencoders for Visuomotor Learning" by Finn et al.
(https://arxiv.org/pdf/1509.06113). A minimal port of the robomimic implementation.
(https://huggingface.co/papers/1509.06113). A minimal port of the robomimic implementation.
At a high level, this takes 2D feature maps (from a convnet/ViT) and returns the "center of mass"
of activations of each channel, i.e., keypoints in the image space for the policy to focus on.
@@ -387,7 +387,7 @@ class VQBeTModel(nn.Module):
# only extract the output tokens at the position of action query:
# Behavior Transformer (BeT), and VQ-BeT are both sequence-to-sequence prediction models,
# mapping sequential observation to sequential action (please refer to section 2.2 in BeT paper https://arxiv.org/pdf/2206.11251).
# mapping sequential observation to sequential action (please refer to section 2.2 in BeT paper https://huggingface.co/papers/2206.11251).
# Thus, it predicts a historical action sequence, in addition to current and future actions (predicting future actions : optional).
if len_additional_action_token > 0:
features = torch.cat(
@@ -824,8 +824,8 @@ class VqVae(nn.Module):
return einops.rearrange(output, "N (T A) -> N T A", A=self.config.action_feature.shape[0])
def get_code(self, state):
# in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://arxiv.org/pdf/2403.03181)
# this function outputs the `GT code` of given action using frozen encoder and quantization layers. (please refer to Figure 2. in the paper https://arxiv.org/pdf/2403.03181)
# in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://huggingface.co/papers/2403.03181)
# this function outputs the `GT code` of given action using frozen encoder and quantization layers. (please refer to Figure 2. in the paper https://huggingface.co/papers/2403.03181)
state = einops.rearrange(state, "N T A -> N (T A)")
with torch.no_grad():
state_rep = self.encoder(state)
@@ -838,7 +838,7 @@ class VqVae(nn.Module):
return state_vq, vq_code
def vqvae_forward(self, state):
# This function passes the given data through Residual VQ with Encoder and Decoder. Please refer to section 3.2 in the paper https://arxiv.org/pdf/2403.03181).
# This function passes the given data through Residual VQ with Encoder and Decoder. Please refer to section 3.2 in the paper https://huggingface.co/papers/2403.03181).
state = einops.rearrange(state, "N T A -> N (T A)")
# We start with passing action (or action chunk) at:t+n through the encoder ϕ.
state_rep = self.encoder(state)

View File

@@ -336,7 +336,7 @@ class ResidualVQ(nn.Module):
"""
Residual VQ is composed of multiple VectorQuantize layers.
Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
Follows Algorithm 1. in https://huggingface.co/papers/2107.03312
"Residual Vector Quantizer (a.k.a. multi-stage vector quantizer [36]) cascades Nq layers of VQ as follows. The unquantized input vector is
passed through a first VQ and quantization residuals are computed. The residuals are then iteratively quantized by a sequence of additional
Nq -1 vector quantizers, as described in Algorithm 1."
@@ -1006,7 +1006,7 @@ def gumbel_sample(
if not straight_through or temperature <= 0.0 or not training:
return ind, one_hot
# use reinmax for better second-order accuracy - https://arxiv.org/abs/2304.08612
# use reinmax for better second-order accuracy - https://huggingface.co/papers/2304.08612
# algorithm 2
if reinmax:
@@ -1156,7 +1156,7 @@ def batched_embedding(indices, embeds):
def orthogonal_loss_fn(t):
# eq (2) from https://arxiv.org/abs/2112.00384
# eq (2) from https://huggingface.co/papers/2112.00384
h, n = t.shape[:2]
normed_codes = F.normalize(t, p=2, dim=-1)
cosine_sim = einsum("h i d, h j d -> h i j", normed_codes, normed_codes)

View File

@@ -250,6 +250,7 @@ def control_loop(
observation, action = robot.teleop_step(record_data=True)
else:
observation = robot.capture_observation()
action = None
if policy is not None:
pred_action = predict_action(
@@ -266,9 +267,10 @@ def control_loop(
# TODO(Steven): This should be more general (for RemoteRobot instead of checking the name, but anyways it will change soon)
if (display_data and not is_headless()) or (display_data and robot.robot_type.startswith("lekiwi")):
for k, v in action.items():
for i, vv in enumerate(v):
rr.log(f"sent_{k}_{i}", rr.Scalar(vv.numpy()))
if action is not None:
for k, v in action.items():
for i, vv in enumerate(v):
rr.log(f"sent_{k}_{i}", rr.Scalar(vv.numpy()))
image_keys = [key for key in observation if "image" in key]
for key in image_keys: