CI nightlies cpu/gpu & cleanup (#75)

This commit is contained in:
Simon Alibert
2024-04-25 14:58:39 +02:00
committed by GitHub
parent 659c69a1c0
commit b980c5dd9e
14 changed files with 771 additions and 187 deletions

View File

@@ -0,0 +1,207 @@
# Inspired by
# https://github.com/huggingface/peft/blob/main/.github/workflows/build_docker_images.yml
name: Nightly Builds
on:
workflow_dispatch:
workflow_call:
schedule:
- cron: "0 1 * * *"
# concurrency:
# group: docker-image-builds
# cancel-in-progress: false
env:
PYTHON_VERSION: "3.10"
# CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
jobs:
latest-cpu:
name: "Build CPU"
runs-on: ubuntu-latest
steps:
- name: Cleanup disk
run: |
sudo df -h
# sudo ls -l /usr/local/lib/
# sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Check out code
uses: actions/checkout@v4
- name: Checkout gym-aloha
uses: actions/checkout@v4
with:
repository: huggingface/gym-aloha
path: envs/gym-aloha
ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout gym-xarm
uses: actions/checkout@v4
with:
repository: huggingface/gym-xarm
path: envs/gym-xarm
ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout gym-pusht
uses: actions/checkout@v4
with:
repository: huggingface/gym-pusht
path: envs/gym-pusht
ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
# HACK(aliberts): to be removed for release
# -----------------------------------------
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Change envs dependencies as local path
run: python .github/scripts/dep_build.py
# -----------------------------------------
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Build and Push CPU
uses: docker/build-push-action@v5
with:
context: .
file: ./docker/lerobot-cpu/Dockerfile
push: true
tags: huggingface/lerobot-cpu
build-args: PYTHON_VERSION=${{ env.PYTHON_VERSION }}
# - name: Post to a Slack channel
# id: slack
# #uses: slackapi/slack-github-action@v1.25.0
# uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
# with:
# # Slack channel id, channel name, or user id to post message.
# # See also: https://api.slack.com/methods/chat.postMessage#channels
# channel-id: ${{ env.CI_SLACK_CHANNEL }}
# # For posting a rich message using Block Kit
# payload: |
# {
# "text": "lerobot-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
# "blocks": [
# {
# "type": "section",
# "text": {
# "type": "mrkdwn",
# "text": "lerobot-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
# }
# }
# ]
# }
# env:
# SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-cuda:
name: "Build GPU"
runs-on: ubuntu-latest
steps:
- name: Cleanup disk
run: |
sudo df -h
# sudo ls -l /usr/local/lib/
# sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Check out code
uses: actions/checkout@v4
- name: Checkout gym-aloha
uses: actions/checkout@v4
with:
repository: huggingface/gym-aloha
path: envs/gym-aloha
ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout gym-xarm
uses: actions/checkout@v4
with:
repository: huggingface/gym-xarm
path: envs/gym-xarm
ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout gym-pusht
uses: actions/checkout@v4
with:
repository: huggingface/gym-pusht
path: envs/gym-pusht
ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
# HACK(aliberts): to be removed for release
# -----------------------------------------
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Change envs dependencies as local path
run: python .github/scripts/dep_build.py
# -----------------------------------------
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Build and Push GPU
uses: docker/build-push-action@v5
with:
context: .
file: ./docker/lerobot-gpu/Dockerfile
push: true
tags: huggingface/lerobot-gpu
build-args: PYTHON_VERSION=${{ env.PYTHON_VERSION }}
# - name: Post to a Slack channel
# id: slack
# #uses: slackapi/slack-github-action@v1.25.0
# uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
# with:
# # Slack channel id, channel name, or user id to post message.
# # See also: https://api.slack.com/methods/chat.postMessage#channels
# channel-id: ${{ env.CI_SLACK_CHANNEL }}
# # For posting a rich message using Block Kit
# payload: |
# {
# "text": "lerobot-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
# "blocks": [
# {
# "type": "section",
# "text": {
# "type": "mrkdwn",
# "text": "lerobot-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
# }
# }
# ]
# }
# env:
# SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

85
.github/workflows/nightly-tests.yml vendored Normal file
View File

@@ -0,0 +1,85 @@
# Inspired by
# https://github.com/huggingface/peft/blob/main/.github/workflows/nightly.yml
name: Nightly Tests
on:
workflow_dispatch:
schedule:
- cron: "0 2 * * *"
env:
DATA_DIR: tests/data
# SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
jobs:
run_all_tests_cpu:
name: "Test CPU"
strategy:
fail-fast: false
runs-on: ubuntu-latest
container:
image: huggingface/lerobot-cpu:latest
options: --shm-size "16gb"
credentials:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
defaults:
run:
shell: bash
working-directory: /lerobot
steps:
- name: Tests
env:
DATA_DIR: tests/data
run: pytest -v --cov=./lerobot --disable-warnings tests
- name: Tests end-to-end
env:
DATA_DIR: tests/data
run: make test-end-to-end
run_all_tests_single_gpu:
name: "Test GPU"
strategy:
fail-fast: false
runs-on: [single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
TEST_TYPE: "single_gpu"
container:
image: huggingface/lerobot-gpu:latest
options: --gpus all --shm-size "16gb"
credentials:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
defaults:
run:
shell: bash
working-directory: /lerobot
steps:
- name: Nvidia-smi
run: nvidia-smi
- name: Test
run: pytest -v --cov=./lerobot --cov-report=xml --disable-warnings tests
# TODO(aliberts): Link with HF Codecov account
# - name: Upload coverage reports to Codecov with GitHub Action
# uses: codecov/codecov-action@v4
# with:
# files: ./coverage.xml
# verbose: true
- name: Tests end-to-end
run: make test-end-to-end
- name: Tailscale Wait
if: ${{ failure() || runner.debug == '1' }}
uses: huggingface/tailscale-action@v1
with:
waitForSSH: true
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# - name: Generate Report
# if: always()
# run: |
# pip install slack_sdk tabulate
# python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

View File

@@ -0,0 +1,120 @@
# name: Tests poetry
# on:
# pull_request:
# branches:
# - main
# push:
# branches:
# - main
# jobs:
# tests:
# runs-on: ubuntu-latest
# env:
# POETRY_VERSION: 1.8.2
# DATA_DIR: tests/data
# MUJOCO_GL: egl
# steps:
# #----------------------------------------------
# # check-out repo and set-up python
# #----------------------------------------------
# - name: Check out repository
# uses: actions/checkout@v4
# with:
# lfs: true
# - name: Set up python
# id: setup-python
# uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - name: Add SSH key for installing envs
# uses: webfactory/ssh-agent@v0.9.0
# with:
# ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
# #----------------------------------------------
# # install & configure poetry
# #----------------------------------------------
# - name: Load cached Poetry installation
# id: restore-poetry-cache
# uses: actions/cache/restore@v3
# with:
# path: ~/.local
# key: poetry-${{ env.POETRY_VERSION }}
# - name: Install Poetry
# if: steps.restore-poetry-cache.outputs.cache-hit != 'true'
# uses: snok/install-poetry@v1
# with:
# version: ${{ env.POETRY_VERSION }}
# virtualenvs-create: true
# installer-parallel: true
# - name: Save cached Poetry installation
# if: |
# steps.restore-poetry-cache.outputs.cache-hit != 'true' &&
# github.ref_name == 'main'
# id: save-poetry-cache
# uses: actions/cache/save@v3
# with:
# path: ~/.local
# key: poetry-${{ env.POETRY_VERSION }}
# - name: Configure Poetry
# run: poetry config virtualenvs.in-project true
# #----------------------------------------------
# # install dependencies
# #----------------------------------------------
# # TODO(aliberts): move to gpu runners
# - name: Select cpu dependencies # HACK
# run: cp -t . .github/poetry/cpu/pyproject.toml .github/poetry/cpu/poetry.lock
# - name: Load cached venv
# id: restore-dependencies-cache
# uses: actions/cache/restore@v3
# with:
# path: .venv
# key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
# - name: Install dependencies
# if: steps.restore-dependencies-cache.outputs.cache-hit != 'true'
# run: poetry install --no-interaction --no-root --all-extras
# - name: Save cached venv
# if: |
# steps.restore-dependencies-cache.outputs.cache-hit != 'true' &&
# github.ref_name == 'main'
# id: save-dependencies-cache
# uses: actions/cache/save@v3
# with:
# path: .venv
# key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
# - name: Install EGL
# run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
# #----------------------------------------------
# # install project
# #----------------------------------------------
# - name: Install project
# run: poetry install --no-interaction --all-extras
# #----------------------------------------------
# # run tests & coverage
# #----------------------------------------------
# - name: Run tests
# run: |
# source .venv/bin/activate
# pytest -v --cov=./lerobot tests
# #----------------------------------------------
# # run end-to-end tests
# #----------------------------------------------
# - name: Tests end-to-end
# run: |
# source .venv/bin/activate
# make test-end-to-end

View File

@@ -4,210 +4,43 @@ on:
pull_request:
branches:
- main
types: [opened, synchronize, reopened, labeled]
push:
branches:
- main
jobs:
tests:
if: |
${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'CI') }} ||
${{ github.event_name == 'push' }}
runs-on: ubuntu-latest
env:
POETRY_VERSION: 1.8.2
DATA_DIR: tests/data
MUJOCO_GL: egl
steps:
#----------------------------------------------
# check-out repo and set-up python
#----------------------------------------------
- name: Check out repository
uses: actions/checkout@v4
with:
lfs: true
- name: Set up python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Add SSH key for installing envs
uses: webfactory/ssh-agent@v0.9.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
#----------------------------------------------
# install & configure poetry
#----------------------------------------------
- name: Load cached Poetry installation
id: restore-poetry-cache
uses: actions/cache/restore@v3
- uses: actions/checkout@v4
with:
path: ~/.local
key: poetry-${{ env.POETRY_VERSION }}
lfs: true
- name: Install Poetry
if: steps.restore-poetry-cache.outputs.cache-hit != 'true'
uses: snok/install-poetry@v1
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
version: ${{ env.POETRY_VERSION }}
virtualenvs-create: true
installer-parallel: true
python-version: "3.10"
cache: ${{ (github.ref == 'refs/heads/main') && 'pip' || '' }}
cache-dependency-path: pyproject.toml
- name: Save cached Poetry installation
if: |
steps.restore-poetry-cache.outputs.cache-hit != 'true' &&
github.ref_name == 'main'
id: save-poetry-cache
uses: actions/cache/save@v3
with:
path: ~/.local
key: poetry-${{ env.POETRY_VERSION }}
- name: Configure Poetry
run: poetry config virtualenvs.in-project true
#----------------------------------------------
# install dependencies
#----------------------------------------------
# TODO(aliberts): move to gpu runners
- name: Select cpu dependencies # HACK
run: cp -t . .github/poetry/cpu/pyproject.toml .github/poetry/cpu/poetry.lock
- name: Load cached venv
id: restore-dependencies-cache
uses: actions/cache/restore@v3
with:
path: .venv
key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
- name: Install dependencies
if: steps.restore-dependencies-cache.outputs.cache-hit != 'true'
env:
TMPDIR: ~/tmp
TEMP: ~/tmp
TMP: ~/tmp
run: |
mkdir ~/tmp
poetry install --no-interaction --no-root --all-extras
- name: Save cached venv
if: |
steps.restore-dependencies-cache.outputs.cache-hit != 'true' &&
github.ref_name == 'main'
id: save-dependencies-cache
uses: actions/cache/save@v3
with:
path: .venv
key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
- name: Install libegl1-mesa-dev (to use MUJOCO_GL=egl)
- name: Install EGL
run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
#----------------------------------------------
# install project
#----------------------------------------------
- name: Install project
run: poetry install --no-interaction --all-extras
#----------------------------------------------
# run tests & coverage
#----------------------------------------------
- name: Run tests
- name: Install pip dependencies
run: |
source .venv/bin/activate
pytest -v --cov=./lerobot --cov-report=xml tests
python -m pip install --upgrade pip
pip install -e ".[test, aloha, xarm, pusht]"
# TODO(aliberts): Link with HF Codecov account
# - name: Upload coverage reports to Codecov with GitHub Action
# uses: codecov/codecov-action@v4
# with:
# files: ./coverage.xml
# verbose: true
- name: Test with pytest
run: pytest -v --cov=./lerobot tests
#----------------------------------------------
# run end-to-end tests
#----------------------------------------------
- name: Test train ACT on ALOHA end-to-end
run: |
source .venv/bin/activate
python lerobot/scripts/train.py \
policy=act \
env=aloha \
wandb.enable=False \
offline_steps=2 \
online_steps=0 \
eval_episodes=1 \
device=cpu \
save_model=true \
save_freq=2 \
policy.n_action_steps=20 \
policy.chunk_size=20 \
policy.batch_size=2 \
hydra.run.dir=tests/outputs/act/
- name: Test eval ACT on ALOHA end-to-end
run: |
source .venv/bin/activate
python lerobot/scripts/eval.py \
--config tests/outputs/act/.hydra/config.yaml \
eval_episodes=1 \
env.episode_length=8 \
device=cpu \
policy.pretrained_model_path=tests/outputs/act/models/2.pt
- name: Test train Diffusion on PushT end-to-end
run: |
source .venv/bin/activate
python lerobot/scripts/train.py \
policy=diffusion \
env=pusht \
wandb.enable=False \
offline_steps=2 \
online_steps=0 \
eval_episodes=1 \
device=cpu \
save_model=true \
save_freq=2 \
policy.batch_size=2 \
hydra.run.dir=tests/outputs/diffusion/
- name: Test eval Diffusion on PushT end-to-end
run: |
source .venv/bin/activate
python lerobot/scripts/eval.py \
--config tests/outputs/diffusion/.hydra/config.yaml \
eval_episodes=1 \
env.episode_length=8 \
device=cpu \
policy.pretrained_model_path=tests/outputs/diffusion/models/2.pt
- name: Test train TDMPC on Simxarm end-to-end
run: |
source .venv/bin/activate
python lerobot/scripts/train.py \
policy=tdmpc \
env=xarm \
wandb.enable=False \
offline_steps=1 \
online_steps=2 \
eval_episodes=1 \
env.episode_length=2 \
device=cpu \
save_model=true \
save_freq=2 \
policy.batch_size=2 \
hydra.run.dir=tests/outputs/tdmpc/
- name: Test eval TDMPC on Simxarm end-to-end
run: |
source .venv/bin/activate
python lerobot/scripts/eval.py \
--config tests/outputs/tdmpc/.hydra/config.yaml \
eval_episodes=1 \
env.episode_length=8 \
device=cpu \
policy.pretrained_model_path=tests/outputs/tdmpc/models/2.pt
- name: Test end-to-end
run: make test-end-to-end