CI nightlies cpu/gpu & cleanup (#75)

2024-04-25 14:58:39 +02:00
parent 659c69a1c0
commit b980c5dd9e
14 changed files with 771 additions and 187 deletions
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -0,0 +1,207 @@
+# Inspired by
+# https://github.com/huggingface/peft/blob/main/.github/workflows/build_docker_images.yml
+name: Nightly Builds
+
+on:
+  workflow_dispatch:
+  workflow_call:
+  schedule:
+    - cron: "0 1 * * *"
+
+# concurrency:
+#   group: docker-image-builds
+#   cancel-in-progress: false
+
+env:
+  PYTHON_VERSION: "3.10"
+#   CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
+
+jobs:
+  latest-cpu:
+    name: "Build CPU"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo df -h
+          # sudo ls -l /usr/local/lib/
+          # sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo df -h
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Checkout gym-aloha
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/gym-aloha
+          path: envs/gym-aloha
+          ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout gym-xarm
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/gym-xarm
+          path: envs/gym-xarm
+          ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout gym-pusht
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/gym-pusht
+          path: envs/gym-pusht
+          ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      # HACK(aliberts): to be removed for release
+      # -----------------------------------------
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Change envs dependencies as local path
+        run: python .github/scripts/dep_build.py
+      # -----------------------------------------
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push CPU
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./docker/lerobot-cpu/Dockerfile
+          push: true
+          tags: huggingface/lerobot-cpu
+          build-args: PYTHON_VERSION=${{ env.PYTHON_VERSION }}
+
+    #   - name: Post to a Slack channel
+    #     id: slack
+    #     #uses: slackapi/slack-github-action@v1.25.0
+    #     uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+    #     with:
+    #       # Slack channel id, channel name, or user id to post message.
+    #       # See also: https://api.slack.com/methods/chat.postMessage#channels
+    #       channel-id: ${{ env.CI_SLACK_CHANNEL }}
+    #       # For posting a rich message using Block Kit
+    #       payload: |
+    #         {
+    #           "text": "lerobot-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+    #           "blocks": [
+    #             {
+    #               "type": "section",
+    #               "text": {
+    #                 "type": "mrkdwn",
+    #                 "text": "lerobot-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+    #               }
+    #             }
+    #           ]
+    #         }
+    #     env:
+    #       SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-cuda:
+    name: "Build GPU"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo df -h
+          # sudo ls -l /usr/local/lib/
+          # sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo df -h
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Checkout gym-aloha
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/gym-aloha
+          path: envs/gym-aloha
+          ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout gym-xarm
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/gym-xarm
+          path: envs/gym-xarm
+          ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout gym-pusht
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/gym-pusht
+          path: envs/gym-pusht
+          ssh-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      # HACK(aliberts): to be removed for release
+      # -----------------------------------------
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Change envs dependencies as local path
+        run: python .github/scripts/dep_build.py
+      # -----------------------------------------
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push GPU
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./docker/lerobot-gpu/Dockerfile
+          push: true
+          tags: huggingface/lerobot-gpu
+          build-args: PYTHON_VERSION=${{ env.PYTHON_VERSION }}
+
+      # - name: Post to a Slack channel
+      #   id: slack
+      #   #uses: slackapi/slack-github-action@v1.25.0
+      #   uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+      #   with:
+      #     # Slack channel id, channel name, or user id to post message.
+      #     # See also: https://api.slack.com/methods/chat.postMessage#channels
+      #     channel-id: ${{ env.CI_SLACK_CHANNEL }}
+      #     # For posting a rich message using Block Kit
+      #     payload: |
+      #       {
+      #         "text": "lerobot-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+      #         "blocks": [
+      #           {
+      #             "type": "section",
+      #             "text": {
+      #               "type": "mrkdwn",
+      #               "text": "lerobot-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+      #             }
+      #           }
+      #         ]
+      #       }
+      #   env:
+      #     SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/nightly-tests.yml
+++ b/.github/workflows/nightly-tests.yml
@@ -0,0 +1,85 @@
+# Inspired by
+# https://github.com/huggingface/peft/blob/main/.github/workflows/nightly.yml
+name: Nightly Tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 2 * * *"
+
+env:
+  DATA_DIR: tests/data
+  # SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+
+
+jobs:
+  run_all_tests_cpu:
+    name: "Test CPU"
+    strategy:
+      fail-fast: false
+    runs-on: ubuntu-latest
+    container:
+      image: huggingface/lerobot-cpu:latest
+      options: --shm-size "16gb"
+      credentials:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_PASSWORD }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: /lerobot
+    steps:
+      - name: Tests
+        env:
+          DATA_DIR: tests/data
+        run: pytest -v --cov=./lerobot --disable-warnings tests
+      - name: Tests end-to-end
+        env:
+          DATA_DIR: tests/data
+        run: make test-end-to-end
+
+  run_all_tests_single_gpu:
+    name: "Test GPU"
+    strategy:
+      fail-fast: false
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu"
+    container:
+      image: huggingface/lerobot-gpu:latest
+      options: --gpus all --shm-size "16gb"
+      credentials:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_PASSWORD }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: /lerobot
+    steps:
+      - name: Nvidia-smi
+        run: nvidia-smi
+      - name: Test
+        run: pytest -v --cov=./lerobot --cov-report=xml --disable-warnings tests
+      #   TODO(aliberts): Link with HF Codecov account
+      # - name: Upload coverage reports to Codecov with GitHub Action
+      #   uses: codecov/codecov-action@v4
+      #   with:
+      #     files: ./coverage.xml
+      #     verbose: true
+      - name: Tests end-to-end
+        run: make test-end-to-end
+      - name: Tailscale Wait
+        if: ${{ failure() || runner.debug == '1' }}
+        uses: huggingface/tailscale-action@v1
+        with:
+           waitForSSH: true
+           authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+           slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+           slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+    #   - name: Generate Report
+    #     if: always()
+    #     run: |
+    #       pip install slack_sdk tabulate
+    #       python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/test-poetry_DEPRECATED.yml
+++ b/.github/workflows/test-poetry_DEPRECATED.yml
@@ -0,0 +1,120 @@
+# name: Tests poetry
+
+# on:
+#   pull_request:
+#     branches:
+#       - main
+#   push:
+#     branches:
+#       - main
+
+# jobs:
+#   tests:
+#     runs-on: ubuntu-latest
+#     env:
+#       POETRY_VERSION: 1.8.2
+#       DATA_DIR: tests/data
+#       MUJOCO_GL: egl
+#     steps:
+#       #----------------------------------------------
+#       #       check-out repo and set-up python
+#       #----------------------------------------------
+#       - name: Check out repository
+#         uses: actions/checkout@v4
+#         with:
+#           lfs: true
+
+#       - name: Set up python
+#         id: setup-python
+#         uses: actions/setup-python@v5
+#         with:
+#           python-version: '3.10'
+
+#       - name: Add SSH key for installing envs
+#         uses: webfactory/ssh-agent@v0.9.0
+#         with:
+#           ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+#       #----------------------------------------------
+#       #         install & configure poetry
+#       #----------------------------------------------
+#       - name: Load cached Poetry installation
+#         id: restore-poetry-cache
+#         uses: actions/cache/restore@v3
+#         with:
+#           path: ~/.local
+#           key: poetry-${{ env.POETRY_VERSION }}
+
+#       - name: Install Poetry
+#         if: steps.restore-poetry-cache.outputs.cache-hit != 'true'
+#         uses: snok/install-poetry@v1
+#         with:
+#           version: ${{ env.POETRY_VERSION }}
+#           virtualenvs-create: true
+#           installer-parallel: true
+
+#       - name: Save cached Poetry installation
+#         if: |
+#           steps.restore-poetry-cache.outputs.cache-hit != 'true' &&
+#           github.ref_name == 'main'
+#         id: save-poetry-cache
+#         uses: actions/cache/save@v3
+#         with:
+#           path: ~/.local
+#           key: poetry-${{ env.POETRY_VERSION }}
+
+#       - name: Configure Poetry
+#         run: poetry config virtualenvs.in-project true
+
+#       #----------------------------------------------
+#       #           install dependencies
+#       #----------------------------------------------
+#       # TODO(aliberts): move to gpu runners
+#       - name: Select cpu dependencies  # HACK
+#         run: cp -t . .github/poetry/cpu/pyproject.toml .github/poetry/cpu/poetry.lock
+
+#       - name: Load cached venv
+#         id: restore-dependencies-cache
+#         uses: actions/cache/restore@v3
+#         with:
+#           path: .venv
+#           key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
+
+#       - name: Install dependencies
+#         if: steps.restore-dependencies-cache.outputs.cache-hit != 'true'
+#         run: poetry install --no-interaction --no-root --all-extras
+
+#       - name: Save cached venv
+#         if: |
+#             steps.restore-dependencies-cache.outputs.cache-hit != 'true' &&
+#             github.ref_name == 'main'
+#         id: save-dependencies-cache
+#         uses: actions/cache/save@v3
+#         with:
+#           path: .venv
+#           key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
+
+#       - name: Install EGL
+#         run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
+
+#       #----------------------------------------------
+#       #             install project
+#       #----------------------------------------------
+#       - name: Install project
+#         run: poetry install --no-interaction --all-extras
+
+#       #----------------------------------------------
+#       #            run tests & coverage
+#       #----------------------------------------------
+#       - name: Run tests
+#         run: |
+#           source .venv/bin/activate
+#           pytest -v --cov=./lerobot tests
+
+#       #----------------------------------------------
+#       #            run end-to-end tests
+#       #----------------------------------------------
+#       - name: Tests end-to-end
+#         run: |
+#           source .venv/bin/activate
+#           make test-end-to-end
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -4,210 +4,43 @@ on:
  pull_request:
    branches:
      - main
-    types: [opened, synchronize, reopened, labeled]
  push:
    branches:
      - main

 jobs:
  tests:
-    if: |
-      ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'CI') }} ||
-      ${{ github.event_name == 'push' }}
    runs-on: ubuntu-latest
    env:
-      POETRY_VERSION: 1.8.2
      DATA_DIR: tests/data
      MUJOCO_GL: egl
    steps:
-      #----------------------------------------------
-      #       check-out repo and set-up python
-      #----------------------------------------------
-      - name: Check out repository
-        uses: actions/checkout@v4
-        with:
-          lfs: true
-
-      - name: Set up python
-        id: setup-python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-
      - name: Add SSH key for installing envs
        uses: webfactory/ssh-agent@v0.9.0
        with:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

-      #----------------------------------------------
-      #         install & configure poetry
-      #----------------------------------------------
-      - name: Load cached Poetry installation
-        id: restore-poetry-cache
-        uses: actions/cache/restore@v3
+      - uses: actions/checkout@v4
        with:
-          path: ~/.local
-          key: poetry-${{ env.POETRY_VERSION }}
+          lfs: true

-      - name: Install Poetry
-        if: steps.restore-poetry-cache.outputs.cache-hit != 'true'
-        uses: snok/install-poetry@v1
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5
        with:
-          version: ${{ env.POETRY_VERSION }}
-          virtualenvs-create: true
-          installer-parallel: true
+          python-version: "3.10"
+          cache: ${{ (github.ref == 'refs/heads/main') && 'pip' || '' }}
+          cache-dependency-path: pyproject.toml

-      - name: Save cached Poetry installation
-        if: |
-          steps.restore-poetry-cache.outputs.cache-hit != 'true' &&
-          github.ref_name == 'main'
-        id: save-poetry-cache
-        uses: actions/cache/save@v3
-        with:
-          path: ~/.local
-          key: poetry-${{ env.POETRY_VERSION }}
-
-      - name: Configure Poetry
-        run: poetry config virtualenvs.in-project true
-
-      #----------------------------------------------
-      #           install dependencies
-      #----------------------------------------------
-      # TODO(aliberts): move to gpu runners
-      - name: Select cpu dependencies  # HACK
-        run: cp -t . .github/poetry/cpu/pyproject.toml .github/poetry/cpu/poetry.lock
-
-      - name: Load cached venv
-        id: restore-dependencies-cache
-        uses: actions/cache/restore@v3
-        with:
-          path: .venv
-          key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
-
-      - name: Install dependencies
-        if: steps.restore-dependencies-cache.outputs.cache-hit != 'true'
-        env:
-          TMPDIR: ~/tmp
-          TEMP: ~/tmp
-          TMP: ~/tmp
-        run: |
-          mkdir ~/tmp
-          poetry install --no-interaction --no-root --all-extras
-
-      - name: Save cached venv
-        if: |
-            steps.restore-dependencies-cache.outputs.cache-hit != 'true' &&
-            github.ref_name == 'main'
-        id: save-dependencies-cache
-        uses: actions/cache/save@v3
-        with:
-          path: .venv
-          key: venv-${{ steps.setup-python.outputs.python-version }}-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }}
-
-      - name: Install libegl1-mesa-dev (to use MUJOCO_GL=egl)
+      - name: Install EGL
        run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev

-      #----------------------------------------------
-      #             install project
-      #----------------------------------------------
-      - name: Install project
-        run: poetry install --no-interaction --all-extras
-
-      #----------------------------------------------
-      #            run tests & coverage
-      #----------------------------------------------
-      - name: Run tests
+      - name: Install pip dependencies
        run: |
-          source .venv/bin/activate
-          pytest -v --cov=./lerobot --cov-report=xml tests
+          python -m pip install --upgrade pip
+          pip install -e ".[test, aloha, xarm, pusht]"

-      #   TODO(aliberts): Link with HF Codecov account
-      # - name: Upload coverage reports to Codecov with GitHub Action
-      #   uses: codecov/codecov-action@v4
-      #   with:
-      #     files: ./coverage.xml
-      #     verbose: true
+      - name: Test with pytest
+        run: pytest -v --cov=./lerobot tests

-      #----------------------------------------------
-      #            run end-to-end tests
-      #----------------------------------------------
-      - name: Test train ACT on ALOHA end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/train.py \
-            policy=act \
-            env=aloha \
-            wandb.enable=False \
-            offline_steps=2 \
-            online_steps=0 \
-            eval_episodes=1 \
-            device=cpu \
-            save_model=true \
-            save_freq=2 \
-            policy.n_action_steps=20 \
-            policy.chunk_size=20 \
-            policy.batch_size=2 \
-            hydra.run.dir=tests/outputs/act/
-
-      - name: Test eval ACT on ALOHA end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config tests/outputs/act/.hydra/config.yaml \
-            eval_episodes=1 \
-            env.episode_length=8 \
-            device=cpu \
-            policy.pretrained_model_path=tests/outputs/act/models/2.pt
-
-      - name: Test train Diffusion on PushT end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/train.py \
-            policy=diffusion \
-            env=pusht \
-            wandb.enable=False \
-            offline_steps=2 \
-            online_steps=0 \
-            eval_episodes=1 \
-            device=cpu \
-            save_model=true \
-            save_freq=2 \
-            policy.batch_size=2 \
-            hydra.run.dir=tests/outputs/diffusion/
-
-      - name: Test eval Diffusion on PushT end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config tests/outputs/diffusion/.hydra/config.yaml \
-            eval_episodes=1 \
-            env.episode_length=8 \
-            device=cpu \
-            policy.pretrained_model_path=tests/outputs/diffusion/models/2.pt
-
-      - name: Test train TDMPC on Simxarm end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/train.py \
-            policy=tdmpc \
-            env=xarm \
-            wandb.enable=False \
-            offline_steps=1 \
-            online_steps=2 \
-            eval_episodes=1 \
-            env.episode_length=2 \
-            device=cpu \
-            save_model=true \
-            save_freq=2 \
-            policy.batch_size=2 \
-            hydra.run.dir=tests/outputs/tdmpc/
-
-      - name: Test eval TDMPC on Simxarm end-to-end
-        run: |
-          source .venv/bin/activate
-          python lerobot/scripts/eval.py \
-            --config tests/outputs/tdmpc/.hydra/config.yaml \
-            eval_episodes=1 \
-            env.episode_length=8 \
-            device=cpu \
-            policy.pretrained_model_path=tests/outputs/tdmpc/models/2.pt
+      - name: Test end-to-end
+        run: make test-end-to-end