refactor(config): Move device & amp args to PreTrainedConfig (#812)

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
2025-03-06 17:59:28 +01:00
parent 10706ed753
commit 5e9473806c
19 changed files with 62 additions and 136 deletions
--- a/lerobot/configs/eval.py
+++ b/lerobot/configs/eval.py
@@ -18,11 +18,9 @@ from dataclasses import dataclass, field
 from pathlib import Path

 from lerobot.common import envs, policies  # noqa: F401
-from lerobot.common.utils.utils import auto_select_torch_device, is_amp_available, is_torch_device_available
 from lerobot.configs import parser
 from lerobot.configs.default import EvalConfig
 from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.train import TrainPipelineConfig


@dataclass
@@ -35,11 +33,6 @@ class EvalPipelineConfig:
    policy: PreTrainedConfig | None = None
    output_dir: Path | None = None
    job_name: str | None = None
-    # TODO(rcadene, aliberts): By default, use device and use_amp values from policy checkpoint.
-    device: str | None = None  # cuda | cpu | mps
-    # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
-    # automatic gradient scaling is used.
-    use_amp: bool = False
    seed: int | None = 1000

    def __post_init__(self):
@@ -50,27 +43,6 @@ class EvalPipelineConfig:
            self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
            self.policy.pretrained_path = policy_path

-            # When no device or use_amp are given, use the one from training config.
-            if self.device is None or self.use_amp is None:
-                train_cfg = TrainPipelineConfig.from_pretrained(policy_path)
-                if self.device is None:
-                    self.device = train_cfg.device
-                if self.use_amp is None:
-                    self.use_amp = train_cfg.use_amp
-
-            # Automatically switch to available device if necessary
-            if not is_torch_device_available(self.device):
-                auto_device = auto_select_torch_device()
-                logging.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.")
-                self.device = auto_device
-
-            # Automatically deactivate AMP if necessary
-            if self.use_amp and not is_amp_available(self.device):
-                logging.warning(
-                    f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP."
-                )
-                self.use_amp = False
-
        else:
            logging.warning(
                "No pretrained path was provided, evaluated policy will be built from scratch (random weights)."
@@ -87,11 +59,6 @@ class EvalPipelineConfig:
            eval_dir = f"{now:%Y-%m-%d}/{now:%H-%M-%S}_{self.job_name}"
            self.output_dir = Path("outputs/eval") / eval_dir

-        if self.device is None:
-            raise ValueError("Set one of the following device: cuda, cpu or mps")
-        elif self.device == "cuda" and self.use_amp is None:
-            raise ValueError("Set 'use_amp' to True or False.")
-
    @classmethod
    def __get_path_fields__(cls) -> list[str]:
        """This enables the parser to load config from the policy using `--policy.path=local/dir`"""
--- a/lerobot/configs/policies.py
+++ b/lerobot/configs/policies.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import abc
+import logging
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -25,6 +26,7 @@ from huggingface_hub.errors import HfHubHTTPError
 from lerobot.common.optim.optimizers import OptimizerConfig
 from lerobot.common.optim.schedulers import LRSchedulerConfig
 from lerobot.common.utils.hub import HubMixin
+from lerobot.common.utils.utils import auto_select_torch_device, is_amp_available, is_torch_device_available
 from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature

 # Generic variable that is either PreTrainedConfig or a subclass thereof
@@ -53,8 +55,24 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
    output_features: dict[str, PolicyFeature] = field(default_factory=dict)

+    device: str | None = None  # cuda | cpu | mp
+    # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
+    # automatic gradient scaling is used.
+    use_amp: bool = False
+
    def __post_init__(self):
        self.pretrained_path = None
+        if not self.device or not is_torch_device_available(self.device):
+            auto_device = auto_select_torch_device()
+            logging.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.")
+            self.device = auto_device.type
+
+        # Automatically deactivate AMP if necessary
+        if self.use_amp and not is_amp_available(self.device):
+            logging.warning(
+                f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP."
+            )
+            self.use_amp = False

    @property
    def type(self) -> str:
--- a/lerobot/configs/train.py
+++ b/lerobot/configs/train.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import datetime as dt
-import logging
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -26,7 +25,6 @@ from lerobot.common import envs
 from lerobot.common.optim import OptimizerConfig
 from lerobot.common.optim.schedulers import LRSchedulerConfig
 from lerobot.common.utils.hub import HubMixin
-from lerobot.common.utils.utils import auto_select_torch_device, is_amp_available
 from lerobot.configs import parser
 from lerobot.configs.default import DatasetConfig, EvalConfig, WandBConfig
 from lerobot.configs.policies import PreTrainedConfig
@@ -48,10 +46,6 @@ class TrainPipelineConfig(HubMixin):
    # Note that when resuming a run, the default behavior is to use the configuration from the checkpoint,
    # regardless of what's provided with the training command at the time of resumption.
    resume: bool = False
-    device: str | None = None  # cuda | cpu | mp
-    # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
-    # automatic gradient scaling is used.
-    use_amp: bool = False
    # `seed` is used for training (eg: model initialization, dataset shuffling)
    # AND for the evaluation environments.
    seed: int | None = 1000
@@ -74,18 +68,6 @@ class TrainPipelineConfig(HubMixin):
        self.checkpoint_path = None

    def validate(self):
-        if not self.device:
-            logging.warning("No device specified, trying to infer device automatically")
-            device = auto_select_torch_device()
-            self.device = device.type
-
-        # Automatically deactivate AMP if necessary
-        if self.use_amp and not is_amp_available(self.device):
-            logging.warning(
-                f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP."
-            )
-            self.use_amp = False
-
        # HACK: We parse again the cli args here to get the pretrained paths if there was some.
        policy_path = parser.get_path_arg("policy")
        if policy_path: