diff --git a/lerobot/common/policies/smolvla/smolvlm_with_expert.py b/lerobot/common/policies/smolvla/smolvlm_with_expert.py index ce3b5cd0..07eae808 100644 --- a/lerobot/common/policies/smolvla/smolvlm_with_expert.py +++ b/lerobot/common/policies/smolvla/smolvlm_with_expert.py @@ -133,9 +133,7 @@ class SmolVLMWithExpertModel(nn.Module): self.expert_hidden_size = lm_expert_config.hidden_size self.set_requires_grad() - def get_vlm_model( - self, - ): + def get_vlm_model(self): return self.vlm.model def set_requires_grad(self): diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py index 88fe97ea..13beda8b 100644 --- a/lerobot/common/robot_devices/control_utils.py +++ b/lerobot/common/robot_devices/control_utils.py @@ -109,8 +109,9 @@ def predict_action(observation, policy, device, use_amp): ): # Convert to pytorch format: channel first and float32 in [0,1] with batch dimension for name in observation: + # Skip all observations that are not tensors (e.g. text) if not isinstance(observation[name], torch.Tensor): - continue # VLA-like and multirobot policies include textual inputs in the observation + continue if "image" in name: observation[name] = observation[name].type(torch.float32) / 255