diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index 096dc54..daba009 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -111,6 +111,7 @@ class DesktopEnv(gym.Env): os_type: str = "Ubuntu", enable_proxy: bool = False, client_password: str = "", + eval_model: str = "gpt-5.2-chat-latest" ): """ Args: @@ -127,6 +128,7 @@ class DesktopEnv(gym.Env): require_terminal (bool): whether to require terminal output os_type (str): operating system type, default to "Ubuntu" enable_proxy (bool): whether to enable proxy support, default to False + eval_model (str): evaluation model to use, default to "gpt-5.2-chat-latest" """ # Initialize VM manager and vitualization provider self.region = region @@ -179,6 +181,9 @@ class DesktopEnv(gym.Env): self.require_a11y_tree = require_a11y_tree self.require_terminal = require_terminal + # Evaluation model + self.eval_model = eval_model + # Initialize emulator and controller logger.info("Initializing...") self._start_emulator() @@ -425,7 +430,7 @@ class DesktopEnv(gym.Env): return observation, reward, done, info - def evaluate(self): + def evaluate(self, result_dir: Optional[str] = None): """ Evaluate whether the task is successfully completed. """ @@ -448,6 +453,20 @@ class DesktopEnv(gym.Env): if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'): return 0 + if self.evaluator['func'] == "vllm_eval": + logger.info("Preparing vllm_eval metric options...") + screenshot_bytes = self.controller.get_screenshot() + + import base64 + self.metric_options["instruction"] = self.instruction + self.metric_options["eval_model"] = self.eval_model + + if result_dir: + self.metric_options["result_dir"] = result_dir + logger.info(f"Using result_dir for vllm_eval: {result_dir}") + + logger.info(f"Evaluation options prepared: {self.metric_options.keys()}") + if type(self.metric) == list: # Multiple metrics to evaluate whether the task is successfully completed results = []