feat(env): add eval_model parameter and result_dir support for vllm evaluation

2026-02-05 16:53:12 +08:00
parent dd58a1de03
commit be24e77d93
1 changed files with 20 additions and 1 deletions
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -111,6 +111,7 @@ class DesktopEnv(gym.Env):
            os_type: str = "Ubuntu",
            enable_proxy: bool = False,
            client_password: str = "",
+            eval_model: str = "gpt-5.2-chat-latest"
    ):
        """
        Args:
@@ -127,6 +128,7 @@ class DesktopEnv(gym.Env):
            require_terminal (bool): whether to require terminal output
            os_type (str): operating system type, default to "Ubuntu"
            enable_proxy (bool): whether to enable proxy support, default to False
+            eval_model (str): evaluation model to use, default to "gpt-5.2-chat-latest"
        """
        # Initialize VM manager and vitualization provider
        self.region = region
@@ -179,6 +181,9 @@ class DesktopEnv(gym.Env):
        self.require_a11y_tree = require_a11y_tree
        self.require_terminal = require_terminal

+        # Evaluation model
+        self.eval_model = eval_model
+
        # Initialize emulator and controller
        logger.info("Initializing...")
        self._start_emulator()
@@ -425,7 +430,7 @@ class DesktopEnv(gym.Env):

        return observation, reward, done, info

-    def evaluate(self):
+    def evaluate(self, result_dir: Optional[str] = None):
        """
        Evaluate whether the task is successfully completed.
        """
@@ -448,6 +453,20 @@ class DesktopEnv(gym.Env):
                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
                    return 0

+        if self.evaluator['func'] == "vllm_eval":
+            logger.info("Preparing vllm_eval metric options...")
+            screenshot_bytes = self.controller.get_screenshot()
+
+            import base64
+            self.metric_options["instruction"] = self.instruction
+            self.metric_options["eval_model"] = self.eval_model
+
+            if result_dir:
+                self.metric_options["result_dir"] = result_dir
+                logger.info(f"Using result_dir for vllm_eval: {result_dir}")
+
+            logger.info(f"Evaluation options prepared: {self.metric_options.keys()}")
+            
        if type(self.metric) == list:
            # Multiple metrics to evaluate whether the task is successfully completed
            results = []