feat(env): add eval_model parameter and result_dir support for vllm evaluation

This commit is contained in:
cui0711
2026-02-05 16:53:12 +08:00
parent dd58a1de03
commit be24e77d93

View File

@@ -111,6 +111,7 @@ class DesktopEnv(gym.Env):
os_type: str = "Ubuntu",
enable_proxy: bool = False,
client_password: str = "",
eval_model: str = "gpt-5.2-chat-latest"
):
"""
Args:
@@ -127,6 +128,7 @@ class DesktopEnv(gym.Env):
require_terminal (bool): whether to require terminal output
os_type (str): operating system type, default to "Ubuntu"
enable_proxy (bool): whether to enable proxy support, default to False
eval_model (str): evaluation model to use, default to "gpt-5.2-chat-latest"
"""
# Initialize VM manager and vitualization provider
self.region = region
@@ -179,6 +181,9 @@ class DesktopEnv(gym.Env):
self.require_a11y_tree = require_a11y_tree
self.require_terminal = require_terminal
# Evaluation model
self.eval_model = eval_model
# Initialize emulator and controller
logger.info("Initializing...")
self._start_emulator()
@@ -425,7 +430,7 @@ class DesktopEnv(gym.Env):
return observation, reward, done, info
def evaluate(self):
def evaluate(self, result_dir: Optional[str] = None):
"""
Evaluate whether the task is successfully completed.
"""
@@ -448,6 +453,20 @@ class DesktopEnv(gym.Env):
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
return 0
if self.evaluator['func'] == "vllm_eval":
logger.info("Preparing vllm_eval metric options...")
screenshot_bytes = self.controller.get_screenshot()
import base64
self.metric_options["instruction"] = self.instruction
self.metric_options["eval_model"] = self.eval_model
if result_dir:
self.metric_options["result_dir"] = result_dir
logger.info(f"Using result_dir for vllm_eval: {result_dir}")
logger.info(f"Evaluation options prepared: {self.metric_options.keys()}")
if type(self.metric) == list:
# Multiple metrics to evaluate whether the task is successfully completed
results = []