feat(env): add eval_model parameter and result_dir support for vllm evaluation
This commit is contained in:
@@ -111,6 +111,7 @@ class DesktopEnv(gym.Env):
|
||||
os_type: str = "Ubuntu",
|
||||
enable_proxy: bool = False,
|
||||
client_password: str = "",
|
||||
eval_model: str = "gpt-5.2-chat-latest"
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@@ -127,6 +128,7 @@ class DesktopEnv(gym.Env):
|
||||
require_terminal (bool): whether to require terminal output
|
||||
os_type (str): operating system type, default to "Ubuntu"
|
||||
enable_proxy (bool): whether to enable proxy support, default to False
|
||||
eval_model (str): evaluation model to use, default to "gpt-5.2-chat-latest"
|
||||
"""
|
||||
# Initialize VM manager and vitualization provider
|
||||
self.region = region
|
||||
@@ -179,6 +181,9 @@ class DesktopEnv(gym.Env):
|
||||
self.require_a11y_tree = require_a11y_tree
|
||||
self.require_terminal = require_terminal
|
||||
|
||||
# Evaluation model
|
||||
self.eval_model = eval_model
|
||||
|
||||
# Initialize emulator and controller
|
||||
logger.info("Initializing...")
|
||||
self._start_emulator()
|
||||
@@ -425,7 +430,7 @@ class DesktopEnv(gym.Env):
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def evaluate(self):
|
||||
def evaluate(self, result_dir: Optional[str] = None):
|
||||
"""
|
||||
Evaluate whether the task is successfully completed.
|
||||
"""
|
||||
@@ -448,6 +453,20 @@ class DesktopEnv(gym.Env):
|
||||
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
|
||||
return 0
|
||||
|
||||
if self.evaluator['func'] == "vllm_eval":
|
||||
logger.info("Preparing vllm_eval metric options...")
|
||||
screenshot_bytes = self.controller.get_screenshot()
|
||||
|
||||
import base64
|
||||
self.metric_options["instruction"] = self.instruction
|
||||
self.metric_options["eval_model"] = self.eval_model
|
||||
|
||||
if result_dir:
|
||||
self.metric_options["result_dir"] = result_dir
|
||||
logger.info(f"Using result_dir for vllm_eval: {result_dir}")
|
||||
|
||||
logger.info(f"Evaluation options prepared: {self.metric_options.keys()}")
|
||||
|
||||
if type(self.metric) == list:
|
||||
# Multiple metrics to evaluate whether the task is successfully completed
|
||||
results = []
|
||||
|
||||
Reference in New Issue
Block a user