feat(env): add eval_model parameter and result_dir support for vllm evaluation
This commit is contained in:
@@ -111,6 +111,7 @@ class DesktopEnv(gym.Env):
|
|||||||
os_type: str = "Ubuntu",
|
os_type: str = "Ubuntu",
|
||||||
enable_proxy: bool = False,
|
enable_proxy: bool = False,
|
||||||
client_password: str = "",
|
client_password: str = "",
|
||||||
|
eval_model: str = "gpt-5.2-chat-latest"
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@@ -127,6 +128,7 @@ class DesktopEnv(gym.Env):
|
|||||||
require_terminal (bool): whether to require terminal output
|
require_terminal (bool): whether to require terminal output
|
||||||
os_type (str): operating system type, default to "Ubuntu"
|
os_type (str): operating system type, default to "Ubuntu"
|
||||||
enable_proxy (bool): whether to enable proxy support, default to False
|
enable_proxy (bool): whether to enable proxy support, default to False
|
||||||
|
eval_model (str): evaluation model to use, default to "gpt-5.2-chat-latest"
|
||||||
"""
|
"""
|
||||||
# Initialize VM manager and vitualization provider
|
# Initialize VM manager and vitualization provider
|
||||||
self.region = region
|
self.region = region
|
||||||
@@ -179,6 +181,9 @@ class DesktopEnv(gym.Env):
|
|||||||
self.require_a11y_tree = require_a11y_tree
|
self.require_a11y_tree = require_a11y_tree
|
||||||
self.require_terminal = require_terminal
|
self.require_terminal = require_terminal
|
||||||
|
|
||||||
|
# Evaluation model
|
||||||
|
self.eval_model = eval_model
|
||||||
|
|
||||||
# Initialize emulator and controller
|
# Initialize emulator and controller
|
||||||
logger.info("Initializing...")
|
logger.info("Initializing...")
|
||||||
self._start_emulator()
|
self._start_emulator()
|
||||||
@@ -425,7 +430,7 @@ class DesktopEnv(gym.Env):
|
|||||||
|
|
||||||
return observation, reward, done, info
|
return observation, reward, done, info
|
||||||
|
|
||||||
def evaluate(self):
|
def evaluate(self, result_dir: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Evaluate whether the task is successfully completed.
|
Evaluate whether the task is successfully completed.
|
||||||
"""
|
"""
|
||||||
@@ -448,6 +453,20 @@ class DesktopEnv(gym.Env):
|
|||||||
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
|
if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
if self.evaluator['func'] == "vllm_eval":
|
||||||
|
logger.info("Preparing vllm_eval metric options...")
|
||||||
|
screenshot_bytes = self.controller.get_screenshot()
|
||||||
|
|
||||||
|
import base64
|
||||||
|
self.metric_options["instruction"] = self.instruction
|
||||||
|
self.metric_options["eval_model"] = self.eval_model
|
||||||
|
|
||||||
|
if result_dir:
|
||||||
|
self.metric_options["result_dir"] = result_dir
|
||||||
|
logger.info(f"Using result_dir for vllm_eval: {result_dir}")
|
||||||
|
|
||||||
|
logger.info(f"Evaluation options prepared: {self.metric_options.keys()}")
|
||||||
|
|
||||||
if type(self.metric) == list:
|
if type(self.metric) == list:
|
||||||
# Multiple metrics to evaluate whether the task is successfully completed
|
# Multiple metrics to evaluate whether the task is successfully completed
|
||||||
results = []
|
results = []
|
||||||
|
|||||||
Reference in New Issue
Block a user