feat: 新增科研软件 benchmark 任务数据

- 新增 avogadro/imagej/jade/origin/ovito/pymol/vesta 等科研软件任务 JSON
- 修改 vllm_eval.py,修改图片文件名称为第x步
- desktop_env.py 添加额外数据参数 config 和 metadata
This commit is contained in:
2026-02-25 15:19:36 +08:00
parent 613f55f0da
commit 9899d4a0c7
85 changed files with 4703 additions and 71 deletions

View File

@@ -20,42 +20,42 @@ Metric = Callable[[Any, Any], float]
Getter = Callable[[gym.Env, Dict[str, Any]], Any]
MAX_RETRIES = 5 # Maximum retries for environment setup
def _fix_pyautogui_less_than_bug(command: str) -> str:
"""
Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
References:
- https://github.com/asweigart/pyautogui/issues/198
- https://github.com/xlang-ai/OSWorld/issues/257
Args:
command (str): The original pyautogui command
Returns:
str: The fixed command with '<' characters handled properly
"""
# Pattern to match press('<') or press('\u003c') calls
# Pattern to match press('<') or press('\u003c') calls
press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
# Handle press('<') calls
def replace_press_less_than(match):
return 'pyautogui.hotkey("shift", ",")'
# First handle press('<') calls
command = re.sub(press_pattern, replace_press_less_than, command)
# Pattern to match typewrite calls with quoted strings
typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
# Then handle typewrite calls
def process_typewrite_match(match):
quote_char = match.group(1)
content = match.group(2)
# Preprocess: Try to decode Unicode escapes like \u003c to actual '<'
# This handles cases where '<' is represented as escaped Unicode
try:
@@ -65,15 +65,15 @@ def _fix_pyautogui_less_than_bug(command: str) -> str:
except UnicodeDecodeError:
# If decoding fails, proceed with original content to avoid breaking existing logic
pass # English comment: Graceful degradation - fall back to original content if decoding fails
# Check if content contains '<'
if '<' not in content:
return match.group(0)
# Split by '<' and rebuild
parts = content.split('<')
result_parts = []
for i, part in enumerate(parts):
if i == 0:
# First part
@@ -84,11 +84,11 @@ def _fix_pyautogui_less_than_bug(command: str) -> str:
result_parts.append('pyautogui.hotkey("shift", ",")')
if part:
result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
return '; '.join(result_parts)
command = re.sub(typewrite_pattern, process_typewrite_match, command)
return command
@@ -145,12 +145,12 @@ class DesktopEnv(gym.Env):
self.screen_width = screen_size[0]
self.screen_height = screen_size[1]
# Default
# Default
self.server_port = 5000
self.chromium_port = 9222
self.vnc_port = 8006
self.vlc_port = 8080
# Initialize with default (no proxy) provider
self.current_use_proxy = False
self.manager, self.provider = create_vm_manager_and_provider(provider_name, region, use_proxy=False)
@@ -173,7 +173,7 @@ class DesktopEnv(gym.Env):
if provider_name in {"vmware", "virtualbox"} else path_to_vm
else:
self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=region, screen_size=(self.screen_width, self.screen_height))
self.snapshot_name = snapshot_name
self.cache_dir_base: str = cache_dir
# todo: add the logic to get the screen size from the VM
@@ -229,8 +229,8 @@ class DesktopEnv(gym.Env):
# due to the fact it could be changed when implemented by cloud services
path_to_vm = self.provider.revert_to_snapshot(self.path_to_vm, self.snapshot_name)
if path_to_vm and not path_to_vm == self.path_to_vm:
# path_to_vm has to be a new path
# path_to_vm has to be a new path
self.manager.delete_vm(self.path_to_vm, self.region)
self.manager.add_vm(path_to_vm, self.region)
self.manager.occupy_vm(path_to_vm, os.getpid(), self.region)
@@ -245,7 +245,7 @@ class DesktopEnv(gym.Env):
self.provider.stop_emulator(self.path_to_vm)
def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
# Reset to certain task in OSWorld
logger.info("Resetting environment...")
logger.info("Switching task...")
@@ -258,17 +258,17 @@ class DesktopEnv(gym.Env):
# Only revert to snapshot if environment has been used (step/setup)
# This optimization is especially important for cloud providers like AWS
# where unnecessary snapshot operations are costly and time-consuming
if task_config is not None:
# Only consider task proxy requirement if proxy is enabled at system level
task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
if not self.enable_proxy and task_config.get("proxy", False):
logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
if task_use_proxy != self.current_use_proxy:
# keep because get_info_from_website depend on this
self.current_use_proxy = task_use_proxy
if self.is_environment_used:
logger.info("Environment has been used, reverting to snapshot: {}...".format(self.snapshot_name))
self._revert_to_snapshot()
@@ -302,7 +302,7 @@ class DesktopEnv(gym.Env):
time.sleep(5)
else:
break
logger.info("Environment setup complete.")
observation = self._get_obs()
@@ -333,7 +333,8 @@ class DesktopEnv(gym.Env):
os.makedirs(self.cache_dir, exist_ok=True)
self.instruction = task_config["instruction"]
self.config = task_config["config"] if "config" in task_config else []
self.metadata = task_config.get("metadata", {})
self._set_evaluator_info(task_config)
def _set_evaluator_info(self, task_config: Dict[str, Any]):
@@ -386,7 +387,7 @@ class DesktopEnv(gym.Env):
def step(self, action, pause=2):
self._step_no += 1
self.action_history.append(action)
# Mark environment as used when step is called
self.is_environment_used = True
@@ -461,12 +462,16 @@ class DesktopEnv(gym.Env):
self.metric_options["instruction"] = self.instruction
self.metric_options["eval_model"] = self.eval_model
# Pass pre-configured environment info and expected steps
self.metric_options["config"] = self.config
self.metric_options["metadata"] = self.metadata
if result_dir:
self.metric_options["result_dir"] = result_dir
logger.info(f"Using result_dir for vllm_eval: {result_dir}")
logger.info(f"Evaluation options prepared: {self.metric_options.keys()}")
if type(self.metric) == list:
# Multiple metrics to evaluate whether the task is successfully completed
results = []

View File

@@ -287,7 +287,7 @@ class UnifiedLLM:
raise ValueError(f"Unsupported provider: {self.provider}")
def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size: int = 800, quality: int = 85) -> List[str]:
def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size: int = 800, quality: int = 85) -> tuple:
"""
Load all step screenshots from result directory and convert to base64
@@ -298,9 +298,10 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:
quality: JPEG quality for compression (default: 85)
Returns:
List of base64 encoded screenshot strings
Tuple of (list of base64 encoded screenshot strings, list of short filenames like 'step_1', 'step_2', ...)
"""
screenshots = []
filenames = []
# Find all step screenshot files (e.g., step_1_20240101@120000.png)
pattern = os.path.join(result_dir, "step_*.png")
@@ -308,8 +309,9 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:
if not screenshot_files:
logger.warning(f"No screenshot files found in {result_dir}")
return screenshots
return screenshots, filenames
import re as _re
for filepath in screenshot_files:
try:
with open(filepath, "rb") as f:
@@ -321,11 +323,16 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:
img_b64 = _compress_image(img_b64, max_size=max_size, quality=quality)
screenshots.append(img_b64)
# Extract short name like 'step_1' from 'step_1_20240101@120000.png'
basename = os.path.basename(filepath)
match = _re.match(r'(step_\d+)', basename)
short_name = match.group(1) if match else basename
filenames.append(short_name)
except Exception as e:
logger.error(f"Error loading screenshot {filepath}: {e}")
logger.info(f"Loaded {len(screenshots)} screenshots from {result_dir}")
return screenshots
logger.info(f"Loaded {len(screenshots)} screenshots from {result_dir}: {filenames}")
return screenshots, filenames
def vllm_eval(result_state, **options) -> float:
@@ -358,8 +365,10 @@ def vllm_eval(result_state, **options) -> float:
max_image_size = options.get("max_image_size", 800)
image_quality = options.get("image_quality", 85)
screenshot_filenames = [] # Short names like 'step_1', 'step_2', ...
if result_dir and not screenshots:
screenshots = _load_screenshots_from_dir(
screenshots, screenshot_filenames = _load_screenshots_from_dir(
result_dir,
compress=compress_images,
max_size=max_image_size,
@@ -368,6 +377,7 @@ def vllm_eval(result_state, **options) -> float:
logger.info(f"Loaded {len(screenshots)} screenshots from result_dir: {result_dir}")
elif screenshots:
logger.info(f"Using {len(screenshots)} screenshots from options")
screenshot_filenames = [f"step_{i+1}" for i in range(len(screenshots))]
# Compress screenshots if needed
if compress_images:
logger.info("Compressing provided screenshots...")
@@ -375,6 +385,8 @@ def vllm_eval(result_state, **options) -> float:
instruction = options.get("instruction", "")
eval_model = options.get("eval_model", "gpt-4-vision-preview")
config = options.get("config", [])
metadata = options.get("metadata", {})
params = {
"temperature": options.get("temperature", 0.7),
@@ -384,32 +396,91 @@ def vllm_eval(result_state, **options) -> float:
llm = UnifiedLLM(eval_model)
prompt = f"""You are an expert evaluator for desktop environment tasks.
# Build pre-configured environment description from config
preconfig_items = []
for cfg in config:
if cfg.get("type") == "launch":
cmds = cfg.get("parameters", {}).get("command", [])
if cmds:
app_name = os.path.basename(cmds[0]) if cmds else "unknown"
preconfig_items.append(f"Application '{app_name}' was automatically launched before the agent started.")
elif cfg.get("type") == "sleep":
pass # not relevant to scoring
elif cfg.get("type") == "open":
path = cfg.get("parameters", {}).get("path", "")
preconfig_items.append(f"File/URL '{path}' was automatically opened before the agent started.")
preconfig_section = ""
if preconfig_items:
preconfig_desc = "\n".join(f" - {item}" for item in preconfig_items)
preconfig_section = f"""
PRE-CONFIGURED ENVIRONMENT (done BEFORE the agent started, NOT the agent's work):
{preconfig_desc}
IMPORTANT: The above actions were performed automatically as part of environment setup. The agent did NOT perform these actions. Do NOT give ANY credit for them. For example, if the application was pre-launched, the agent merely having the application open is worth 0 points - that was the starting state."""
# Build expected steps section from metadata
expected_steps_section = ""
if metadata.get("steps"):
expected_steps_section = f"""
EXPECTED STEPS for this task (use as reference for what the agent should have done):
{metadata['steps']}
NOTE: Evaluate the screenshots against these expected steps. Only give credit for steps that show VISIBLE evidence of completion BEYOND the pre-configured starting state."""
# Build image list description for the prompt
if screenshot_filenames:
img_list_str = ", ".join(screenshot_filenames)
img_info = f"""\nYou are provided with exactly {len(screenshot_filenames)} screenshots in chronological order: {img_list_str}
The FIRST screenshot is: {screenshot_filenames[0]}
The LAST screenshot (final state): {screenshot_filenames[-1]}
IMPORTANT: Only reference screenshots from the list above. Do NOT reference any screenshot that is not listed."""
else:
img_info = "\nNo screenshots were provided."
prompt = f"""You are a STRICT and RIGOROUS evaluator for desktop environment tasks. Your job is to score ONLY based on concrete, visible evidence of task completion in the screenshots.
Task Instruction: {instruction}
{preconfig_section}
{expected_steps_section}
{img_info}
I will provide you with screenshot(s) showing the current state of the desktop environment. Please analyze the task execution step by step and provide a detailed evaluation.
Analyze ONLY the FINAL screenshot ({screenshot_filenames[-1] if screenshot_filenames else 'N/A'}) to determine the end state, while using earlier screenshots for context.
CRITICAL SCORING RULES:
1. Score ONLY based on what the AGENT actually accomplished. The pre-configured environment (application already launched, files already opened, etc.) is the STARTING STATE and worth 0 points.
2. Score ONLY based on what is ACTUALLY VISIBLE in the screenshots. Do NOT give credit for assumed or potential progress.
3. If the screenshots show NO meaningful action beyond the initial pre-configured state, the score MUST be 0.
4. Do NOT give partial credit for "having the system on", "desktop being visible", "the application being open" (if it was pre-launched), or "the application being installed". These are prerequisites or pre-configured state, NOT progress.
5. Each point must correspond to a SPECIFIC, VERIFIABLE action that was successfully completed BY THE AGENT toward the task goal.
SCORING GUIDE (0-10):
- 0: No progress beyond the pre-configured starting state. If the app was pre-launched, merely having it open is 0. If the screenshots only show the desktop or the initial app state without any agent action, score is 0.
- 1-2: The agent performed one minor action (e.g., clicked on a menu) but did not make meaningful progress toward the task goal.
- 3-4: Some initial steps toward the task have been taken but the task is far from complete.
- 5-6: Significant progress - about half the required steps are completed with visible evidence.
- 7-8: Most steps are completed but the final result is not fully achieved or has minor issues.
- 9: The task is essentially complete with very minor cosmetic differences.
- 10: The task is perfectly and completely finished with clear evidence in the final screenshot.
IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:
{{
"steps_analysis": [
{{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation"}},
{{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation"}}
{{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation of VISIBLE evidence"}},
{{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation of VISIBLE evidence"}}
],
"final_completion": "True/False",
"score": 0-10
}}
Where:
- "steps_analysis": Array of steps you identified from the screenshots (reference screenshot filenames like step_1.png, step_2.png, etc.)
- "steps_analysis": Array of steps you identified from the screenshots. Each step must cite VISIBLE evidence from a specific screenshot. Do NOT include pre-configured actions as agent steps.
- "status": Either "Success" or "Fail" for each step
- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
- "reason": Brief explanation of why this step succeeded or failed
- "final_completion": "True" if the overall task is completed, "False" otherwise
- "score": Integer from 0 to 10, where 10 means perfectly completed and 0 means not completed at all
- "reason": Explanation of what is VISUALLY observed in the screenshot as evidence
- "final_completion": "True" ONLY if the overall task is fully completed with clear visual proof, "False" otherwise
- "score": Integer from 0 to 10, following the strict scoring guide above
Remember: Return ONLY the JSON object, no additional text."""
Remember: Return ONLY the JSON object, no additional text. Be STRICT - when in doubt, score LOWER."""
try:
result = llm.generate_with_images(