152 lines
4.9 KiB
Python
152 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Thread-safe results logging for OSWorld evaluations.
|
|
Appends task completion results to results.json in real-time.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import platform
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
|
|
# Import fcntl only on Unix-like systems (Linux, macOS)
|
|
# On Windows, we'll use msvcrt for file locking
|
|
if platform.system() != "Windows":
|
|
import fcntl
|
|
HAS_FCNTL = True
|
|
else:
|
|
import msvcrt
|
|
HAS_FCNTL = False
|
|
|
|
|
|
def extract_domain_from_path(result_path: str) -> str:
|
|
"""
|
|
Extract domain/application from result directory path.
|
|
Expected structure: results/{action_space}/{observation_type}/{model}/{domain}/{task_id}/
|
|
"""
|
|
path_parts = Path(result_path).parts
|
|
if len(path_parts) >= 2:
|
|
return path_parts[-2] # Second to last part should be domain
|
|
return "unknown"
|
|
|
|
|
|
def append_task_result(
|
|
task_id: str,
|
|
domain: str,
|
|
score: float,
|
|
result_dir: str,
|
|
args: Any,
|
|
error_message: Optional[str] = None
|
|
) -> None:
|
|
"""
|
|
Thread-safely append a task result to results.json.
|
|
|
|
Args:
|
|
task_id: UUID of the task
|
|
domain: Application domain (chrome, vlc, etc.)
|
|
score: Task score (0.0 or 1.0)
|
|
result_dir: Full path to the task result directory
|
|
args: Command line arguments object
|
|
error_message: Error message if task failed
|
|
"""
|
|
# Create result entry
|
|
result_entry = {
|
|
"application": domain,
|
|
"task_id": task_id,
|
|
"status": "error" if error_message else "success",
|
|
"score": score,
|
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
|
}
|
|
|
|
if error_message:
|
|
result_entry["err_message"] = error_message
|
|
|
|
# Determine summary directory and results file path
|
|
# Extract base result directory from args
|
|
base_result_dir = Path(args.result_dir)
|
|
summary_dir = base_result_dir / "summary"
|
|
results_file = summary_dir / "results.json"
|
|
|
|
# Ensure summary directory exists
|
|
summary_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Thread-safe JSON append with file locking
|
|
try:
|
|
with open(results_file, 'a+') as f:
|
|
# Lock the file for exclusive access (platform-specific)
|
|
if HAS_FCNTL:
|
|
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
else:
|
|
# Windows file locking using msvcrt
|
|
msvcrt.locking(f.fileno(), msvcrt.LK_LOCK, 1)
|
|
|
|
try:
|
|
# Move to beginning to read existing content
|
|
f.seek(0)
|
|
content = f.read().strip()
|
|
|
|
# Parse existing JSON array or create new one
|
|
if content:
|
|
try:
|
|
existing_results = json.loads(content)
|
|
if not isinstance(existing_results, list):
|
|
existing_results = []
|
|
except json.JSONDecodeError:
|
|
existing_results = []
|
|
else:
|
|
existing_results = []
|
|
|
|
# Add new result
|
|
existing_results.append(result_entry)
|
|
|
|
# Write back the complete JSON array
|
|
f.seek(0)
|
|
f.truncate()
|
|
json.dump(existing_results, f, indent=2)
|
|
f.write('\n') # Add newline for readability
|
|
|
|
finally:
|
|
# Always unlock the file (platform-specific)
|
|
if HAS_FCNTL:
|
|
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
|
else:
|
|
# Windows unlock using msvcrt
|
|
msvcrt.locking(f.fileno(), msvcrt.LK_UNLCK, 1)
|
|
|
|
print(f"📝 Logged result: {domain}/{task_id} -> {result_entry['status']} (score: {score})")
|
|
|
|
except Exception as e:
|
|
# Don't let logging errors break the main evaluation
|
|
print(f"⚠️ Failed to log result for {task_id}: {e}")
|
|
|
|
|
|
def log_task_completion(example: Dict, result: float, result_dir: str, args: Any) -> None:
|
|
"""
|
|
Convenience wrapper for logging successful task completion.
|
|
|
|
Args:
|
|
example: Task configuration dictionary
|
|
result: Task score
|
|
result_dir: Path to task result directory
|
|
args: Command line arguments
|
|
"""
|
|
task_id = example.get('id', 'unknown')
|
|
domain = extract_domain_from_path(result_dir)
|
|
append_task_result(task_id, domain, result, result_dir, args)
|
|
|
|
|
|
def log_task_error(example: Dict, error_msg: str, result_dir: str, args: Any) -> None:
|
|
"""
|
|
Convenience wrapper for logging task errors.
|
|
|
|
Args:
|
|
example: Task configuration dictionary
|
|
error_msg: Error message
|
|
result_dir: Path to task result directory
|
|
args: Command line arguments
|
|
"""
|
|
task_id = example.get('id', 'unknown')
|
|
domain = extract_domain_from_path(result_dir)
|
|
append_task_result(task_id, domain, 0.0, result_dir, args, error_msg) |