sci-gui-agent-benchmark/lib_results_logger.py

#!/usr/bin/env python3
"""
Thread-safe results logging for OSWorld evaluations.
Appends task completion results to results.json in real-time.
"""

import json
import os
import time
import platform
from pathlib import Path
from typing import Dict, Any, Optional

# Import fcntl only on Unix-like systems (Linux, macOS)
# On Windows, we'll use msvcrt for file locking
if platform.system() != "Windows":
    import fcntl
    HAS_FCNTL = True
else:
    import msvcrt
    HAS_FCNTL = False


def extract_domain_from_path(result_path: str) -> str:
    """
    Extract domain/application from result directory path.
    Expected structure: results/{action_space}/{observation_type}/{model}/{domain}/{task_id}/
    """
    path_parts = Path(result_path).parts
    if len(path_parts) >= 2:
        return path_parts[-2]  # Second to last part should be domain
    return "unknown"


def append_task_result(
    task_id: str,
    domain: str,
    score: float,
    result_dir: str,
    args: Any,
    error_message: Optional[str] = None
) -> None:
    """
    Thread-safely append a task result to results.json.

    Args:
        task_id: UUID of the task
        domain: Application domain (chrome, vlc, etc.)
        score: Task score (0.0 or 1.0)
        result_dir: Full path to the task result directory
        args: Command line arguments object
        error_message: Error message if task failed
    """
    # Create result entry
    result_entry = {
        "application": domain,
        "task_id": task_id,
        "status": "error" if error_message else "success",
        "score": score,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }

    if error_message:
        result_entry["err_message"] = error_message

    # Determine summary directory and results file path
    # Extract base result directory from args
    base_result_dir = Path(args.result_dir)
    summary_dir = base_result_dir / "summary"
    results_file = summary_dir / "results.json"

    # Ensure summary directory exists
    summary_dir.mkdir(parents=True, exist_ok=True)

    # Thread-safe JSON append with file locking
    try:
        with open(results_file, 'a+') as f:
            # Lock the file for exclusive access (platform-specific)
            if HAS_FCNTL:
                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
            else:
                # Windows file locking using msvcrt
                msvcrt.locking(f.fileno(), msvcrt.LK_LOCK, 1)

            try:
                # Move to beginning to read existing content
                f.seek(0)
                content = f.read().strip()

                # Parse existing JSON array or create new one
                if content:
                    try:
                        existing_results = json.loads(content)
                        if not isinstance(existing_results, list):
                            existing_results = []
                    except json.JSONDecodeError:
                        existing_results = []
                else:
                    existing_results = []

                # Add new result
                existing_results.append(result_entry)

                # Write back the complete JSON array
                f.seek(0)
                f.truncate()
                json.dump(existing_results, f, indent=2)
                f.write('\n')  # Add newline for readability

            finally:
                # Always unlock the file (platform-specific)
                if HAS_FCNTL:
                    fcntl.flock(f.fileno(), fcntl.LOCK_UN)
                else:
                    # Windows unlock using msvcrt
                    msvcrt.locking(f.fileno(), msvcrt.LK_UNLCK, 1)

        print(f"📝 Logged result: {domain}/{task_id} -> {result_entry['status']} (score: {score})")

    except Exception as e:
        # Don't let logging errors break the main evaluation
        print(f"⚠️  Failed to log result for {task_id}: {e}")


def log_task_completion(example: Dict, result: float, result_dir: str, args: Any) -> None:
    """
    Convenience wrapper for logging successful task completion.

    Args:
        example: Task configuration dictionary
        result: Task score
        result_dir: Path to task result directory
        args: Command line arguments
    """
    task_id = example.get('id', 'unknown')
    domain = extract_domain_from_path(result_dir)
    append_task_result(task_id, domain, result, result_dir, args)


def log_task_error(example: Dict, error_msg: str, result_dir: str, args: Any) -> None:
    """
    Convenience wrapper for logging task errors.

    Args:
        example: Task configuration dictionary
        error_msg: Error message
        result_dir: Path to task result directory
        args: Command line arguments
    """
    task_id = example.get('id', 'unknown')
    domain = extract_domain_from_path(result_dir)
    append_task_result(task_id, domain, 0.0, result_dir, args, error_msg)