sci-gui-agent-benchmark/evaluation_examples/extract_instructions_v2.py

import os
import sys
import asyncio
import aiohttp
import base64
import logging
from pathlib import Path
from typing import List, Optional
import tempfile
import shutil
from dataclasses import dataclass
from datetime import datetime
import json
import re

# Configuration
SCRIPT_DIR = Path(__file__).parent
PROJECT_ROOT = SCRIPT_DIR.parent

API_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
API_URL = f"{API_BASE_URL}/chat/completions"
API_KEY = os.getenv("OPENAI_API_KEY")
MODEL_NAME = os.getenv("EXTRACT_MODEL", "gpt-4o")  # Configurable via env var
MAX_CONCURRENT_REQUESTS = 5

# Input folder where PDFs/Docs are stored, organized by software name
# e.g. evaluation_examples/inputs/vesta/tutorial.pdf
INPUT_FOLDER = PROJECT_ROOT / "evaluation_examples" / "inputs"
EXAMPLES_FOLDER = PROJECT_ROOT / "evaluation_examples" / "examples"
TEST_ALL_JSON = PROJECT_ROOT / "evaluation_examples" / "test_all.json"

# Retry configuration
MAX_RETRY_ATTEMPTS = 3
RETRY_DELAY = 5
RETRY_BACKOFF = 2

# Image limit - keep low to avoid 413 payload too large errors
MAX_IMAGES_PER_REQUEST = 20

# Supported file extensions
SUPPORTED_EXTENSIONS = {'.docx', '.doc', '.ppt', '.pptx', '.pdf', '.mp4', '.avi', '.mov', '.mkv'}

# Software-specific launch config and snapshot mapping
# Maps software folder name -> {"snapshot": ..., "config": [...]}
SOFTWARE_CONFIG = {
    "avogadro": {
        "snapshot": "avogadro",
        "config": [
            {"type": "launch", "parameters": {"command": ["C:\\Avogadro2\\bin\\avogadro2.exe"]}},
            {"type": "sleep", "parameters": {"seconds": 5}}
        ]
    },
    "imagej": {
        "snapshot": "imagej",
        "config": [
            {"type": "launch", "parameters": {"command": ["C:\\ImageJ\\ImageJ.exe"]}},
            {"type": "sleep", "parameters": {"seconds": 5}}
        ]
    },
    "origin": {
        "snapshot": "origin",
        "config": [
            {"type": "launch", "parameters": {"command": ["C:\\OriginLab\\Origin2025b\\Origin64.exe"]}},
            {"type": "sleep", "parameters": {"seconds": 5}}
        ]
    },
    "ovito": {
        "snapshot": "ovito",
        "config": [
            {"type": "launch", "parameters": {"command": ["C:\\OVITO Basic\\ovito.exe"]}},
            {"type": "sleep", "parameters": {"seconds": 5}}
        ]
    },
    "pymol": {
        "snapshot": "pymol",
        "config": [
            {"type": "launch", "parameters": {"command": ["C:\\PYMOL\\PyMOLWin.exe"]}},
            {"type": "sleep", "parameters": {"seconds": 5}}
        ]
    },
    "vesta": {
        "snapshot": "vesta",
        "config": [
            {"type": "launch", "parameters": {"command": ["C:\\VESTA-win64\\VESTA.exe"]}},
            {"type": "sleep", "parameters": {"seconds": 5}}
        ]
    },
}

# Default config for unknown software
DEFAULT_SOFTWARE_CONFIG = {
    "snapshot": "snapshot",
    "config": []
}

SYSTEM_PROMPT = """你是一个科研软件 GUI 自动化测试专家。你的任务是从教程文档中提取出多个**具体的、可执行的、可验证的** GUI 操作任务。

## 核心要求
这些任务将被用于测试 AI Agent 操控桌面软件的能力。每个任务必须足够具体，让 Agent 明确知道要做什么，做完后能通过截图判断是否成功。

## 任务粒度要求（非常重要）
- **每个任务应该是 3-8 步 GUI 操作就能完成的小任务**
- **task_goal 必须包含具体的参数值、文件名、菜单路径等细节**
- **绝对不要写模糊的指令**

### ❌ 错误示例（太模糊）：
- "Perform phase identification" — Agent 不知道用哪个文件、选什么参数
- "Export data" — 导出什么格式？保存到哪里？
- "Calculate crystallite size" — 选哪个峰？什么参数？

### ✅ 正确示例（具体可执行）：
- "在 ImageJ 中，通过 File → Open 打开桌面上的 cell_image.tif 文件"
- "在 ImageJ 中，使用 Image → Adjust → Threshold 对当前图像进行阈值分割，选择 Default 方法并点击 Apply"
- "在 ImageJ 中，通过 Analyze → Measure 测量当前选区的面积和平均灰度值"
- "在 ImageJ 中，使用 Process → Filters → Gaussian Blur 对图像施加半径为 2.0 像素的高斯模糊"
- "在 Avogadro 2 中，通过 Build → Insert → Molecule 搜索并插入一个 benzene 分子"
- "在 VESTA 中通过 File → Open 打开桌面上的 Si.cif 文件，然后将视角旋转到 [110] 方向"

## 输出格式
返回严格的 JSON 对象：
{
    "tasks": [
        {
            "task_goal": "一句话具体描述要做什么（包含软件名、菜单路径、文件名、参数值等具体信息）。用中文。",
            "input_files": ["涉及的文件名列表，如 'sample.raw'。如果不需要输入文件则为空列表 []"],
            "steps": "详细的 GUI 操作步骤，带编号，用换行分隔"
        }
    ]
}

## 任务提取规则
1. **独立性**：每个任务都能独立完成（假设软件已打开或从头启动）
2. **具体性**：task_goal 中必须包含教程中提到的具体文件名、参数值、菜单名称
3. **可验证性**：完成后应该能从屏幕截图看出任务是否成功（例如：文件已打开、图表已显示、对话框已出现等）
4. **忠实性**：只描述教程中实际出现的操作，不要编造功能
5. **数量**：从一份教程中提取 10-15 个不同的任务，覆盖教程的各个章节。优先选择最常用、最有代表性的操作
6. **软件名称**：task_goal 必须以「在 XXX 中，」开头，明确指出软件名称
7. **难度分布**：包含简单（2-3步）、中等（4-5步）、较难（6-8步）的任务各占三分之一

"""

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

stats = None # Will be initialized in main

@dataclass
class ProcessingStats:
    """Processing statistics tracker"""
    total_files: int = 0
    completed_files: int = 0
    failed_files: int = 0
    retried_files: int = 0
    generated_tasks: int = 0
    start_time: datetime = None
    failed_list: List[tuple] = None

    def __post_init__(self):
        if self.start_time is None:
            self.start_time = datetime.now()
        if self.failed_list is None:
            self.failed_list = []

    def add_completed(self, num_tasks=1):
        self.completed_files += 1
        self.generated_tasks += num_tasks
        self._log_progress()

    def add_failed(self, file_path: str, error: str):
        self.failed_files += 1
        self.failed_list.append((file_path, error))
        self._log_progress()

    def add_retry(self):
        self.retried_files += 1

    def _log_progress(self):
        processed = self.completed_files + self.failed_files
        percentage = (processed / self.total_files * 100) if self.total_files > 0 else 0
        elapsed = (datetime.now() - self.start_time).total_seconds()

        logger.info(f"Progress: {processed}/{self.total_files} ({percentage:.1f}%) | "
                   f"Tasks Gen: {self.generated_tasks} | Failed: {self.failed_files}")

# -----------------------------------------------------------------------------
# Dependency Checks & File Conversion (Copied & Adapted from original script)
# -----------------------------------------------------------------------------

def check_dependencies():
    """Check and prompt for missing dependencies"""
    missing = []
    try: import pdf2image
    except ImportError: missing.append("pdf2image")
    try: import PIL
    except ImportError: missing.append("Pillow")
    try: import cv2
    except ImportError: missing.append("opencv-python")

    if not shutil.which("soffice") and not shutil.which("libreoffice"):
        logger.warning("LibreOffice not detected (needed for .doc/.ppt)")

    if missing:
        logger.error(f"Missing dependencies: {', '.join(missing)}")
        return False
    return True

def convert_pdf_to_images(pdf_path: str) -> List[str]:
    try:
        from pdf2image import convert_from_path
        import io

        # First, get total page count at very low DPI
        quick_check = convert_from_path(pdf_path, dpi=36, fmt='jpeg')
        total_pages = len(quick_check)
        del quick_check

        # For large PDFs: lower DPI + sample pages evenly
        if total_pages > MAX_IMAGES_PER_REQUEST:
            dpi = 100  # lower DPI for large docs
            quality = 80
            # Sample pages evenly across the document
            step = total_pages / MAX_IMAGES_PER_REQUEST
            selected_pages = [int(step * i) + 1 for i in range(MAX_IMAGES_PER_REQUEST)]
            logger.info(f"Large PDF ({total_pages} pages): sampling {len(selected_pages)} pages at {dpi} DPI")
            base64_images = []
            for page_num in selected_pages:
                imgs = convert_from_path(pdf_path, dpi=dpi, fmt='jpeg',
                                        first_page=page_num, last_page=page_num)
                if imgs:
                    buffer = io.BytesIO()
                    imgs[0].save(buffer, format='JPEG', quality=quality)
                    base64_images.append(base64.b64encode(buffer.getvalue()).decode('utf-8'))
            return base64_images
        else:
            # Small PDF: convert all pages at normal quality
            dpi = 150
            quality = 90
            logger.info(f"PDF ({total_pages} pages) at {dpi} DPI")
            images = convert_from_path(pdf_path, dpi=dpi, fmt='jpeg')
            base64_images = []
            for img in images:
                buffer = io.BytesIO()
                img.save(buffer, format='JPEG', quality=quality)
                base64_images.append(base64.b64encode(buffer.getvalue()).decode('utf-8'))
            return base64_images
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        return []

def convert_office_to_pdf(input_path: str) -> Optional[str]:
    try:
        import subprocess
        temp_dir = tempfile.mkdtemp()
        soffice_cmd = "soffice" if shutil.which("soffice") else "libreoffice"
        if not soffice_cmd: return None

        cmd = [soffice_cmd, "--headless", "--convert-to", "pdf", "--outdir", temp_dir, input_path]
        subprocess.run(cmd, capture_output=True, timeout=60)

        pdf_name = Path(input_path).stem + ".pdf"
        pdf_path = os.path.join(temp_dir, pdf_name)
        return pdf_path if os.path.exists(pdf_path) else None
    except Exception:
        return None

def extract_video_frames(video_path: str, num_frames=10) -> List[str]:
    try:
        import cv2
        cap = cv2.VideoCapture(video_path)
        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total == 0: return []
        indices = [int(total * i / (num_frames + 1)) for i in range(1, num_frames + 1)]
        frames = []
        for idx in indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                h, w = frame.shape[:2]
                if w > 1280:
                    scale = 1280/w
                    frame = cv2.resize(frame, (1280, int(h*scale)))
                _, buf = cv2.imencode('.jpg', frame)
                frames.append(base64.b64encode(buf).decode('utf-8'))
        cap.release()
        return frames
    except Exception:
        return []

def convert_document_to_images(file_path: str) -> List[str]:
    path = Path(file_path)
    ext = path.suffix.lower()
    if ext == '.pdf':
        return convert_pdf_to_images(file_path)
    elif ext in ['.docx', '.doc', '.ppt', '.pptx']:
        pdf = convert_office_to_pdf(file_path)
        if pdf:
            imgs = convert_pdf_to_images(pdf)
            shutil.rmtree(os.path.dirname(pdf), ignore_errors=True)
            return imgs
    elif ext in ['.mp4', '.avi', '.mov', '.mkv']:
        return extract_video_frames(file_path)
    return []

# -----------------------------------------------------------------------------
# API Interaction
# -----------------------------------------------------------------------------

async def call_multimodal_api(images_b64: List[str], session: aiohttp.ClientSession) -> tuple[str, bool]:
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]

    content = [{"type": "text", "text": "Analyze these tutorial pages and extract benchmark tasks as JSON."}]

    # Cap images to avoid huge payloads
    subset_images = images_b64[:MAX_IMAGES_PER_REQUEST]
    for img in subset_images:
        content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{img}"}
        })

    messages.append({"role": "user", "content": content})

    for attempt in range(1, MAX_RETRY_ATTEMPTS + 1):
        try:
            headers = {
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            }
            # Add site specific headers if using openrouter or others if needed

            payload = {
                "model": MODEL_NAME,
                "messages": messages,
                "max_tokens": 4096,
            }

            async with session.post(API_URL, headers=headers, json=payload, timeout=180) as response:
                if response.status == 200:
                    res_json = await response.json()
                    return res_json['choices'][0]['message']['content'], True
                else:
                    err = await response.text()
                    logger.warning(f"API Error ({response.status}): {err}")
                    if attempt < MAX_RETRY_ATTEMPTS:
                        await asyncio.sleep(RETRY_DELAY)
                    else:
                        return f"API Error: {err}", False
        except Exception as e:
            logger.warning(f"Exception: {e}")
            if attempt < MAX_RETRY_ATTEMPTS:
                await asyncio.sleep(RETRY_DELAY)
            else:
                return str(e), False
    return "Max retries", False

# -----------------------------------------------------------------------------
# Main Logic
# -----------------------------------------------------------------------------

software_tests = {}  # Global dict to track software -> [test_ids]
FORCE_REGENERATE = False  # Set via --force flag

async def process_file(file_path: str, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore):
    async with semaphore:
        file_path_obj = Path(file_path)
        file_stem = file_path_obj.stem

        # Infer software name from folder structure
        try:
            rel_path = file_path_obj.relative_to(INPUT_FOLDER)
            software_name = rel_path.parts[0] if len(rel_path.parts) > 1 else "unknown"
        except ValueError:
            software_name = "unknown"

        # Skip if already processed (check if task1 json exists)
        existing_task1 = EXAMPLES_FOLDER / software_name / f"{file_stem}_task1.json"
        if existing_task1.exists() and not FORCE_REGENERATE:
            logger.info(f"Skipping (already processed): {file_path_obj.name} → use --force to regenerate")
            # Still register existing tasks in software_tests for test_all.json
            import glob as g
            existing_tasks = g.glob(str(EXAMPLES_FOLDER / software_name / f"{file_stem}_task*.json"))
            for t in existing_tasks:
                tid = Path(t).stem
                if software_name not in software_tests:
                    software_tests[software_name] = []
                software_tests[software_name].append(tid)
            stats.add_completed(num_tasks=len(existing_tasks))
            return

        logger.info(f"Processing: {file_path_obj.name}")

        # 1. Convert to images
        images = convert_document_to_images(file_path)
        if not images:
            stats.add_failed(file_path, "No images extracted")
            return

        # 2. Call API
        content, success = await call_multimodal_api(images, session)
        if not success:
            stats.add_failed(file_path, content)
            return

        # 3. Parse JSON
        try:
            # Try to find JSON block if mixed with text
            json_match = re.search(r'\{.*\}', content, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
            else:
                json_str = content

            api_result = json.loads(json_str)
            tasks = api_result.get("tasks", [])
            if not tasks:
                logger.warning(f"No tasks found in JSON for {file_path}")
                return

        except json.JSONDecodeError as e:
            stats.add_failed(file_path, f"JSON Parse Error: {e}")
            logger.error(f"Raw content: {content[:200]}...")
            return

        # 4. Generate Output Files
        for i, task in enumerate(tasks, 1):
            test_id = f"{file_stem}_task{i}"
            output_file = EXAMPLES_FOLDER / software_name / f"{test_id}.json"
            output_file.parent.mkdir(parents=True, exist_ok=True)

            # Get software-specific config
            sw_cfg = SOFTWARE_CONFIG.get(software_name, DEFAULT_SOFTWARE_CONFIG)

            # Construct the OSWorld/Jade Benchmark Standard JSON
            task_json = {
                "id": test_id,
                "snapshot": sw_cfg["snapshot"],
                "instruction": task.get("task_goal", ""),
                "source": "custom",
                "config": sw_cfg["config"],
                "trajectory": "trajectories/",
                "related_apps": [software_name],
                "evaluator": {
                    "postconfig": [
                        {
                            "type": "sleep",
                            "parameters": {
                                "seconds": 3
                            }
                        }
                    ],
                    "func": "vllm_eval"
                    # "result" field is NOT needed for vllm_eval
                },
                "proxy": False,
                "fixed_ip": False,
                "possibility_of_env_change": "low",
                "metadata": {
                    "input_files": task.get("input_files", []),
                    "steps": task.get("steps", "")
                }
            }

            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(task_json, f, ensure_ascii=False, indent=2)

            # Register to global index
            if software_name not in software_tests:
                software_tests[software_name] = []
            software_tests[software_name].append(test_id)

        stats.add_completed(num_tasks=len(tasks))
        logger.info(f"Generated {len(tasks)} tasks for {file_path_obj.name}")

def save_test_all_json():
    """Update test_all.json with new tests"""
    test_all_meta_path = Path(TEST_ALL_JSON)
    existing_data = {}
    if test_all_meta_path.exists():
        try:
            with open(test_all_meta_path, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        except: pass

    # Merge new tests
    for software, test_ids in software_tests.items():
        current_list = existing_data.get(software, [])
        # Append unique
        updated_list = sorted(list(set(current_list + test_ids)))
        existing_data[software] = updated_list

    with open(test_all_meta_path, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=2)

    # Also save a 'test_custom.json' that ONLY contains the softwares we just processed/have in our inputs
    # This is useful for running ONLY your custom benchmarks without OSWorld defaults
    custom_data = {}

    # We scan the INPUT_FOLDER to see which softwares are "ours"
    custom_softwares = set()
    if INPUT_FOLDER.exists():
        for item in os.listdir(INPUT_FOLDER):
            if (INPUT_FOLDER / item).is_dir():
                custom_softwares.add(item)

    for software in custom_softwares:
        if software in existing_data:
            custom_data[software] = existing_data[software]

    test_custom_path = PROJECT_ROOT / "evaluation_examples" / "test_custom.json"
    with open(test_custom_path, 'w', encoding='utf-8') as f:
        json.dump(custom_data, f, ensure_ascii=False, indent=2)
    logger.info(f"Custom test index saved to: {test_custom_path}")

async def main():
    global stats, FORCE_REGENERATE
    stats = ProcessingStats()

    # Parse --force flag
    FORCE_REGENERATE = "--force" in sys.argv

    if not API_KEY:
        logger.error("OPENAI_API_KEY environment variable not set.")
        return

    # Check/Create Input Folder
    if not INPUT_FOLDER.exists():
        logger.warning(f"Input folder {INPUT_FOLDER} does not exist. Creating it.")
        INPUT_FOLDER.mkdir(parents=True, exist_ok=True)
        logger.info(f"Please put software PDF tutorials into subfolders in: {INPUT_FOLDER}")
        return

    # Find files
    files = []
    for root, _, filenames in os.walk(INPUT_FOLDER):
        for f in filenames:
            if Path(f).suffix.lower() in SUPPORTED_EXTENSIONS:
                files.append(os.path.join(root, f))

    stats.total_files = len(files)
    logger.info(f"Found {len(files)} files in {INPUT_FOLDER}")

    if not files:
        logger.info("No files to process.")
        return

    # Process
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession() as session:
        tasks = [process_file(f, session, semaphore) for f in files]
        await asyncio.gather(*tasks)

    # Save Index
    save_test_all_json()

    logger.info("Done.")

if __name__ == "__main__":
    asyncio.run(main())