605 lines
21 KiB
Python
605 lines
21 KiB
Python
import os
|
|
import sys
|
|
import asyncio
|
|
import aiohttp
|
|
import base64
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
import tempfile
|
|
import shutil
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
import json
|
|
|
|
# Configuration
|
|
SCRIPT_DIR = Path(__file__).parent
|
|
PROJECT_ROOT = SCRIPT_DIR.parent
|
|
|
|
API_BASE_URL = os.getenv("OPENAI_BASE_URL")
|
|
API_URL = f"{API_BASE_URL}/chat/completions" if API_BASE_URL else None
|
|
API_KEY = os.getenv("OPENAI_API_KEY")
|
|
MODEL_NAME = "gemini-2.5-pro"
|
|
MAX_CONCURRENT_REQUESTS = 5
|
|
INPUT_FOLDER = "/Users/cuihang/Downloads/test_files"
|
|
EXAMPLES_FOLDER = PROJECT_ROOT / "evaluation_examples" / "examples"
|
|
TEST_ALL_JSON = PROJECT_ROOT / "evaluation_examples" / "test_all.json"
|
|
|
|
# Retry configuration
|
|
MAX_RETRY_ATTEMPTS = 3
|
|
RETRY_DELAY = 5
|
|
RETRY_BACKOFF = 2
|
|
|
|
# Image limit
|
|
MAX_IMAGES_PER_REQUEST = 50
|
|
|
|
# Supported file extensions
|
|
SUPPORTED_EXTENSIONS = {'.docx', '.doc', '.ppt', '.pptx', '.pdf', '.mp4', '.avi', '.mov', '.mkv'}
|
|
|
|
SYSTEM_PROMPT = """You are an AI assistant that generates precise, executable step-by-step instructions for desktop software operations.
|
|
|
|
Your task:
|
|
Convert the provided document information into precise operation instructions that can be executed step-by-step by an AI agent in a software GUI.
|
|
|
|
Output requirements (no additional explanatory text):
|
|
------------------------------------------------
|
|
|
|
[Task Goal]
|
|
Describe in one sentence the final task result to be achieved in the software.
|
|
|
|
[Input Files]
|
|
Specify the file names, types, and locations involved in this operation.
|
|
- If the document provides complete paths, record them as is
|
|
- If only file names are mentioned (e.g., data.xlsx), record the filename and note "complete path not specified in document"
|
|
- If no input files are mentioned, write "no input files required"
|
|
|
|
[Detailed Operation Steps (GUI Level)]
|
|
Break down the task into atomic GUI operation steps.
|
|
Each step must meet the following conditions:
|
|
- Contains only one explicit, indivisible GUI atomic action
|
|
- Must specify the menus, panels, buttons, or controls involved
|
|
- Must specify parameter names and option values involved
|
|
- Arranged in the actual operation order of the software
|
|
- Must include software launch steps (e.g., double-click desktop icon, launch from start menu, etc.)
|
|
|
|
Step format example:
|
|
1. Double-click the [Software Name] icon on the desktop to launch the software.
|
|
2. Click "File → Open" in the main menu bar.
|
|
3. In the file selection dialog, navigate to the specified directory and select file [filename].
|
|
4. Click the "Open" button to confirm.
|
|
5. ... (and so on)
|
|
|
|
------------------------------------------------
|
|
|
|
[Handling Uncertain Information]
|
|
- Strictly generate operation steps based on document content, do not add features or menus not mentioned
|
|
- If operation steps are unclear or ambiguous, infer based on common software operation flows
|
|
- If parameter values in the document are unclear, note "[set according to actual needs]" in the step
|
|
|
|
[Output Format]
|
|
Output in JSON format with the following fields:
|
|
{
|
|
"input_files": ["file1", "file2", "..."],
|
|
"task_goal": "...",
|
|
"steps": "A string containing all operation steps, arranged in order, with numbered prefix for each step, separated by newlines"
|
|
}
|
|
Note: Output must be strict JSON format, with no extra text or explanations."""
|
|
|
|
|
|
# Logging configuration
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ProcessingStats:
|
|
"""Processing statistics tracker"""
|
|
total_files: int = 0
|
|
completed_files: int = 0
|
|
failed_files: int = 0
|
|
retried_files: int = 0
|
|
start_time: datetime = None
|
|
failed_list: List[tuple] = None
|
|
|
|
def __post_init__(self):
|
|
if self.start_time is None:
|
|
self.start_time = datetime.now()
|
|
if self.failed_list is None:
|
|
self.failed_list = []
|
|
|
|
def add_completed(self):
|
|
self.completed_files += 1
|
|
self._log_progress()
|
|
|
|
def add_failed(self, file_path: str, error: str):
|
|
self.failed_files += 1
|
|
self.failed_list.append((file_path, error))
|
|
self._log_progress()
|
|
|
|
def add_retry(self):
|
|
self.retried_files += 1
|
|
|
|
def _log_progress(self):
|
|
processed = self.completed_files + self.failed_files
|
|
percentage = (processed / self.total_files * 100) if self.total_files > 0 else 0
|
|
elapsed = (datetime.now() - self.start_time).total_seconds()
|
|
|
|
if processed > 0:
|
|
avg_time = elapsed / processed
|
|
remaining = (self.total_files - processed) * avg_time
|
|
eta = f"{int(remaining // 60)}m{int(remaining % 60)}s"
|
|
else:
|
|
eta = "calculating..."
|
|
|
|
logger.info(f"Progress: {processed}/{self.total_files} ({percentage:.1f}%) | "
|
|
f"Success: {self.completed_files} | Failed: {self.failed_files} | "
|
|
f"Retried: {self.retried_files} | ETA: {eta}")
|
|
|
|
def print_summary(self):
|
|
elapsed = (datetime.now() - self.start_time).total_seconds()
|
|
logger.info("=" * 60)
|
|
logger.info("Processing Complete")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total files: {self.total_files}")
|
|
logger.info(f"Success: {self.completed_files}")
|
|
logger.info(f"Failed: {self.failed_files}")
|
|
logger.info(f"Total retries: {self.retried_files}")
|
|
logger.info(f"Total time: {int(elapsed // 60)}m{int(elapsed % 60)}s")
|
|
|
|
if self.failed_list:
|
|
logger.info("\nFailed files:")
|
|
for file_path, error in self.failed_list:
|
|
logger.info(f" - {file_path}")
|
|
logger.info(f" Error: {error}")
|
|
|
|
self._save_report()
|
|
|
|
def _save_report(self):
|
|
report = {
|
|
"total_files": self.total_files,
|
|
"completed": self.completed_files,
|
|
"failed": self.failed_files,
|
|
"retries": self.retried_files,
|
|
"start_time": self.start_time.isoformat(),
|
|
"end_time": datetime.now().isoformat(),
|
|
"elapsed_seconds": (datetime.now() - self.start_time).total_seconds(),
|
|
"failed_files": [{"file": f, "error": e} for f, e in self.failed_list]
|
|
}
|
|
|
|
report_file = Path(EXAMPLES_FOLDER) / "processing_report.json"
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"\nDetailed report saved to: {report_file}")
|
|
|
|
|
|
stats = ProcessingStats()
|
|
software_tests = {}
|
|
|
|
|
|
def check_dependencies():
|
|
"""Check and prompt for missing dependencies"""
|
|
missing = []
|
|
|
|
try:
|
|
import pdf2image
|
|
except ImportError:
|
|
missing.append("pdf2image")
|
|
|
|
try:
|
|
import PIL
|
|
except ImportError:
|
|
missing.append("Pillow")
|
|
|
|
try:
|
|
import cv2
|
|
except ImportError:
|
|
missing.append("opencv-python or opencv-python-headless")
|
|
|
|
if not shutil.which("soffice") and not shutil.which("libreoffice"):
|
|
logger.warning("LibreOffice not detected, cannot convert .doc and .ppt files")
|
|
logger.info("Install: sudo apt-get install libreoffice (Linux) or download from https://www.libreoffice.org/")
|
|
|
|
if missing:
|
|
logger.error(f"Missing dependencies: {', '.join(missing)}")
|
|
logger.info(f"Install with: pip install {' '.join(missing)}")
|
|
logger.info("Note: pdf2image also requires poppler")
|
|
logger.info(" - Ubuntu/Debian: sudo apt-get install poppler-utils")
|
|
logger.info(" - macOS: brew install poppler")
|
|
logger.info(" - Windows: download from https://github.com/oschwartz10612/poppler-windows/releases/")
|
|
return False
|
|
return True
|
|
|
|
|
|
def convert_pdf_to_images(pdf_path: str) -> List[str]:
|
|
"""Convert PDF to base64-encoded images"""
|
|
try:
|
|
from pdf2image import convert_from_path
|
|
from PIL import Image
|
|
import io
|
|
|
|
images = convert_from_path(pdf_path, dpi=150, fmt='jpeg')
|
|
base64_images = []
|
|
|
|
for img in images:
|
|
buffer = io.BytesIO()
|
|
img.save(buffer, format='JPEG', quality=100)
|
|
img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
base64_images.append(img_base64)
|
|
|
|
return base64_images
|
|
except Exception as e:
|
|
logger.error(f"PDF conversion failed for {pdf_path}: {str(e)}")
|
|
return []
|
|
|
|
|
|
def convert_office_to_pdf(input_path: str) -> Optional[str]:
|
|
"""Convert Office documents to PDF using LibreOffice"""
|
|
try:
|
|
import subprocess
|
|
|
|
temp_dir = tempfile.mkdtemp()
|
|
soffice_cmd = "soffice" if shutil.which("soffice") else "libreoffice"
|
|
|
|
cmd = [
|
|
soffice_cmd,
|
|
"--headless",
|
|
"--convert-to", "pdf",
|
|
"--outdir", temp_dir,
|
|
input_path
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
|
|
|
if result.returncode == 0:
|
|
pdf_name = Path(input_path).stem + ".pdf"
|
|
pdf_path = os.path.join(temp_dir, pdf_name)
|
|
|
|
if os.path.exists(pdf_path):
|
|
return pdf_path
|
|
|
|
logger.error(f"LibreOffice conversion failed: {result.stderr}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Office conversion failed for {input_path}: {str(e)}")
|
|
return None
|
|
|
|
|
|
def convert_document_to_images(file_path: str) -> List[str]:
|
|
"""Convert any supported document to base64-encoded images"""
|
|
file_ext = Path(file_path).suffix.lower()
|
|
|
|
if file_ext == '.pdf':
|
|
return convert_pdf_to_images(file_path)
|
|
|
|
elif file_ext in ['.docx', '.doc', '.ppt', '.pptx']:
|
|
pdf_path = convert_office_to_pdf(file_path)
|
|
if pdf_path:
|
|
images = convert_pdf_to_images(pdf_path)
|
|
try:
|
|
os.remove(pdf_path)
|
|
os.rmdir(os.path.dirname(pdf_path))
|
|
except:
|
|
pass
|
|
return images
|
|
return []
|
|
|
|
elif file_ext in ['.mp4', '.avi', '.mov', '.mkv']:
|
|
return extract_video_frames(file_path)
|
|
|
|
return []
|
|
|
|
|
|
def extract_video_frames(video_path: str, num_frames: int = 10) -> List[str]:
|
|
"""Extract key frames from video"""
|
|
try:
|
|
import cv2
|
|
|
|
cap = cv2.VideoCapture(video_path)
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
|
|
if total_frames == 0:
|
|
return []
|
|
|
|
frame_indices = [int(total_frames * i / (num_frames + 1)) for i in range(1, num_frames + 1)]
|
|
base64_frames = []
|
|
|
|
for idx in frame_indices:
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
|
ret, frame = cap.read()
|
|
|
|
if ret:
|
|
height, width = frame.shape[:2]
|
|
if width > 1280:
|
|
scale = 1280 / width
|
|
frame = cv2.resize(frame, (1280, int(height * scale)))
|
|
|
|
_, buffer = cv2.imencode('.jpg', frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
|
frame_base64 = base64.b64encode(buffer).decode('utf-8')
|
|
base64_frames.append(frame_base64)
|
|
|
|
cap.release()
|
|
return base64_frames
|
|
|
|
except Exception as e:
|
|
logger.error(f"Video frame extraction failed for {video_path}: {str(e)}")
|
|
return []
|
|
|
|
|
|
async def call_api_single_batch(images_batch: List[str], file_type: str,
|
|
session: aiohttp.ClientSession, batch_num: int = 0) -> tuple[str, bool, int]:
|
|
"""
|
|
Call API to process a single batch of images
|
|
Returns: (content, success, status_code)
|
|
"""
|
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
|
|
|
batch_info = f" (batch {batch_num})" if batch_num > 0 else ""
|
|
content = [
|
|
{"type": "text", "text": f"Please analyze the following {file_type} pages/frames{batch_info} and extract the operation workflow:"}
|
|
]
|
|
|
|
for img_b64 in images_batch:
|
|
content.append({
|
|
"type": "image_url",
|
|
"image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
|
|
})
|
|
|
|
messages.append({"role": "user", "content": content})
|
|
|
|
try:
|
|
headers = {
|
|
"Authorization": f"Bearer {API_KEY}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
payload = {
|
|
"model": MODEL_NAME,
|
|
"messages": messages,
|
|
"max_tokens": 8192
|
|
}
|
|
|
|
async with session.post(API_URL, headers=headers, json=payload, timeout=180) as response:
|
|
status_code = response.status
|
|
if status_code == 200:
|
|
result = await response.json()
|
|
return result['choices'][0]['message']['content'], True, status_code
|
|
else:
|
|
error_text = await response.text()
|
|
return f"[API call failed: {status_code}]\n{error_text}", False, status_code
|
|
|
|
except asyncio.TimeoutError:
|
|
return "[API call timeout]", False, 0
|
|
except Exception as e:
|
|
return f"[API call error: {str(e)}]", False, 0
|
|
|
|
|
|
async def call_multimodal_api_with_retry(file_path: str, session: aiohttp.ClientSession) -> tuple[str, bool]:
|
|
"""
|
|
Call multimodal API to analyze document images with retry mechanism
|
|
Returns: (content, success)
|
|
"""
|
|
images_base64 = convert_document_to_images(file_path)
|
|
|
|
if not images_base64:
|
|
error_msg = f"[Document conversion failed: unable to convert {Path(file_path).name} to images]"
|
|
return error_msg, False
|
|
|
|
file_type = "video" if Path(file_path).suffix.lower() in ['.mp4', '.avi', '.mov', '.mkv'] else "document"
|
|
total_images = len(images_base64)
|
|
|
|
if total_images > MAX_IMAGES_PER_REQUEST:
|
|
images_base64 = images_base64[:MAX_IMAGES_PER_REQUEST]
|
|
total_images = MAX_IMAGES_PER_REQUEST
|
|
|
|
for attempt in range(1, MAX_RETRY_ATTEMPTS + 1):
|
|
try:
|
|
content, success, status_code = await call_api_single_batch(images_base64, file_type, session)
|
|
|
|
if success:
|
|
return content, True
|
|
|
|
if status_code == 413:
|
|
return f"[File too large: server refused to process the file]", False
|
|
|
|
if attempt < MAX_RETRY_ATTEMPTS:
|
|
delay = RETRY_DELAY * (RETRY_BACKOFF ** (attempt - 1))
|
|
logger.info(f"\nRetry {attempt}/{MAX_RETRY_ATTEMPTS}: {Path(file_path).name} (waiting {delay}s)")
|
|
stats.add_retry()
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
|
|
return content, False
|
|
|
|
except asyncio.TimeoutError:
|
|
if attempt < MAX_RETRY_ATTEMPTS:
|
|
delay = RETRY_DELAY * (RETRY_BACKOFF ** (attempt - 1))
|
|
logger.info(f"\nRetry {attempt}/{MAX_RETRY_ATTEMPTS}: {Path(file_path).name} (timeout, waiting {delay}s)")
|
|
stats.add_retry()
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
return "[API call timeout]", False
|
|
|
|
except Exception as e:
|
|
if attempt < MAX_RETRY_ATTEMPTS:
|
|
delay = RETRY_DELAY * (RETRY_BACKOFF ** (attempt - 1))
|
|
logger.info(f"\nRetry {attempt}/{MAX_RETRY_ATTEMPTS}: {Path(file_path).name} (error, waiting {delay}s)")
|
|
stats.add_retry()
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
return f"[API call error: {str(e)}]", False
|
|
|
|
return "[Max retry attempts reached]", False
|
|
|
|
|
|
async def process_file(file_path: str, session: aiohttp.ClientSession,
|
|
semaphore: asyncio.Semaphore):
|
|
"""Process a single file"""
|
|
async with semaphore:
|
|
try:
|
|
content, success = await call_multimodal_api_with_retry(file_path, session)
|
|
|
|
file_path_obj = Path(file_path).resolve()
|
|
input_folder_obj = Path(INPUT_FOLDER).resolve()
|
|
|
|
try:
|
|
rel_path = file_path_obj.relative_to(input_folder_obj)
|
|
software_name = rel_path.parts[0] if len(rel_path.parts) > 1 else "unknown"
|
|
except ValueError:
|
|
software_name = "unknown"
|
|
|
|
file_stem = file_path_obj.stem
|
|
test_id = file_stem
|
|
output_file = Path(EXAMPLES_FOLDER) / software_name / f"{file_stem}.json"
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
import re
|
|
match = re.search(r'```json\s*([\s\S]*?)\s*```', content)
|
|
content = match.group(1) if match else content
|
|
|
|
if success:
|
|
api_result = json.loads(content)
|
|
|
|
data = {
|
|
"id": test_id,
|
|
"snapshot": "snapshot",
|
|
"instruction": api_result.get("steps", ""),
|
|
"source": "custom",
|
|
"config": [],
|
|
"trajectory": "trajectories/",
|
|
"related_apps": [software_name],
|
|
"evaluator": {
|
|
"postconfig": [
|
|
{
|
|
"type": "sleep",
|
|
"parameters": {
|
|
"seconds": 3
|
|
}
|
|
}
|
|
],
|
|
"func": "vllm_eval"
|
|
},
|
|
"proxy": False,
|
|
"fixed_ip": False,
|
|
"possibility_of_env_change": "low",
|
|
"metadata": {
|
|
"input_files": api_result.get("input_files", []),
|
|
"task_goal": api_result.get("task_goal", "")
|
|
}
|
|
}
|
|
|
|
if software_name not in software_tests:
|
|
software_tests[software_name] = []
|
|
software_tests[software_name].append(test_id)
|
|
|
|
else:
|
|
data = {
|
|
"id": test_id,
|
|
"error": content,
|
|
"status": "failed"
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
if success:
|
|
stats.add_completed()
|
|
else:
|
|
stats.add_failed(file_path, content)
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
stats.add_failed(file_path, error_msg)
|
|
logger.error(f"\nError processing {file_path}: {error_msg}")
|
|
|
|
|
|
def find_all_files(input_folder: str) -> List[str]:
|
|
"""Recursively find all supported files"""
|
|
all_files = []
|
|
|
|
for root, dirs, files in os.walk(input_folder):
|
|
for file in files:
|
|
file_path = os.path.join(root, file)
|
|
if Path(file_path).suffix.lower() in SUPPORTED_EXTENSIONS:
|
|
all_files.append(file_path)
|
|
|
|
return all_files
|
|
|
|
|
|
def save_test_all_json():
|
|
"""Save aggregated test_all.json"""
|
|
test_all_path = Path(TEST_ALL_JSON)
|
|
if test_all_path.exists():
|
|
with open(test_all_path, 'r', encoding='utf-8') as f:
|
|
existing_data = json.load(f)
|
|
else:
|
|
existing_data = {}
|
|
|
|
for software, test_ids in software_tests.items():
|
|
if software in existing_data:
|
|
existing_data[software] = list(set(existing_data[software] + test_ids))
|
|
else:
|
|
existing_data[software] = test_ids
|
|
|
|
test_all_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(test_all_path, 'w', encoding='utf-8') as f:
|
|
json.dump(existing_data, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"\nTest index updated: {test_all_path}")
|
|
logger.info(f"Software included: {list(existing_data.keys())}")
|
|
|
|
|
|
async def main():
|
|
"""Main function"""
|
|
if not check_dependencies():
|
|
return
|
|
|
|
if not Path(INPUT_FOLDER).exists():
|
|
logger.error(f"Input directory does not exist: {INPUT_FOLDER}")
|
|
return
|
|
|
|
Path(EXAMPLES_FOLDER).mkdir(parents=True, exist_ok=True)
|
|
|
|
logger.info("Scanning files...")
|
|
logger.info(f"Input directory: {INPUT_FOLDER}")
|
|
logger.info(f"Output directory: {EXAMPLES_FOLDER}")
|
|
logger.info(f"Test index file: {TEST_ALL_JSON}\n")
|
|
|
|
files = find_all_files(INPUT_FOLDER)
|
|
stats.total_files = len(files)
|
|
|
|
logger.info(f"Found {len(files)} files")
|
|
logger.info(f"Configuration: max retries={MAX_RETRY_ATTEMPTS}, concurrency={MAX_CONCURRENT_REQUESTS}")
|
|
logger.info("=" * 60 + "\n")
|
|
|
|
if not files:
|
|
logger.warning("No supported files found")
|
|
return
|
|
|
|
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
tasks = [
|
|
process_file(file, session, semaphore)
|
|
for file in files
|
|
]
|
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
save_test_all_json()
|
|
stats.print_summary()
|
|
|
|
logger.info("\nCompleted!")
|
|
logger.info(f" - Test cases saved to: {EXAMPLES_FOLDER}")
|
|
logger.info(f" - Test index updated: {TEST_ALL_JSON}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|