fix: fix multiapps tasks (#231)

* Update JSON example for multi_apps: change snapshot name and specify presenter in instructions for clarity. * Enhance PDF image comparison in chrome.py by adding existence checks for input files and improving image extraction logic. Introduce image hashing for similarity scoring with a configurable threshold. Update docs.py to support fuzzy matching in DOCX file comparisons, allowing for similarity scoring based on text content. Modify example JSON to enable fuzzy matching option. --------- Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
2025-07-03 16:58:43 +08:00
parent e9c657b714
commit bba367b8bc
4 changed files with 91 additions and 37 deletions
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -2,6 +2,7 @@ import logging
 import os
 import re
 import shutil
 import io
 from itertools import product
 from typing import Any, Dict, List, Union
@@ -172,6 +173,7 @@ import fitz
 from PIL import Image
 from borb.pdf import Document
 from borb.pdf import PDF
 import imagehash
 from pathlib import Path
 import typing
@@ -180,6 +182,9 @@ import typing
 def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
    if not pdf1_path or not pdf2_path:
        return 0.
    if not all(map(os.path.exists, [pdf1_path, pdf2_path])):
        logger.warning(f"PDF file does not exist: {pdf1_path} or {pdf2_path}")
        return 0.
    def extract_images_from_pdf(pdf_path):
        pdf_document = fitz.open(pdf_path)
@@ -187,35 +192,61 @@ def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
-            pixmap = page.get_pixmap()
+            for img_index, img in enumerate(page.get_images(full=True)):
-
+                xref = img[0]
-            img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
+                base_image = pdf_document.extract_image(xref)
-
+                image_bytes = base_image["image"]
-            images.append(img)
+                
                # convert to PIL Image
                try:
                    pil_image = Image.open(io.BytesIO(image_bytes))
                    images.append(pil_image)
                except Exception as e:
                    logger.error(f"Failed to process image in {pdf_path} on page {page_number}: {e}")
        return images
    temp_dir = Path(pdf1_path).parent / "temp_pdf_comparison"
    os.makedirs(temp_dir, exist_ok=True)
    temp_pdf1 = temp_dir / Path(pdf1_path).name
    temp_pdf2 = temp_dir / Path(pdf2_path).name
-    def fix_pdf(in_path: Path, out_path: Path) -> None:
+    shutil.copy(pdf1_path, temp_pdf1)
-        doc: typing.Optional[Document] = None
+    shutil.copy(pdf2_path, temp_pdf2)
        with open(in_path, "rb") as fh:
            doc = PDF.loads(fh)
        with open(out_path, "wb") as fh:
            PDF.dumps(fh, doc)
-    fix_pdf(Path(pdf1_path), Path(pdf1_path))
+    try:
-    fix_pdf(Path(pdf2_path), Path(pdf2_path))
+        images1 = extract_images_from_pdf(str(temp_pdf1))
        images2 = extract_images_from_pdf(str(temp_pdf2))
    except Exception as e:
        logger.error(f"Error extracting images from PDFs: {e}")
        shutil.rmtree(temp_dir)
        return 0.
    finally:
        shutil.rmtree(temp_dir)
    images1 = extract_images_from_pdf(pdf1_path)
    images2 = extract_images_from_pdf(pdf2_path)
    if len(images1) != len(images2):
        logger.info(f"Different number of images found. Gold: {len(images1)}, Pred: {len(images2)}")
        return 0.
-    for img1, img2 in zip(images1, images2):
+    if not images1:
-        if img1.tobytes() != img2.tobytes():
+        logger.info("No images found in either PDF. Considering it a match.")
-            return 0.
+        return 1.0
-    return 1.
+    hash_threshold = 5 
    total_score = 0
    for i, (img1, img2) in enumerate(zip(images1, images2)):
        hash1 = imagehash.phash(img1)
        hash2 = imagehash.phash(img2)
        hash_diff = hash1 - hash2
        logger.info(f"Image {i+1}: Gold hash: {hash1}, Pred hash: {hash2}, Hash difference: {hash_diff}")
        if hash_diff <= hash_threshold:
            total_score +=1
    return total_score / len(images1)
 def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -86,6 +86,7 @@ def compare_docx_files(file1, file2, **options):
    ignore_case = options.get('ignore_case', False)
    ignore_order = options.get('ignore_order', False)
    content_only = options.get('content_only', False)
    fuzzy_match = options.get('fuzzy_match', False)
    if not file1 or not file2:
        return 0
@@ -151,29 +152,48 @@ def compare_docx_files(file1, file2, **options):
        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
        if ignore_case:
            text1, text2 = text1.lower(), text2.lower()
-        if text1 != text2:
+
-            return 0
+        if fuzzy_match:
            similarity = fuzz.ratio(text1, text2) / 100.0
            return similarity
        else:
            if text1 != text2:
                return 0
    else:
        print("ignore_blanks=false")
        if len(doc1_paragraphs) != len(doc2_paragraphs):
            print(doc1_paragraphs)
            print(doc2_paragraphs)
            print(len(doc1_paragraphs))
            print(len(doc2_paragraphs))
            return 0
-        print("in compare")
+        
-        # Compare each paragraph
+        if fuzzy_match:
-        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+            total_similarity = 0
-            if ignore_case:
+            if not doc1_paragraphs:
-                p1, p2 = p1.lower(), p2.lower()
+                return 1.0
-            if p1 != p2:
+            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
-                # show the difference
+                if ignore_case:
-                print("=== First Paragraph ===")
+                    p1, p2 = p1.lower(), p2.lower()
-                print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
+                total_similarity += fuzz.ratio(p1, p2) / 100.0
-                print("=== Second Paragraph ===") 
+            
-                print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
+            if len(doc1_paragraphs) == 0:
-                print("=" * 50)  # Clear boundary
+                return 1.0 if len(doc2_paragraphs) == 0 else 0.0
-                return 0
+
            avg_similarity = total_similarity / len(doc1_paragraphs)
            return avg_similarity
        else:
            # Compare each paragraph
            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
                if ignore_case:
                    p1, p2 = p1.lower(), p2.lower()
                if p1 != p2:
                    # show the difference
                    print("=== First Paragraph ===")
                    print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
                    print("=== Second Paragraph ===")
                    print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
                    print("=" * 50)  # Clear boundary
                    return 0
    return 1
--- a/evaluation_examples/examples/multi_apps/82e3c869-49f6-4305-a7ce-f3e64a0618e7.json
+++ b/evaluation_examples/examples/multi_apps/82e3c869-49f6-4305-a7ce-f3e64a0618e7.json
@@ -1,7 +1,7 @@
 {
  "id": "82e3c869-49f6-4305-a7ce-f3e64a0618e7",
-  "snapshot": "libreoffice_calc",
+  "snapshot": "multi_apps",
-  "instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
+  "instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters (a.k.a. Tao Yu) and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
  "source": "authors",
  "config": [
    {
--- a/evaluation_examples/examples/multi_apps/aad10cd7-9337-4b62-b704-a857848cedf2.json
+++ b/evaluation_examples/examples/multi_apps/aad10cd7-9337-4b62-b704-a857848cedf2.json
@@ -65,6 +65,9 @@
      "type": "vm_file",
      "path": "/home/user/Desktop/notes.docx",
      "dest": "notes.docx"
    },
    "options": {
      "fuzzy_match": true
    }
  },
  "proxy": false