From bba367b8bcc5d4e73f1e5bcfeb502f4b2185dcc7 Mon Sep 17 00:00:00 2001
From: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
Date: Thu, 3 Jul 2025 16:58:43 +0800
Subject: [PATCH] fix: fix multiapps tasks (#231)

* Update JSON example for multi_apps: change snapshot name and specify presenter in instructions for clarity.

* Enhance PDF image comparison in chrome.py by adding existence checks for input files and improving image extraction logic. Introduce image hashing for similarity scoring with a configurable threshold. Update docs.py to support fuzzy matching in DOCX file comparisons, allowing for similarity scoring based on text content. Modify example JSON to enable fuzzy matching option.

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
---
 desktop_env/evaluators/metrics/chrome.py      | 69 ++++++++++++++-----
 desktop_env/evaluators/metrics/docs.py        | 52 +++++++++-----
 .../82e3c869-49f6-4305-a7ce-f3e64a0618e7.json |  4 +-
 .../aad10cd7-9337-4b62-b704-a857848cedf2.json |  3 +
 4 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py
index cc12e29..494e8cf 100644
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -2,6 +2,7 @@ import logging
 import os
 import re
 import shutil
+import io
 from itertools import product
 from typing import Any, Dict, List, Union
 
@@ -172,6 +173,7 @@ import fitz
 from PIL import Image
 from borb.pdf import Document
 from borb.pdf import PDF
+import imagehash
 
 from pathlib import Path
 import typing
@@ -180,6 +182,9 @@ import typing
 def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
     if not pdf1_path or not pdf2_path:
         return 0.
+    if not all(map(os.path.exists, [pdf1_path, pdf2_path])):
+        logger.warning(f"PDF file does not exist: {pdf1_path} or {pdf2_path}")
+        return 0.
 
     def extract_images_from_pdf(pdf_path):
         pdf_document = fitz.open(pdf_path)
@@ -187,35 +192,61 @@ def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
 
         for page_number in range(pdf_document.page_count):
             page = pdf_document[page_number]
-            pixmap = page.get_pixmap()
-
-            img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
-
-            images.append(img)
+            for img_index, img in enumerate(page.get_images(full=True)):
+                xref = img[0]
+                base_image = pdf_document.extract_image(xref)
+                image_bytes = base_image["image"]
+                
+                # convert to PIL Image
+                try:
+                    pil_image = Image.open(io.BytesIO(image_bytes))
+                    images.append(pil_image)
+                except Exception as e:
+                    logger.error(f"Failed to process image in {pdf_path} on page {page_number}: {e}")
 
         return images
+    
+    temp_dir = Path(pdf1_path).parent / "temp_pdf_comparison"
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    temp_pdf1 = temp_dir / Path(pdf1_path).name
+    temp_pdf2 = temp_dir / Path(pdf2_path).name
 
-    def fix_pdf(in_path: Path, out_path: Path) -> None:
-        doc: typing.Optional[Document] = None
-        with open(in_path, "rb") as fh:
-            doc = PDF.loads(fh)
-        with open(out_path, "wb") as fh:
-            PDF.dumps(fh, doc)
+    shutil.copy(pdf1_path, temp_pdf1)
+    shutil.copy(pdf2_path, temp_pdf2)
 
-    fix_pdf(Path(pdf1_path), Path(pdf1_path))
-    fix_pdf(Path(pdf2_path), Path(pdf2_path))
+    try:
+        images1 = extract_images_from_pdf(str(temp_pdf1))
+        images2 = extract_images_from_pdf(str(temp_pdf2))
+    except Exception as e:
+        logger.error(f"Error extracting images from PDFs: {e}")
+        shutil.rmtree(temp_dir)
+        return 0.
+    finally:
+        shutil.rmtree(temp_dir)
 
-    images1 = extract_images_from_pdf(pdf1_path)
-    images2 = extract_images_from_pdf(pdf2_path)
 
     if len(images1) != len(images2):
+        logger.info(f"Different number of images found. Gold: {len(images1)}, Pred: {len(images2)}")
         return 0.
 
-    for img1, img2 in zip(images1, images2):
-        if img1.tobytes() != img2.tobytes():
-            return 0.
+    if not images1:
+        logger.info("No images found in either PDF. Considering it a match.")
+        return 1.0
 
-    return 1.
+    hash_threshold = 5 
+    total_score = 0
+    for i, (img1, img2) in enumerate(zip(images1, images2)):
+        hash1 = imagehash.phash(img1)
+        hash2 = imagehash.phash(img2)
+        hash_diff = hash1 - hash2
+        
+        logger.info(f"Image {i+1}: Gold hash: {hash1}, Pred hash: {hash2}, Hash difference: {hash_diff}")
+
+        if hash_diff <= hash_threshold:
+            total_score +=1
+    
+    return total_score / len(images1)
 
 
 def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py
index 908a387..9397d45 100644
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -86,6 +86,7 @@ def compare_docx_files(file1, file2, **options):
     ignore_case = options.get('ignore_case', False)
     ignore_order = options.get('ignore_order', False)
     content_only = options.get('content_only', False)
+    fuzzy_match = options.get('fuzzy_match', False)
 
     if not file1 or not file2:
         return 0
@@ -151,29 +152,48 @@ def compare_docx_files(file1, file2, **options):
         text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
         if ignore_case:
             text1, text2 = text1.lower(), text2.lower()
-        if text1 != text2:
-            return 0
+
+        if fuzzy_match:
+            similarity = fuzz.ratio(text1, text2) / 100.0
+            return similarity
+        else:
+            if text1 != text2:
+                return 0
     else:
-        print("ignore_blanks=false")
         if len(doc1_paragraphs) != len(doc2_paragraphs):
             print(doc1_paragraphs)
             print(doc2_paragraphs)
             print(len(doc1_paragraphs))
             print(len(doc2_paragraphs))
             return 0
-        print("in compare")
-        # Compare each paragraph
-        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
-            if ignore_case:
-                p1, p2 = p1.lower(), p2.lower()
-            if p1 != p2:
-                # show the difference
-                print("=== First Paragraph ===")
-                print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
-                print("=== Second Paragraph ===") 
-                print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
-                print("=" * 50)  # Clear boundary
-                return 0
+        
+        if fuzzy_match:
+            total_similarity = 0
+            if not doc1_paragraphs:
+                return 1.0
+            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+                if ignore_case:
+                    p1, p2 = p1.lower(), p2.lower()
+                total_similarity += fuzz.ratio(p1, p2) / 100.0
+            
+            if len(doc1_paragraphs) == 0:
+                return 1.0 if len(doc2_paragraphs) == 0 else 0.0
+
+            avg_similarity = total_similarity / len(doc1_paragraphs)
+            return avg_similarity
+        else:
+            # Compare each paragraph
+            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+                if ignore_case:
+                    p1, p2 = p1.lower(), p2.lower()
+                if p1 != p2:
+                    # show the difference
+                    print("=== First Paragraph ===")
+                    print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
+                    print("=== Second Paragraph ===")
+                    print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
+                    print("=" * 50)  # Clear boundary
+                    return 0
 
     return 1
 
diff --git a/evaluation_examples/examples/multi_apps/82e3c869-49f6-4305-a7ce-f3e64a0618e7.json b/evaluation_examples/examples/multi_apps/82e3c869-49f6-4305-a7ce-f3e64a0618e7.json
index e60ba1a..deaa36d 100644
--- a/evaluation_examples/examples/multi_apps/82e3c869-49f6-4305-a7ce-f3e64a0618e7.json
+++ b/evaluation_examples/examples/multi_apps/82e3c869-49f6-4305-a7ce-f3e64a0618e7.json
@@ -1,7 +1,7 @@
 {
   "id": "82e3c869-49f6-4305-a7ce-f3e64a0618e7",
-  "snapshot": "libreoffice_calc",
-  "instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
+  "snapshot": "multi_apps",
+  "instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters (a.k.a. Tao Yu) and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
   "source": "authors",
   "config": [
     {
diff --git a/evaluation_examples/examples/multi_apps/aad10cd7-9337-4b62-b704-a857848cedf2.json b/evaluation_examples/examples/multi_apps/aad10cd7-9337-4b62-b704-a857848cedf2.json
index ac60aa1..239bc46 100644
--- a/evaluation_examples/examples/multi_apps/aad10cd7-9337-4b62-b704-a857848cedf2.json
+++ b/evaluation_examples/examples/multi_apps/aad10cd7-9337-4b62-b704-a857848cedf2.json
@@ -65,6 +65,9 @@
       "type": "vm_file",
       "path": "/home/user/Desktop/notes.docx",
       "dest": "notes.docx"
+    },
+    "options": {
+      "fuzzy_match": true
     }
   },
   "proxy": false