fix: fix multiapps tasks (#231)

* Update JSON example for multi_apps: change snapshot name and specify presenter in instructions for clarity.

* Enhance PDF image comparison in chrome.py by adding existence checks for input files and improving image extraction logic. Introduce image hashing for similarity scoring with a configurable threshold. Update docs.py to support fuzzy matching in DOCX file comparisons, allowing for similarity scoring based on text content. Modify example JSON to enable fuzzy matching option.

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
This commit is contained in:
Tianbao Xie
2025-07-03 16:58:43 +08:00
committed by GitHub
parent e9c657b714
commit bba367b8bc
4 changed files with 91 additions and 37 deletions

View File

@@ -2,6 +2,7 @@ import logging
import os
import re
import shutil
import io
from itertools import product
from typing import Any, Dict, List, Union
@@ -172,6 +173,7 @@ import fitz
from PIL import Image
from borb.pdf import Document
from borb.pdf import PDF
import imagehash
from pathlib import Path
import typing
@@ -180,6 +182,9 @@ import typing
def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
if not pdf1_path or not pdf2_path:
return 0.
if not all(map(os.path.exists, [pdf1_path, pdf2_path])):
logger.warning(f"PDF file does not exist: {pdf1_path} or {pdf2_path}")
return 0.
def extract_images_from_pdf(pdf_path):
pdf_document = fitz.open(pdf_path)
@@ -187,35 +192,61 @@ def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
pixmap = page.get_pixmap()
img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
images.append(img)
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
# convert to PIL Image
try:
pil_image = Image.open(io.BytesIO(image_bytes))
images.append(pil_image)
except Exception as e:
logger.error(f"Failed to process image in {pdf_path} on page {page_number}: {e}")
return images
temp_dir = Path(pdf1_path).parent / "temp_pdf_comparison"
os.makedirs(temp_dir, exist_ok=True)
temp_pdf1 = temp_dir / Path(pdf1_path).name
temp_pdf2 = temp_dir / Path(pdf2_path).name
def fix_pdf(in_path: Path, out_path: Path) -> None:
doc: typing.Optional[Document] = None
with open(in_path, "rb") as fh:
doc = PDF.loads(fh)
with open(out_path, "wb") as fh:
PDF.dumps(fh, doc)
shutil.copy(pdf1_path, temp_pdf1)
shutil.copy(pdf2_path, temp_pdf2)
fix_pdf(Path(pdf1_path), Path(pdf1_path))
fix_pdf(Path(pdf2_path), Path(pdf2_path))
try:
images1 = extract_images_from_pdf(str(temp_pdf1))
images2 = extract_images_from_pdf(str(temp_pdf2))
except Exception as e:
logger.error(f"Error extracting images from PDFs: {e}")
shutil.rmtree(temp_dir)
return 0.
finally:
shutil.rmtree(temp_dir)
images1 = extract_images_from_pdf(pdf1_path)
images2 = extract_images_from_pdf(pdf2_path)
if len(images1) != len(images2):
logger.info(f"Different number of images found. Gold: {len(images1)}, Pred: {len(images2)}")
return 0.
for img1, img2 in zip(images1, images2):
if img1.tobytes() != img2.tobytes():
return 0.
if not images1:
logger.info("No images found in either PDF. Considering it a match.")
return 1.0
return 1.
hash_threshold = 5
total_score = 0
for i, (img1, img2) in enumerate(zip(images1, images2)):
hash1 = imagehash.phash(img1)
hash2 = imagehash.phash(img2)
hash_diff = hash1 - hash2
logger.info(f"Image {i+1}: Gold hash: {hash1}, Pred hash: {hash2}, Hash difference: {hash_diff}")
if hash_diff <= hash_threshold:
total_score +=1
return total_score / len(images1)
def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:

View File

@@ -86,6 +86,7 @@ def compare_docx_files(file1, file2, **options):
ignore_case = options.get('ignore_case', False)
ignore_order = options.get('ignore_order', False)
content_only = options.get('content_only', False)
fuzzy_match = options.get('fuzzy_match', False)
if not file1 or not file2:
return 0
@@ -151,29 +152,48 @@ def compare_docx_files(file1, file2, **options):
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
if ignore_case:
text1, text2 = text1.lower(), text2.lower()
if text1 != text2:
return 0
if fuzzy_match:
similarity = fuzz.ratio(text1, text2) / 100.0
return similarity
else:
if text1 != text2:
return 0
else:
print("ignore_blanks=false")
if len(doc1_paragraphs) != len(doc2_paragraphs):
print(doc1_paragraphs)
print(doc2_paragraphs)
print(len(doc1_paragraphs))
print(len(doc2_paragraphs))
return 0
print("in compare")
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
if p1 != p2:
# show the difference
print("=== First Paragraph ===")
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
print("=== Second Paragraph ===")
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
print("=" * 50) # Clear boundary
return 0
if fuzzy_match:
total_similarity = 0
if not doc1_paragraphs:
return 1.0
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
total_similarity += fuzz.ratio(p1, p2) / 100.0
if len(doc1_paragraphs) == 0:
return 1.0 if len(doc2_paragraphs) == 0 else 0.0
avg_similarity = total_similarity / len(doc1_paragraphs)
return avg_similarity
else:
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
if p1 != p2:
# show the difference
print("=== First Paragraph ===")
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
print("=== Second Paragraph ===")
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
print("=" * 50) # Clear boundary
return 0
return 1

View File

@@ -1,7 +1,7 @@
{
"id": "82e3c869-49f6-4305-a7ce-f3e64a0618e7",
"snapshot": "libreoffice_calc",
"instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
"snapshot": "multi_apps",
"instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters (a.k.a. Tao Yu) and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
"source": "authors",
"config": [
{

View File

@@ -65,6 +65,9 @@
"type": "vm_file",
"path": "/home/user/Desktop/notes.docx",
"dest": "notes.docx"
},
"options": {
"fuzzy_match": true
}
},
"proxy": false