fix: fix multiapps tasks (#231)

* Update JSON example for multi_apps: change snapshot name and specify presenter in instructions for clarity.

* Enhance PDF image comparison in chrome.py by adding existence checks for input files and improving image extraction logic. Introduce image hashing for similarity scoring with a configurable threshold. Update docs.py to support fuzzy matching in DOCX file comparisons, allowing for similarity scoring based on text content. Modify example JSON to enable fuzzy matching option.

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
This commit is contained in:
Tianbao Xie
2025-07-03 16:58:43 +08:00
committed by GitHub
parent e9c657b714
commit bba367b8bc
4 changed files with 91 additions and 37 deletions

View File

@@ -2,6 +2,7 @@ import logging
import os import os
import re import re
import shutil import shutil
import io
from itertools import product from itertools import product
from typing import Any, Dict, List, Union from typing import Any, Dict, List, Union
@@ -172,6 +173,7 @@ import fitz
from PIL import Image from PIL import Image
from borb.pdf import Document from borb.pdf import Document
from borb.pdf import PDF from borb.pdf import PDF
import imagehash
from pathlib import Path from pathlib import Path
import typing import typing
@@ -180,6 +182,9 @@ import typing
def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float: def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
if not pdf1_path or not pdf2_path: if not pdf1_path or not pdf2_path:
return 0. return 0.
if not all(map(os.path.exists, [pdf1_path, pdf2_path])):
logger.warning(f"PDF file does not exist: {pdf1_path} or {pdf2_path}")
return 0.
def extract_images_from_pdf(pdf_path): def extract_images_from_pdf(pdf_path):
pdf_document = fitz.open(pdf_path) pdf_document = fitz.open(pdf_path)
@@ -187,35 +192,61 @@ def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
for page_number in range(pdf_document.page_count): for page_number in range(pdf_document.page_count):
page = pdf_document[page_number] page = pdf_document[page_number]
pixmap = page.get_pixmap() for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
images.append(img)
# convert to PIL Image
try:
pil_image = Image.open(io.BytesIO(image_bytes))
images.append(pil_image)
except Exception as e:
logger.error(f"Failed to process image in {pdf_path} on page {page_number}: {e}")
return images return images
temp_dir = Path(pdf1_path).parent / "temp_pdf_comparison"
os.makedirs(temp_dir, exist_ok=True)
temp_pdf1 = temp_dir / Path(pdf1_path).name
temp_pdf2 = temp_dir / Path(pdf2_path).name
def fix_pdf(in_path: Path, out_path: Path) -> None: shutil.copy(pdf1_path, temp_pdf1)
doc: typing.Optional[Document] = None shutil.copy(pdf2_path, temp_pdf2)
with open(in_path, "rb") as fh:
doc = PDF.loads(fh)
with open(out_path, "wb") as fh:
PDF.dumps(fh, doc)
fix_pdf(Path(pdf1_path), Path(pdf1_path)) try:
fix_pdf(Path(pdf2_path), Path(pdf2_path)) images1 = extract_images_from_pdf(str(temp_pdf1))
images2 = extract_images_from_pdf(str(temp_pdf2))
except Exception as e:
logger.error(f"Error extracting images from PDFs: {e}")
shutil.rmtree(temp_dir)
return 0.
finally:
shutil.rmtree(temp_dir)
images1 = extract_images_from_pdf(pdf1_path)
images2 = extract_images_from_pdf(pdf2_path)
if len(images1) != len(images2): if len(images1) != len(images2):
logger.info(f"Different number of images found. Gold: {len(images1)}, Pred: {len(images2)}")
return 0. return 0.
for img1, img2 in zip(images1, images2): if not images1:
if img1.tobytes() != img2.tobytes(): logger.info("No images found in either PDF. Considering it a match.")
return 0. return 1.0
return 1. hash_threshold = 5
total_score = 0
for i, (img1, img2) in enumerate(zip(images1, images2)):
hash1 = imagehash.phash(img1)
hash2 = imagehash.phash(img2)
hash_diff = hash1 - hash2
logger.info(f"Image {i+1}: Gold hash: {hash1}, Pred hash: {hash2}, Hash difference: {hash_diff}")
if hash_diff <= hash_threshold:
total_score +=1
return total_score / len(images1)
def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float: def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:

View File

@@ -86,6 +86,7 @@ def compare_docx_files(file1, file2, **options):
ignore_case = options.get('ignore_case', False) ignore_case = options.get('ignore_case', False)
ignore_order = options.get('ignore_order', False) ignore_order = options.get('ignore_order', False)
content_only = options.get('content_only', False) content_only = options.get('content_only', False)
fuzzy_match = options.get('fuzzy_match', False)
if not file1 or not file2: if not file1 or not file2:
return 0 return 0
@@ -151,29 +152,48 @@ def compare_docx_files(file1, file2, **options):
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip() text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
if ignore_case: if ignore_case:
text1, text2 = text1.lower(), text2.lower() text1, text2 = text1.lower(), text2.lower()
if text1 != text2:
return 0 if fuzzy_match:
similarity = fuzz.ratio(text1, text2) / 100.0
return similarity
else:
if text1 != text2:
return 0
else: else:
print("ignore_blanks=false")
if len(doc1_paragraphs) != len(doc2_paragraphs): if len(doc1_paragraphs) != len(doc2_paragraphs):
print(doc1_paragraphs) print(doc1_paragraphs)
print(doc2_paragraphs) print(doc2_paragraphs)
print(len(doc1_paragraphs)) print(len(doc1_paragraphs))
print(len(doc2_paragraphs)) print(len(doc2_paragraphs))
return 0 return 0
print("in compare")
# Compare each paragraph if fuzzy_match:
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs): total_similarity = 0
if ignore_case: if not doc1_paragraphs:
p1, p2 = p1.lower(), p2.lower() return 1.0
if p1 != p2: for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
# show the difference if ignore_case:
print("=== First Paragraph ===") p1, p2 = p1.lower(), p2.lower()
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars total_similarity += fuzz.ratio(p1, p2) / 100.0
print("=== Second Paragraph ===")
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars if len(doc1_paragraphs) == 0:
print("=" * 50) # Clear boundary return 1.0 if len(doc2_paragraphs) == 0 else 0.0
return 0
avg_similarity = total_similarity / len(doc1_paragraphs)
return avg_similarity
else:
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
if p1 != p2:
# show the difference
print("=== First Paragraph ===")
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
print("=== Second Paragraph ===")
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
print("=" * 50) # Clear boundary
return 0
return 1 return 1

View File

@@ -1,7 +1,7 @@
{ {
"id": "82e3c869-49f6-4305-a7ce-f3e64a0618e7", "id": "82e3c869-49f6-4305-a7ce-f3e64a0618e7",
"snapshot": "libreoffice_calc", "snapshot": "multi_apps",
"instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.", "instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters (a.k.a. Tao Yu) and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
"source": "authors", "source": "authors",
"config": [ "config": [
{ {

View File

@@ -65,6 +65,9 @@
"type": "vm_file", "type": "vm_file",
"path": "/home/user/Desktop/notes.docx", "path": "/home/user/Desktop/notes.docx",
"dest": "notes.docx" "dest": "notes.docx"
},
"options": {
"fuzzy_match": true
} }
}, },
"proxy": false "proxy": false