fix: fix multiapps tasks (#231)
* Update JSON example for multi_apps: change snapshot name and specify presenter in instructions for clarity. * Enhance PDF image comparison in chrome.py by adding existence checks for input files and improving image extraction logic. Introduce image hashing for similarity scoring with a configurable threshold. Update docs.py to support fuzzy matching in DOCX file comparisons, allowing for similarity scoring based on text content. Modify example JSON to enable fuzzy matching option. --------- Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
This commit is contained in:
@@ -2,6 +2,7 @@ import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import io
|
||||
from itertools import product
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
@@ -172,6 +173,7 @@ import fitz
|
||||
from PIL import Image
|
||||
from borb.pdf import Document
|
||||
from borb.pdf import PDF
|
||||
import imagehash
|
||||
|
||||
from pathlib import Path
|
||||
import typing
|
||||
@@ -180,6 +182,9 @@ import typing
|
||||
def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
|
||||
if not pdf1_path or not pdf2_path:
|
||||
return 0.
|
||||
if not all(map(os.path.exists, [pdf1_path, pdf2_path])):
|
||||
logger.warning(f"PDF file does not exist: {pdf1_path} or {pdf2_path}")
|
||||
return 0.
|
||||
|
||||
def extract_images_from_pdf(pdf_path):
|
||||
pdf_document = fitz.open(pdf_path)
|
||||
@@ -187,35 +192,61 @@ def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
|
||||
|
||||
for page_number in range(pdf_document.page_count):
|
||||
page = pdf_document[page_number]
|
||||
pixmap = page.get_pixmap()
|
||||
|
||||
img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
|
||||
|
||||
images.append(img)
|
||||
for img_index, img in enumerate(page.get_images(full=True)):
|
||||
xref = img[0]
|
||||
base_image = pdf_document.extract_image(xref)
|
||||
image_bytes = base_image["image"]
|
||||
|
||||
# convert to PIL Image
|
||||
try:
|
||||
pil_image = Image.open(io.BytesIO(image_bytes))
|
||||
images.append(pil_image)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process image in {pdf_path} on page {page_number}: {e}")
|
||||
|
||||
return images
|
||||
|
||||
temp_dir = Path(pdf1_path).parent / "temp_pdf_comparison"
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
temp_pdf1 = temp_dir / Path(pdf1_path).name
|
||||
temp_pdf2 = temp_dir / Path(pdf2_path).name
|
||||
|
||||
def fix_pdf(in_path: Path, out_path: Path) -> None:
|
||||
doc: typing.Optional[Document] = None
|
||||
with open(in_path, "rb") as fh:
|
||||
doc = PDF.loads(fh)
|
||||
with open(out_path, "wb") as fh:
|
||||
PDF.dumps(fh, doc)
|
||||
shutil.copy(pdf1_path, temp_pdf1)
|
||||
shutil.copy(pdf2_path, temp_pdf2)
|
||||
|
||||
fix_pdf(Path(pdf1_path), Path(pdf1_path))
|
||||
fix_pdf(Path(pdf2_path), Path(pdf2_path))
|
||||
try:
|
||||
images1 = extract_images_from_pdf(str(temp_pdf1))
|
||||
images2 = extract_images_from_pdf(str(temp_pdf2))
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting images from PDFs: {e}")
|
||||
shutil.rmtree(temp_dir)
|
||||
return 0.
|
||||
finally:
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
images1 = extract_images_from_pdf(pdf1_path)
|
||||
images2 = extract_images_from_pdf(pdf2_path)
|
||||
|
||||
if len(images1) != len(images2):
|
||||
logger.info(f"Different number of images found. Gold: {len(images1)}, Pred: {len(images2)}")
|
||||
return 0.
|
||||
|
||||
for img1, img2 in zip(images1, images2):
|
||||
if img1.tobytes() != img2.tobytes():
|
||||
return 0.
|
||||
if not images1:
|
||||
logger.info("No images found in either PDF. Considering it a match.")
|
||||
return 1.0
|
||||
|
||||
return 1.
|
||||
hash_threshold = 5
|
||||
total_score = 0
|
||||
for i, (img1, img2) in enumerate(zip(images1, images2)):
|
||||
hash1 = imagehash.phash(img1)
|
||||
hash2 = imagehash.phash(img2)
|
||||
hash_diff = hash1 - hash2
|
||||
|
||||
logger.info(f"Image {i+1}: Gold hash: {hash1}, Pred hash: {hash2}, Hash difference: {hash_diff}")
|
||||
|
||||
if hash_diff <= hash_threshold:
|
||||
total_score +=1
|
||||
|
||||
return total_score / len(images1)
|
||||
|
||||
|
||||
def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
|
||||
|
||||
@@ -86,6 +86,7 @@ def compare_docx_files(file1, file2, **options):
|
||||
ignore_case = options.get('ignore_case', False)
|
||||
ignore_order = options.get('ignore_order', False)
|
||||
content_only = options.get('content_only', False)
|
||||
fuzzy_match = options.get('fuzzy_match', False)
|
||||
|
||||
if not file1 or not file2:
|
||||
return 0
|
||||
@@ -151,29 +152,48 @@ def compare_docx_files(file1, file2, **options):
|
||||
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
|
||||
if ignore_case:
|
||||
text1, text2 = text1.lower(), text2.lower()
|
||||
if text1 != text2:
|
||||
return 0
|
||||
|
||||
if fuzzy_match:
|
||||
similarity = fuzz.ratio(text1, text2) / 100.0
|
||||
return similarity
|
||||
else:
|
||||
if text1 != text2:
|
||||
return 0
|
||||
else:
|
||||
print("ignore_blanks=false")
|
||||
if len(doc1_paragraphs) != len(doc2_paragraphs):
|
||||
print(doc1_paragraphs)
|
||||
print(doc2_paragraphs)
|
||||
print(len(doc1_paragraphs))
|
||||
print(len(doc2_paragraphs))
|
||||
return 0
|
||||
print("in compare")
|
||||
# Compare each paragraph
|
||||
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
|
||||
if ignore_case:
|
||||
p1, p2 = p1.lower(), p2.lower()
|
||||
if p1 != p2:
|
||||
# show the difference
|
||||
print("=== First Paragraph ===")
|
||||
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
|
||||
print("=== Second Paragraph ===")
|
||||
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
|
||||
print("=" * 50) # Clear boundary
|
||||
return 0
|
||||
|
||||
if fuzzy_match:
|
||||
total_similarity = 0
|
||||
if not doc1_paragraphs:
|
||||
return 1.0
|
||||
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
|
||||
if ignore_case:
|
||||
p1, p2 = p1.lower(), p2.lower()
|
||||
total_similarity += fuzz.ratio(p1, p2) / 100.0
|
||||
|
||||
if len(doc1_paragraphs) == 0:
|
||||
return 1.0 if len(doc2_paragraphs) == 0 else 0.0
|
||||
|
||||
avg_similarity = total_similarity / len(doc1_paragraphs)
|
||||
return avg_similarity
|
||||
else:
|
||||
# Compare each paragraph
|
||||
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
|
||||
if ignore_case:
|
||||
p1, p2 = p1.lower(), p2.lower()
|
||||
if p1 != p2:
|
||||
# show the difference
|
||||
print("=== First Paragraph ===")
|
||||
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
|
||||
print("=== Second Paragraph ===")
|
||||
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
|
||||
print("=" * 50) # Clear boundary
|
||||
return 0
|
||||
|
||||
return 1
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "82e3c869-49f6-4305-a7ce-f3e64a0618e7",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
|
||||
"snapshot": "multi_apps",
|
||||
"instruction": "Please sift through the folder with all the event photos taken by our photographer. I need you to extract the photos featuring the presenters (a.k.a. Tao Yu) and place them in a separate folder named 'presenter'. Then, compress this folder into a zip file named 'presenter.zip' so I can easily share it with others later.",
|
||||
"source": "authors",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -65,6 +65,9 @@
|
||||
"type": "vm_file",
|
||||
"path": "/home/user/Desktop/notes.docx",
|
||||
"dest": "notes.docx"
|
||||
},
|
||||
"options": {
|
||||
"fuzzy_match": true
|
||||
}
|
||||
},
|
||||
"proxy": false
|
||||
|
||||
Reference in New Issue
Block a user