fix: fix multiapps tasks (#231)

* Update JSON example for multi_apps: change snapshot name and specify presenter in instructions for clarity.

* Enhance PDF image comparison in chrome.py by adding existence checks for input files and improving image extraction logic. Introduce image hashing for similarity scoring with a configurable threshold. Update docs.py to support fuzzy matching in DOCX file comparisons, allowing for similarity scoring based on text content. Modify example JSON to enable fuzzy matching option.

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
This commit is contained in:
Tianbao Xie
2025-07-03 16:58:43 +08:00
committed by GitHub
parent e9c657b714
commit bba367b8bc
4 changed files with 91 additions and 37 deletions

View File

@@ -86,6 +86,7 @@ def compare_docx_files(file1, file2, **options):
ignore_case = options.get('ignore_case', False)
ignore_order = options.get('ignore_order', False)
content_only = options.get('content_only', False)
fuzzy_match = options.get('fuzzy_match', False)
if not file1 or not file2:
return 0
@@ -151,29 +152,48 @@ def compare_docx_files(file1, file2, **options):
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
if ignore_case:
text1, text2 = text1.lower(), text2.lower()
if text1 != text2:
return 0
if fuzzy_match:
similarity = fuzz.ratio(text1, text2) / 100.0
return similarity
else:
if text1 != text2:
return 0
else:
print("ignore_blanks=false")
if len(doc1_paragraphs) != len(doc2_paragraphs):
print(doc1_paragraphs)
print(doc2_paragraphs)
print(len(doc1_paragraphs))
print(len(doc2_paragraphs))
return 0
print("in compare")
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
if p1 != p2:
# show the difference
print("=== First Paragraph ===")
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
print("=== Second Paragraph ===")
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
print("=" * 50) # Clear boundary
return 0
if fuzzy_match:
total_similarity = 0
if not doc1_paragraphs:
return 1.0
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
total_similarity += fuzz.ratio(p1, p2) / 100.0
if len(doc1_paragraphs) == 0:
return 1.0 if len(doc2_paragraphs) == 0 else 0.0
avg_similarity = total_similarity / len(doc1_paragraphs)
return avg_similarity
else:
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
if p1 != p2:
# show the difference
print("=== First Paragraph ===")
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
print("=== Second Paragraph ===")
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
print("=" * 50) # Clear boundary
return 0
return 1