Merge branch 'main' into xiaochuanli/addChromeExtensions

This commit is contained in:
Tianbao Xie
2024-03-08 20:45:49 +08:00
committed by GitHub
109 changed files with 7196 additions and 172 deletions

View File

@@ -11,6 +11,7 @@ from docx.shared import RGBColor
from odf.opendocument import load
from odf.text import P
from odf.text import Span
from rapidfuzz import fuzz
from skimage.color import deltaE_ciede2000
from skimage.color import rgb2lab
@@ -57,6 +58,10 @@ def contains_page_break(docx_file):
def compare_docx_files(file1, file2, **options):
ignore_blanks = options.get('ignore_blanks', True)
ignore_case = options.get('ignore_case', False)
ignore_order = options.get('ignore_order', False)
content_only = options.get('content_only', False)
def get_paragraph_texts_odt(document):
paragraphs = document.getElementsByType(P)
paragraph_texts = []
@@ -79,20 +84,37 @@ def compare_docx_files(file1, file2, **options):
doc2 = Document(file2)
doc1_paragraphs = [p.text for p in doc1.paragraphs]
doc2_paragraphs = [p.text for p in doc2.paragraphs]
if ignore_order:
doc1_paragraphs = sorted(doc1_paragraphs)
doc2_paragraphs = sorted(doc2_paragraphs)
elif file1.endswith('.odt') and file2.endswith('.odt'):
doc1 = load(file1)
doc2 = load(file2)
doc1_paragraphs = get_paragraph_texts_odt(doc1)
doc2_paragraphs = get_paragraph_texts_odt(doc2)
if ignore_order:
doc1_paragraphs = sorted(doc1_paragraphs)
doc2_paragraphs = sorted(doc2_paragraphs)
else:
# Unsupported file types or mismatch
print("Unsupported file types or mismatch between file types.")
return 0
if content_only:
# Compare the content of the documents
text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
if ignore_case:
text1, text2 = text1.lower(), text2.lower()
similarity = fuzz.ratio(text1, text2) / 100.0
return similarity
# Process and compare documents
if ignore_blanks:
text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
if ignore_case:
text1, text2 = text1.lower(), text2.lower()
if text1 != text2:
return 0
else:
@@ -106,6 +128,8 @@ def compare_docx_files(file1, file2, **options):
print("in compare")
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
if p1 != p2:
print(p1)
print(p2)
@@ -157,6 +181,44 @@ def compare_docx_tables(docx_file1, docx_file2):
return 1
from io import BytesIO
from PIL import Image
def compare_docx_images(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
def extract_images(doc):
images = []
for rel in doc.part.rels.values():
if "image" in rel.reltype:
img_data = rel.target_part.blob
images.append(BytesIO(img_data))
return images
images1 = extract_images(doc1)
images2 = extract_images(doc2)
if len(images1) != len(images2):
return 0
for img1, img2 in zip(images1, images2):
if Image.open(img1).tobytes() != Image.open(img2).tobytes():
return 0
return 1
import pytesseract
def compare_image_text(image_path, rule):
img = Image.open(image_path)
img_text = pytesseract.image_to_string(img)
if rule['type'] == 'text':
return 1 if rule['text'] in img_text else 0
else:
raise ValueError("Unsupported rule type")
def compare_line_spacing(docx_file1, docx_file2):
if not compare_docx_files(docx_file1, docx_file2):
return 0
@@ -263,7 +325,7 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
section = doc2.sections[0]
paragraph_width = section.page_width - section.left_margin - section.right_margin
ignore_tabs = lambda x: x.alignment == WD_TAB_ALIGNMENT.CLEAR or (
x.alignment == WD_TAB_ALIGNMENT.LEFT and x.position == 0)
x.alignment == WD_TAB_ALIGNMENT.LEFT and x.position == 0)
minus = .0
for p1, p2 in zip(para1, para2):
# filter CLEAR tabstop and default left-0 tabstop
@@ -566,3 +628,95 @@ def compare_highlighted_text(file1, file2):
return 1
else:
return 0
def compare_references(file1, file2, **options):
reference_indicator = options.get('reference_indicator', 'References')
reference_base_result = options.get('reference_base_result', 0.5)
# Determine file types and load documents
if file1.endswith('.docx') and file2.endswith('.docx'):
doc1 = Document(file1)
doc2 = Document(file2)
doc1_paragraphs = [p.text for p in doc1.paragraphs]
doc2_paragraphs = [p.text for p in doc2.paragraphs]
else:
# Unsupported file types or mismatch
print("Unsupported file types or mismatch between file types.")
return 0
# Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
if ref1_idx == -1 and ref2_idx == -1:
return 1
if ref1_idx == -1 or ref2_idx == -1:
return 0
# split the reference section into reference items, and remove the empty string items
ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
# Compare the references
if len(ref1) != len(ref2):
return 0
total_similarity = 0
for r1, r2 in zip(ref1, ref2):
# fuzzy match the references
similarity = fuzz.ratio(r1, r2) / 100.0
total_similarity += similarity
result = total_similarity / len(ref1)
if result >= reference_base_result:
return (result - reference_base_result) / (1 - reference_base_result)
else:
return 0
def compare_answer(file1, file2, **options):
"""This is a specific function to compare the """
# Determine file types and load documents
if file1.endswith('.docx') and file2.endswith('.docx'):
doc1 = Document(file1)
doc2 = Document(file2)
doc1_paragraphs = [p.text for p in doc1.paragraphs]
doc2_paragraphs = [p.text for p in doc2.paragraphs]
else:
# Unsupported file types or mismatch
print("Unsupported file types or mismatch between file types.")
return 0
# Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
if ref1_idx == -1 and ref2_idx == -1:
return 1
if ref1_idx == -1 or ref2_idx == -1:
return 0
# split the reference section into reference items, and remove the empty string items
ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
# Compare the references
if len(ref1) != len(ref2):
return 0
total_similarity = 0
for r1, r2 in zip(ref1, ref2):
# fuzzy match the references
similarity = fuzz.ratio(r1, r2) / 100.0
total_similarity += similarity
result = total_similarity / len(ref1)
if result >= reference_base_result:
return (result - reference_base_result) / (1 - reference_base_result)
else:
return 0