merge

2024-03-09 18:53:27 +08:00
parent 5b07ec17bf 2291af394f
commit aae848196b
48 changed files with 2106 additions and 155 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -3,8 +3,10 @@ import os
 import re
 import xml.etree.ElementTree as ET
 import zipfile
+from io import BytesIO
 from typing import List, Dict, Any

+from PIL import Image
 from docx import Document
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
 from docx.shared import RGBColor
@@ -23,6 +25,9 @@ def find_default_font(config_file_path, rules):
    default_font = None
    expected_font = rules["font_name"]

+    if not config_file_path:
+        return 0
+
    try:
        tree = ET.parse(config_file_path)
        root = tree.getroot()
@@ -42,6 +47,9 @@ def find_default_font(config_file_path, rules):


 def contains_page_break(docx_file):
+    if not docx_file:
+        return 0
+
    doc = Document(docx_file)

    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
@@ -62,6 +70,9 @@ def compare_docx_files(file1, file2, **options):
    ignore_order = options.get('ignore_order', False)
    content_only = options.get('content_only', False)

+    if not file1 or not file2:
+        return 0
+
    def get_paragraph_texts_odt(document):
        paragraphs = document.getElementsByType(P)
        paragraph_texts = []
@@ -118,20 +129,30 @@ def compare_docx_files(file1, file2, **options):
        if text1 != text2:
            return 0
    else:
+        print("ignore_blanks=false")
        if len(doc1_paragraphs) != len(doc2_paragraphs):
+            print(doc1_paragraphs)
+            print(doc2_paragraphs)
+            print(len(doc1_paragraphs))
+            print(len(doc2_paragraphs))
            return 0
-
+        print("in compare")
        # Compare each paragraph
        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
            if ignore_case:
                p1, p2 = p1.lower(), p2.lower()
            if p1 != p2:
+                print(p1)
+                print(p2)
                return 0

    return 1


 def compare_init_lines(file1, file2):
+    if not file1 or not file2:
+        return 0
+
    doc1 = Document(file1)
    doc2 = Document(file2)

@@ -149,6 +170,9 @@ def compare_init_lines(file1, file2):


 def compare_docx_tables(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

@@ -174,11 +198,10 @@ def compare_docx_tables(docx_file1, docx_file2):
    return 1


-from io import BytesIO
-from PIL import Image
-
-
 def compare_docx_images(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

@@ -212,6 +235,9 @@ def compare_image_text(image_path, rule):


 def compare_line_spacing(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    if not compare_docx_files(docx_file1, docx_file2):
        return 0
    doc1 = Document(docx_file1)
@@ -233,6 +259,9 @@ def compare_line_spacing(docx_file1, docx_file2):


 def compare_insert_equation(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    if not compare_docx_files(docx_file1, docx_file2):
        return 0

@@ -248,6 +277,9 @@ def compare_insert_equation(docx_file1, docx_file2):


 def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
+    if not docx_file:
+        return 0
+
    doc = Document(docx_file)
    expected_font = rules["font_name"]

@@ -260,6 +292,9 @@ def compare_font_names(docx_file, rules: List[Dict[str, Any]]):


 def compare_subscript_contains(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

@@ -272,6 +307,9 @@ def compare_subscript_contains(docx_file1, docx_file2):


 def has_page_numbers_in_footers(docx_file):
+    if not docx_file:
+        return 0
+
    doc = Document(docx_file)

    for section in doc.sections:
@@ -286,6 +324,9 @@ def has_page_numbers_in_footers(docx_file):


 def is_first_line_centered(docx_file):
+    if not docx_file:
+        return 0
+
    doc = Document(docx_file)
    first_paragraph = doc.paragraphs[0]

@@ -294,11 +335,16 @@ def is_first_line_centered(docx_file):


 def check_file_exists(directory, filename):
+    if not directory or not filename:
+        return 0
    file_path = os.path.join(directory, filename)
    return 1 if os.path.isfile(file_path) else 0


 def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
+    if not docx_file1 or not docx_file2:
+        return .0
+
    doc1: Document = Document(docx_file1)
    doc2: Document = Document(docx_file2)
    para1 = [p for p in doc1.paragraphs if p.text.strip()]
@@ -334,6 +380,9 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:


 def compare_contains_image(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

@@ -346,6 +395,9 @@ def compare_contains_image(docx_file1, docx_file2):


 def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
+    if not file_path1 or not file_path2:
+        return 0
+
    if not compare_docx_files(file_path1, file_path2):
        return 0
    document = Document(file_path1)
@@ -380,6 +432,9 @@ def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):


 def check_highlighted_words(file_path1, file_path2):
+    if not file_path1 or not file_path2:
+        return 0
+
    if not compare_docx_files(file_path1, file_path2):
        return 0

@@ -402,6 +457,9 @@ def check_highlighted_words(file_path1, file_path2):


 def evaluate_strike_through_last_paragraph(file_path1, file_path2):
+    if not file_path1 or not file_path2:
+        return 0
+
    if not compare_docx_files(file_path1, file_path2):
        return 0
    document = Document(file_path1)
@@ -418,6 +476,9 @@ def evaluate_strike_through_last_paragraph(file_path1, file_path2):


 def evaluate_conversion(file_path):
+    if not file_path:
+        return 0
+
    document = Document(file_path)

    for table in document.tables:
@@ -437,6 +498,9 @@ def evaluate_conversion(file_path):


 def evaluate_spacing(file_path):
+    if not file_path:
+        return 0
+
    document = Document(file_path)

    # Check line spacing for introduction, body, and conclusion
@@ -450,6 +514,9 @@ def evaluate_spacing(file_path):


 def check_italic_font_size_14(path1, path2):
+    if not path1 or not path2:
+        return 0
+
    if not compare_docx_files(path1, path2):
        return 0
    document = Document(path1)
@@ -463,6 +530,9 @@ def check_italic_font_size_14(path1, path2):


 def evaluate_alignment(docx_path):
+    if not docx_path:
+        return 0
+
    # Load the document
    doc = Document(docx_path)

@@ -492,6 +562,9 @@ def evaluate_alignment(docx_path):


 def get_unique_train_ids(initial_file):  # fixed standard
+    if not initial_file:
+        return set(), 0
+
    doc = Document(initial_file)
    train_ids = set()
    processed_lines = 0
@@ -508,6 +581,9 @@ def get_unique_train_ids(initial_file):  # fixed standard


 def check_no_duplicates(initial_file, processed_file):
+    if not initial_file or not processed_file:
+        return 0
+
    # Open the document
    train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
    doc_processed = Document(processed_file)
@@ -535,6 +611,9 @@ def check_no_duplicates(initial_file, processed_file):


 def compare_docx_lines(file1, file2):
+    if not file1 or not file2:
+        return 0
+
    # Read the text of the document, line by line
    doc1 = Document(file1)
    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
@@ -551,8 +630,47 @@ def compare_docx_lines(file1, file2):
        return 0


+def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
+    ignore_blanks = options.get('ignore_blanks', True)
+
+    if not file1 or not file2:
+        return 0
+
+    # Determine file types and load documents
+    if file1.endswith('.docx') and file2.endswith('.docx'):
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+        # First, delete all the blank in paragraphs
+        doc1 = [p for p in doc1.paragraphs if p.text != '']
+        doc2 = [p for p in doc2.paragraphs if p.text != '']
+        doc1_paragraphs = [p.text for p in doc1]
+        doc2_paragraphs = [p.text for p in doc2]
+    else:
+        # Unsupported file types or mismatch
+        print("Unsupported file types or mismatch between file types.")
+        return 0
+
+    # Process and compare documents
+    if ignore_blanks:
+        text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
+        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
+        if text1 != text2:
+            return 0
+    else:
+        if len(doc1_paragraphs) != len(doc2_paragraphs):
+            return 0
+        # Compare each paragraph
+        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+            if p1 != p2:
+                return 0
+    return 1
+
+
 # Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
 def compare_highlighted_text(file1, file2):
+    if not file1 or not file2:
+        return 0
+
    def extract_highlighted_text(file_path):
        highlighted_texts = []

@@ -590,6 +708,9 @@ def compare_highlighted_text(file1, file2):


 def compare_references(file1, file2, **options):
+    if not file1 or not file2:
+        return 0
+
    reference_indicator = options.get('reference_indicator', 'References')
    reference_base_result = options.get('reference_base_result', 0.5)

@@ -634,48 +755,3 @@ def compare_references(file1, file2, **options):
        return (result - reference_base_result) / (1 - reference_base_result)
    else:
        return 0
-
-
-def compare_answer(file1, file2, **options):
-    """This is a specific function to compare the """
-    # Determine file types and load documents
-    if file1.endswith('.docx') and file2.endswith('.docx'):
-        doc1 = Document(file1)
-        doc2 = Document(file2)
-        doc1_paragraphs = [p.text for p in doc1.paragraphs]
-        doc2_paragraphs = [p.text for p in doc2.paragraphs]
-    else:
-        # Unsupported file types or mismatch
-        print("Unsupported file types or mismatch between file types.")
-        return 0
-
-    # Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
-    ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
-    ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
-
-    if ref1_idx == -1 and ref2_idx == -1:
-        return 1
-
-    if ref1_idx == -1 or ref2_idx == -1:
-        return 0
-
-    # split the reference section into reference items, and remove the empty string items
-    ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
-    ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
-
-    # Compare the references
-
-    if len(ref1) != len(ref2):
-        return 0
-
-    total_similarity = 0
-    for r1, r2 in zip(ref1, ref2):
-        # fuzzy match the references
-        similarity = fuzz.ratio(r1, r2) / 100.0
-        total_similarity += similarity
-
-    result = total_similarity / len(ref1)
-    if result >= reference_base_result:
-        return (result - reference_base_result) / (1 - reference_base_result)
-    else:
-        return 0