Fix bugs in multiple examples

2024-03-10 23:52:29 +08:00
parent 3f19cc5117
commit b3d27f6387
4 changed files with 276 additions and 144 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -50,7 +50,11 @@ def contains_page_break(docx_file):
    if not docx_file:
        return 0

-    doc = Document(docx_file)
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

@@ -91,16 +95,24 @@ def compare_docx_files(file1, file2, **options):

    # Determine file types and load documents
    if file1.endswith('.docx') and file2.endswith('.docx'):
-        doc1 = Document(file1)
-        doc2 = Document(file2)
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
        doc1_paragraphs = [p.text for p in doc1.paragraphs]
        doc2_paragraphs = [p.text for p in doc2.paragraphs]
        if ignore_order:
            doc1_paragraphs = sorted(doc1_paragraphs)
            doc2_paragraphs = sorted(doc2_paragraphs)
    elif file1.endswith('.odt') and file2.endswith('.odt'):
-        doc1 = load(file1)
-        doc2 = load(file2)
+        try:
+            doc1 = load(file1)
+            doc2 = load(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
        doc1_paragraphs = get_paragraph_texts_odt(doc1)
        doc2_paragraphs = get_paragraph_texts_odt(doc2)
        if ignore_order:
@@ -153,8 +165,12 @@ def compare_init_lines(file1, file2):
    if not file1 or not file2:
        return 0

-    doc1 = Document(file1)
-    doc2 = Document(file2)
+    try:
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    doc1_paragraphs = [p.text for p in doc1.paragraphs]
    doc2_paragraphs = [p.text for p in doc2.paragraphs]
@@ -173,8 +189,12 @@ def compare_docx_tables(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0

-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # get list of tables in docx
    tables1 = doc1.tables
@@ -202,8 +222,12 @@ def compare_docx_images(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0

-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    def extract_images(doc):
        images = []
@@ -240,8 +264,13 @@ def compare_line_spacing(docx_file1, docx_file2):

    if not compare_docx_files(docx_file1, docx_file2):
        return 0
-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    if len(doc1.paragraphs) != len(doc2.paragraphs):
        return 0
@@ -265,8 +294,12 @@ def compare_insert_equation(docx_file1, docx_file2):
    if not compare_docx_files(docx_file1, docx_file2):
        return 0

-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Compare each paragraph if it contains equation
    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
@@ -280,7 +313,12 @@ def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
    if not docx_file:
        return 0

-    doc = Document(docx_file)
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    expected_font = rules["font_name"]

    for paragraph in doc.paragraphs:
@@ -295,8 +333,12 @@ def compare_subscript_contains(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0

-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
        for run1, run2 in zip(para1.runs, para2.runs):
@@ -310,7 +352,11 @@ def has_page_numbers_in_footers(docx_file):
    if not docx_file:
        return 0

-    doc = Document(docx_file)
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for section in doc.sections:
        footer = section.footer
@@ -327,7 +373,12 @@ def is_first_line_centered(docx_file):
    if not docx_file:
        return 0

-    doc = Document(docx_file)
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    first_paragraph = doc.paragraphs[0]

    # check if the first line is center justified
@@ -345,8 +396,13 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
    if not docx_file1 or not docx_file2:
        return .0

-    doc1: Document = Document(docx_file1)
-    doc2: Document = Document(docx_file2)
+    try:
+        doc1: Document = Document(docx_file1)
+        doc2: Document = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return .0
+
    para1 = [p for p in doc1.paragraphs if p.text.strip()]
    para2 = [p for p in doc2.paragraphs if p.text.strip()]
    if len(para1) != len(para2): return .0
@@ -383,8 +439,12 @@ def compare_contains_image(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0

-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
        for run1, run2 in zip(para1.runs, para2.runs):
@@ -400,7 +460,13 @@ def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):

    if not compare_docx_files(file_path1, file_path2):
        return 0
-    document = Document(file_path1)
+
+    try:
+        document = Document(file_path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    threshold = kwargs.get('threshold', 3.5)

    def _calculate_color_difference(rgb1, rgb2):
@@ -462,7 +528,12 @@ def evaluate_strike_through_last_paragraph(file_path1, file_path2):

    if not compare_docx_files(file_path1, file_path2):
        return 0
-    document = Document(file_path1)
+
+    try:
+        document = Document(file_path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Get the last paragraph
    last_paragraph = document.paragraphs[-1]
@@ -479,7 +550,11 @@ def evaluate_conversion(file_path):
    if not file_path:
        return 0

-    document = Document(file_path)
+    try:
+        document = Document(file_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for table in document.tables:
        for row in table.rows:
@@ -501,7 +576,11 @@ def evaluate_spacing(file_path):
    if not file_path:
        return 0

-    document = Document(file_path)
+    try:
+        document = Document(file_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Check line spacing for introduction, body, and conclusion
    introduction_spacing = document.paragraphs[0].paragraph_format.line_spacing
@@ -519,7 +598,13 @@ def check_italic_font_size_14(path1, path2):

    if not compare_docx_files(path1, path2):
        return 0
-    document = Document(path1)
+
+    try:
+        document = Document(path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    for paragraph in document.paragraphs:
        for run in paragraph.runs:
            if run.italic:
@@ -534,7 +619,11 @@ def evaluate_alignment(docx_path):
        return 0

    # Load the document
-    doc = Document(docx_path)
+    try:
+        doc = Document(docx_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Iterate through each paragraph in the document
    for para in doc.paragraphs:
@@ -565,7 +654,12 @@ def get_unique_train_ids(initial_file):  # fixed standard
    if not initial_file:
        return set(), 0

-    doc = Document(initial_file)
+    try:
+        doc = Document(initial_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return set(), 0
+
    train_ids = set()
    processed_lines = 0

@@ -586,7 +680,13 @@ def check_no_duplicates(initial_file, processed_file):

    # Open the document
    train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
-    doc_processed = Document(processed_file)
+
+    try:
+        doc_processed = Document(processed_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    train_ids_pro = set()
    processed_lines = 0  # Counter for valid lines processed

@@ -615,10 +715,14 @@ def compare_docx_lines(file1, file2):
        return 0

    # Read the text of the document, line by line
-    doc1 = Document(file1)
-    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
+    try:
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

-    doc2 = Document(file2)
+    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
    doc2_lines = [p.text.strip() for p in doc2.paragraphs if p.text.strip()]
    # print(doc1_lines)
    # print(doc2_lines)
@@ -638,8 +742,13 @@ def compare_docx_files_and_ignore_new_lines(file1, file2, **options):

    # Determine file types and load documents
    if file1.endswith('.docx') and file2.endswith('.docx'):
-        doc1 = Document(file1)
-        doc2 = Document(file2)
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+
        # First, delete all the blank in paragraphs
        doc1 = [p for p in doc1.paragraphs if p.text != '']
        doc2 = [p for p in doc2.paragraphs if p.text != '']
@@ -716,8 +825,13 @@ def compare_references(file1, file2, **options):

    # Determine file types and load documents
    if file1.endswith('.docx') and file2.endswith('.docx'):
-        doc1 = Document(file1)
-        doc2 = Document(file2)
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+
        doc1_paragraphs = [p.text for p in doc1.paragraphs]
        doc2_paragraphs = [p.text for p in doc2.paragraphs]
    else: