Add none file handling for doc

2024-03-09 00:16:50 +08:00
parent 62b3b2390d
commit 4de0eff703
1 changed files with 88 additions and 49 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -3,8 +3,10 @@ import os
 import re
 import xml.etree.ElementTree as ET
 import zipfile
 from io import BytesIO
 from typing import List, Dict, Any
 from PIL import Image
 from docx import Document
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
 from docx.shared import RGBColor
@@ -23,6 +25,9 @@ def find_default_font(config_file_path, rules):
    default_font = None
    expected_font = rules["font_name"]
    if not config_file_path:
        return 0
    try:
        tree = ET.parse(config_file_path)
        root = tree.getroot()
@@ -42,6 +47,9 @@ def find_default_font(config_file_path, rules):
 def contains_page_break(docx_file):
    if not docx_file:
        return 0
    doc = Document(docx_file)
    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
@@ -62,6 +70,9 @@ def compare_docx_files(file1, file2, **options):
    ignore_order = options.get('ignore_order', False)
    content_only = options.get('content_only', False)
    if not file1 or not file2:
        return 0
    def get_paragraph_texts_odt(document):
        paragraphs = document.getElementsByType(P)
        paragraph_texts = []
@@ -139,6 +150,9 @@ def compare_docx_files(file1, file2, **options):
 def compare_init_lines(file1, file2):
    if not file1 or not file2:
        return 0
    doc1 = Document(file1)
    doc2 = Document(file2)
@@ -156,6 +170,9 @@ def compare_init_lines(file1, file2):
 def compare_docx_tables(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)
@@ -181,11 +198,10 @@ def compare_docx_tables(docx_file1, docx_file2):
    return 1
 from io import BytesIO
 from PIL import Image
 def compare_docx_images(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)
@@ -211,6 +227,9 @@ import pytesseract
 def compare_image_text(image_path, rule):
    if not image_path:
        return 0
    img = Image.open(image_path)
    img_text = pytesseract.image_to_string(img)
    if rule['type'] == 'text':
@@ -220,6 +239,9 @@ def compare_image_text(image_path, rule):
 def compare_line_spacing(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0
    if not compare_docx_files(docx_file1, docx_file2):
        return 0
    doc1 = Document(docx_file1)
@@ -241,6 +263,9 @@ def compare_line_spacing(docx_file1, docx_file2):
 def compare_insert_equation(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0
    if not compare_docx_files(docx_file1, docx_file2):
        return 0
@@ -256,6 +281,9 @@ def compare_insert_equation(docx_file1, docx_file2):
 def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
    if not docx_file:
        return 0
    doc = Document(docx_file)
    expected_font = rules["font_name"]
@@ -268,6 +296,9 @@ def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
 def compare_subscript_contains(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)
@@ -280,6 +311,9 @@ def compare_subscript_contains(docx_file1, docx_file2):
 def has_page_numbers_in_footers(docx_file):
    if not docx_file:
        return 0
    doc = Document(docx_file)
    for section in doc.sections:
@@ -294,6 +328,9 @@ def has_page_numbers_in_footers(docx_file):
 def is_first_line_centered(docx_file):
    if not docx_file:
        return 0
    doc = Document(docx_file)
    first_paragraph = doc.paragraphs[0]
@@ -302,11 +339,16 @@ def is_first_line_centered(docx_file):
 def check_file_exists(directory, filename):
    if not directory or not filename:
        return 0
    file_path = os.path.join(directory, filename)
    return 1 if os.path.isfile(file_path) else 0
 def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
    if not docx_file1 or not docx_file2:
        return .0
    doc1: Document = Document(docx_file1)
    doc2: Document = Document(docx_file2)
    para1 = [p for p in doc1.paragraphs if p.text.strip()]
@@ -342,6 +384,9 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
 def compare_contains_image(docx_file1, docx_file2):
    if not docx_file1 or not docx_file2:
        return 0
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)
@@ -354,6 +399,9 @@ def compare_contains_image(docx_file1, docx_file2):
 def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
    if not file_path1 or not file_path2:
        return 0
    if not compare_docx_files(file_path1, file_path2):
        return 0
    document = Document(file_path1)
@@ -388,6 +436,9 @@ def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
 def check_highlighted_words(file_path1, file_path2):
    if not file_path1 or not file_path2:
        return 0
    if not compare_docx_files(file_path1, file_path2):
        return 0
@@ -410,6 +461,9 @@ def check_highlighted_words(file_path1, file_path2):
 def evaluate_strike_through_last_paragraph(file_path1, file_path2):
    if not file_path1 or not file_path2:
        return 0
    if not compare_docx_files(file_path1, file_path2):
        return 0
    document = Document(file_path1)
@@ -426,6 +480,9 @@ def evaluate_strike_through_last_paragraph(file_path1, file_path2):
 def evaluate_conversion(file_path):
    if not file_path:
        return 0
    document = Document(file_path)
    for table in document.tables:
@@ -445,6 +502,9 @@ def evaluate_conversion(file_path):
 def evaluate_spacing(file_path):
    if not file_path:
        return 0
    document = Document(file_path)
    # Check line spacing for introduction, body, and conclusion
@@ -458,6 +518,9 @@ def evaluate_spacing(file_path):
 def check_italic_font_size_14(path1, path2):
    if not path1 or not path2:
        return 0
    if not compare_docx_files(path1, path2):
        return 0
    document = Document(path1)
@@ -471,6 +534,9 @@ def check_italic_font_size_14(path1, path2):
 def evaluate_alignment(docx_path):
    if not docx_path:
        return 0
    # Load the document
    doc = Document(docx_path)
@@ -500,6 +566,9 @@ def evaluate_alignment(docx_path):
 def get_unique_train_ids(initial_file):  # fixed standard
    if not initial_file:
        return set(), 0
    doc = Document(initial_file)
    train_ids = set()
    processed_lines = 0
@@ -516,6 +585,9 @@ def get_unique_train_ids(initial_file):  # fixed standard
 def check_no_duplicates(initial_file, processed_file):
    if not initial_file or not processed_file:
        return 0
    # Open the document
    train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
    doc_processed = Document(processed_file)
@@ -543,6 +615,9 @@ def check_no_duplicates(initial_file, processed_file):
 def compare_docx_lines(file1, file2):
    if not file1 or not file2:
        return 0
    # Read the text of the document, line by line
    doc1 = Document(file1)
    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
@@ -562,6 +637,9 @@ def compare_docx_lines(file1, file2):
 def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
    ignore_blanks = options.get('ignore_blanks', True)
    if not file1 or not file2:
        return 0
    # Determine file types and load documents
    if file1.endswith('.docx') and file2.endswith('.docx'):
        doc1 = Document(file1)
@@ -594,6 +672,9 @@ def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
 # Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
 def compare_highlighted_text(file1, file2):
    if not file1 or not file2:
        return 0
    def extract_highlighted_text(file_path):
        highlighted_texts = []
@@ -631,6 +712,9 @@ def compare_highlighted_text(file1, file2):
 def compare_references(file1, file2, **options):
    if not file1 or not file2:
        return 0
    reference_indicator = options.get('reference_indicator', 'References')
    reference_base_result = options.get('reference_base_result', 0.5)
@@ -675,48 +759,3 @@ def compare_references(file1, file2, **options):
        return (result - reference_base_result) / (1 - reference_base_result)
    else:
        return 0
 def compare_answer(file1, file2, **options):
    """This is a specific function to compare the """
    # Determine file types and load documents
    if file1.endswith('.docx') and file2.endswith('.docx'):
        doc1 = Document(file1)
        doc2 = Document(file2)
        doc1_paragraphs = [p.text for p in doc1.paragraphs]
        doc2_paragraphs = [p.text for p in doc2.paragraphs]
    else:
        # Unsupported file types or mismatch
        print("Unsupported file types or mismatch between file types.")
        return 0
    # Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
    ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
    ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
    if ref1_idx == -1 and ref2_idx == -1:
        return 1
    if ref1_idx == -1 or ref2_idx == -1:
        return 0
    # split the reference section into reference items, and remove the empty string items
    ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
    ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
    # Compare the references
    if len(ref1) != len(ref2):
        return 0
    total_similarity = 0
    for r1, r2 in zip(ref1, ref2):
        # fuzzy match the references
        similarity = fuzz.ratio(r1, r2) / 100.0
        total_similarity += similarity
    result = total_similarity / len(ref1)
    if result >= reference_base_result:
        return (result - reference_base_result) / (1 - reference_base_result)
    else:
        return 0