diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py index 3eceed5..706f3ba 100644 --- a/desktop_env/evaluators/metrics/docs.py +++ b/desktop_env/evaluators/metrics/docs.py @@ -3,8 +3,10 @@ import os import re import xml.etree.ElementTree as ET import zipfile +from io import BytesIO from typing import List, Dict, Any +from PIL import Image from docx import Document from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT from docx.shared import RGBColor @@ -23,6 +25,9 @@ def find_default_font(config_file_path, rules): default_font = None expected_font = rules["font_name"] + if not config_file_path: + return 0 + try: tree = ET.parse(config_file_path) root = tree.getroot() @@ -42,6 +47,9 @@ def find_default_font(config_file_path, rules): def contains_page_break(docx_file): + if not docx_file: + return 0 + doc = Document(docx_file) namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} @@ -62,6 +70,9 @@ def compare_docx_files(file1, file2, **options): ignore_order = options.get('ignore_order', False) content_only = options.get('content_only', False) + if not file1 or not file2: + return 0 + def get_paragraph_texts_odt(document): paragraphs = document.getElementsByType(P) paragraph_texts = [] @@ -139,6 +150,9 @@ def compare_docx_files(file1, file2, **options): def compare_init_lines(file1, file2): + if not file1 or not file2: + return 0 + doc1 = Document(file1) doc2 = Document(file2) @@ -156,6 +170,9 @@ def compare_init_lines(file1, file2): def compare_docx_tables(docx_file1, docx_file2): + if not docx_file1 or not docx_file2: + return 0 + doc1 = Document(docx_file1) doc2 = Document(docx_file2) @@ -181,11 +198,10 @@ def compare_docx_tables(docx_file1, docx_file2): return 1 -from io import BytesIO -from PIL import Image - - def compare_docx_images(docx_file1, docx_file2): + if not docx_file1 or not docx_file2: + return 0 + doc1 = Document(docx_file1) doc2 = Document(docx_file2) @@ -211,6 +227,9 @@ import pytesseract def compare_image_text(image_path, rule): + if not image_path: + return 0 + img = Image.open(image_path) img_text = pytesseract.image_to_string(img) if rule['type'] == 'text': @@ -220,6 +239,9 @@ def compare_image_text(image_path, rule): def compare_line_spacing(docx_file1, docx_file2): + if not docx_file1 or not docx_file2: + return 0 + if not compare_docx_files(docx_file1, docx_file2): return 0 doc1 = Document(docx_file1) @@ -241,6 +263,9 @@ def compare_line_spacing(docx_file1, docx_file2): def compare_insert_equation(docx_file1, docx_file2): + if not docx_file1 or not docx_file2: + return 0 + if not compare_docx_files(docx_file1, docx_file2): return 0 @@ -256,6 +281,9 @@ def compare_insert_equation(docx_file1, docx_file2): def compare_font_names(docx_file, rules: List[Dict[str, Any]]): + if not docx_file: + return 0 + doc = Document(docx_file) expected_font = rules["font_name"] @@ -268,6 +296,9 @@ def compare_font_names(docx_file, rules: List[Dict[str, Any]]): def compare_subscript_contains(docx_file1, docx_file2): + if not docx_file1 or not docx_file2: + return 0 + doc1 = Document(docx_file1) doc2 = Document(docx_file2) @@ -280,6 +311,9 @@ def compare_subscript_contains(docx_file1, docx_file2): def has_page_numbers_in_footers(docx_file): + if not docx_file: + return 0 + doc = Document(docx_file) for section in doc.sections: @@ -294,6 +328,9 @@ def has_page_numbers_in_footers(docx_file): def is_first_line_centered(docx_file): + if not docx_file: + return 0 + doc = Document(docx_file) first_paragraph = doc.paragraphs[0] @@ -302,11 +339,16 @@ def is_first_line_centered(docx_file): def check_file_exists(directory, filename): + if not directory or not filename: + return 0 file_path = os.path.join(directory, filename) return 1 if os.path.isfile(file_path) else 0 def check_tabstops(docx_file1, docx_file2, **kwargs) -> float: + if not docx_file1 or not docx_file2: + return .0 + doc1: Document = Document(docx_file1) doc2: Document = Document(docx_file2) para1 = [p for p in doc1.paragraphs if p.text.strip()] @@ -342,6 +384,9 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float: def compare_contains_image(docx_file1, docx_file2): + if not docx_file1 or not docx_file2: + return 0 + doc1 = Document(docx_file1) doc2 = Document(docx_file2) @@ -354,6 +399,9 @@ def compare_contains_image(docx_file1, docx_file2): def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs): + if not file_path1 or not file_path2: + return 0 + if not compare_docx_files(file_path1, file_path2): return 0 document = Document(file_path1) @@ -388,6 +436,9 @@ def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs): def check_highlighted_words(file_path1, file_path2): + if not file_path1 or not file_path2: + return 0 + if not compare_docx_files(file_path1, file_path2): return 0 @@ -410,6 +461,9 @@ def check_highlighted_words(file_path1, file_path2): def evaluate_strike_through_last_paragraph(file_path1, file_path2): + if not file_path1 or not file_path2: + return 0 + if not compare_docx_files(file_path1, file_path2): return 0 document = Document(file_path1) @@ -426,6 +480,9 @@ def evaluate_strike_through_last_paragraph(file_path1, file_path2): def evaluate_conversion(file_path): + if not file_path: + return 0 + document = Document(file_path) for table in document.tables: @@ -445,6 +502,9 @@ def evaluate_conversion(file_path): def evaluate_spacing(file_path): + if not file_path: + return 0 + document = Document(file_path) # Check line spacing for introduction, body, and conclusion @@ -458,6 +518,9 @@ def evaluate_spacing(file_path): def check_italic_font_size_14(path1, path2): + if not path1 or not path2: + return 0 + if not compare_docx_files(path1, path2): return 0 document = Document(path1) @@ -471,6 +534,9 @@ def check_italic_font_size_14(path1, path2): def evaluate_alignment(docx_path): + if not docx_path: + return 0 + # Load the document doc = Document(docx_path) @@ -500,6 +566,9 @@ def evaluate_alignment(docx_path): def get_unique_train_ids(initial_file): # fixed standard + if not initial_file: + return set(), 0 + doc = Document(initial_file) train_ids = set() processed_lines = 0 @@ -516,6 +585,9 @@ def get_unique_train_ids(initial_file): # fixed standard def check_no_duplicates(initial_file, processed_file): + if not initial_file or not processed_file: + return 0 + # Open the document train_ids_ini, ini_lines = get_unique_train_ids(initial_file) doc_processed = Document(processed_file) @@ -543,6 +615,9 @@ def check_no_duplicates(initial_file, processed_file): def compare_docx_lines(file1, file2): + if not file1 or not file2: + return 0 + # Read the text of the document, line by line doc1 = Document(file1) doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()] @@ -562,6 +637,9 @@ def compare_docx_lines(file1, file2): def compare_docx_files_and_ignore_new_lines(file1, file2, **options): ignore_blanks = options.get('ignore_blanks', True) + if not file1 or not file2: + return 0 + # Determine file types and load documents if file1.endswith('.docx') and file2.endswith('.docx'): doc1 = Document(file1) @@ -594,6 +672,9 @@ def compare_docx_files_and_ignore_new_lines(file1, file2, **options): # Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated def compare_highlighted_text(file1, file2): + if not file1 or not file2: + return 0 + def extract_highlighted_text(file_path): highlighted_texts = [] @@ -631,6 +712,9 @@ def compare_highlighted_text(file1, file2): def compare_references(file1, file2, **options): + if not file1 or not file2: + return 0 + reference_indicator = options.get('reference_indicator', 'References') reference_base_result = options.get('reference_base_result', 0.5) @@ -675,48 +759,3 @@ def compare_references(file1, file2, **options): return (result - reference_base_result) / (1 - reference_base_result) else: return 0 - - -def compare_answer(file1, file2, **options): - """This is a specific function to compare the """ - # Determine file types and load documents - if file1.endswith('.docx') and file2.endswith('.docx'): - doc1 = Document(file1) - doc2 = Document(file2) - doc1_paragraphs = [p.text for p in doc1.paragraphs] - doc2_paragraphs = [p.text for p in doc2.paragraphs] - else: - # Unsupported file types or mismatch - print("Unsupported file types or mismatch between file types.") - return 0 - - # Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list - ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1 - ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1 - - if ref1_idx == -1 and ref2_idx == -1: - return 1 - - if ref1_idx == -1 or ref2_idx == -1: - return 0 - - # split the reference section into reference items, and remove the empty string items - ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()] - ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()] - - # Compare the references - - if len(ref1) != len(ref2): - return 0 - - total_similarity = 0 - for r1, r2 in zip(ref1, ref2): - # fuzzy match the references - similarity = fuzz.ratio(r1, r2) / 100.0 - total_similarity += similarity - - result = total_similarity / len(ref1) - if result >= reference_base_result: - return (result - reference_base_result) / (1 - reference_base_result) - else: - return 0