Add none file handling for doc

This commit is contained in:
Timothyxxx
2024-03-09 00:16:50 +08:00
parent 62b3b2390d
commit 4de0eff703

View File

@@ -3,8 +3,10 @@ import os
import re
import xml.etree.ElementTree as ET
import zipfile
from io import BytesIO
from typing import List, Dict, Any
from PIL import Image
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
from docx.shared import RGBColor
@@ -23,6 +25,9 @@ def find_default_font(config_file_path, rules):
default_font = None
expected_font = rules["font_name"]
if not config_file_path:
return 0
try:
tree = ET.parse(config_file_path)
root = tree.getroot()
@@ -42,6 +47,9 @@ def find_default_font(config_file_path, rules):
def contains_page_break(docx_file):
if not docx_file:
return 0
doc = Document(docx_file)
namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
@@ -62,6 +70,9 @@ def compare_docx_files(file1, file2, **options):
ignore_order = options.get('ignore_order', False)
content_only = options.get('content_only', False)
if not file1 or not file2:
return 0
def get_paragraph_texts_odt(document):
paragraphs = document.getElementsByType(P)
paragraph_texts = []
@@ -139,6 +150,9 @@ def compare_docx_files(file1, file2, **options):
def compare_init_lines(file1, file2):
if not file1 or not file2:
return 0
doc1 = Document(file1)
doc2 = Document(file2)
@@ -156,6 +170,9 @@ def compare_init_lines(file1, file2):
def compare_docx_tables(docx_file1, docx_file2):
if not docx_file1 or not docx_file2:
return 0
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
@@ -181,11 +198,10 @@ def compare_docx_tables(docx_file1, docx_file2):
return 1
from io import BytesIO
from PIL import Image
def compare_docx_images(docx_file1, docx_file2):
if not docx_file1 or not docx_file2:
return 0
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
@@ -211,6 +227,9 @@ import pytesseract
def compare_image_text(image_path, rule):
if not image_path:
return 0
img = Image.open(image_path)
img_text = pytesseract.image_to_string(img)
if rule['type'] == 'text':
@@ -220,6 +239,9 @@ def compare_image_text(image_path, rule):
def compare_line_spacing(docx_file1, docx_file2):
if not docx_file1 or not docx_file2:
return 0
if not compare_docx_files(docx_file1, docx_file2):
return 0
doc1 = Document(docx_file1)
@@ -241,6 +263,9 @@ def compare_line_spacing(docx_file1, docx_file2):
def compare_insert_equation(docx_file1, docx_file2):
if not docx_file1 or not docx_file2:
return 0
if not compare_docx_files(docx_file1, docx_file2):
return 0
@@ -256,6 +281,9 @@ def compare_insert_equation(docx_file1, docx_file2):
def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
if not docx_file:
return 0
doc = Document(docx_file)
expected_font = rules["font_name"]
@@ -268,6 +296,9 @@ def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
def compare_subscript_contains(docx_file1, docx_file2):
if not docx_file1 or not docx_file2:
return 0
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
@@ -280,6 +311,9 @@ def compare_subscript_contains(docx_file1, docx_file2):
def has_page_numbers_in_footers(docx_file):
if not docx_file:
return 0
doc = Document(docx_file)
for section in doc.sections:
@@ -294,6 +328,9 @@ def has_page_numbers_in_footers(docx_file):
def is_first_line_centered(docx_file):
if not docx_file:
return 0
doc = Document(docx_file)
first_paragraph = doc.paragraphs[0]
@@ -302,11 +339,16 @@ def is_first_line_centered(docx_file):
def check_file_exists(directory, filename):
if not directory or not filename:
return 0
file_path = os.path.join(directory, filename)
return 1 if os.path.isfile(file_path) else 0
def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
if not docx_file1 or not docx_file2:
return .0
doc1: Document = Document(docx_file1)
doc2: Document = Document(docx_file2)
para1 = [p for p in doc1.paragraphs if p.text.strip()]
@@ -342,6 +384,9 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
def compare_contains_image(docx_file1, docx_file2):
if not docx_file1 or not docx_file2:
return 0
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
@@ -354,6 +399,9 @@ def compare_contains_image(docx_file1, docx_file2):
def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
if not file_path1 or not file_path2:
return 0
if not compare_docx_files(file_path1, file_path2):
return 0
document = Document(file_path1)
@@ -388,6 +436,9 @@ def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
def check_highlighted_words(file_path1, file_path2):
if not file_path1 or not file_path2:
return 0
if not compare_docx_files(file_path1, file_path2):
return 0
@@ -410,6 +461,9 @@ def check_highlighted_words(file_path1, file_path2):
def evaluate_strike_through_last_paragraph(file_path1, file_path2):
if not file_path1 or not file_path2:
return 0
if not compare_docx_files(file_path1, file_path2):
return 0
document = Document(file_path1)
@@ -426,6 +480,9 @@ def evaluate_strike_through_last_paragraph(file_path1, file_path2):
def evaluate_conversion(file_path):
if not file_path:
return 0
document = Document(file_path)
for table in document.tables:
@@ -445,6 +502,9 @@ def evaluate_conversion(file_path):
def evaluate_spacing(file_path):
if not file_path:
return 0
document = Document(file_path)
# Check line spacing for introduction, body, and conclusion
@@ -458,6 +518,9 @@ def evaluate_spacing(file_path):
def check_italic_font_size_14(path1, path2):
if not path1 or not path2:
return 0
if not compare_docx_files(path1, path2):
return 0
document = Document(path1)
@@ -471,6 +534,9 @@ def check_italic_font_size_14(path1, path2):
def evaluate_alignment(docx_path):
if not docx_path:
return 0
# Load the document
doc = Document(docx_path)
@@ -500,6 +566,9 @@ def evaluate_alignment(docx_path):
def get_unique_train_ids(initial_file): # fixed standard
if not initial_file:
return set(), 0
doc = Document(initial_file)
train_ids = set()
processed_lines = 0
@@ -516,6 +585,9 @@ def get_unique_train_ids(initial_file): # fixed standard
def check_no_duplicates(initial_file, processed_file):
if not initial_file or not processed_file:
return 0
# Open the document
train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
doc_processed = Document(processed_file)
@@ -543,6 +615,9 @@ def check_no_duplicates(initial_file, processed_file):
def compare_docx_lines(file1, file2):
if not file1 or not file2:
return 0
# Read the text of the document, line by line
doc1 = Document(file1)
doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
@@ -562,6 +637,9 @@ def compare_docx_lines(file1, file2):
def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
ignore_blanks = options.get('ignore_blanks', True)
if not file1 or not file2:
return 0
# Determine file types and load documents
if file1.endswith('.docx') and file2.endswith('.docx'):
doc1 = Document(file1)
@@ -594,6 +672,9 @@ def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
# Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
def compare_highlighted_text(file1, file2):
if not file1 or not file2:
return 0
def extract_highlighted_text(file_path):
highlighted_texts = []
@@ -631,6 +712,9 @@ def compare_highlighted_text(file1, file2):
def compare_references(file1, file2, **options):
if not file1 or not file2:
return 0
reference_indicator = options.get('reference_indicator', 'References')
reference_base_result = options.get('reference_base_result', 0.5)
@@ -675,48 +759,3 @@ def compare_references(file1, file2, **options):
return (result - reference_base_result) / (1 - reference_base_result)
else:
return 0
def compare_answer(file1, file2, **options):
"""This is a specific function to compare the """
# Determine file types and load documents
if file1.endswith('.docx') and file2.endswith('.docx'):
doc1 = Document(file1)
doc2 = Document(file2)
doc1_paragraphs = [p.text for p in doc1.paragraphs]
doc2_paragraphs = [p.text for p in doc2.paragraphs]
else:
# Unsupported file types or mismatch
print("Unsupported file types or mismatch between file types.")
return 0
# Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
if ref1_idx == -1 and ref2_idx == -1:
return 1
if ref1_idx == -1 or ref2_idx == -1:
return 0
# split the reference section into reference items, and remove the empty string items
ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
# Compare the references
if len(ref1) != len(ref2):
return 0
total_similarity = 0
for r1, r2 in zip(ref1, ref2):
# fuzzy match the references
similarity = fuzz.ratio(r1, r2) / 100.0
total_similarity += similarity
result = total_similarity / len(ref1)
if result >= reference_base_result:
return (result - reference_base_result) / (1 - reference_base_result)
else:
return 0