check_highlighted_words modified

This commit is contained in:
thomasshin
2024-01-28 14:36:28 +08:00
parent 394d6353fd
commit 4b8cab0805
2 changed files with 27 additions and 15 deletions

View File

@@ -1,6 +1,8 @@
import logging
import os
import xml.etree.ElementTree as ET
import zipfile
import re
from typing import List, Dict, Any
from docx import Document
@@ -247,14 +249,24 @@ def evaluate_colored_words_in_tables(file_path1, file_path2):
def check_highlighted_words(file_path1, file_path2):
if not compare_docx_files(file_path1, file_path2):
return 0
document = Document(file_path1)
# Extract content.xml from the .odt file
extract_dir = file_path1 + "_extracted"
with zipfile.ZipFile(file_path1, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
content_xml_path = os.path.join(extract_dir, 'content.xml')
with open(content_xml_path, 'r') as file:
content_xml = file.read()
for paragraph in document.paragraphs:
for run in paragraph.runs:
if run.font.highlight_color is not None:
return 0 # Highlighted words found
# Check for yellow highlights in the content.xml
yellow_highlight_pattern = re.compile(r'(.{0,50}background-color="#ffff00"[^>]*>.{0,50})')
yellow_highlight_matches = yellow_highlight_pattern.findall(content_xml)
return 1 # No highlighted words found
# Return True if yellow highlights are NOT found, otherwise True
if yellow_highlight_matches:
return 0
else:
return 1
def evaluate_strike_through_last_paragraph(file_path1, file_path2):
@@ -415,4 +427,4 @@ def compare_highlighted_text(file1, file2):
doc2_highlighted = extract_highlighted_text(Document(file2))
# Compare the sets of highlighted text to check if they are the same
return set(doc1_highlighted) == set(doc2_highlighted)
return set(doc1_highlighted) == set(doc2_highlighted)