Fix writer examples

2024-01-30 01:25:30 +08:00
parent 343813a29b
commit 1756d3b672
4 changed files with 108 additions and 55 deletions
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -1,13 +1,16 @@
 import logging
 import os
-import xml.etree.ElementTree as ET
-import zipfile
 import re
+import xml.etree.ElementTree as ET
 from typing import List, Dict, Any

 from docx import Document
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
 from docx.shared import RGBColor
+from odf.opendocument import load
+from odf.text import P
+from odf.text import Span
+import zipfile

 logger = logging.getLogger("desktopenv.metric.docs")

@@ -51,12 +54,39 @@ def contains_page_break(docx_file):


 def compare_docx_files(file1, file2, ignore_blanks=True):
-    doc1 = Document(file1)
-    doc2 = Document(file2)
+    def get_paragraph_texts_odt(document):
+        paragraphs = document.getElementsByType(P)
+        paragraph_texts = []
+        for paragraph in paragraphs:
+            text_parts = []
+            for node in paragraph.childNodes:
+                if node.nodeType == node.TEXT_NODE:
+                    text_parts.append(node.data)
+                elif node.nodeType == node.ELEMENT_NODE and node.tagName == 'text:span':
+                    # Assuming direct text content in <text:span>, for simplicity
+                    for child in node.childNodes:
+                        if child.nodeType == child.TEXT_NODE:
+                            text_parts.append(child.data)
+            paragraph_texts.append(''.join(text_parts))
+        return paragraph_texts

-    doc1_paragraphs = [p.text for p in doc1.paragraphs]
-    doc2_paragraphs = [p.text for p in doc2.paragraphs]
+    # Determine file types and load documents
+    if file1.endswith('.docx') and file2.endswith('.docx'):
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+        doc1_paragraphs = [p.text for p in doc1.paragraphs]
+        doc2_paragraphs = [p.text for p in doc2.paragraphs]
+    elif file1.endswith('.odt') and file2.endswith('.odt'):
+        doc1 = load(file1)
+        doc2 = load(file2)
+        doc1_paragraphs = get_paragraph_texts_odt(doc1)
+        doc2_paragraphs = get_paragraph_texts_odt(doc2)
+    else:
+        # Unsupported file types or mismatch
+        print("Unsupported file types or mismatch between file types.")
+        return 0

+    # Process and compare documents
    if ignore_blanks:
        text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
@@ -64,19 +94,16 @@ def compare_docx_files(file1, file2, ignore_blanks=True):
            return 0
    else:
        if len(doc1_paragraphs) != len(doc2_paragraphs):
-            # print(len(doc1_paragraphs))
-            # print(len(doc2_paragraphs))
            return 0

        # Compare each paragraph
        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
            if p1 != p2:
-                # print(p1)
-                # print(p2)
                return 0

    return 1

+
 def compare_init_lines(file1, file2):
    doc1 = Document(file1)
    doc2 = Document(file2)
@@ -93,6 +120,7 @@ def compare_init_lines(file1, file2):

    return 1

+
 def compare_docx_tables(docx_file1, docx_file2):
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)
@@ -217,6 +245,7 @@ def compare_contains_image(docx_file1, docx_file2):
                return 0
    return 1

+
 # file1 = 'path/to/file1.docx'
 # file2 = 'path/to/file2.docx'

@@ -253,26 +282,25 @@ def evaluate_colored_words_in_tables(file_path1, file_path2):


 def check_highlighted_words(file_path1, file_path2):
-    # if not compare_docx_files(file_path1, file_path2):
-    #     return 0
-        
-    # Extract content.xml from the .odt file
-    extract_dir = file_path1 + "_extracted"
-    with zipfile.ZipFile(file_path1, 'r') as zip_ref:
-        zip_ref.extractall(extract_dir)
-    content_xml_path = os.path.join(extract_dir, 'content.xml')
-    with open(content_xml_path, 'r', encoding="utf-8") as file:
-        content_xml = file.read()
-
-    # Check for yellow highlights in the content.xml
-    yellow_highlight_pattern = re.compile(r'(.{0,50}background-color="#ffff00"[^>]*>.{0,50})')
-    yellow_highlight_matches = yellow_highlight_pattern.findall(content_xml)
-
-    # Return True if yellow highlights are NOT found, otherwise True
-    if yellow_highlight_matches:
+    if not compare_docx_files(file_path1, file_path2):
        return 0
-    else:
-        return 1
+
+    doc = load(file_path1)
+    highlighted = False
+
+    for span in doc.getElementsByType(Span):
+        style_name = span.getAttribute('stylename')
+        if style_name:
+            for automatic_style in doc.automaticstyles.childNodes:
+                if automatic_style.getAttribute('name') == style_name:
+                    for property in automatic_style.childNodes:
+                        if property.getAttribute('backgroundcolor') == '#ffff00':
+                            highlighted = True
+                            break
+            if highlighted:
+                break
+
+    return 0 if highlighted else 1


 def evaluate_strike_through_last_paragraph(file_path1, file_path2):
@@ -318,9 +346,9 @@ def evaluate_spacing(file_path):
    body_spacing = document.paragraphs[1].paragraph_format.line_spacing
    conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing
    if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5):
-      return 1
+        return 1
    else:
-      return 0
+        return 0


 def check_italic_font_size_14(path1, path2):
@@ -358,13 +386,14 @@ def evaluate_alignment(docx_path):
            second_part = ' '.join(words[3:])

            # Check if the sentence structure matches the pattern: first part + large space/tab + second part
-            if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(second_part)):
+            if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(
+                    second_part)):
                return 0  # The sentence does not meet the alignment criteria

    return 1  # All sentences meet the alignment criteria


-def get_unique_train_ids(initial_file): #fixed standard
+def get_unique_train_ids(initial_file):  # fixed standard
    doc = Document(initial_file)
    train_ids = set()
    processed_lines = 0
@@ -374,11 +403,12 @@ def get_unique_train_ids(initial_file): #fixed standard
        if len(line_parts) == 4:
            train_id = line_parts[1].strip()
            if train_id not in train_ids:
-              train_ids.add(train_id)
-              processed_lines += 1
+                train_ids.add(train_id)
+                processed_lines += 1

    return train_ids, processed_lines

+
 def check_no_duplicates(initial_file, processed_file):
    # Open the document
    train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
@@ -405,6 +435,7 @@ def check_no_duplicates(initial_file, processed_file):
    # No duplicates found and at least one valid line was processed
    return 1

+
 def compare_docx_lines(file1, file2):
    # Read the text of the document, line by line
    doc1 = Document(file1)
@@ -416,21 +447,46 @@ def compare_docx_lines(file1, file2):
    # print(doc2_lines)

    # Convert the list of lines to sets and compare
-    return set(doc1_lines) == set(doc2_lines)
+    if set(doc1_lines) == set(doc2_lines):
+        return 1
+    else:
+        return 0

+
+# Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
 def compare_highlighted_text(file1, file2):
-    def extract_highlighted_text(doc):
-        highlighted_text = []
-        # Iterate through each run in each paragraph to check for highlight
-        for paragraph in doc.paragraphs:
-            for run in paragraph.runs:
-                if run.font.highlight_color:  # Checks if the run is highlighted
-                    highlighted_text.append(run.text.strip())
-        return highlighted_text
+    def extract_highlighted_text(file_path):
+        highlighted_texts = []
+
+        # Open the .docx file as a zip file and read the document.xml
+        with zipfile.ZipFile(file_path, 'r') as docx:
+            with docx.open('word/document.xml') as document_xml:
+                tree = ET.parse(document_xml)
+                root = tree.getroot()
+
+        # Define the namespaces
+        namespaces = {
+            'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+        }
+
+        # Find all runs with highlight property
+        for run in root.findall('.//w:r', namespaces):
+            highlight = run.find('.//w:highlight', namespaces)
+            if highlight is not None and highlight.get(
+                    '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') != 'none':
+                text = run.find('.//w:t', namespaces)
+                if text is not None:
+                    highlighted_texts.append(text.text)
+
+        return highlighted_texts

    # Read the highlighted text from both documents
-    doc1_highlighted = extract_highlighted_text(Document(file1))
-    doc2_highlighted = extract_highlighted_text(Document(file2))
+    doc1_highlighted = extract_highlighted_text(file1)
+    doc2_highlighted = extract_highlighted_text(file2)

    # Compare the sets of highlighted text to check if they are the same
-    return set(doc1_highlighted) == set(doc2_highlighted)
+    if set(doc1_highlighted) == set(doc2_highlighted):
+        return 1
+    else:
+        return 0
+