From 1756d3b67244174c4dae07ce514c9f481a0b2ee4 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Tue, 30 Jan 2024 01:25:30 +0800 Subject: [PATCH] Fix writer examples --- desktop_env/evaluators/metrics/docs.py | 150 ++++++++++++------ .../41c621f7-3544-49e1-af8d-dafd0f834f75.json | 7 +- .../6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2.json | 4 +- requirements.txt | 2 +- 4 files changed, 108 insertions(+), 55 deletions(-) diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py index 461a9ad..e66e63f 100644 --- a/desktop_env/evaluators/metrics/docs.py +++ b/desktop_env/evaluators/metrics/docs.py @@ -1,13 +1,16 @@ import logging import os -import xml.etree.ElementTree as ET -import zipfile import re +import xml.etree.ElementTree as ET from typing import List, Dict, Any from docx import Document from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.shared import RGBColor +from odf.opendocument import load +from odf.text import P +from odf.text import Span +import zipfile logger = logging.getLogger("desktopenv.metric.docs") @@ -51,12 +54,39 @@ def contains_page_break(docx_file): def compare_docx_files(file1, file2, ignore_blanks=True): - doc1 = Document(file1) - doc2 = Document(file2) + def get_paragraph_texts_odt(document): + paragraphs = document.getElementsByType(P) + paragraph_texts = [] + for paragraph in paragraphs: + text_parts = [] + for node in paragraph.childNodes: + if node.nodeType == node.TEXT_NODE: + text_parts.append(node.data) + elif node.nodeType == node.ELEMENT_NODE and node.tagName == 'text:span': + # Assuming direct text content in , for simplicity + for child in node.childNodes: + if child.nodeType == child.TEXT_NODE: + text_parts.append(child.data) + paragraph_texts.append(''.join(text_parts)) + return paragraph_texts - doc1_paragraphs = [p.text for p in doc1.paragraphs] - doc2_paragraphs = [p.text for p in doc2.paragraphs] + # Determine file types and load documents + if file1.endswith('.docx') and file2.endswith('.docx'): + doc1 = Document(file1) + doc2 = Document(file2) + doc1_paragraphs = [p.text for p in doc1.paragraphs] + doc2_paragraphs = [p.text for p in doc2.paragraphs] + elif file1.endswith('.odt') and file2.endswith('.odt'): + doc1 = load(file1) + doc2 = load(file2) + doc1_paragraphs = get_paragraph_texts_odt(doc1) + doc2_paragraphs = get_paragraph_texts_odt(doc2) + else: + # Unsupported file types or mismatch + print("Unsupported file types or mismatch between file types.") + return 0 + # Process and compare documents if ignore_blanks: text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip() text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip() @@ -64,19 +94,16 @@ def compare_docx_files(file1, file2, ignore_blanks=True): return 0 else: if len(doc1_paragraphs) != len(doc2_paragraphs): - # print(len(doc1_paragraphs)) - # print(len(doc2_paragraphs)) return 0 # Compare each paragraph for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs): if p1 != p2: - # print(p1) - # print(p2) return 0 return 1 + def compare_init_lines(file1, file2): doc1 = Document(file1) doc2 = Document(file2) @@ -93,6 +120,7 @@ def compare_init_lines(file1, file2): return 1 + def compare_docx_tables(docx_file1, docx_file2): doc1 = Document(docx_file1) doc2 = Document(docx_file2) @@ -217,6 +245,7 @@ def compare_contains_image(docx_file1, docx_file2): return 0 return 1 + # file1 = 'path/to/file1.docx' # file2 = 'path/to/file2.docx' @@ -253,26 +282,25 @@ def evaluate_colored_words_in_tables(file_path1, file_path2): def check_highlighted_words(file_path1, file_path2): - # if not compare_docx_files(file_path1, file_path2): - # return 0 - - # Extract content.xml from the .odt file - extract_dir = file_path1 + "_extracted" - with zipfile.ZipFile(file_path1, 'r') as zip_ref: - zip_ref.extractall(extract_dir) - content_xml_path = os.path.join(extract_dir, 'content.xml') - with open(content_xml_path, 'r', encoding="utf-8") as file: - content_xml = file.read() - - # Check for yellow highlights in the content.xml - yellow_highlight_pattern = re.compile(r'(.{0,50}background-color="#ffff00"[^>]*>.{0,50})') - yellow_highlight_matches = yellow_highlight_pattern.findall(content_xml) - - # Return True if yellow highlights are NOT found, otherwise True - if yellow_highlight_matches: + if not compare_docx_files(file_path1, file_path2): return 0 - else: - return 1 + + doc = load(file_path1) + highlighted = False + + for span in doc.getElementsByType(Span): + style_name = span.getAttribute('stylename') + if style_name: + for automatic_style in doc.automaticstyles.childNodes: + if automatic_style.getAttribute('name') == style_name: + for property in automatic_style.childNodes: + if property.getAttribute('backgroundcolor') == '#ffff00': + highlighted = True + break + if highlighted: + break + + return 0 if highlighted else 1 def evaluate_strike_through_last_paragraph(file_path1, file_path2): @@ -318,9 +346,9 @@ def evaluate_spacing(file_path): body_spacing = document.paragraphs[1].paragraph_format.line_spacing conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5): - return 1 + return 1 else: - return 0 + return 0 def check_italic_font_size_14(path1, path2): @@ -358,13 +386,14 @@ def evaluate_alignment(docx_path): second_part = ' '.join(words[3:]) # Check if the sentence structure matches the pattern: first part + large space/tab + second part - if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(second_part)): + if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find( + second_part)): return 0 # The sentence does not meet the alignment criteria return 1 # All sentences meet the alignment criteria -def get_unique_train_ids(initial_file): #fixed standard +def get_unique_train_ids(initial_file): # fixed standard doc = Document(initial_file) train_ids = set() processed_lines = 0 @@ -374,11 +403,12 @@ def get_unique_train_ids(initial_file): #fixed standard if len(line_parts) == 4: train_id = line_parts[1].strip() if train_id not in train_ids: - train_ids.add(train_id) - processed_lines += 1 + train_ids.add(train_id) + processed_lines += 1 return train_ids, processed_lines + def check_no_duplicates(initial_file, processed_file): # Open the document train_ids_ini, ini_lines = get_unique_train_ids(initial_file) @@ -405,6 +435,7 @@ def check_no_duplicates(initial_file, processed_file): # No duplicates found and at least one valid line was processed return 1 + def compare_docx_lines(file1, file2): # Read the text of the document, line by line doc1 = Document(file1) @@ -416,21 +447,46 @@ def compare_docx_lines(file1, file2): # print(doc2_lines) # Convert the list of lines to sets and compare - return set(doc1_lines) == set(doc2_lines) + if set(doc1_lines) == set(doc2_lines): + return 1 + else: + return 0 + +# Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated def compare_highlighted_text(file1, file2): - def extract_highlighted_text(doc): - highlighted_text = [] - # Iterate through each run in each paragraph to check for highlight - for paragraph in doc.paragraphs: - for run in paragraph.runs: - if run.font.highlight_color: # Checks if the run is highlighted - highlighted_text.append(run.text.strip()) - return highlighted_text + def extract_highlighted_text(file_path): + highlighted_texts = [] + + # Open the .docx file as a zip file and read the document.xml + with zipfile.ZipFile(file_path, 'r') as docx: + with docx.open('word/document.xml') as document_xml: + tree = ET.parse(document_xml) + root = tree.getroot() + + # Define the namespaces + namespaces = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', + } + + # Find all runs with highlight property + for run in root.findall('.//w:r', namespaces): + highlight = run.find('.//w:highlight', namespaces) + if highlight is not None and highlight.get( + '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') != 'none': + text = run.find('.//w:t', namespaces) + if text is not None: + highlighted_texts.append(text.text) + + return highlighted_texts # Read the highlighted text from both documents - doc1_highlighted = extract_highlighted_text(Document(file1)) - doc2_highlighted = extract_highlighted_text(Document(file2)) + doc1_highlighted = extract_highlighted_text(file1) + doc2_highlighted = extract_highlighted_text(file2) # Compare the sets of highlighted text to check if they are the same - return set(doc1_highlighted) == set(doc2_highlighted) \ No newline at end of file + if set(doc1_highlighted) == set(doc2_highlighted): + return 1 + else: + return 0 + diff --git a/evaluation_examples/examples/libreoffice_writer/41c621f7-3544-49e1-af8d-dafd0f834f75.json b/evaluation_examples/examples/libreoffice_writer/41c621f7-3544-49e1-af8d-dafd0f834f75.json index 445cc3f..b59cd82 100644 --- a/evaluation_examples/examples/libreoffice_writer/41c621f7-3544-49e1-af8d-dafd0f834f75.json +++ b/evaluation_examples/examples/libreoffice_writer/41c621f7-3544-49e1-af8d-dafd0f834f75.json @@ -1,7 +1,7 @@ { "id": "41c621f7-3544-49e1-af8d-dafd0f834f75", "snapshot": "libreoffice_writer", - "instruction": "I am adding comments which begin with \"#\" right beside sentences my students have written. I want to make my comments to stand out more, so I am highlighting my comments to yellow. Could you help me on this? It is hard for me to color comments one by one.", + "instruction": "I added annotations starting with a \"#\" next to the sentences that my students wrote. I want my annotations to stand out more, so I highlighted them in yellow. Can you help me with this favor? I'm having a hard time coloring the annotations one by one. By the way, remember to remove the # sign after highlighting the text part. Thanks!", "source": "https://superuser.com/questions/1668018/how-to-auto-format-lines-in-libre-office-writer", "config": [ { @@ -52,10 +52,7 @@ } } ], - "func": [ - "compare_highlighted_text", - "compare_docx_files" - ], + "func": "compare_highlighted_text", "expected": { "type": "cloud_file", "path": "https://drive.usercontent.google.com/download?id=1x2gplRDN1AOnRWiqfqmJqN9JTQ_x4sPu&export=download&authuser=0&confirm=t&uuid=ea6b0a61-fd06-4823-b253-05473c77c192&at=APZUnTUxWWFB_TOKLR9chabErm7b:1706018376493", diff --git a/evaluation_examples/examples/libreoffice_writer/6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2.json b/evaluation_examples/examples/libreoffice_writer/6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2.json index 74ff2a1..d876b97 100644 --- a/evaluation_examples/examples/libreoffice_writer/6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2.json +++ b/evaluation_examples/examples/libreoffice_writer/6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2.json @@ -9,7 +9,7 @@ "parameters": { "files": [ { - "url": "https://drive.google.com/uc?id=1Ul4mtQ4SpUNLVDlaiJThPprBe6WTGZ2L&export=download", + "url": "https://drive.usercontent.google.com/download?id=1_1uItoxJsp8PgBqaED0U3bOfKxIweNkJ&export=download&authuser=0&confirm=t&uuid=facec3ed-2fd6-49f7-aeda-95feaa38f45c&at=APZUnTU6B0xoLG-kI0V2WZ74oSI6:1706544439742", "path": "Desktop/sample-recruitment-phone-script.odt" } ] @@ -55,7 +55,7 @@ "func": "check_highlighted_words", "expected": { "type": "cloud_file", - "path": "https://drive.google.com/uc?id=12iMkgCYuUyhKUXux96kANLIeud0Wz9ct&export=download", + "path": "https://drive.usercontent.google.com/download?id=1Qxz3EjHuiLa8yZaCr1DJHwEOSd6U-cjY&export=download&authuser=0&confirm=t&uuid=c502680e-51f9-4e4f-90d0-58b123a09b3d&at=APZUnTXN2UX9EuvB1PZoJ5koJwmQ:1706544440969", "dest": "sample-recruitment-phone-script_Gold.odt" }, "result": { diff --git a/requirements.txt b/requirements.txt index b494102..1abecf8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,5 +36,5 @@ backoff formulas pydrive fastdtw - +odfpy openai