Fix writer examples

This commit is contained in:
Timothyxxx
2024-01-30 01:25:30 +08:00
parent 343813a29b
commit 1756d3b672
4 changed files with 108 additions and 55 deletions

View File

@@ -1,13 +1,16 @@
import logging import logging
import os import os
import xml.etree.ElementTree as ET
import zipfile
import re import re
import xml.etree.ElementTree as ET
from typing import List, Dict, Any from typing import List, Dict, Any
from docx import Document from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import RGBColor from docx.shared import RGBColor
from odf.opendocument import load
from odf.text import P
from odf.text import Span
import zipfile
logger = logging.getLogger("desktopenv.metric.docs") logger = logging.getLogger("desktopenv.metric.docs")
@@ -51,12 +54,39 @@ def contains_page_break(docx_file):
def compare_docx_files(file1, file2, ignore_blanks=True): def compare_docx_files(file1, file2, ignore_blanks=True):
doc1 = Document(file1) def get_paragraph_texts_odt(document):
doc2 = Document(file2) paragraphs = document.getElementsByType(P)
paragraph_texts = []
for paragraph in paragraphs:
text_parts = []
for node in paragraph.childNodes:
if node.nodeType == node.TEXT_NODE:
text_parts.append(node.data)
elif node.nodeType == node.ELEMENT_NODE and node.tagName == 'text:span':
# Assuming direct text content in <text:span>, for simplicity
for child in node.childNodes:
if child.nodeType == child.TEXT_NODE:
text_parts.append(child.data)
paragraph_texts.append(''.join(text_parts))
return paragraph_texts
doc1_paragraphs = [p.text for p in doc1.paragraphs] # Determine file types and load documents
doc2_paragraphs = [p.text for p in doc2.paragraphs] if file1.endswith('.docx') and file2.endswith('.docx'):
doc1 = Document(file1)
doc2 = Document(file2)
doc1_paragraphs = [p.text for p in doc1.paragraphs]
doc2_paragraphs = [p.text for p in doc2.paragraphs]
elif file1.endswith('.odt') and file2.endswith('.odt'):
doc1 = load(file1)
doc2 = load(file2)
doc1_paragraphs = get_paragraph_texts_odt(doc1)
doc2_paragraphs = get_paragraph_texts_odt(doc2)
else:
# Unsupported file types or mismatch
print("Unsupported file types or mismatch between file types.")
return 0
# Process and compare documents
if ignore_blanks: if ignore_blanks:
text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip() text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip() text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
@@ -64,19 +94,16 @@ def compare_docx_files(file1, file2, ignore_blanks=True):
return 0 return 0
else: else:
if len(doc1_paragraphs) != len(doc2_paragraphs): if len(doc1_paragraphs) != len(doc2_paragraphs):
# print(len(doc1_paragraphs))
# print(len(doc2_paragraphs))
return 0 return 0
# Compare each paragraph # Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs): for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if p1 != p2: if p1 != p2:
# print(p1)
# print(p2)
return 0 return 0
return 1 return 1
def compare_init_lines(file1, file2): def compare_init_lines(file1, file2):
doc1 = Document(file1) doc1 = Document(file1)
doc2 = Document(file2) doc2 = Document(file2)
@@ -93,6 +120,7 @@ def compare_init_lines(file1, file2):
return 1 return 1
def compare_docx_tables(docx_file1, docx_file2): def compare_docx_tables(docx_file1, docx_file2):
doc1 = Document(docx_file1) doc1 = Document(docx_file1)
doc2 = Document(docx_file2) doc2 = Document(docx_file2)
@@ -217,6 +245,7 @@ def compare_contains_image(docx_file1, docx_file2):
return 0 return 0
return 1 return 1
# file1 = 'path/to/file1.docx' # file1 = 'path/to/file1.docx'
# file2 = 'path/to/file2.docx' # file2 = 'path/to/file2.docx'
@@ -253,26 +282,25 @@ def evaluate_colored_words_in_tables(file_path1, file_path2):
def check_highlighted_words(file_path1, file_path2): def check_highlighted_words(file_path1, file_path2):
# if not compare_docx_files(file_path1, file_path2): if not compare_docx_files(file_path1, file_path2):
# return 0
# Extract content.xml from the .odt file
extract_dir = file_path1 + "_extracted"
with zipfile.ZipFile(file_path1, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
content_xml_path = os.path.join(extract_dir, 'content.xml')
with open(content_xml_path, 'r', encoding="utf-8") as file:
content_xml = file.read()
# Check for yellow highlights in the content.xml
yellow_highlight_pattern = re.compile(r'(.{0,50}background-color="#ffff00"[^>]*>.{0,50})')
yellow_highlight_matches = yellow_highlight_pattern.findall(content_xml)
# Return True if yellow highlights are NOT found, otherwise True
if yellow_highlight_matches:
return 0 return 0
else:
return 1 doc = load(file_path1)
highlighted = False
for span in doc.getElementsByType(Span):
style_name = span.getAttribute('stylename')
if style_name:
for automatic_style in doc.automaticstyles.childNodes:
if automatic_style.getAttribute('name') == style_name:
for property in automatic_style.childNodes:
if property.getAttribute('backgroundcolor') == '#ffff00':
highlighted = True
break
if highlighted:
break
return 0 if highlighted else 1
def evaluate_strike_through_last_paragraph(file_path1, file_path2): def evaluate_strike_through_last_paragraph(file_path1, file_path2):
@@ -318,9 +346,9 @@ def evaluate_spacing(file_path):
body_spacing = document.paragraphs[1].paragraph_format.line_spacing body_spacing = document.paragraphs[1].paragraph_format.line_spacing
conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing
if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5): if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5):
return 1 return 1
else: else:
return 0 return 0
def check_italic_font_size_14(path1, path2): def check_italic_font_size_14(path1, path2):
@@ -358,13 +386,14 @@ def evaluate_alignment(docx_path):
second_part = ' '.join(words[3:]) second_part = ' '.join(words[3:])
# Check if the sentence structure matches the pattern: first part + large space/tab + second part # Check if the sentence structure matches the pattern: first part + large space/tab + second part
if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(second_part)): if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(
second_part)):
return 0 # The sentence does not meet the alignment criteria return 0 # The sentence does not meet the alignment criteria
return 1 # All sentences meet the alignment criteria return 1 # All sentences meet the alignment criteria
def get_unique_train_ids(initial_file): #fixed standard def get_unique_train_ids(initial_file): # fixed standard
doc = Document(initial_file) doc = Document(initial_file)
train_ids = set() train_ids = set()
processed_lines = 0 processed_lines = 0
@@ -374,11 +403,12 @@ def get_unique_train_ids(initial_file): #fixed standard
if len(line_parts) == 4: if len(line_parts) == 4:
train_id = line_parts[1].strip() train_id = line_parts[1].strip()
if train_id not in train_ids: if train_id not in train_ids:
train_ids.add(train_id) train_ids.add(train_id)
processed_lines += 1 processed_lines += 1
return train_ids, processed_lines return train_ids, processed_lines
def check_no_duplicates(initial_file, processed_file): def check_no_duplicates(initial_file, processed_file):
# Open the document # Open the document
train_ids_ini, ini_lines = get_unique_train_ids(initial_file) train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
@@ -405,6 +435,7 @@ def check_no_duplicates(initial_file, processed_file):
# No duplicates found and at least one valid line was processed # No duplicates found and at least one valid line was processed
return 1 return 1
def compare_docx_lines(file1, file2): def compare_docx_lines(file1, file2):
# Read the text of the document, line by line # Read the text of the document, line by line
doc1 = Document(file1) doc1 = Document(file1)
@@ -416,21 +447,46 @@ def compare_docx_lines(file1, file2):
# print(doc2_lines) # print(doc2_lines)
# Convert the list of lines to sets and compare # Convert the list of lines to sets and compare
return set(doc1_lines) == set(doc2_lines) if set(doc1_lines) == set(doc2_lines):
return 1
else:
return 0
# Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
def compare_highlighted_text(file1, file2): def compare_highlighted_text(file1, file2):
def extract_highlighted_text(doc): def extract_highlighted_text(file_path):
highlighted_text = [] highlighted_texts = []
# Iterate through each run in each paragraph to check for highlight
for paragraph in doc.paragraphs: # Open the .docx file as a zip file and read the document.xml
for run in paragraph.runs: with zipfile.ZipFile(file_path, 'r') as docx:
if run.font.highlight_color: # Checks if the run is highlighted with docx.open('word/document.xml') as document_xml:
highlighted_text.append(run.text.strip()) tree = ET.parse(document_xml)
return highlighted_text root = tree.getroot()
# Define the namespaces
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
}
# Find all runs with highlight property
for run in root.findall('.//w:r', namespaces):
highlight = run.find('.//w:highlight', namespaces)
if highlight is not None and highlight.get(
'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') != 'none':
text = run.find('.//w:t', namespaces)
if text is not None:
highlighted_texts.append(text.text)
return highlighted_texts
# Read the highlighted text from both documents # Read the highlighted text from both documents
doc1_highlighted = extract_highlighted_text(Document(file1)) doc1_highlighted = extract_highlighted_text(file1)
doc2_highlighted = extract_highlighted_text(Document(file2)) doc2_highlighted = extract_highlighted_text(file2)
# Compare the sets of highlighted text to check if they are the same # Compare the sets of highlighted text to check if they are the same
return set(doc1_highlighted) == set(doc2_highlighted) if set(doc1_highlighted) == set(doc2_highlighted):
return 1
else:
return 0

View File

@@ -1,7 +1,7 @@
{ {
"id": "41c621f7-3544-49e1-af8d-dafd0f834f75", "id": "41c621f7-3544-49e1-af8d-dafd0f834f75",
"snapshot": "libreoffice_writer", "snapshot": "libreoffice_writer",
"instruction": "I am adding comments which begin with \"#\" right beside sentences my students have written. I want to make my comments to stand out more, so I am highlighting my comments to yellow. Could you help me on this? It is hard for me to color comments one by one.", "instruction": "I added annotations starting with a \"#\" next to the sentences that my students wrote. I want my annotations to stand out more, so I highlighted them in yellow. Can you help me with this favor? I'm having a hard time coloring the annotations one by one. By the way, remember to remove the # sign after highlighting the text part. Thanks!",
"source": "https://superuser.com/questions/1668018/how-to-auto-format-lines-in-libre-office-writer", "source": "https://superuser.com/questions/1668018/how-to-auto-format-lines-in-libre-office-writer",
"config": [ "config": [
{ {
@@ -52,10 +52,7 @@
} }
} }
], ],
"func": [ "func": "compare_highlighted_text",
"compare_highlighted_text",
"compare_docx_files"
],
"expected": { "expected": {
"type": "cloud_file", "type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1x2gplRDN1AOnRWiqfqmJqN9JTQ_x4sPu&export=download&authuser=0&confirm=t&uuid=ea6b0a61-fd06-4823-b253-05473c77c192&at=APZUnTUxWWFB_TOKLR9chabErm7b:1706018376493", "path": "https://drive.usercontent.google.com/download?id=1x2gplRDN1AOnRWiqfqmJqN9JTQ_x4sPu&export=download&authuser=0&confirm=t&uuid=ea6b0a61-fd06-4823-b253-05473c77c192&at=APZUnTUxWWFB_TOKLR9chabErm7b:1706018376493",

View File

@@ -9,7 +9,7 @@
"parameters": { "parameters": {
"files": [ "files": [
{ {
"url": "https://drive.google.com/uc?id=1Ul4mtQ4SpUNLVDlaiJThPprBe6WTGZ2L&export=download", "url": "https://drive.usercontent.google.com/download?id=1_1uItoxJsp8PgBqaED0U3bOfKxIweNkJ&export=download&authuser=0&confirm=t&uuid=facec3ed-2fd6-49f7-aeda-95feaa38f45c&at=APZUnTU6B0xoLG-kI0V2WZ74oSI6:1706544439742",
"path": "Desktop/sample-recruitment-phone-script.odt" "path": "Desktop/sample-recruitment-phone-script.odt"
} }
] ]
@@ -55,7 +55,7 @@
"func": "check_highlighted_words", "func": "check_highlighted_words",
"expected": { "expected": {
"type": "cloud_file", "type": "cloud_file",
"path": "https://drive.google.com/uc?id=12iMkgCYuUyhKUXux96kANLIeud0Wz9ct&export=download", "path": "https://drive.usercontent.google.com/download?id=1Qxz3EjHuiLa8yZaCr1DJHwEOSd6U-cjY&export=download&authuser=0&confirm=t&uuid=c502680e-51f9-4e4f-90d0-58b123a09b3d&at=APZUnTXN2UX9EuvB1PZoJ5koJwmQ:1706544440969",
"dest": "sample-recruitment-phone-script_Gold.odt" "dest": "sample-recruitment-phone-script_Gold.odt"
}, },
"result": { "result": {

View File

@@ -36,5 +36,5 @@ backoff
formulas formulas
pydrive pydrive
fastdtw fastdtw
odfpy
openai openai