Fix writer examples
This commit is contained in:
@@ -1,13 +1,16 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import zipfile
|
|
||||||
import re
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||||||
from docx.shared import RGBColor
|
from docx.shared import RGBColor
|
||||||
|
from odf.opendocument import load
|
||||||
|
from odf.text import P
|
||||||
|
from odf.text import Span
|
||||||
|
import zipfile
|
||||||
|
|
||||||
logger = logging.getLogger("desktopenv.metric.docs")
|
logger = logging.getLogger("desktopenv.metric.docs")
|
||||||
|
|
||||||
@@ -51,12 +54,39 @@ def contains_page_break(docx_file):
|
|||||||
|
|
||||||
|
|
||||||
def compare_docx_files(file1, file2, ignore_blanks=True):
|
def compare_docx_files(file1, file2, ignore_blanks=True):
|
||||||
doc1 = Document(file1)
|
def get_paragraph_texts_odt(document):
|
||||||
doc2 = Document(file2)
|
paragraphs = document.getElementsByType(P)
|
||||||
|
paragraph_texts = []
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
text_parts = []
|
||||||
|
for node in paragraph.childNodes:
|
||||||
|
if node.nodeType == node.TEXT_NODE:
|
||||||
|
text_parts.append(node.data)
|
||||||
|
elif node.nodeType == node.ELEMENT_NODE and node.tagName == 'text:span':
|
||||||
|
# Assuming direct text content in <text:span>, for simplicity
|
||||||
|
for child in node.childNodes:
|
||||||
|
if child.nodeType == child.TEXT_NODE:
|
||||||
|
text_parts.append(child.data)
|
||||||
|
paragraph_texts.append(''.join(text_parts))
|
||||||
|
return paragraph_texts
|
||||||
|
|
||||||
doc1_paragraphs = [p.text for p in doc1.paragraphs]
|
# Determine file types and load documents
|
||||||
doc2_paragraphs = [p.text for p in doc2.paragraphs]
|
if file1.endswith('.docx') and file2.endswith('.docx'):
|
||||||
|
doc1 = Document(file1)
|
||||||
|
doc2 = Document(file2)
|
||||||
|
doc1_paragraphs = [p.text for p in doc1.paragraphs]
|
||||||
|
doc2_paragraphs = [p.text for p in doc2.paragraphs]
|
||||||
|
elif file1.endswith('.odt') and file2.endswith('.odt'):
|
||||||
|
doc1 = load(file1)
|
||||||
|
doc2 = load(file2)
|
||||||
|
doc1_paragraphs = get_paragraph_texts_odt(doc1)
|
||||||
|
doc2_paragraphs = get_paragraph_texts_odt(doc2)
|
||||||
|
else:
|
||||||
|
# Unsupported file types or mismatch
|
||||||
|
print("Unsupported file types or mismatch between file types.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Process and compare documents
|
||||||
if ignore_blanks:
|
if ignore_blanks:
|
||||||
text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
|
text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
|
||||||
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
|
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
|
||||||
@@ -64,19 +94,16 @@ def compare_docx_files(file1, file2, ignore_blanks=True):
|
|||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
if len(doc1_paragraphs) != len(doc2_paragraphs):
|
if len(doc1_paragraphs) != len(doc2_paragraphs):
|
||||||
# print(len(doc1_paragraphs))
|
|
||||||
# print(len(doc2_paragraphs))
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# Compare each paragraph
|
# Compare each paragraph
|
||||||
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
|
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
|
||||||
if p1 != p2:
|
if p1 != p2:
|
||||||
# print(p1)
|
|
||||||
# print(p2)
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
def compare_init_lines(file1, file2):
|
def compare_init_lines(file1, file2):
|
||||||
doc1 = Document(file1)
|
doc1 = Document(file1)
|
||||||
doc2 = Document(file2)
|
doc2 = Document(file2)
|
||||||
@@ -93,6 +120,7 @@ def compare_init_lines(file1, file2):
|
|||||||
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
def compare_docx_tables(docx_file1, docx_file2):
|
def compare_docx_tables(docx_file1, docx_file2):
|
||||||
doc1 = Document(docx_file1)
|
doc1 = Document(docx_file1)
|
||||||
doc2 = Document(docx_file2)
|
doc2 = Document(docx_file2)
|
||||||
@@ -217,6 +245,7 @@ def compare_contains_image(docx_file1, docx_file2):
|
|||||||
return 0
|
return 0
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
# file1 = 'path/to/file1.docx'
|
# file1 = 'path/to/file1.docx'
|
||||||
# file2 = 'path/to/file2.docx'
|
# file2 = 'path/to/file2.docx'
|
||||||
|
|
||||||
@@ -253,26 +282,25 @@ def evaluate_colored_words_in_tables(file_path1, file_path2):
|
|||||||
|
|
||||||
|
|
||||||
def check_highlighted_words(file_path1, file_path2):
|
def check_highlighted_words(file_path1, file_path2):
|
||||||
# if not compare_docx_files(file_path1, file_path2):
|
if not compare_docx_files(file_path1, file_path2):
|
||||||
# return 0
|
|
||||||
|
|
||||||
# Extract content.xml from the .odt file
|
|
||||||
extract_dir = file_path1 + "_extracted"
|
|
||||||
with zipfile.ZipFile(file_path1, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall(extract_dir)
|
|
||||||
content_xml_path = os.path.join(extract_dir, 'content.xml')
|
|
||||||
with open(content_xml_path, 'r', encoding="utf-8") as file:
|
|
||||||
content_xml = file.read()
|
|
||||||
|
|
||||||
# Check for yellow highlights in the content.xml
|
|
||||||
yellow_highlight_pattern = re.compile(r'(.{0,50}background-color="#ffff00"[^>]*>.{0,50})')
|
|
||||||
yellow_highlight_matches = yellow_highlight_pattern.findall(content_xml)
|
|
||||||
|
|
||||||
# Return True if yellow highlights are NOT found, otherwise True
|
|
||||||
if yellow_highlight_matches:
|
|
||||||
return 0
|
return 0
|
||||||
else:
|
|
||||||
return 1
|
doc = load(file_path1)
|
||||||
|
highlighted = False
|
||||||
|
|
||||||
|
for span in doc.getElementsByType(Span):
|
||||||
|
style_name = span.getAttribute('stylename')
|
||||||
|
if style_name:
|
||||||
|
for automatic_style in doc.automaticstyles.childNodes:
|
||||||
|
if automatic_style.getAttribute('name') == style_name:
|
||||||
|
for property in automatic_style.childNodes:
|
||||||
|
if property.getAttribute('backgroundcolor') == '#ffff00':
|
||||||
|
highlighted = True
|
||||||
|
break
|
||||||
|
if highlighted:
|
||||||
|
break
|
||||||
|
|
||||||
|
return 0 if highlighted else 1
|
||||||
|
|
||||||
|
|
||||||
def evaluate_strike_through_last_paragraph(file_path1, file_path2):
|
def evaluate_strike_through_last_paragraph(file_path1, file_path2):
|
||||||
@@ -318,9 +346,9 @@ def evaluate_spacing(file_path):
|
|||||||
body_spacing = document.paragraphs[1].paragraph_format.line_spacing
|
body_spacing = document.paragraphs[1].paragraph_format.line_spacing
|
||||||
conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing
|
conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing
|
||||||
if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5):
|
if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5):
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def check_italic_font_size_14(path1, path2):
|
def check_italic_font_size_14(path1, path2):
|
||||||
@@ -358,13 +386,14 @@ def evaluate_alignment(docx_path):
|
|||||||
second_part = ' '.join(words[3:])
|
second_part = ' '.join(words[3:])
|
||||||
|
|
||||||
# Check if the sentence structure matches the pattern: first part + large space/tab + second part
|
# Check if the sentence structure matches the pattern: first part + large space/tab + second part
|
||||||
if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(second_part)):
|
if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(
|
||||||
|
second_part)):
|
||||||
return 0 # The sentence does not meet the alignment criteria
|
return 0 # The sentence does not meet the alignment criteria
|
||||||
|
|
||||||
return 1 # All sentences meet the alignment criteria
|
return 1 # All sentences meet the alignment criteria
|
||||||
|
|
||||||
|
|
||||||
def get_unique_train_ids(initial_file): #fixed standard
|
def get_unique_train_ids(initial_file): # fixed standard
|
||||||
doc = Document(initial_file)
|
doc = Document(initial_file)
|
||||||
train_ids = set()
|
train_ids = set()
|
||||||
processed_lines = 0
|
processed_lines = 0
|
||||||
@@ -374,11 +403,12 @@ def get_unique_train_ids(initial_file): #fixed standard
|
|||||||
if len(line_parts) == 4:
|
if len(line_parts) == 4:
|
||||||
train_id = line_parts[1].strip()
|
train_id = line_parts[1].strip()
|
||||||
if train_id not in train_ids:
|
if train_id not in train_ids:
|
||||||
train_ids.add(train_id)
|
train_ids.add(train_id)
|
||||||
processed_lines += 1
|
processed_lines += 1
|
||||||
|
|
||||||
return train_ids, processed_lines
|
return train_ids, processed_lines
|
||||||
|
|
||||||
|
|
||||||
def check_no_duplicates(initial_file, processed_file):
|
def check_no_duplicates(initial_file, processed_file):
|
||||||
# Open the document
|
# Open the document
|
||||||
train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
|
train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
|
||||||
@@ -405,6 +435,7 @@ def check_no_duplicates(initial_file, processed_file):
|
|||||||
# No duplicates found and at least one valid line was processed
|
# No duplicates found and at least one valid line was processed
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
def compare_docx_lines(file1, file2):
|
def compare_docx_lines(file1, file2):
|
||||||
# Read the text of the document, line by line
|
# Read the text of the document, line by line
|
||||||
doc1 = Document(file1)
|
doc1 = Document(file1)
|
||||||
@@ -416,21 +447,46 @@ def compare_docx_lines(file1, file2):
|
|||||||
# print(doc2_lines)
|
# print(doc2_lines)
|
||||||
|
|
||||||
# Convert the list of lines to sets and compare
|
# Convert the list of lines to sets and compare
|
||||||
return set(doc1_lines) == set(doc2_lines)
|
if set(doc1_lines) == set(doc2_lines):
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
|
||||||
def compare_highlighted_text(file1, file2):
|
def compare_highlighted_text(file1, file2):
|
||||||
def extract_highlighted_text(doc):
|
def extract_highlighted_text(file_path):
|
||||||
highlighted_text = []
|
highlighted_texts = []
|
||||||
# Iterate through each run in each paragraph to check for highlight
|
|
||||||
for paragraph in doc.paragraphs:
|
# Open the .docx file as a zip file and read the document.xml
|
||||||
for run in paragraph.runs:
|
with zipfile.ZipFile(file_path, 'r') as docx:
|
||||||
if run.font.highlight_color: # Checks if the run is highlighted
|
with docx.open('word/document.xml') as document_xml:
|
||||||
highlighted_text.append(run.text.strip())
|
tree = ET.parse(document_xml)
|
||||||
return highlighted_text
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Define the namespaces
|
||||||
|
namespaces = {
|
||||||
|
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Find all runs with highlight property
|
||||||
|
for run in root.findall('.//w:r', namespaces):
|
||||||
|
highlight = run.find('.//w:highlight', namespaces)
|
||||||
|
if highlight is not None and highlight.get(
|
||||||
|
'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') != 'none':
|
||||||
|
text = run.find('.//w:t', namespaces)
|
||||||
|
if text is not None:
|
||||||
|
highlighted_texts.append(text.text)
|
||||||
|
|
||||||
|
return highlighted_texts
|
||||||
|
|
||||||
# Read the highlighted text from both documents
|
# Read the highlighted text from both documents
|
||||||
doc1_highlighted = extract_highlighted_text(Document(file1))
|
doc1_highlighted = extract_highlighted_text(file1)
|
||||||
doc2_highlighted = extract_highlighted_text(Document(file2))
|
doc2_highlighted = extract_highlighted_text(file2)
|
||||||
|
|
||||||
# Compare the sets of highlighted text to check if they are the same
|
# Compare the sets of highlighted text to check if they are the same
|
||||||
return set(doc1_highlighted) == set(doc2_highlighted)
|
if set(doc1_highlighted) == set(doc2_highlighted):
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"id": "41c621f7-3544-49e1-af8d-dafd0f834f75",
|
"id": "41c621f7-3544-49e1-af8d-dafd0f834f75",
|
||||||
"snapshot": "libreoffice_writer",
|
"snapshot": "libreoffice_writer",
|
||||||
"instruction": "I am adding comments which begin with \"#\" right beside sentences my students have written. I want to make my comments to stand out more, so I am highlighting my comments to yellow. Could you help me on this? It is hard for me to color comments one by one.",
|
"instruction": "I added annotations starting with a \"#\" next to the sentences that my students wrote. I want my annotations to stand out more, so I highlighted them in yellow. Can you help me with this favor? I'm having a hard time coloring the annotations one by one. By the way, remember to remove the # sign after highlighting the text part. Thanks!",
|
||||||
"source": "https://superuser.com/questions/1668018/how-to-auto-format-lines-in-libre-office-writer",
|
"source": "https://superuser.com/questions/1668018/how-to-auto-format-lines-in-libre-office-writer",
|
||||||
"config": [
|
"config": [
|
||||||
{
|
{
|
||||||
@@ -52,10 +52,7 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"func": [
|
"func": "compare_highlighted_text",
|
||||||
"compare_highlighted_text",
|
|
||||||
"compare_docx_files"
|
|
||||||
],
|
|
||||||
"expected": {
|
"expected": {
|
||||||
"type": "cloud_file",
|
"type": "cloud_file",
|
||||||
"path": "https://drive.usercontent.google.com/download?id=1x2gplRDN1AOnRWiqfqmJqN9JTQ_x4sPu&export=download&authuser=0&confirm=t&uuid=ea6b0a61-fd06-4823-b253-05473c77c192&at=APZUnTUxWWFB_TOKLR9chabErm7b:1706018376493",
|
"path": "https://drive.usercontent.google.com/download?id=1x2gplRDN1AOnRWiqfqmJqN9JTQ_x4sPu&export=download&authuser=0&confirm=t&uuid=ea6b0a61-fd06-4823-b253-05473c77c192&at=APZUnTUxWWFB_TOKLR9chabErm7b:1706018376493",
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"parameters": {
|
"parameters": {
|
||||||
"files": [
|
"files": [
|
||||||
{
|
{
|
||||||
"url": "https://drive.google.com/uc?id=1Ul4mtQ4SpUNLVDlaiJThPprBe6WTGZ2L&export=download",
|
"url": "https://drive.usercontent.google.com/download?id=1_1uItoxJsp8PgBqaED0U3bOfKxIweNkJ&export=download&authuser=0&confirm=t&uuid=facec3ed-2fd6-49f7-aeda-95feaa38f45c&at=APZUnTU6B0xoLG-kI0V2WZ74oSI6:1706544439742",
|
||||||
"path": "Desktop/sample-recruitment-phone-script.odt"
|
"path": "Desktop/sample-recruitment-phone-script.odt"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -55,7 +55,7 @@
|
|||||||
"func": "check_highlighted_words",
|
"func": "check_highlighted_words",
|
||||||
"expected": {
|
"expected": {
|
||||||
"type": "cloud_file",
|
"type": "cloud_file",
|
||||||
"path": "https://drive.google.com/uc?id=12iMkgCYuUyhKUXux96kANLIeud0Wz9ct&export=download",
|
"path": "https://drive.usercontent.google.com/download?id=1Qxz3EjHuiLa8yZaCr1DJHwEOSd6U-cjY&export=download&authuser=0&confirm=t&uuid=c502680e-51f9-4e4f-90d0-58b123a09b3d&at=APZUnTXN2UX9EuvB1PZoJ5koJwmQ:1706544440969",
|
||||||
"dest": "sample-recruitment-phone-script_Gold.odt"
|
"dest": "sample-recruitment-phone-script_Gold.odt"
|
||||||
},
|
},
|
||||||
"result": {
|
"result": {
|
||||||
|
|||||||
@@ -36,5 +36,5 @@ backoff
|
|||||||
formulas
|
formulas
|
||||||
pydrive
|
pydrive
|
||||||
fastdtw
|
fastdtw
|
||||||
|
odfpy
|
||||||
openai
|
openai
|
||||||
|
|||||||
Reference in New Issue
Block a user