Files
sci-gui-agent-benchmark/desktop_env/evaluators/metrics/docs.py

436 lines
14 KiB
Python

import logging
import os
import xml.etree.ElementTree as ET
import zipfile
import re
from typing import List, Dict, Any
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import RGBColor
logger = logging.getLogger("desktopenv.metric.docs")
def find_default_font(config_file_path, rules):
"""Find the default font in LibreOffice Writer."""
default_font = None
expected_font = rules["font_name"]
try:
tree = ET.parse(config_file_path)
root = tree.getroot()
# Define the XML namespace used in the file
namespace = {'oor': 'http://openoffice.org/2001/registry'}
# Search for the node containing the default font setting for LibreOffice Writer
for elem in root.findall('.//item[@oor:path="/org.openoffice.Office.Writer/DefaultFont"]', namespace):
for prop in elem.findall('.//prop[@oor:name="Standard"]', namespace):
for value in prop.findall('value', namespace):
default_font = value.text
except Exception as e:
logger.error(f"Error: {e}")
return 1 if default_font == expected_font else 0
def contains_page_break(docx_file):
doc = Document(docx_file)
namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
for paragraph in doc.paragraphs:
for run in paragraph.runs:
br_elems = run.element.findall('.//w:br', namespaces)
for br in br_elems:
if br is not None and '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type' in br.attrib and \
br.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type'] == 'page':
return 1
return 0
def compare_docx_files(file1, file2, ignore_blanks=True):
doc1 = Document(file1)
doc2 = Document(file2)
doc1_paragraphs = [p.text for p in doc1.paragraphs]
doc2_paragraphs = [p.text for p in doc2.paragraphs]
if ignore_blanks:
text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
if text1 != text2:
return 0
else:
if len(doc1_paragraphs) != len(doc2_paragraphs):
# print(len(doc1_paragraphs))
# print(len(doc2_paragraphs))
return 0
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if p1 != p2:
# print(p1)
# print(p2)
return 0
return 1
def compare_init_lines(file1, file2):
doc1 = Document(file1)
doc2 = Document(file2)
doc1_paragraphs = [p.text for p in doc1.paragraphs]
doc2_paragraphs = [p.text for p in doc2.paragraphs]
# Compare each paragraph
for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
if p1 != p2:
# print(p1)
# print(p2)
return 0
return 1
def compare_docx_tables(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
# get list of tables in docx
tables1 = doc1.tables
tables2 = doc2.tables
if len(tables1) != len(tables2):
return 0
# Compare each table content
for table1, table2 in zip(tables1, tables2):
if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
return 0
# Compare each cell
for i in range(len(table1.rows)):
for j in range(len(table1.columns)):
if table1.cell(i, j).text != table2.cell(i, j).text:
return 0
return 1
def compare_line_spacing(docx_file1, docx_file2):
if not compare_docx_files(docx_file1, docx_file2):
return 0
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
if len(doc1.paragraphs) != len(doc2.paragraphs):
return 0
# Compare each paragraph line spacing
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
spacing1 = para1.paragraph_format.line_spacing
spacing2 = para2.paragraph_format.line_spacing
if spacing1 != spacing2:
return 0
return 1
def compare_insert_equation(docx_file1, docx_file2):
if not compare_docx_files(docx_file1, docx_file2):
return 0
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
# Compare each paragraph if it contains equation
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
for run1, run2 in zip(para1.runs, para2.runs):
if run1.element.xpath('.//w:object') and run2.element.xpath('.//w:object'):
return 1
return 0
def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
doc = Document(docx_file)
expected_font = rules["font_name"]
for paragraph in doc.paragraphs:
for run in paragraph.runs:
font_name = run.font.name
if font_name != expected_font:
return 0
return 1
def compare_subscript_contains(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
for run1, run2 in zip(para1.runs, para2.runs):
# check if two paras both contain subscript
if run1.font.subscript and run2.font.subscript:
return 1
return 0
def has_page_numbers_in_footers(docx_file):
doc = Document(docx_file)
for section in doc.sections:
footer = section.footer
if footer is None:
return 0
footer_text = footer.paragraphs[0].text if footer.paragraphs else ''
if not any(char.isdigit() for char in footer_text):
# if no digit in footer, then no page number
return 0
return 1
def is_first_line_centered(docx_file):
doc = Document(docx_file)
first_paragraph = doc.paragraphs[0]
# check if the first line is center justified
return 1 if first_paragraph.paragraph_format.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER else 0
def check_file_exists(directory, filename):
file_path = os.path.join(directory, filename)
return 1 if os.path.isfile(file_path) else 0
def compare_contains_image(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
for run1, run2 in zip(para1.runs, para2.runs):
if ('graphicData' in run1._element.xml and 'graphicData' not in run2._element.xml) or (
'graphicData' not in run1._element.xml and 'graphicData' in run2._element.xml):
return 0
return 1
# file1 = 'path/to/file1.docx'
# file2 = 'path/to/file2.docx'
# print(are_docx_files_same(file1, file2))
# Replace 'your_document.docx' with the path to your document
# result = contains_page_break('your_document.docx')
# print(result)
# config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu"
# print(find_default_font("Ani", config_path))
def evaluate_colored_words_in_tables(file_path1, file_path2):
if not compare_docx_files(file_path1, file_path2):
return 0
document = Document(file_path1)
for table in document.tables:
# Iterate through rows and cells in the table
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
word = run.text
if word:
first_letter = word[0].lower()
if first_letter in 'aeiou' and run.font.color.rgb != RGBColor(255, 0, 0):
return 0 # Vowel-colored words should be red
elif first_letter not in 'aeiou' and run.font.color.rgb != RGBColor(0, 0, 255):
return 0 # Non-vowel-colored words should be blue
return 1 # All words in tables are correctly colored
def check_highlighted_words(file_path1, file_path2):
# if not compare_docx_files(file_path1, file_path2):
# return 0
# Extract content.xml from the .odt file
extract_dir = file_path1 + "_extracted"
with zipfile.ZipFile(file_path1, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
content_xml_path = os.path.join(extract_dir, 'content.xml')
with open(content_xml_path, 'r', encoding="utf-8") as file:
content_xml = file.read()
# Check for yellow highlights in the content.xml
yellow_highlight_pattern = re.compile(r'(.{0,50}background-color="#ffff00"[^>]*>.{0,50})')
yellow_highlight_matches = yellow_highlight_pattern.findall(content_xml)
# Return True if yellow highlights are NOT found, otherwise True
if yellow_highlight_matches:
return 0
else:
return 1
def evaluate_strike_through_last_paragraph(file_path1, file_path2):
if not compare_docx_files(file_path1, file_path2):
return 0
document = Document(file_path1)
# Get the last paragraph
last_paragraph = document.paragraphs[-1]
# Check if any run in the last paragraph has strike-through formatting
for run in last_paragraph.runs:
if not run.font.strike:
return 0 # At least one word does not have strike-through formatting
return 1 # All words in the last paragraph have strike-through formatting
def evaluate_conversion(file_path):
document = Document(file_path)
for table in document.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
if run.text.isupper():
return 0 # Uppercase text should be converted to lowercase
for paragraph in document.paragraphs:
for run in paragraph.runs:
if run.text.isupper():
return 0 # Uppercase text should be converted to lowercase
return 1 # All uppercase text has been successfully converted
def evaluate_spacing(file_path):
document = Document(file_path)
# Check line spacing for introduction, body, and conclusion
introduction_spacing = document.paragraphs[0].paragraph_format.line_spacing
body_spacing = document.paragraphs[1].paragraph_format.line_spacing
conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing
if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5):
return 1
else:
return 0
def check_italic_font_size_14(path1, path2):
if not compare_docx_files(path1, path2):
return 0
document = Document(path1)
for paragraph in document.paragraphs:
for run in paragraph.runs:
if run.italic:
# Check if font size is 14
if run.font.size is None or run.font.size.pt != 14:
return 0
return 1
def evaluate_alignment(docx_path):
# Load the document
doc = Document(docx_path)
# Iterate through each paragraph in the document
for para in doc.paragraphs:
# Split the paragraph into individual sentences
sentences = para.text.split('.')
for sentence in sentences:
# Split the sentence into words
words = sentence.strip().split()
# Check if the sentence has at least three words
if len(words) < 3:
continue # Skip sentences with less than three words
# The first three words should be separated from the rest
first_part = ' '.join(words[:3])
second_part = ' '.join(words[3:])
# Check if the sentence structure matches the pattern: first part + large space/tab + second part
if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(second_part)):
return 0 # The sentence does not meet the alignment criteria
return 1 # All sentences meet the alignment criteria
def get_unique_train_ids(initial_file): #fixed standard
doc = Document(initial_file)
train_ids = set()
processed_lines = 0
for para in doc.paragraphs:
line_parts = para.text.split(',')
if len(line_parts) == 4:
train_id = line_parts[1].strip()
if train_id not in train_ids:
train_ids.add(train_id)
processed_lines += 1
return train_ids, processed_lines
def check_no_duplicates(initial_file, processed_file):
# Open the document
train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
doc_processed = Document(processed_file)
train_ids_pro = set()
processed_lines = 0 # Counter for valid lines processed
# processed
for para in doc_processed.paragraphs:
# Each line has the format: time_HH:MM:SS, train_id, station_id, platform_no
line_parts = para.text.split(',')
# Ensure the line has the correct format
if len(line_parts) == 4:
train_id = line_parts[1].strip()
# If train_id is already in the set, it's a duplicate
if train_id in train_ids_pro:
return 0 # Duplicate found
train_ids_pro.add(train_id)
processed_lines += 1 # Increment valid lines counter
if train_ids_pro != train_ids_ini or processed_lines != ini_lines:
return 0
# No duplicates found and at least one valid line was processed
return 1
def compare_docx_lines(file1, file2):
# Read the text of the document, line by line
doc1 = Document(file1)
doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
doc2 = Document(file2)
doc2_lines = [p.text.strip() for p in doc2.paragraphs if p.text.strip()]
# print(doc1_lines)
# print(doc2_lines)
# Convert the list of lines to sets and compare
return set(doc1_lines) == set(doc2_lines)
def compare_highlighted_text(file1, file2):
def extract_highlighted_text(doc):
highlighted_text = []
# Iterate through each run in each paragraph to check for highlight
for paragraph in doc.paragraphs:
for run in paragraph.runs:
if run.font.highlight_color: # Checks if the run is highlighted
highlighted_text.append(run.text.strip())
return highlighted_text
# Read the highlighted text from both documents
doc1_highlighted = extract_highlighted_text(Document(file1))
doc2_highlighted = extract_highlighted_text(Document(file2))
# Compare the sets of highlighted text to check if they are the same
return set(doc1_highlighted) == set(doc2_highlighted)