sci-gui-agent-benchmark/desktop_env/evaluators/metrics/docs.py

import logging
import os
import xml.etree.ElementTree as ET
import zipfile
import re
from typing import List, Dict, Any

from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import RGBColor

logger = logging.getLogger("desktopenv.metric.docs")


def find_default_font(config_file_path, rules):
    """Find the default font in LibreOffice Writer."""
    default_font = None
    expected_font = rules["font_name"]

    try:
        tree = ET.parse(config_file_path)
        root = tree.getroot()

        # Define the XML namespace used in the file
        namespace = {'oor': 'http://openoffice.org/2001/registry'}

        # Search for the node containing the default font setting for LibreOffice Writer
        for elem in root.findall('.//item[@oor:path="/org.openoffice.Office.Writer/DefaultFont"]', namespace):
            for prop in elem.findall('.//prop[@oor:name="Standard"]', namespace):
                for value in prop.findall('value', namespace):
                    default_font = value.text
    except Exception as e:
        logger.error(f"Error: {e}")

    return 1 if default_font == expected_font else 0


def contains_page_break(docx_file):
    doc = Document(docx_file)

    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

    for paragraph in doc.paragraphs:
        for run in paragraph.runs:
            br_elems = run.element.findall('.//w:br', namespaces)
            for br in br_elems:
                if br is not None and '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type' in br.attrib and \
                        br.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type'] == 'page':
                    return 1
    return 0


def compare_docx_files(file1, file2, ignore_blanks=True):
    doc1 = Document(file1)
    doc2 = Document(file2)

    doc1_paragraphs = [p.text for p in doc1.paragraphs]
    doc2_paragraphs = [p.text for p in doc2.paragraphs]

    if ignore_blanks:
        text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
        if text1 != text2:
            return 0
    else:
        if len(doc1_paragraphs) != len(doc2_paragraphs):
            # print(len(doc1_paragraphs))
            # print(len(doc2_paragraphs))
            return 0

        # Compare each paragraph
        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
            if p1 != p2:
                # print(p1)
                # print(p2)
                return 0

    return 1

def compare_init_lines(file1, file2):
    doc1 = Document(file1)
    doc2 = Document(file2)

    doc1_paragraphs = [p.text for p in doc1.paragraphs]
    doc2_paragraphs = [p.text for p in doc2.paragraphs]

    # Compare each paragraph
    for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
        if p1 != p2:
            # print(p1)
            # print(p2)
            return 0

    return 1

def compare_docx_tables(docx_file1, docx_file2):
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

    # get list of tables in docx
    tables1 = doc1.tables
    tables2 = doc2.tables

    if len(tables1) != len(tables2):
        return 0

    # Compare each table content
    for table1, table2 in zip(tables1, tables2):

        if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
            return 0

        # Compare each cell
        for i in range(len(table1.rows)):
            for j in range(len(table1.columns)):
                if table1.cell(i, j).text != table2.cell(i, j).text:
                    return 0

    return 1


def compare_line_spacing(docx_file1, docx_file2):
    if not compare_docx_files(docx_file1, docx_file2):
        return 0
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

    if len(doc1.paragraphs) != len(doc2.paragraphs):
        return 0

    # Compare each paragraph line spacing
    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):

        spacing1 = para1.paragraph_format.line_spacing
        spacing2 = para2.paragraph_format.line_spacing

        if spacing1 != spacing2:
            return 0

    return 1


def compare_insert_equation(docx_file1, docx_file2):
    if not compare_docx_files(docx_file1, docx_file2):
        return 0

    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

    # Compare each paragraph if it contains equation
    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
        for run1, run2 in zip(para1.runs, para2.runs):
            if run1.element.xpath('.//w:object') and run2.element.xpath('.//w:object'):
                return 1
    return 0


def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
    doc = Document(docx_file)
    expected_font = rules["font_name"]

    for paragraph in doc.paragraphs:
        for run in paragraph.runs:
            font_name = run.font.name
            if font_name != expected_font:
                return 0
    return 1


def compare_subscript_contains(docx_file1, docx_file2):
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
        for run1, run2 in zip(para1.runs, para2.runs):
            # check if two paras both contain subscript
            if run1.font.subscript and run2.font.subscript:
                return 1
    return 0


def has_page_numbers_in_footers(docx_file):
    doc = Document(docx_file)

    for section in doc.sections:
        footer = section.footer
        if footer is None:
            return 0
        footer_text = footer.paragraphs[0].text if footer.paragraphs else ''
        if not any(char.isdigit() for char in footer_text):
            # if no digit in footer, then no page number
            return 0
    return 1


def is_first_line_centered(docx_file):
    doc = Document(docx_file)
    first_paragraph = doc.paragraphs[0]

    # check if the first line is center justified
    return 1 if first_paragraph.paragraph_format.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER else 0


def check_file_exists(directory, filename):
    file_path = os.path.join(directory, filename)
    return 1 if os.path.isfile(file_path) else 0


def compare_contains_image(docx_file1, docx_file2):
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)

    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
        for run1, run2 in zip(para1.runs, para2.runs):
            if ('graphicData' in run1._element.xml and 'graphicData' not in run2._element.xml) or (
                    'graphicData' not in run1._element.xml and 'graphicData' in run2._element.xml):
                return 0
    return 1

# file1 = 'path/to/file1.docx'
# file2 = 'path/to/file2.docx'

# print(are_docx_files_same(file1, file2))
# Replace 'your_document.docx' with the path to your document
# result = contains_page_break('your_document.docx')
# print(result)

# config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu"
# print(find_default_font("Ani", config_path))


def evaluate_colored_words_in_tables(file_path1, file_path2):
    if not compare_docx_files(file_path1, file_path2):
        return 0
    document = Document(file_path1)

    for table in document.tables:
        # Iterate through rows and cells in the table
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for run in paragraph.runs:
                        word = run.text
                        if word:
                            first_letter = word[0].lower()

                            if first_letter in 'aeiou' and run.font.color.rgb != RGBColor(255, 0, 0):
                                return 0  # Vowel-colored words should be red
                            elif first_letter not in 'aeiou' and run.font.color.rgb != RGBColor(0, 0, 255):
                                return 0  # Non-vowel-colored words should be blue

    return 1  # All words in tables are correctly colored


def check_highlighted_words(file_path1, file_path2):
    # if not compare_docx_files(file_path1, file_path2):
    #     return 0

    # Extract content.xml from the .odt file
    extract_dir = file_path1 + "_extracted"
    with zipfile.ZipFile(file_path1, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    content_xml_path = os.path.join(extract_dir, 'content.xml')
    with open(content_xml_path, 'r', encoding="utf-8") as file:
        content_xml = file.read()

    # Check for yellow highlights in the content.xml
    yellow_highlight_pattern = re.compile(r'(.{0,50}background-color="#ffff00"[^>]*>.{0,50})')
    yellow_highlight_matches = yellow_highlight_pattern.findall(content_xml)

    # Return True if yellow highlights are NOT found, otherwise True
    if yellow_highlight_matches:
        return 0
    else:
        return 1


def evaluate_strike_through_last_paragraph(file_path1, file_path2):
    if not compare_docx_files(file_path1, file_path2):
        return 0
    document = Document(file_path1)

    # Get the last paragraph
    last_paragraph = document.paragraphs[-1]

    # Check if any run in the last paragraph has strike-through formatting
    for run in last_paragraph.runs:
        if not run.font.strike:
            return 0  # At least one word does not have strike-through formatting

    return 1  # All words in the last paragraph have strike-through formatting


def evaluate_conversion(file_path):
    document = Document(file_path)

    for table in document.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for run in paragraph.runs:
                        if run.text.isupper():
                            return 0  # Uppercase text should be converted to lowercase

    for paragraph in document.paragraphs:
        for run in paragraph.runs:
            if run.text.isupper():
                return 0  # Uppercase text should be converted to lowercase

    return 1  # All uppercase text has been successfully converted


def evaluate_spacing(file_path):
    document = Document(file_path)

    # Check line spacing for introduction, body, and conclusion
    introduction_spacing = document.paragraphs[0].paragraph_format.line_spacing
    body_spacing = document.paragraphs[1].paragraph_format.line_spacing
    conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing
    if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5):
      return 1
    else:
      return 0


def check_italic_font_size_14(path1, path2):
    if not compare_docx_files(path1, path2):
        return 0
    document = Document(path1)
    for paragraph in document.paragraphs:
        for run in paragraph.runs:
            if run.italic:
                # Check if font size is 14
                if run.font.size is None or run.font.size.pt != 14:
                    return 0
    return 1


def evaluate_alignment(docx_path):
    # Load the document
    doc = Document(docx_path)

    # Iterate through each paragraph in the document
    for para in doc.paragraphs:
        # Split the paragraph into individual sentences
        sentences = para.text.split('.')

        for sentence in sentences:
            # Split the sentence into words
            words = sentence.strip().split()

            # Check if the sentence has at least three words
            if len(words) < 3:
                continue  # Skip sentences with less than three words

            # The first three words should be separated from the rest
            first_part = ' '.join(words[:3])
            second_part = ' '.join(words[3:])

            # Check if the sentence structure matches the pattern: first part + large space/tab + second part
            if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(second_part)):
                return 0  # The sentence does not meet the alignment criteria

    return 1  # All sentences meet the alignment criteria


def get_unique_train_ids(initial_file): #fixed standard
    doc = Document(initial_file)
    train_ids = set()
    processed_lines = 0

    for para in doc.paragraphs:
        line_parts = para.text.split(',')
        if len(line_parts) == 4:
            train_id = line_parts[1].strip()
            if train_id not in train_ids:
              train_ids.add(train_id)
              processed_lines += 1

    return train_ids, processed_lines

def check_no_duplicates(initial_file, processed_file):
    # Open the document
    train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
    doc_processed = Document(processed_file)
    train_ids_pro = set()
    processed_lines = 0  # Counter for valid lines processed

    # processed
    for para in doc_processed.paragraphs:
        # Each line has the format: time_HH:MM:SS, train_id, station_id, platform_no
        line_parts = para.text.split(',')
        # Ensure the line has the correct format
        if len(line_parts) == 4:
            train_id = line_parts[1].strip()
            # If train_id is already in the set, it's a duplicate
            if train_id in train_ids_pro:
                return 0  # Duplicate found
            train_ids_pro.add(train_id)
            processed_lines += 1  # Increment valid lines counter

    if train_ids_pro != train_ids_ini or processed_lines != ini_lines:
        return 0

    # No duplicates found and at least one valid line was processed
    return 1

def compare_docx_lines(file1, file2):
    # Read the text of the document, line by line
    doc1 = Document(file1)
    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]

    doc2 = Document(file2)
    doc2_lines = [p.text.strip() for p in doc2.paragraphs if p.text.strip()]
    # print(doc1_lines)
    # print(doc2_lines)

    # Convert the list of lines to sets and compare
    return set(doc1_lines) == set(doc2_lines)

def compare_highlighted_text(file1, file2):
    def extract_highlighted_text(doc):
        highlighted_text = []
        # Iterate through each run in each paragraph to check for highlight
        for paragraph in doc.paragraphs:
            for run in paragraph.runs:
                if run.font.highlight_color:  # Checks if the run is highlighted
                    highlighted_text.append(run.text.strip())
        return highlighted_text

    # Read the highlighted text from both documents
    doc1_highlighted = extract_highlighted_text(Document(file1))
    doc2_highlighted = extract_highlighted_text(Document(file2))

    # Compare the sets of highlighted text to check if they are the same
    return set(doc1_highlighted) == set(doc2_highlighted)