Merge branch 'main' of github.com:xlang-ai/DesktopEnv
This commit is contained in:
@@ -3,8 +3,10 @@ import os
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from PIL import Image
|
||||
from docx import Document
|
||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
|
||||
from docx.shared import RGBColor
|
||||
@@ -23,6 +25,9 @@ def find_default_font(config_file_path, rules):
|
||||
default_font = None
|
||||
expected_font = rules["font_name"]
|
||||
|
||||
if not config_file_path:
|
||||
return 0
|
||||
|
||||
try:
|
||||
tree = ET.parse(config_file_path)
|
||||
root = tree.getroot()
|
||||
@@ -42,6 +47,9 @@ def find_default_font(config_file_path, rules):
|
||||
|
||||
|
||||
def contains_page_break(docx_file):
|
||||
if not docx_file:
|
||||
return 0
|
||||
|
||||
doc = Document(docx_file)
|
||||
|
||||
namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
||||
@@ -62,6 +70,9 @@ def compare_docx_files(file1, file2, **options):
|
||||
ignore_order = options.get('ignore_order', False)
|
||||
content_only = options.get('content_only', False)
|
||||
|
||||
if not file1 or not file2:
|
||||
return 0
|
||||
|
||||
def get_paragraph_texts_odt(document):
|
||||
paragraphs = document.getElementsByType(P)
|
||||
paragraph_texts = []
|
||||
@@ -139,6 +150,9 @@ def compare_docx_files(file1, file2, **options):
|
||||
|
||||
|
||||
def compare_init_lines(file1, file2):
|
||||
if not file1 or not file2:
|
||||
return 0
|
||||
|
||||
doc1 = Document(file1)
|
||||
doc2 = Document(file2)
|
||||
|
||||
@@ -156,6 +170,9 @@ def compare_init_lines(file1, file2):
|
||||
|
||||
|
||||
def compare_docx_tables(docx_file1, docx_file2):
|
||||
if not docx_file1 or not docx_file2:
|
||||
return 0
|
||||
|
||||
doc1 = Document(docx_file1)
|
||||
doc2 = Document(docx_file2)
|
||||
|
||||
@@ -181,11 +198,10 @@ def compare_docx_tables(docx_file1, docx_file2):
|
||||
return 1
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def compare_docx_images(docx_file1, docx_file2):
|
||||
if not docx_file1 or not docx_file2:
|
||||
return 0
|
||||
|
||||
doc1 = Document(docx_file1)
|
||||
doc2 = Document(docx_file2)
|
||||
|
||||
@@ -211,6 +227,9 @@ import pytesseract
|
||||
|
||||
|
||||
def compare_image_text(image_path, rule):
|
||||
if not image_path:
|
||||
return 0
|
||||
|
||||
img = Image.open(image_path)
|
||||
img_text = pytesseract.image_to_string(img)
|
||||
if rule['type'] == 'text':
|
||||
@@ -220,6 +239,9 @@ def compare_image_text(image_path, rule):
|
||||
|
||||
|
||||
def compare_line_spacing(docx_file1, docx_file2):
|
||||
if not docx_file1 or not docx_file2:
|
||||
return 0
|
||||
|
||||
if not compare_docx_files(docx_file1, docx_file2):
|
||||
return 0
|
||||
doc1 = Document(docx_file1)
|
||||
@@ -241,6 +263,9 @@ def compare_line_spacing(docx_file1, docx_file2):
|
||||
|
||||
|
||||
def compare_insert_equation(docx_file1, docx_file2):
|
||||
if not docx_file1 or not docx_file2:
|
||||
return 0
|
||||
|
||||
if not compare_docx_files(docx_file1, docx_file2):
|
||||
return 0
|
||||
|
||||
@@ -256,6 +281,9 @@ def compare_insert_equation(docx_file1, docx_file2):
|
||||
|
||||
|
||||
def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
|
||||
if not docx_file:
|
||||
return 0
|
||||
|
||||
doc = Document(docx_file)
|
||||
expected_font = rules["font_name"]
|
||||
|
||||
@@ -268,6 +296,9 @@ def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
|
||||
|
||||
|
||||
def compare_subscript_contains(docx_file1, docx_file2):
|
||||
if not docx_file1 or not docx_file2:
|
||||
return 0
|
||||
|
||||
doc1 = Document(docx_file1)
|
||||
doc2 = Document(docx_file2)
|
||||
|
||||
@@ -280,6 +311,9 @@ def compare_subscript_contains(docx_file1, docx_file2):
|
||||
|
||||
|
||||
def has_page_numbers_in_footers(docx_file):
|
||||
if not docx_file:
|
||||
return 0
|
||||
|
||||
doc = Document(docx_file)
|
||||
|
||||
for section in doc.sections:
|
||||
@@ -294,6 +328,9 @@ def has_page_numbers_in_footers(docx_file):
|
||||
|
||||
|
||||
def is_first_line_centered(docx_file):
|
||||
if not docx_file:
|
||||
return 0
|
||||
|
||||
doc = Document(docx_file)
|
||||
first_paragraph = doc.paragraphs[0]
|
||||
|
||||
@@ -302,11 +339,16 @@ def is_first_line_centered(docx_file):
|
||||
|
||||
|
||||
def check_file_exists(directory, filename):
|
||||
if not directory or not filename:
|
||||
return 0
|
||||
file_path = os.path.join(directory, filename)
|
||||
return 1 if os.path.isfile(file_path) else 0
|
||||
|
||||
|
||||
def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
|
||||
if not docx_file1 or not docx_file2:
|
||||
return .0
|
||||
|
||||
doc1: Document = Document(docx_file1)
|
||||
doc2: Document = Document(docx_file2)
|
||||
para1 = [p for p in doc1.paragraphs if p.text.strip()]
|
||||
@@ -342,6 +384,9 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
|
||||
|
||||
|
||||
def compare_contains_image(docx_file1, docx_file2):
|
||||
if not docx_file1 or not docx_file2:
|
||||
return 0
|
||||
|
||||
doc1 = Document(docx_file1)
|
||||
doc2 = Document(docx_file2)
|
||||
|
||||
@@ -354,6 +399,9 @@ def compare_contains_image(docx_file1, docx_file2):
|
||||
|
||||
|
||||
def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
|
||||
if not file_path1 or not file_path2:
|
||||
return 0
|
||||
|
||||
if not compare_docx_files(file_path1, file_path2):
|
||||
return 0
|
||||
document = Document(file_path1)
|
||||
@@ -388,6 +436,9 @@ def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
|
||||
|
||||
|
||||
def check_highlighted_words(file_path1, file_path2):
|
||||
if not file_path1 or not file_path2:
|
||||
return 0
|
||||
|
||||
if not compare_docx_files(file_path1, file_path2):
|
||||
return 0
|
||||
|
||||
@@ -410,6 +461,9 @@ def check_highlighted_words(file_path1, file_path2):
|
||||
|
||||
|
||||
def evaluate_strike_through_last_paragraph(file_path1, file_path2):
|
||||
if not file_path1 or not file_path2:
|
||||
return 0
|
||||
|
||||
if not compare_docx_files(file_path1, file_path2):
|
||||
return 0
|
||||
document = Document(file_path1)
|
||||
@@ -426,6 +480,9 @@ def evaluate_strike_through_last_paragraph(file_path1, file_path2):
|
||||
|
||||
|
||||
def evaluate_conversion(file_path):
|
||||
if not file_path:
|
||||
return 0
|
||||
|
||||
document = Document(file_path)
|
||||
|
||||
for table in document.tables:
|
||||
@@ -445,6 +502,9 @@ def evaluate_conversion(file_path):
|
||||
|
||||
|
||||
def evaluate_spacing(file_path):
|
||||
if not file_path:
|
||||
return 0
|
||||
|
||||
document = Document(file_path)
|
||||
|
||||
# Check line spacing for introduction, body, and conclusion
|
||||
@@ -458,6 +518,9 @@ def evaluate_spacing(file_path):
|
||||
|
||||
|
||||
def check_italic_font_size_14(path1, path2):
|
||||
if not path1 or not path2:
|
||||
return 0
|
||||
|
||||
if not compare_docx_files(path1, path2):
|
||||
return 0
|
||||
document = Document(path1)
|
||||
@@ -471,6 +534,9 @@ def check_italic_font_size_14(path1, path2):
|
||||
|
||||
|
||||
def evaluate_alignment(docx_path):
|
||||
if not docx_path:
|
||||
return 0
|
||||
|
||||
# Load the document
|
||||
doc = Document(docx_path)
|
||||
|
||||
@@ -500,6 +566,9 @@ def evaluate_alignment(docx_path):
|
||||
|
||||
|
||||
def get_unique_train_ids(initial_file): # fixed standard
|
||||
if not initial_file:
|
||||
return set(), 0
|
||||
|
||||
doc = Document(initial_file)
|
||||
train_ids = set()
|
||||
processed_lines = 0
|
||||
@@ -516,6 +585,9 @@ def get_unique_train_ids(initial_file): # fixed standard
|
||||
|
||||
|
||||
def check_no_duplicates(initial_file, processed_file):
|
||||
if not initial_file or not processed_file:
|
||||
return 0
|
||||
|
||||
# Open the document
|
||||
train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
|
||||
doc_processed = Document(processed_file)
|
||||
@@ -543,6 +615,9 @@ def check_no_duplicates(initial_file, processed_file):
|
||||
|
||||
|
||||
def compare_docx_lines(file1, file2):
|
||||
if not file1 or not file2:
|
||||
return 0
|
||||
|
||||
# Read the text of the document, line by line
|
||||
doc1 = Document(file1)
|
||||
doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
|
||||
@@ -562,6 +637,9 @@ def compare_docx_lines(file1, file2):
|
||||
def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
|
||||
ignore_blanks = options.get('ignore_blanks', True)
|
||||
|
||||
if not file1 or not file2:
|
||||
return 0
|
||||
|
||||
# Determine file types and load documents
|
||||
if file1.endswith('.docx') and file2.endswith('.docx'):
|
||||
doc1 = Document(file1)
|
||||
@@ -594,6 +672,9 @@ def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
|
||||
|
||||
# Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
|
||||
def compare_highlighted_text(file1, file2):
|
||||
if not file1 or not file2:
|
||||
return 0
|
||||
|
||||
def extract_highlighted_text(file_path):
|
||||
highlighted_texts = []
|
||||
|
||||
@@ -631,6 +712,9 @@ def compare_highlighted_text(file1, file2):
|
||||
|
||||
|
||||
def compare_references(file1, file2, **options):
|
||||
if not file1 or not file2:
|
||||
return 0
|
||||
|
||||
reference_indicator = options.get('reference_indicator', 'References')
|
||||
reference_base_result = options.get('reference_base_result', 0.5)
|
||||
|
||||
@@ -675,48 +759,3 @@ def compare_references(file1, file2, **options):
|
||||
return (result - reference_base_result) / (1 - reference_base_result)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def compare_answer(file1, file2, **options):
|
||||
"""This is a specific function to compare the """
|
||||
# Determine file types and load documents
|
||||
if file1.endswith('.docx') and file2.endswith('.docx'):
|
||||
doc1 = Document(file1)
|
||||
doc2 = Document(file2)
|
||||
doc1_paragraphs = [p.text for p in doc1.paragraphs]
|
||||
doc2_paragraphs = [p.text for p in doc2.paragraphs]
|
||||
else:
|
||||
# Unsupported file types or mismatch
|
||||
print("Unsupported file types or mismatch between file types.")
|
||||
return 0
|
||||
|
||||
# Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
|
||||
ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
|
||||
ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
|
||||
|
||||
if ref1_idx == -1 and ref2_idx == -1:
|
||||
return 1
|
||||
|
||||
if ref1_idx == -1 or ref2_idx == -1:
|
||||
return 0
|
||||
|
||||
# split the reference section into reference items, and remove the empty string items
|
||||
ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
|
||||
ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
|
||||
|
||||
# Compare the references
|
||||
|
||||
if len(ref1) != len(ref2):
|
||||
return 0
|
||||
|
||||
total_similarity = 0
|
||||
for r1, r2 in zip(ref1, ref2):
|
||||
# fuzzy match the references
|
||||
similarity = fuzz.ratio(r1, r2) / 100.0
|
||||
total_similarity += similarity
|
||||
|
||||
result = total_similarity / len(ref1)
|
||||
if result >= reference_base_result:
|
||||
return (result - reference_base_result) / (1 - reference_base_result)
|
||||
else:
|
||||
return 0
|
||||
|
||||
@@ -1,29 +1,30 @@
|
||||
import csv
|
||||
import datetime
|
||||
import difflib
|
||||
import functools
|
||||
import json
|
||||
import yaml
|
||||
import logging
|
||||
import operator
|
||||
import re
|
||||
import pdfplumber
|
||||
import sqlite3
|
||||
from numbers import Number
|
||||
from typing import Callable, Any, Union
|
||||
from typing import Dict, List, Pattern
|
||||
import datetime
|
||||
import pandas as pd
|
||||
|
||||
import lxml.etree
|
||||
import pandas as pd
|
||||
import pdfplumber
|
||||
import yaml
|
||||
from docx import Document
|
||||
from lxml.cssselect import CSSSelector
|
||||
from lxml.etree import _Element
|
||||
from rapidfuzz import fuzz
|
||||
from docx import Document
|
||||
import difflib
|
||||
|
||||
from .utils import _match_record, _match_value_to_rule
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("desktopenv.metric.general")
|
||||
|
||||
|
||||
def check_include_exclude(result: str, rules: Dict[str, List[str]]) -> float:
|
||||
if result is None:
|
||||
return 0.
|
||||
@@ -71,6 +72,7 @@ def is_in_list(result, rules) -> float:
|
||||
else:
|
||||
return 0.
|
||||
|
||||
|
||||
def diff_text_file(result: str, expect: str) -> float:
|
||||
if result is None:
|
||||
return 0.
|
||||
@@ -81,6 +83,7 @@ def diff_text_file(result: str, expect: str) -> float:
|
||||
expected_lines: List[str] = f.read().splitlines()
|
||||
return difflib.SequenceMatcher(a=result_lines, b=expected_lines).ratio()
|
||||
|
||||
|
||||
def fuzzy_match(result, rules) -> float:
|
||||
expect = rules["expected"]
|
||||
|
||||
@@ -88,7 +91,7 @@ def fuzzy_match(result, rules) -> float:
|
||||
|
||||
|
||||
def fuzzy_place_math(result_file_path, rules) -> float:
|
||||
expect = rules["expected"] # a list of possible answers
|
||||
expect = rules["expected"] # a list of possible answers
|
||||
# read list.docx, and get all texts out, overlook blank lines, remove blanks before and after each line
|
||||
doc = Document(result_file_path)
|
||||
words_list = []
|
||||
@@ -102,6 +105,7 @@ def fuzzy_place_math(result_file_path, rules) -> float:
|
||||
return 0
|
||||
return 1
|
||||
|
||||
|
||||
def check_csv(result: str, rules: Dict[str, List[Dict[str, str]]]) -> float:
|
||||
"""
|
||||
Args:
|
||||
@@ -209,10 +213,10 @@ def check_accessibility_tree(result: str, rules: List[Dict[str, Any]]) -> float:
|
||||
return 0.
|
||||
|
||||
if "text" in r:
|
||||
match_func: Callable[[str], Number] = functools.partial( operator.eq if r["exact"] \
|
||||
else (lambda a, b: fuzz.ratio(a, b) / 100.)
|
||||
, r["text"]
|
||||
)
|
||||
match_func: Callable[[str], Number] = functools.partial(operator.eq if r["exact"] \
|
||||
else (lambda a, b: fuzz.ratio(a, b) / 100.)
|
||||
, r["text"]
|
||||
)
|
||||
match_score: Number = 0
|
||||
for elm in elements:
|
||||
match_score = max(match_score, match_func(elm.text or None))
|
||||
@@ -285,7 +289,7 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
|
||||
return float(metric)
|
||||
|
||||
|
||||
def check_direct_json_object(result, rules)->float:
|
||||
def check_direct_json_object(result, rules) -> float:
|
||||
"""
|
||||
One of the most commonly used function to evalute.
|
||||
Compare two json objects directly.
|
||||
@@ -320,7 +324,11 @@ def check_direct_json_object(result, rules)->float:
|
||||
return 0.
|
||||
return 1.0
|
||||
|
||||
|
||||
def compare_time_in_speedtest_results(speedtest_result_path, time_diff):
|
||||
if not speedtest_result_path:
|
||||
return 0
|
||||
|
||||
# open the speedtest results file(csv)
|
||||
date_col = None
|
||||
with open(speedtest_result_path, 'r') as f:
|
||||
@@ -333,12 +341,17 @@ def compare_time_in_speedtest_results(speedtest_result_path, time_diff):
|
||||
for date in reader[date_col]:
|
||||
date_time = date[-5:]
|
||||
# compare the date time with the current date time, if time diff less than time_diff para, then return true
|
||||
if not abs((datetime.datetime.strptime(date_time, '%H:%M') - datetime.datetime.strptime(now_date_time, '%H:%M')).total_seconds()) / 60 < int(time_diff):
|
||||
return False
|
||||
return True
|
||||
if not abs((datetime.datetime.strptime(date_time, '%H:%M') - datetime.datetime.strptime(now_date_time,
|
||||
'%H:%M')).total_seconds()) / 60 < int(
|
||||
time_diff):
|
||||
return 0
|
||||
return 1
|
||||
|
||||
|
||||
def is_included_all_json_objects(gold_file_path, result_file_path):
|
||||
if not gold_file_path or not result_file_path:
|
||||
return 0
|
||||
|
||||
print("gold_file_path: ")
|
||||
print(gold_file_path)
|
||||
print("result_file_path: ")
|
||||
@@ -350,8 +363,8 @@ def is_included_all_json_objects(gold_file_path, result_file_path):
|
||||
result_json = json.load(fr)
|
||||
for key in gold_json.keys():
|
||||
if key not in result_json.keys() or gold_json[key] != result_json[key]:
|
||||
return False
|
||||
return True
|
||||
return 0
|
||||
return 1
|
||||
|
||||
|
||||
def is_gold_text_included_in_pdf(pdf_file_path, gold_text_path):
|
||||
@@ -373,31 +386,32 @@ def is_gold_text_included_in_pdf(pdf_file_path, gold_text_path):
|
||||
if len(false_list) > 0:
|
||||
print("false_list: ")
|
||||
print(false_list)
|
||||
return False
|
||||
return 0
|
||||
else:
|
||||
return True
|
||||
return 1
|
||||
|
||||
|
||||
def file_contains(file_path, config):
|
||||
# file_path ends with .txt
|
||||
if not file_path :
|
||||
return False
|
||||
if not file_path:
|
||||
return 1
|
||||
with open(file_path, 'r') as f:
|
||||
file_text = f.read()
|
||||
for text in config["expected"]:
|
||||
if text not in file_text:
|
||||
return False
|
||||
return True
|
||||
return 0
|
||||
return 1
|
||||
|
||||
|
||||
def check_csv_line_number(file_path, line_number):
|
||||
# check file_path suffix
|
||||
if not file_path.endswith('.csv'):
|
||||
return False
|
||||
return 0
|
||||
# check line number
|
||||
with open(file_path, 'r') as f:
|
||||
reader = csv.reader(f)
|
||||
line_count = sum(1 for row in reader)
|
||||
return True if line_count == int(line_number["expected"]) else False
|
||||
return 1 if line_count == int(line_number["expected"]) else 0
|
||||
|
||||
|
||||
def compare_terminal_and_txt(txt_file_path, terminal_output):
|
||||
@@ -405,7 +419,7 @@ def compare_terminal_and_txt(txt_file_path, terminal_output):
|
||||
with open(txt_file_path, 'r') as f:
|
||||
txt_file_content = f.read()
|
||||
# compare terminal output with txt file content
|
||||
return True if terminal_output == txt_file_content else False
|
||||
return 1 if terminal_output == txt_file_content else 0
|
||||
|
||||
|
||||
def compare_python_pure_text(py_file_path, gold_file_path):
|
||||
@@ -414,13 +428,18 @@ def compare_python_pure_text(py_file_path, gold_file_path):
|
||||
print(py_file_path)
|
||||
print("gold_file_path: ")
|
||||
print(gold_file_path)
|
||||
|
||||
# gold_file_path = gold_file_path.replace('.txt', '.py')
|
||||
def remove_whitespace(text):
|
||||
return ''.join(text.split())
|
||||
|
||||
with open(py_file_path, 'r') as file1:
|
||||
content1 = file1.read()
|
||||
with open(gold_file_path, 'r') as file2:
|
||||
content2 = file2.read()
|
||||
content1_no_whitespace = remove_whitespace(content1)
|
||||
content2_no_whitespace = remove_whitespace(content2)
|
||||
return content1_no_whitespace == content2_no_whitespace
|
||||
if content1_no_whitespace == content2_no_whitespace:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
Reference in New Issue
Block a user