Robust Evaluation, Blocking File Open, Grader Sensitivity, and LibreOffice Writer Fixes (#217)

* Refactor evaluator structure in LibreOffice Writer example JSON to support multiple expected and result files, enhancing evaluation flexibility.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update time format in get_vm_file function to include hours, minutes, and seconds for more precise file naming with time suffix.

* More delay for 936321ce-5236-426a-9a20-e0e3c5dc536f; support one more potential solutions.

* Enhance SetupController with configurable retry limit and improved error handling for file opening requests. Introduce new function to compare unique training records, and update logging for better debugging. Adjust JSON examples for evaluation to support multiple expected and result files.

* Clean debug code

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
This commit is contained in:
Tianbao Xie
2025-06-16 21:37:19 +08:00
committed by GitHub
parent 347238e17e
commit 4e11eafd1d
13 changed files with 523 additions and 135 deletions

View File

@@ -80,18 +80,16 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
returned.
only support for single file now:
time_suffix(bool): optional. defaults to False. if True, append the current time in required format.
time_format(str): optional. defaults to "%Y_%m_%d". format of the time suffix.
time_format(str): optional. defaults to "%Y%m%d_%H%M%S". format of the time suffix.
"""
time_format = "%Y_%m_%d"
time_format = "%Y%m%d_%H%M%S"
if not config.get("multi", False):
paths: List[str] = [config["path"]]
dests: List[str] = [config["dest"]]
if "time_suffix" in config.keys() and config["time_suffix"]:
if "time_format" in config.keys():
time_format = config["time_format"]
# Insert time before . in file type suffix
paths = [p.split(".")[0] + datetime.now().strftime(time_format) + "." + p.split(".")[1] if "." in p else p for p in paths]
dests = [d.split(".")[0] + datetime.now().strftime(time_format) + "." + d.split(".")[1] if "." in d else d for d in dests]
if config.get("time_suffix", False):
time_format = config.get("time_format", time_format)
# Insert time before file extension.
dests = [f"{os.path.splitext(d)[0]}_{datetime.now().strftime(time_format)}{os.path.splitext(d)[1]}" for d in dests]
else:
paths: List[str] = config["path"]
dests: List[str] = config["dest"]

View File

@@ -52,7 +52,8 @@ from .docs import (
compare_docx_files_and_ignore_new_lines,
compare_docx_images,
compare_image_text,
compare_references
compare_references,
compare_unique_train_records
)
from .general import (
check_csv,

View File

@@ -167,8 +167,12 @@ def compare_docx_files(file1, file2, **options):
if ignore_case:
p1, p2 = p1.lower(), p2.lower()
if p1 != p2:
print(p1)
print(p2)
# show the difference
print("=== First Paragraph ===")
print(f"\033[92m{repr(p1)}\033[0m") # Green color for p1, repr() shows hidden chars
print("=== Second Paragraph ===")
print(f"\033[91m{repr(p2)}\033[0m") # Red color for p2, repr() shows hidden chars
print("=" * 50) # Clear boundary
return 0
return 1
@@ -886,3 +890,72 @@ def compare_references(file1, file2, **options):
return (result - reference_base_result) / (1 - reference_base_result)
else:
return 0
def compare_unique_train_records(processed_file, expected_files, **kwargs):
"""
Compares the processed file with a list of expected files containing the
gold standard and the initial document.
expected_files[0] should be the gold standard file.
expected_files[1] should be the initial file.
"""
# Debug logging to understand what we're actually receiving
logger.info(f"DEBUG: processed_file type: {type(processed_file)}, value: {processed_file}")
logger.info(f"DEBUG: expected_files type: {type(expected_files)}, value: {expected_files}")
logger.info(f"DEBUG: kwargs: {kwargs}")
if not processed_file or not isinstance(expected_files, list) or len(expected_files) < 2:
logger.error("Invalid arguments: processed_file and a list of 2 expected_files are required.")
return 0
gold_file = expected_files[0]
initial_file = expected_files[1]
if not gold_file or not initial_file:
logger.error("Gold file or initial file path is missing from expected_files list.")
return 0
# Helper function to get lines and IDs from a file
def get_lines_and_ids_from_file(file_path):
try:
doc = Document(file_path)
lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
train_ids = [line.split(',')[1].strip() for line in lines if len(line.split(',')) == 4]
return lines, train_ids
except Exception as e:
logger.error(f"Error opening or parsing file {file_path}: {e}")
return None, None
# Get data from all three files
processed_lines, processed_train_ids = get_lines_and_ids_from_file(processed_file)
if processed_lines is None: return 0
gold_lines, gold_train_ids = get_lines_and_ids_from_file(gold_file)
if gold_lines is None: return 0
initial_lines, _ = get_lines_and_ids_from_file(initial_file)
if initial_lines is None: return 0
initial_lines_set = set(initial_lines)
# 1. Subset Check: Ensure every processed line was in the initial file
if not set(processed_lines).issubset(initial_lines_set):
logger.error("Processed file contains lines not present in the initial file.")
logger.error(f"Extra lines: {set(processed_lines) - initial_lines_set}")
return 0
# 2. Uniqueness Check: Check for duplicates within the processed file
if len(processed_train_ids) != len(set(processed_train_ids)):
logger.error("Duplicate train_ids found in the processed file.")
return 0
# 3. Correctness Check: Compare the set of train_ids
if set(processed_train_ids) != set(gold_train_ids):
logger.error("Set of train_ids does not match between processed file and gold file.")
return 0
# 4. Line count check
if len(processed_lines) != len(gold_lines):
logger.error("Number of lines does not match between processed file and gold file.")
return 0
return 1