Robust Evaluation, Blocking File Open, Grader Sensitivity, and LibreOffice Writer Fixes (#217)

* Refactor evaluator structure in LibreOffice Writer example JSON to support multiple expected and result files, enhancing evaluation flexibility.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update time format in get_vm_file function to include hours, minutes, and seconds for more precise file naming with time suffix.

* More delay for 936321ce-5236-426a-9a20-e0e3c5dc536f; support one more potential solutions.

* Enhance SetupController with configurable retry limit and improved error handling for file opening requests. Introduce new function to compare unique training records, and update logging for better debugging. Adjust JSON examples for evaluation to support multiple expected and result files.

* Clean debug code

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
This commit is contained in:
Tianbao Xie
2025-06-16 21:37:19 +08:00
committed by GitHub
parent 347238e17e
commit 4e11eafd1d
13 changed files with 523 additions and 135 deletions

View File

@@ -27,17 +27,57 @@
"libreoffice_writer"
],
"evaluator": {
"func": "compare_pdfs",
"expected": {
"func": [
"compare_pdfs",
"compare_pdfs",
"compare_pdfs",
"compare_pdfs"
],
"conj": "or",
"expected": [
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines_Gold_1.pdf"
},
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines_Gold_2.pdf"
},
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines_Gold.pdf"
"dest": "Constitution_Template_With_Guidelines_Gold_3.pdf"
},
"result": {
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines_Gold_4.pdf"
}
],
"result": [
{
"type": "vm_file",
"path": "/home/user/Desktop/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines.pdf"
}
"dest": "Constitution_Template_With_Guidelines_1.pdf"
},
{
"type": "vm_file",
"path": "/home/user/Documents/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines_2.pdf"
},
{
"type": "vm_file",
"path": "/home/user/Downloads/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines_3.pdf"
},
{
"type": "vm_file",
"path": "/home/user/View_Person_Organizational_Summary.pdf",
"dest": "Constitution_Template_With_Guidelines_4.pdf"
}
]
},
"proxy": false
}

View File

@@ -38,7 +38,7 @@
"command": [
"python",
"-c",
"import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=8, interval=0.01); time.sleep(1); pyautogui.scroll(-2)"
"import pyautogui; import time; time.sleep(15); pyautogui.press(\"down\", presses=8, interval=0.01); time.sleep(1); pyautogui.scroll(-2)"
]
}
}
@@ -68,12 +68,12 @@
"command": [
"python",
"-c",
"import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); pyautogui.press('down'); time.sleep(0.5); pyautogui.press('enter');"
"import pyautogui; import time; time.sleep(1); pyautogui.hotkey('ctrl', 's'); time.sleep(3);"
]
}
}
],
"func": "compare_contains_image",
"func": "compare_docx_images",
"result": {
"type": "vm_file",
"path": "/home/user/Desktop/Viewing_Your_Class_Schedule_and_Textbooks.docx",

View File

@@ -52,7 +52,7 @@
}
}
],
"func": "compare_docx_lines",
"func": "compare_unique_train_records",
"result": {
"type": "vm_file",
"path": "/home/user/Desktop/HK_train_record.docx",
@@ -60,8 +60,16 @@
},
"expected": {
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record_Gold.docx",
"dest": "HK_train_record_Gold.docx"
"path": [
"https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record_Gold.docx",
"https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record.docx"
],
"dest": [
"HK_train_record_Gold.docx",
"HK_train_record_Original.docx"
],
"multi": true,
"gives": [0, 1]
}
},
"proxy": false

View File

@@ -52,20 +52,57 @@
}
}
],
"func": "compare_docx_files",
"expected": {
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold.docx",
"dest": "CCCH9003_Tutorial_guidelines_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
"dest": "CCCH9003_Tutorial_guidelines.docx"
},
"options": {
"ignore_blanks": false
}
"func": [
"compare_docx_files",
"compare_docx_files",
"compare_docx_files"
],
"conj": "or",
"expected": [
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_1.docx",
"dest": "CCCH9003_Tutorial_guidelines_Gold_1.docx"
},
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_2.docx",
"dest": "CCCH9003_Tutorial_guidelines_Gold_2.docx"
},
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_3.docx",
"dest": "CCCH9003_Tutorial_guidelines_Gold_3.docx"
}
],
"result": [
{
"type": "vm_file",
"path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
"dest": "CCCH9003_Tutorial_guidelines.docx"
},
{
"type": "vm_file",
"path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
"dest": "CCCH9003_Tutorial_guidelines.docx"
},
{
"type": "vm_file",
"path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
"dest": "CCCH9003_Tutorial_guidelines.docx"
}
],
"options": [
{
"ignore_blanks": false
},
{
"ignore_blanks": false
},
{
"ignore_blanks": false
}
]
},
"proxy": false
}

View File

@@ -47,22 +47,40 @@
"command": [
"python",
"-c",
"import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); "
"import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(2); "
]
}
}
],
"func": "compare_docx_tables",
"expected": {
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold.docx",
"dest": "Graphemes_Sound_Letter_Patterns_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
"dest": "Graphemes_Sound_Letter_Patterns.docx"
}
"func": [
"compare_docx_tables",
"compare_docx_tables"
],
"conj": "or",
"expected": [
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold.docx",
"dest": "Graphemes_Sound_Letter_Patterns_Gold.docx"
},
{
"type": "cloud_file",
"path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold_2.docx",
"dest": "Graphemes_Sound_Letter_Patterns_Gold_2.docx"
}
],
"result": [
{
"type": "vm_file",
"path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
"dest": "Graphemes_Sound_Letter_Patterns.docx"
},
{
"type": "vm_file",
"path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
"dest": "Graphemes_Sound_Letter_Patterns.docx"
}
]
},
"proxy": false
}