fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑
- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug,改为数字排序 - 对齐reeval.py的prompt逻辑:明确要求模型优先检查最终截图(STEP 1 EXAMINE FINAL SCREENSHOT FIRST) - 评估temperature从0.7降至0.2提升一致性 - 新增batch_reeval.py:基于test_final.json批量重评测已有轨迹 - 新增reeval.py:单任务重评测脚本(final-frame-anchored evaluation) - test_final.json新增avogadro(11题)和origin(8题)
This commit is contained in:
@@ -1,59 +0,0 @@
|
||||
{
|
||||
"id": "Origin_User_Guide_2025b_E_task1",
|
||||
"snapshot": "origin",
|
||||
"instruction": "在 Origin 中通过 Data → Connect to File 导入一个本地 Excel 文件 example.xlsx",
|
||||
"source": "custom",
|
||||
"config": [
|
||||
{
|
||||
"type": "upload_file",
|
||||
"parameters": {
|
||||
"files": [
|
||||
{
|
||||
"local_path": "evaluation_examples/data/origin/example.xlsx",
|
||||
"path": "C:\\Users\\user\\Desktop\\example.xlsx"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "launch",
|
||||
"parameters": {
|
||||
"command": [
|
||||
"C:\\Program Files\\OriginLab\\Origin2025b\\Origin64.exe",
|
||||
"C:\\Users\\user\\Desktop\\example.xlsx"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "sleep",
|
||||
"parameters": {
|
||||
"seconds": 5
|
||||
}
|
||||
}
|
||||
],
|
||||
"trajectory": "trajectories/",
|
||||
"related_apps": [
|
||||
"origin"
|
||||
],
|
||||
"evaluator": {
|
||||
"postconfig": [
|
||||
{
|
||||
"type": "sleep",
|
||||
"parameters": {
|
||||
"seconds": 3
|
||||
}
|
||||
}
|
||||
],
|
||||
"func": "vllm_eval"
|
||||
},
|
||||
"proxy": false,
|
||||
"fixed_ip": false,
|
||||
"possibility_of_env_change": "low",
|
||||
"metadata": {
|
||||
"input_files": [
|
||||
"example.xlsx"
|
||||
],
|
||||
"steps": "1. 单击顶部主菜单栏中的 \"Data\" 菜单。\n2. 在展开的下拉菜单中,将鼠标悬停或单击 \"Connect to File\" 菜单项以展开子菜单。\n3. 在展开的子菜单中,单击选中 \"Excel...\" 选项。\n4. 在弹出的文件选择对话框中,单击选中文件名输入框将光标定位至此。\n5. 在文件名输入框中,输入文字 \"example.xlsx\"。\n6. 单击对话框右下角的 \"Open\"(或\"打开\")按钮。 \n7. 单击新弹出对话框中的 \"OK\" 按钮。",
|
||||
"steps_original": "1. 在 Origin 的主菜单中选择 Data → Connect to File。\n2. 点击 Connect to File 菜单中的按钮。\n3. 选择文件 example.xlsx 并点击 Open。\n4. 数据将被加载到当前的工作表中。"
|
||||
}
|
||||
}
|
||||
@@ -26,7 +26,7 @@
|
||||
],
|
||||
"origin": [
|
||||
"Origin_User_Guide_2025b_E_task2",
|
||||
"Origin_User_Guide_2025b_E_task3",
|
||||
"Origin_User_Guide_2025b_E_task3",
|
||||
"Origin_User_Guide_2025b_E_task4",
|
||||
"Origin_User_Guide_2025b_E_task5",
|
||||
"Origin_User_Guide_2025b_E_task8",
|
||||
@@ -70,4 +70,4 @@
|
||||
"viewports_task10",
|
||||
"viewports_task11"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user