fix(runner): pass result_dir to evaluate and re-enable environment reset

2026-02-05 16:55:49 +08:00
parent ad46acc5f3
commit 63484c7b7b
1 changed files with 8 additions and 9 deletions
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -13,8 +13,8 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
    runtime_logger = setup_logger(example, example_result_dir)

    # Reset environment first to get fresh VM IP
-    # env.reset(task_config=example)
-    # logger.info("=======Environment reset completed=======")
+    env.reset(task_config=example)
+    logger.info("=======Environment reset completed=======")

    # # Reset agent with fresh VM IP (for snapshot reverts)
    # try:
@@ -74,8 +74,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
                break
        step_idx += 1
    time.sleep(20) # Wait for the environment to settle
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
+    result = env.evaluate(result_dir=example_result_dir)
    scores.append(result)
    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
        f.write(f"{result}\n")
@@ -112,7 +111,7 @@ def run_single_example_human(env, example, max_steps, instruction, args, example
        f.write("\n")
    
    # Evaluate the result
-    result = env.evaluate()
+    result = env.evaluate(result_dir=example_result_dir)
    logger.info("Result: %.2f", result)
    scores.append(result)
    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
@@ -549,7 +548,7 @@ def run_single_example_os_symphony(agent, env, example, max_steps, instruction,
                break
        step_idx += 1
    end_time = time.time()
-    result = float(env.evaluate())
+    result = float(env.evaluate(result_dir=example_result_dir))
    logger.info("Result: %.2f", result)
    scores.append(result)
    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
@@ -647,7 +646,7 @@ def run_single_example_evocua(agent, env, example, max_steps, instruction, args,
        step_idx += 1
        
    time.sleep(20) # Wait for environment to settle
-    result = env.evaluate()
+    result = env.evaluate(result_dir=example_result_dir)
    logger.info("Result: %.2f", result)
    scores.append(result)