From 63484c7b7b07a0460d3ae7cd3d1f0bc6e570f272 Mon Sep 17 00:00:00 2001
From: cui0711 <1729461967@qq.com>
Date: Thu, 5 Feb 2026 16:55:49 +0800
Subject: [PATCH] fix(runner): pass result_dir to evaluate and re-enable
 environment reset

---
 lib_run_single.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/lib_run_single.py b/lib_run_single.py
index 2262ea5..591e39e 100644
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -13,17 +13,17 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
     runtime_logger = setup_logger(example, example_result_dir)
 
     # Reset environment first to get fresh VM IP
-    # env.reset(task_config=example)
-    # logger.info("=======Environment reset completed=======")
+    env.reset(task_config=example)
+    logger.info("=======Environment reset completed=======")
 
     # # Reset agent with fresh VM IP (for snapshot reverts)
     # try:
     #     agent.reset(runtime_logger, vm_ip=env.vm_ip)
     # except Exception as e:
     #     agent.reset(vm_ip=env.vm_ip)
-    
+
     # time.sleep(10) # Wait for the environment to be ready
-    
+
     # get initial observation
     logger.info("Getting initial observation...")
     obs = env._get_obs() # Get the initial observation
@@ -74,8 +74,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
                 break
         step_idx += 1
     time.sleep(20) # Wait for the environment to settle
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
+    result = env.evaluate(result_dir=example_result_dir)
     scores.append(result)
     with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
         f.write(f"{result}\n")
@@ -112,7 +111,7 @@ def run_single_example_human(env, example, max_steps, instruction, args, example
         f.write("\n")
     
     # Evaluate the result
-    result = env.evaluate()
+    result = env.evaluate(result_dir=example_result_dir)
     logger.info("Result: %.2f", result)
     scores.append(result)
     with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
@@ -549,7 +548,7 @@ def run_single_example_os_symphony(agent, env, example, max_steps, instruction,
                 break
         step_idx += 1
     end_time = time.time()
-    result = float(env.evaluate())
+    result = float(env.evaluate(result_dir=example_result_dir))
     logger.info("Result: %.2f", result)
     scores.append(result)
     with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
@@ -647,7 +646,7 @@ def run_single_example_evocua(agent, env, example, max_steps, instruction, args,
         step_idx += 1
         
     time.sleep(20) # Wait for environment to settle
-    result = env.evaluate()
+    result = env.evaluate(result_dir=example_result_dir)
     logger.info("Result: %.2f", result)
     scores.append(result)