fix(runner): pass result_dir to evaluate and re-enable environment reset

This commit is contained in:
cui0711
2026-02-05 16:55:49 +08:00
parent ad46acc5f3
commit 63484c7b7b

View File

@@ -13,17 +13,17 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
runtime_logger = setup_logger(example, example_result_dir)
# Reset environment first to get fresh VM IP
# env.reset(task_config=example)
# logger.info("=======Environment reset completed=======")
env.reset(task_config=example)
logger.info("=======Environment reset completed=======")
# # Reset agent with fresh VM IP (for snapshot reverts)
# try:
# agent.reset(runtime_logger, vm_ip=env.vm_ip)
# except Exception as e:
# agent.reset(vm_ip=env.vm_ip)
# time.sleep(10) # Wait for the environment to be ready
# get initial observation
logger.info("Getting initial observation...")
obs = env._get_obs() # Get the initial observation
@@ -74,8 +74,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
break
step_idx += 1
time.sleep(20) # Wait for the environment to settle
result = env.evaluate()
logger.info("Result: %.2f", result)
result = env.evaluate(result_dir=example_result_dir)
scores.append(result)
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
f.write(f"{result}\n")
@@ -112,7 +111,7 @@ def run_single_example_human(env, example, max_steps, instruction, args, example
f.write("\n")
# Evaluate the result
result = env.evaluate()
result = env.evaluate(result_dir=example_result_dir)
logger.info("Result: %.2f", result)
scores.append(result)
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
@@ -549,7 +548,7 @@ def run_single_example_os_symphony(agent, env, example, max_steps, instruction,
break
step_idx += 1
end_time = time.time()
result = float(env.evaluate())
result = float(env.evaluate(result_dir=example_result_dir))
logger.info("Result: %.2f", result)
scores.append(result)
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
@@ -647,7 +646,7 @@ def run_single_example_evocua(agent, env, example, max_steps, instruction, args,
step_idx += 1
time.sleep(20) # Wait for environment to settle
result = env.evaluate()
result = env.evaluate(result_dir=example_result_dir)
logger.info("Result: %.2f", result)
scores.append(result)