fix(runner): pass result_dir to evaluate and re-enable environment reset
This commit is contained in:
@@ -13,17 +13,17 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
|
||||
# Reset environment first to get fresh VM IP
|
||||
# env.reset(task_config=example)
|
||||
# logger.info("=======Environment reset completed=======")
|
||||
env.reset(task_config=example)
|
||||
logger.info("=======Environment reset completed=======")
|
||||
|
||||
# # Reset agent with fresh VM IP (for snapshot reverts)
|
||||
# try:
|
||||
# agent.reset(runtime_logger, vm_ip=env.vm_ip)
|
||||
# except Exception as e:
|
||||
# agent.reset(vm_ip=env.vm_ip)
|
||||
|
||||
|
||||
# time.sleep(10) # Wait for the environment to be ready
|
||||
|
||||
|
||||
# get initial observation
|
||||
logger.info("Getting initial observation...")
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
@@ -74,8 +74,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
||||
break
|
||||
step_idx += 1
|
||||
time.sleep(20) # Wait for the environment to settle
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
result = env.evaluate(result_dir=example_result_dir)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
@@ -112,7 +111,7 @@ def run_single_example_human(env, example, max_steps, instruction, args, example
|
||||
f.write("\n")
|
||||
|
||||
# Evaluate the result
|
||||
result = env.evaluate()
|
||||
result = env.evaluate(result_dir=example_result_dir)
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
@@ -549,7 +548,7 @@ def run_single_example_os_symphony(agent, env, example, max_steps, instruction,
|
||||
break
|
||||
step_idx += 1
|
||||
end_time = time.time()
|
||||
result = float(env.evaluate())
|
||||
result = float(env.evaluate(result_dir=example_result_dir))
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
@@ -647,7 +646,7 @@ def run_single_example_evocua(agent, env, example, max_steps, instruction, args,
|
||||
step_idx += 1
|
||||
|
||||
time.sleep(20) # Wait for environment to settle
|
||||
result = env.evaluate()
|
||||
result = env.evaluate(result_dir=example_result_dir)
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user