fix(runner): pass result_dir to evaluate and re-enable environment reset
This commit is contained in:
@@ -13,8 +13,8 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
|
||||
# Reset environment first to get fresh VM IP
|
||||
# env.reset(task_config=example)
|
||||
# logger.info("=======Environment reset completed=======")
|
||||
env.reset(task_config=example)
|
||||
logger.info("=======Environment reset completed=======")
|
||||
|
||||
# # Reset agent with fresh VM IP (for snapshot reverts)
|
||||
# try:
|
||||
@@ -74,8 +74,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
||||
break
|
||||
step_idx += 1
|
||||
time.sleep(20) # Wait for the environment to settle
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
result = env.evaluate(result_dir=example_result_dir)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
@@ -112,7 +111,7 @@ def run_single_example_human(env, example, max_steps, instruction, args, example
|
||||
f.write("\n")
|
||||
|
||||
# Evaluate the result
|
||||
result = env.evaluate()
|
||||
result = env.evaluate(result_dir=example_result_dir)
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
@@ -549,7 +548,7 @@ def run_single_example_os_symphony(agent, env, example, max_steps, instruction,
|
||||
break
|
||||
step_idx += 1
|
||||
end_time = time.time()
|
||||
result = float(env.evaluate())
|
||||
result = float(env.evaluate(result_dir=example_result_dir))
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
@@ -647,7 +646,7 @@ def run_single_example_evocua(agent, env, example, max_steps, instruction, args,
|
||||
step_idx += 1
|
||||
|
||||
time.sleep(20) # Wait for environment to settle
|
||||
result = env.evaluate()
|
||||
result = env.evaluate(result_dir=example_result_dir)
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user