fix(runner): pass result_dir to evaluate and re-enable environment reset
This commit is contained in:
@@ -13,8 +13,8 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
|||||||
runtime_logger = setup_logger(example, example_result_dir)
|
runtime_logger = setup_logger(example, example_result_dir)
|
||||||
|
|
||||||
# Reset environment first to get fresh VM IP
|
# Reset environment first to get fresh VM IP
|
||||||
# env.reset(task_config=example)
|
env.reset(task_config=example)
|
||||||
# logger.info("=======Environment reset completed=======")
|
logger.info("=======Environment reset completed=======")
|
||||||
|
|
||||||
# # Reset agent with fresh VM IP (for snapshot reverts)
|
# # Reset agent with fresh VM IP (for snapshot reverts)
|
||||||
# try:
|
# try:
|
||||||
@@ -74,8 +74,7 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
|
|||||||
break
|
break
|
||||||
step_idx += 1
|
step_idx += 1
|
||||||
time.sleep(20) # Wait for the environment to settle
|
time.sleep(20) # Wait for the environment to settle
|
||||||
result = env.evaluate()
|
result = env.evaluate(result_dir=example_result_dir)
|
||||||
logger.info("Result: %.2f", result)
|
|
||||||
scores.append(result)
|
scores.append(result)
|
||||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||||
f.write(f"{result}\n")
|
f.write(f"{result}\n")
|
||||||
@@ -112,7 +111,7 @@ def run_single_example_human(env, example, max_steps, instruction, args, example
|
|||||||
f.write("\n")
|
f.write("\n")
|
||||||
|
|
||||||
# Evaluate the result
|
# Evaluate the result
|
||||||
result = env.evaluate()
|
result = env.evaluate(result_dir=example_result_dir)
|
||||||
logger.info("Result: %.2f", result)
|
logger.info("Result: %.2f", result)
|
||||||
scores.append(result)
|
scores.append(result)
|
||||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||||
@@ -549,7 +548,7 @@ def run_single_example_os_symphony(agent, env, example, max_steps, instruction,
|
|||||||
break
|
break
|
||||||
step_idx += 1
|
step_idx += 1
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
result = float(env.evaluate())
|
result = float(env.evaluate(result_dir=example_result_dir))
|
||||||
logger.info("Result: %.2f", result)
|
logger.info("Result: %.2f", result)
|
||||||
scores.append(result)
|
scores.append(result)
|
||||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||||
@@ -647,7 +646,7 @@ def run_single_example_evocua(agent, env, example, max_steps, instruction, args,
|
|||||||
step_idx += 1
|
step_idx += 1
|
||||||
|
|
||||||
time.sleep(20) # Wait for environment to settle
|
time.sleep(20) # Wait for environment to settle
|
||||||
result = env.evaluate()
|
result = env.evaluate(result_dir=example_result_dir)
|
||||||
logger.info("Result: %.2f", result)
|
logger.info("Result: %.2f", result)
|
||||||
scores.append(result)
|
scores.append(result)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user