Wxy/opencua (#256)

* OpenCUA Agent code base * update url * debug, modify url input
2025-07-14 20:26:39 +08:00
parent 2339db20ca
commit db83b9cb2c
3 changed files with 1308 additions and 0 deletions
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -146,6 +146,61 @@ def run_single_example_openaicua(agent, env, example, max_steps, instruction, ar
    result = env.evaluate()
    logger.info("Result: %.2f", result)
    scores.append(result)
+    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
+        f.write(f"{result}\n")
+    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+
+def run_single_example_opencua(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
+    runtime_logger = setup_logger(example, example_result_dir)
+    agent.reset(runtime_logger)
+    env.reset(task_config=example)
+    time.sleep(60) # Wait for the environment to be ready
+    obs = env._get_obs() # Get the initial observation
+    done = False
+    step_idx = 0
+    env.controller.start_recording()
+    while not done and step_idx < max_steps:
+        response, actions, info_dict = agent.predict(instruction, obs)
+
+        logger.info(f"Got Action: {actions}")
+        if not actions or len(actions)==0 or actions[0]=="" or actions[0].lower().startswith("error"): # TODO: new added
+            break
+
+        for action in actions:
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_idx + 1, action)
+            
+            obs, reward, done, info = env.step(action)
+            time.sleep(3)
+            obs = env._get_obs()
+
+            logger.info(f"Action {action} executed, reward: {reward}, done: {done}")
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
+                      "wb") as _f:
+                _f.write(obs['screenshot'])
+
+            with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_idx + 1,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "response": response,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                }))
+                f.write("\n")
+            if done:
+                logger.info("The episode is done.")
+                break
+        step_idx += 1
+
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+    scores.append(result)
    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
        f.write(f"{result}\n")
    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))