support mano agent (#338)

Co-authored-by: Fei Hu <molanhand@users.noreply.github.com>
2025-09-16 18:10:29 +08:00
parent dc7e46e7aa
commit 7213eca069
3 changed files with 1696 additions and 0 deletions
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -267,3 +267,63 @@ def run_single_example_autoglm(agent, env, example, max_steps, instruction, args
    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
        f.write(f"{result}\n")
    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+
+def run_single_example_mano(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
+    runtime_logger = setup_logger(example, example_result_dir)
+    agent.reset(runtime_logger)
+    env.reset(task_config=example)
+    time.sleep(60) # Wait for the environment to be ready
+    obs = env._get_obs() # Get the initial observation
+    done = False
+    step_idx = 0
+    env.controller.start_recording()
+    
+    with open(os.path.join(example_result_dir, f"step_0.png"),
+      "wb") as _f:
+        _f.write(obs['screenshot'])
+    while not done and step_idx < max_steps:
+        response, actions = agent.predict(
+            instruction,
+            obs
+        )
+        if len(actions) > 1:
+            if (("pyautogui.hotkey('shift')" in actions[0] or "pyautogui.hotkey('ctrl')" in actions[0]) 
+                and "pyautogui.click" in actions[1]):
+                hotkey_type = 'shift' if "shift" in actions[0] else 'ctrl'
+                action = f"pyautogui.keyDown('{hotkey_type}')\n{actions[1]}\npyautogui.keyUp('{hotkey_type}')"
+                actions = [action]  
+                
+        for action in actions:
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_idx + 1, action)
+            obs, reward, done, info = env.step(action, args.sleep_after_execution)
+
+            logger.info("Reward: %.2f", reward)
+            logger.info("Done: %s", done)
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
+                      "wb") as _f:
+                _f.write(obs['screenshot'])
+            with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_idx + 1,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png",
+                    "response":response
+                }))
+                f.write("\n")
+            if done:
+                logger.info("The episode is done.")
+                break
+        step_idx += 1
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+    scores.append(result)
+    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
+        f.write(f"{result}\n")
+    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))