Add EvoCUA Support (#401)

* evocua init * setup max_token --------- Co-authored-by: xuetaofeng <xuetaofeng@meituan.com> Co-authored-by: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
2025-12-23 20:46:23 +08:00
parent 031696e83c
commit 410ec63a89
5 changed files with 1697 additions and 57 deletions
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -462,82 +462,102 @@ def run_single_example_uipath(agent, env, example, max_steps, instruction, args,
        f.write(f"{result}\n")
    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))

-from mm_agents.os_symphony.utils.common_utils import draw_coordinates
-from mm_agents.os_symphony.utils.process_context import set_current_result_dir

-
-logger = logging.getLogger("desktopenv.experiment")
-
-def run_single_example_os_symphony(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
-    set_current_result_dir(example_result_dir)
+def run_single_example_evocua(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
+    """
+    Unified run function for EvoCUAAgent (supporting both S1 and S2 modes).
+    """
+    runtime_logger = setup_logger(example, example_result_dir)
    
-    agent.reset(result_dir=example_result_dir)
+    # Reset Environment
    env.reset(task_config=example)
-    time.sleep(30) # Wait for the environment to be ready
+    
+    # Reset Agent
+    # Handle agent reset signature differences if any
+    try:
+        agent.reset(runtime_logger, vm_ip=env.vm_ip)
+    except Exception:
+        try:
+            agent.reset(runtime_logger)
+        except Exception:
+            agent.reset()
+
+    time.sleep(60) # Wait for the environment to be ready
    obs = env._get_obs() # Get the initial observation
    done = False
    step_idx = 0
-    # env.controller.start_recording()
-    start_time = time.time()

+    env.controller.start_recording()
    while not done and step_idx < max_steps:
-        response, actions = agent.predict(
-            instruction,
-            obs,
-            step_idx == max_steps - 1
-        )
+        # EvoCUAAgent.predict unified signature: returns (response, actions)
+        # It handles both modes internally.
+        predict_res = agent.predict(instruction, obs)
+        
+        # Check return signature logic
+        if len(predict_res) == 3:
+            # Compatibility with S1 original signature if agent was updated to match
+            response, actions, info_dict = predict_res
+        else:
+            response, actions = predict_res
+            info_dict = {}
+
+        logger.info(f"Step {step_idx + 1} Actions: {actions}")
+        
+        # Break if no actions (fail-safe)
+        if not actions or (len(actions) == 1 and (actions[0] == "" or "error" in actions[0].lower())):
+             # Allow "FAIL" or "DONE" to process through execution loop if agent outputs them as actions
+             if not (actions and actions[0] in ["FAIL", "DONE"]):
+                 logger.warning("No valid actions returned. Breaking loop.")
+                 break
+
        for action in actions:
-            # Save screenshot and trajectory information
-            if "reflection" in response and response["reflection"].get("is_milestone"):
-                img_name = f"step_{step_idx + 1}_milestone.png"
-            else:
-                img_name = f"step_{step_idx + 1}.png"
-                
-            with open(os.path.join(example_result_dir, img_name),
-                      "wb") as _f:
-                _f.write(obs['screenshot'])
-            if "coordinates" in response and response["coordinates"]:
-                draw_coordinates(
-                    image_bytes=obs['screenshot'], 
-                    coordinates=response["coordinates"], 
-                    save_path=os.path.join(example_result_dir, img_name[:-4] + "_draw.png")
-                )
-
-            logger.info("Step %d: %s", step_idx + 1, action)
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S%f")
+            logger.info("Executing action: %s", action)
+            
+            # Execute
            obs, reward, done, info = env.step(action, args.sleep_after_execution)
+            
+            logger.info("Reward: %.2f", reward)
            logger.info("Done: %s", done)
-
+            
+            # Save screenshot
+            screenshot_file = f"step_{step_idx + 1}_{action_timestamp}.png"
+            with open(os.path.join(example_result_dir, screenshot_file), "wb") as _f:
+                _f.write(obs['screenshot'])
+            
+            # Log Trajectory
+            log_entry = {
+                "step_num": step_idx + 1,
+                "action_timestamp": action_timestamp,
+                "action": action,
+                "response": response,
+                "reward": reward,
+                "done": done,
+                "info": info,
+                "screenshot_file": screenshot_file
+            }
+            # Add natural language info if available (S1 style)
+            if info_dict:
+                log_entry["natural_language_action"] = info_dict.get("action")
+            
            with open(os.path.join(example_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
-                f.write(json.dumps({
-                    "instruction": instruction,
-                    "step_num": step_idx + 1,
-                    "action": action,
-                    "response": response,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": img_name
-                }))
+                f.write(json.dumps(log_entry, ensure_ascii=False))
                f.write("\n")
-            with open(os.path.join(example_result_dir, f"traj_{step_idx+1}.json"), "w", encoding="utf-8") as f:
-                json.dump({
-                    "step_num": step_idx + 1,
-                    "action": action,
-                    "response": response,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": img_name
-                }, f, indent=4, ensure_ascii=False)
+                
            if done:
                logger.info("The episode is done.")
-                time.sleep(60)
                break
+        
        step_idx += 1
-    end_time = time.time()
-    result = float(env.evaluate())
+        
+    time.sleep(20) # Wait for environment to settle
+    result = env.evaluate()
    logger.info("Result: %.2f", result)
    scores.append(result)
+    
    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
        f.write(f"{result}\n")
+    
+    log_task_completion(example, result, example_result_dir, args)

-    with open(os.path.join(example_result_dir, "time.txt"), "w", encoding="utf-8") as f:
-        f.write(f"{end_time-start_time:.2f}\n")
+    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))