Add AutoGLM-OS agent (#309)

* autoglm-os initialize * clean code * chore: use proxy for download setup * feat(autoglm-os): add parameter to toggle images * fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel * update * add client_password * update multienv * fix * fix prompt * fix prompt * fix prompt * fix sys prompt * feat: use proxy in file evaluator * fix client_password * fix note_prompt * fix autoglm agent cmd type * fix * revert: fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel reverts commit bab5473eea1de0e61b0e1d68b23ce324a5b0ee57 * feat(autoglm): setup tools * fix(autoglm): remove second time of get a11y tree * add osworld server restart * Revert "add osworld server restart" This reverts commit 7bd9d84122e246ce2a26de0e49c25494244c2b3d. * fix _launch_setup * fix autoglm agent tools & xml tree * fix desktop_env * fix bug for tool name capitalization * fix: always use proxy for setup download * add fail after exceeding max turns * fix(autoglm): avoid adding image to message when screenshot is empty * fix maximize_window * fix maximize_window * fix maximize_window * fix import browsertools module bug * fix task proxy config bug * restore setup * refactor desktop env * restore image in provider * restore file.py * refactor desktop_env * quick fix * refactor desktop_env.step * fix our env reset * add max truns constraint * clean run script * clean lib_run_single.py --------- Co-authored-by: hanyullai <hanyullai@outlook.com> Co-authored-by: JingBh <jingbohao@yeah.net>
2025-08-17 12:08:40 +08:00
parent c833d03a4b
commit aa05f6cc26
26 changed files with 8657 additions and 23 deletions
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -208,4 +208,62 @@ def run_single_example_opencua(agent, env, example, max_steps, instruction, args
    scores.append(result)
    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
        f.write(f"{result}\n")
-    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+
+def run_single_example_autoglm(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
+    runtime_logger = setup_logger(example, example_result_dir)
+    try:
+        agent.reset(runtime_logger)
+    except Exception as e:
+        agent.reset()
+
+    env.reset(task_config=example)
+    
+    time.sleep(60) # Wait for the environment to be ready
+    obs = env._get_obs() # Get the initial observation
+    done = False
+    step_idx = 0
+    env.controller.start_recording()
+    while not done and step_idx < max_steps:
+        response, actions = agent.predict(
+            instruction,
+            obs
+        )
+        for action in actions:
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_idx + 1, action)
+            obs, reward, done, info = env.step(action, args.sleep_after_execution)
+
+            logger.info("Reward: %.2f", reward)
+            logger.info("Done: %s", done)
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
+                      "wb") as _f:
+                _f.write(obs['screenshot'])
+            with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_idx + 1,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "response": response,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                }))
+                f.write("\n")
+            if done:
+                logger.info("The episode is done.")
+                break
+        
+        if not done: # not completed the task yet
+            env.action_history.append('FAIL')
+            
+        step_idx += 1
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+    scores.append(result)
+    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
+        f.write(f"{result}\n")
+    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))