Add AutoGLM-OS agent (#309)

* autoglm-os initialize

* clean code

* chore: use proxy for download setup

* feat(autoglm-os): add parameter to toggle images

* fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel

* update

* add client_password

* update multienv

* fix

* fix prompt

* fix prompt

* fix prompt

* fix sys prompt

* feat: use proxy in file evaluator

* fix client_password

* fix note_prompt

* fix autoglm agent cmd type

* fix

* revert: fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel

reverts commit bab5473eea1de0e61b0e1d68b23ce324a5b0ee57

* feat(autoglm): setup tools

* fix(autoglm): remove second time of get a11y tree

* add osworld server restart

* Revert "add osworld server restart"

This reverts commit 7bd9d84122e246ce2a26de0e49c25494244c2b3d.

* fix _launch_setup

* fix autoglm agent tools & xml tree

* fix desktop_env

* fix bug for tool name capitalization

* fix: always use proxy for setup download

* add fail after exceeding max turns

* fix(autoglm): avoid adding image to message when screenshot is empty

* fix maximize_window

* fix maximize_window

* fix maximize_window

* fix import browsertools module bug

* fix task proxy config bug

* restore setup

* refactor desktop env

* restore image in provider

* restore file.py

* refactor desktop_env

* quick fix

* refactor desktop_env.step

* fix our env reset

* add max truns constraint

* clean run script

* clean lib_run_single.py

---------

Co-authored-by: hanyullai <hanyullai@outlook.com>
Co-authored-by: JingBh <jingbohao@yeah.net>
This commit is contained in:
Adam Yanxiao Zhao
2025-08-17 12:08:40 +08:00
committed by GitHub
parent c833d03a4b
commit aa05f6cc26
26 changed files with 8657 additions and 23 deletions

View File

@@ -0,0 +1,100 @@
import re
def parse_line(line):
# 解析格式label Google Chrome (191, 13) (104, 17)
pattern = r"^(\S+)\s+(.+?)\s+\((\d+), (\d+)\)\s+\((\d+), (\d+)\)"
m = re.match(pattern, line)
if not m:
return None
node_type, text, cx, cy, w, h = m.groups()
cx, cy, w, h = map(int, (cx, cy, w, h))
# bounding box as (x1, y1, x2, y2)
x1 = cx - w // 2
y1 = cy - h // 2
x2 = x1 + w
y2 = y1 + h
return {
"type": node_type,
"text": text.strip(),
"bbox": (x1, y1, x2, y2),
"center": (cx, cy),
"size": (w, h),
"raw": line,
}
def iou(box1, box2):
# box: (x1, y1, x2, y2)
xi1 = max(box1[0], box2[0])
yi1 = max(box1[1], box2[1])
xi2 = min(box1[2], box2[2])
yi2 = min(box1[3], box2[3])
inter_width = max(0, xi2 - xi1)
inter_height = max(0, yi2 - yi1)
inter_area = inter_width * inter_height
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - inter_area
if union == 0:
return 0
return inter_area / union
def norm_text(s):
# 归一化文本:小写、去空格等
return re.sub(r"\s+", "", s.lower())
def text_similarity(a, b):
# 简单判定完全一致为1否则0
na, nb = norm_text(a), norm_text(b)
if na == nb:
return 1.0
else:
return 0
def filter_similar_nodes(linearized_accessibility_tree):
lines = [ln for ln in linearized_accessibility_tree.split("\n") if ln.strip()]
# parse all nodes
nodes = []
for ln in lines:
node = parse_line(ln)
if node:
nodes.append(node)
else:
# 解析不了的保留
nodes.append({"raw": ln, "invalid": True})
filtered = []
removed = [False] * len(nodes)
# 阈值可自行调整
IOU_THRESH = 0.2
TEXT_THRESH = 0.9
for i, ni in enumerate(nodes):
if ni.get("invalid"):
filtered.append(ni["raw"])
continue
if removed[i]:
continue
for j in range(i + 1, len(nodes)):
nj = nodes[j]
if nj.get("invalid"):
continue
iou_val = iou(ni["bbox"], nj["bbox"])
text_sim = text_similarity(ni["text"], nj["text"])
if iou_val > IOU_THRESH and text_sim > TEXT_THRESH:
# 二者极其相似,移除后者
removed[j] = True
# print(f"移除: {nj['raw']} (与 {ni['raw']} 相似度高)")
# 保留未被标记为移除的
if not removed[i]:
filtered.append(ni["raw"])
return "\n".join(filtered)
# 示例用法
if __name__ == "__main__":
linearized_accessibility_tree = "tag\ttext\tposition (center x & y)\tsize (w & h)\nicon\t\t(1853, 1001)\t(64, 64)\nlabel\tHome\t(1853, 1045)\t(40, 17)\nlabel\tActivities\t(49, 13)\t(63, 17)\ntext\tActivities\t(49, 13)\t(63, 17)\nlabel\tApr 171704\t(995, 13)\t(117, 27)\ntext\tApr 171704\t(995, 13)\t(87, 18)\nmenu\tSystem\t(1867, 13)\t(106, 27)\npush-button\tGoogle Chrome\t(35, 65)\t(70, 64)\npush-button\tThunderbird Mail\t(35, 133)\t(70, 64)\npush-button\tVisual Studio Code\t(35, 201)\t(70, 64)\npush-button\tVLC media player\t(35, 269)\t(70, 64)\npush-button\tLibreOffice Writer\t(35, 337)\t(70, 64)\npush-button\tLibreOffice Calc\t(35, 405)\t(70, 64)\npush-button\tLibreOffice Impress\t(35, 473)\t(70, 64)\npush-button\tGNU Image Manipulation Program\t(35, 541)\t(70, 64)\npush-button\tFiles\t(35, 609)\t(70, 64)\npush-button\tUbuntu Software\t(35, 677)\t(70, 64)\npush-button\tHelp\t(35, 745)\t(70, 64)\npush-button\tTrash\t(35, 816)\t(70, 64)\ntoggle-button\tShow Applications\t(35, 1045)\t(70, 70)"
result = filter_similar_nodes(linearized_accessibility_tree)
print(result)