Files
sci-gui-agent-benchmark/mm_agents/autoglm/prompt/deduplicate_node.py
Adam Yanxiao Zhao aa05f6cc26 Add AutoGLM-OS agent (#309)
* autoglm-os initialize

* clean code

* chore: use proxy for download setup

* feat(autoglm-os): add parameter to toggle images

* fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel

* update

* add client_password

* update multienv

* fix

* fix prompt

* fix prompt

* fix prompt

* fix sys prompt

* feat: use proxy in file evaluator

* fix client_password

* fix note_prompt

* fix autoglm agent cmd type

* fix

* revert: fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel

reverts commit bab5473eea1de0e61b0e1d68b23ce324a5b0ee57

* feat(autoglm): setup tools

* fix(autoglm): remove second time of get a11y tree

* add osworld server restart

* Revert "add osworld server restart"

This reverts commit 7bd9d84122e246ce2a26de0e49c25494244c2b3d.

* fix _launch_setup

* fix autoglm agent tools & xml tree

* fix desktop_env

* fix bug for tool name capitalization

* fix: always use proxy for setup download

* add fail after exceeding max turns

* fix(autoglm): avoid adding image to message when screenshot is empty

* fix maximize_window

* fix maximize_window

* fix maximize_window

* fix import browsertools module bug

* fix task proxy config bug

* restore setup

* refactor desktop env

* restore image in provider

* restore file.py

* refactor desktop_env

* quick fix

* refactor desktop_env.step

* fix our env reset

* add max truns constraint

* clean run script

* clean lib_run_single.py

---------

Co-authored-by: hanyullai <hanyullai@outlook.com>
Co-authored-by: JingBh <jingbohao@yeah.net>
2025-08-17 12:08:40 +08:00

101 lines
3.8 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
def parse_line(line):
# 解析格式label Google Chrome (191, 13) (104, 17)
pattern = r"^(\S+)\s+(.+?)\s+\((\d+), (\d+)\)\s+\((\d+), (\d+)\)"
m = re.match(pattern, line)
if not m:
return None
node_type, text, cx, cy, w, h = m.groups()
cx, cy, w, h = map(int, (cx, cy, w, h))
# bounding box as (x1, y1, x2, y2)
x1 = cx - w // 2
y1 = cy - h // 2
x2 = x1 + w
y2 = y1 + h
return {
"type": node_type,
"text": text.strip(),
"bbox": (x1, y1, x2, y2),
"center": (cx, cy),
"size": (w, h),
"raw": line,
}
def iou(box1, box2):
# box: (x1, y1, x2, y2)
xi1 = max(box1[0], box2[0])
yi1 = max(box1[1], box2[1])
xi2 = min(box1[2], box2[2])
yi2 = min(box1[3], box2[3])
inter_width = max(0, xi2 - xi1)
inter_height = max(0, yi2 - yi1)
inter_area = inter_width * inter_height
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - inter_area
if union == 0:
return 0
return inter_area / union
def norm_text(s):
# 归一化文本:小写、去空格等
return re.sub(r"\s+", "", s.lower())
def text_similarity(a, b):
# 简单判定完全一致为1否则0
na, nb = norm_text(a), norm_text(b)
if na == nb:
return 1.0
else:
return 0
def filter_similar_nodes(linearized_accessibility_tree):
lines = [ln for ln in linearized_accessibility_tree.split("\n") if ln.strip()]
# parse all nodes
nodes = []
for ln in lines:
node = parse_line(ln)
if node:
nodes.append(node)
else:
# 解析不了的保留
nodes.append({"raw": ln, "invalid": True})
filtered = []
removed = [False] * len(nodes)
# 阈值可自行调整
IOU_THRESH = 0.2
TEXT_THRESH = 0.9
for i, ni in enumerate(nodes):
if ni.get("invalid"):
filtered.append(ni["raw"])
continue
if removed[i]:
continue
for j in range(i + 1, len(nodes)):
nj = nodes[j]
if nj.get("invalid"):
continue
iou_val = iou(ni["bbox"], nj["bbox"])
text_sim = text_similarity(ni["text"], nj["text"])
if iou_val > IOU_THRESH and text_sim > TEXT_THRESH:
# 二者极其相似,移除后者
removed[j] = True
# print(f"移除: {nj['raw']} (与 {ni['raw']} 相似度高)")
# 保留未被标记为移除的
if not removed[i]:
filtered.append(ni["raw"])
return "\n".join(filtered)
# 示例用法
if __name__ == "__main__":
linearized_accessibility_tree = "tag\ttext\tposition (center x & y)\tsize (w & h)\nicon\t\t(1853, 1001)\t(64, 64)\nlabel\tHome\t(1853, 1045)\t(40, 17)\nlabel\tActivities\t(49, 13)\t(63, 17)\ntext\tActivities\t(49, 13)\t(63, 17)\nlabel\tApr 171704\t(995, 13)\t(117, 27)\ntext\tApr 171704\t(995, 13)\t(87, 18)\nmenu\tSystem\t(1867, 13)\t(106, 27)\npush-button\tGoogle Chrome\t(35, 65)\t(70, 64)\npush-button\tThunderbird Mail\t(35, 133)\t(70, 64)\npush-button\tVisual Studio Code\t(35, 201)\t(70, 64)\npush-button\tVLC media player\t(35, 269)\t(70, 64)\npush-button\tLibreOffice Writer\t(35, 337)\t(70, 64)\npush-button\tLibreOffice Calc\t(35, 405)\t(70, 64)\npush-button\tLibreOffice Impress\t(35, 473)\t(70, 64)\npush-button\tGNU Image Manipulation Program\t(35, 541)\t(70, 64)\npush-button\tFiles\t(35, 609)\t(70, 64)\npush-button\tUbuntu Software\t(35, 677)\t(70, 64)\npush-button\tHelp\t(35, 745)\t(70, 64)\npush-button\tTrash\t(35, 816)\t(70, 64)\ntoggle-button\tShow Applications\t(35, 1045)\t(70, 70)"
result = filter_similar_nodes(linearized_accessibility_tree)
print(result)