Merge branch 'main' of github.com:xlang-ai/OSWorld

2025-07-10 22:35:42 +00:00
parent 6897e5320d 61f265a082
commit 6f0382c0c2
8 changed files with 35 additions and 10 deletions
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -115,6 +115,11 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f
    if match_type == "url":
        expected_urls = rule['urls']
        actual_urls = [tab['url'] for tab in open_tabs]
        if not are_lists_equal(expected_urls, actual_urls, compare_urls):
            logger.error("list not match") 
            logger.error(expected_urls)
            logger.error(actual_urls)
            return 0
        return 1 if are_lists_equal(expected_urls, actual_urls, compare_urls) else 0
    else:
        logger.error(f"Unknown type: {match_type}")
@@ -343,7 +348,7 @@ def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
    return score / len(pred_files)
-def compare_htmls(html_path1: str, html_path2: str) -> float:
+def compare_htmls(html_path1: str, html_path2: str, **options) -> float:
    """
    Compare two HTML files.
    """
@@ -351,20 +356,33 @@ def compare_htmls(html_path1: str, html_path2: str) -> float:
        soup1 = BeautifulSoup(inf, 'lxml')
    with open(html_path2, 'r', encoding='utf-8') as inf:
        soup2 = BeautifulSoup(inf, 'lxml')
    ignore_sdnum = options.get("ignore_sdnum", None)
    def compare_elements(elem1, elem2):
        if not (isinstance(elem1, Tag) and isinstance(elem2, Tag)):
            if elem1 != elem2:
                logger.info("not the same")
            return elem1 == elem2
        if elem1.name != elem2.name:
            logger.info("html name not match")
            return False
        if elem1.text.strip() != elem2.text.strip():
            logger.info("html text not match")
            return False
        if elem1.attrs != elem2.attrs:
            if ignore_sdnum:
                attrs1 = {k: v for k, v in elem1.attrs.items() if k != 'sdnum'}
                attrs2 = {k: v for k, v in elem2.attrs.items() if k != 'sdnum'}
                return attrs1 == attrs2
            logger.info("html attrs not match")
            logger.info(f"{elem1.attrs}")
            logger.info(f"{elem2.attrs}")
            return False
        return True
    for elem1, elem2 in zip(soup1.recursiveChildGenerator(), soup2.recursiveChildGenerator()):
        if not compare_elements(elem1, elem2):
            logger.info("html not match")
            return .0
    return 1.
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -213,7 +213,6 @@ _accessibility_ns_map = {
 }
 def check_accessibility_tree(result: str, rules: List[Dict[str, Any]], osname: str = "ubuntu") -> float:
    """
    Args:
--- a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json
+++ b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json
@@ -163,7 +163,8 @@
            "hua shan mountain.jpg"
          ]
        },
-        "expect_in_result": true
+        "expect_in_result": true,
        "result_not_list": true
      }
    }
  },
--- a/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json
+++ b/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json
@@ -1,7 +1,7 @@
 {
  "id": "da922383-bfa4-4cd3-bbad-6bebab3d7742",
  "snapshot": "multiapps",
-  "instruction": "I browsed a lot of interesting blog articles today. I hope to store these articles in my local designated folder just like zotero stores papers. Please download the blogs opening now in pdf format and save them in their tile to /home/user/Documents/Blog.",
+  "instruction": "I browsed a lot of interesting blog articles today. I hope to store these articles in my local designated folder just like zotero stores papers. Please download the blogs opening now in pdf format and save them in their title to /home/user/Documents/Blog.",
  "source": "authors",
  "config": [
    {
--- a/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json
+++ b/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json
@@ -89,7 +89,14 @@
        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e/annual-enterprise-survey-2021-financial-year-provisional.html",
        "dest": "annual-enterprise-survey-2021-financial-year-provisional_gold.html"
      }
    ],
    "options": [
      {},
      {
        "ignore_sdnum": true
      }
    ]
  },
  "proxy": true
 }
--- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json
+++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json
@@ -36,8 +36,8 @@
  ],
  "evaluator": {
    "func": [
-      "check_structure_sim",
+      "check_structure_sim_resized",
-      "check_structure_sim"
+      "check_structure_sim_resized"
    ],
    "result": [
      {
--- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json
+++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json
@@ -10,7 +10,7 @@
        "files": [
          {
            "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2/file.xls",
-            "path": "/home/user/cell_search.xlsx"
+            "path": "/home/user/cell_search.xls"
          }
        ]
      }
@@ -47,7 +47,7 @@
    {
      "type": "open",
      "parameters": {
-        "path": "/home/user/cell_search.xlsx"
+        "path": "/home/user/cell_search.xls"
      }
    }
  ],
@@ -65,7 +65,7 @@
      "type": "rule",
      "rules": {
        "expect": {
-          "pattern": "www\\.google\\.com.*?/search\\?q=Nereida&"
+          "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)"
        }
      }
    }
--- a/main.py
+++ b/main.py
@@ -83,4 +83,4 @@ def human_agent():
 if __name__ == "__main__":
-    human_agent()
+    human_agent()
`@@ -83,4 +83,4 @@ def human_agent():`


	`if __name__ == "__main__":`	`if __name__ == "__main__":`
	`human_agent()`	`human_agent()`