diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..bc0f472
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": [
+                "--path_to_vm", "/Users/lxc/Virtual Machines.localized/DesktopEnv-Ubuntu 64-bit Arm.vmwarevm/DesktopEnv-Ubuntu 64-bit Arm.vmx",
+                "--example_time_limit", "60"
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/demo.py b/demo.py
new file mode 100644
index 0000000..736adfe
--- /dev/null
+++ b/demo.py
@@ -0,0 +1,16 @@
+import signal
+import time
+
+def handler(signo, frame):
+    raise RuntimeError("Timeout")
+
+signal.signal(signal.SIGALRM, handler)
+
+while True:
+    try:
+        signal.alarm(5) # seconds
+        time.sleep(10)
+        print("Working...")
+    except Exception as e :
+        print(e)
+        continue
\ No newline at end of file
diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index 603ed3c..fee3f37 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -174,7 +174,7 @@ class DesktopEnv(gym.Env):
             if isinstance(self.evaluator["func"], list) \
             else getattr(metrics, self.evaluator["func"])
         self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
-        if "result" in self.evaluator:
+        if "result" in self.evaluator and len(self.evaluator["result"])>0:
             self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
                                           self.evaluator["result"]] \
                 if isinstance(self.evaluator["result"], list) \
@@ -184,7 +184,7 @@ class DesktopEnv(gym.Env):
                 if isinstance(self.metric, list) \
                 else None
 
-        if "expected" in self.evaluator:
+        if "expected" in self.evaluator and len(self.evaluator["expected"])>0:
             self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
                                             self.evaluator["expected"]] \
                 if isinstance(self.evaluator["expected"], list) \
diff --git a/desktop_env/server/osbench_server.service b/desktop_env/server/osbench_server.service
new file mode 100644
index 0000000..d0fa216
--- /dev/null
+++ b/desktop_env/server/osbench_server.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=OSBench Server
+StartLimitIntervalSec=60
+StartLimitBurst=4
+After=network.target auditd.service
+
+[Service]
+ExecStart=/usr/bin/python3 /home/user/main.py
+User=user
+WorkingDirectory=/home/user
+Restart=on-failure
+RestartSec=1
+Environment="DISPLAY=:1"
+
+[Install]
+WantedBy=graphical.target
diff --git a/desktop_env/server/osbench_server@.service b/desktop_env/server/osbench_server@.service
new file mode 100644
index 0000000..87fc59f
--- /dev/null
+++ b/desktop_env/server/osbench_server@.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=OSBench Server
+StartLimitIntervalSec=60
+StartLimitBurst=4
+After=network.target auditd.service
+
+[Service]
+ExecStart=/usr/bin/python3 /home/user/main.py
+User=user
+WorkingDirectory=/home/user
+Restart=on-failure
+RestartSec=1
+Environment="DISPLAY=%i"
+
+[Install]
+WantedBy=graphical.target
diff --git a/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json b/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
index aba58cd..d4bbb32 100644
--- a/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
+++ b/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
@@ -10,10 +10,6 @@
     "libreoffice_calc"
   ],
   "evaluator": {
-    "func": "infeasible",
-    "expected": {
-    },
-    "result": {
-    }
+    "func": "infeasible"
   }
-}
\ No newline at end of file
+}
diff --git a/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json b/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
index 0ebfeaf..46d6e7c 100644
--- a/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
+++ b/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
@@ -10,10 +10,6 @@
     "libreoffice_calc"
   ],
   "evaluator": {
-    "func": "infeasible",
-    "expected": {
-    },
-    "result": {
-    }
+    "func": "infeasible"
   }
-}
\ No newline at end of file
+}
diff --git a/evaluation_examples/examples/multi_apps/demo.py b/evaluation_examples/examples/multi_apps/demo.py
new file mode 100644
index 0000000..ffa2b85
--- /dev/null
+++ b/evaluation_examples/examples/multi_apps/demo.py
@@ -0,0 +1,19 @@
+import pandas as pd
+
+file_path = "/Users/lxc/Downloads/Speedtest.csv"
+# 找到csv第二行的第二个数据格里的值
+# with open(file_path, "r") as f:
+#     for i, line in enumerate(f):
+#         if i == 1:
+#             data = line.split(",")[1]
+#             break
+# print(data)
+
+with open(file_path, "r") as f:
+    reader = pd.read_csv(f, sep=',', header=None)
+    # for column in reader.columns:
+    #     if column.startswith("TEST_DATE"):
+    #         data_col = column
+    #         break
+    for data in reader['TEST_DATE']:
+        print(data)
\ No newline at end of file
diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
index 337b402..34a1d76 100644
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -55,12 +55,12 @@ def judge_node(node: ET, platform="ubuntu") -> bool:
                      or platform=="windows"\
                         and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
                       )\
-                    and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
-                        )\
-                    and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
+                  and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
+                      )\
+                  and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
 
     coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
     sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
diff --git a/mm_agents/agent.py b/mm_agents/agent.py
index 4314c63..7c7b756 100644
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -5,11 +5,13 @@ import os
 import re
 import time
 import uuid
+import openai
 import xml.etree.ElementTree as ET
 from http import HTTPStatus
 from io import BytesIO
 from typing import Dict, List
-
+from google.api_core.exceptions import InvalidArgument
+import backoff
 import dashscope
 import google.generativeai as genai
 import requests
@@ -22,6 +24,8 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S
     SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
     SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT
 
+# todo: cross-check with visualwebarena
+
 logger = logging.getLogger("desktopenv.agent")
 
 
@@ -506,18 +510,25 @@ class PromptAgent:
         try:
             actions = self.parse_actions(response, masks)
             self.thoughts.append(response)
-        except Exception as e:
+        except ValueError as e:
             print("Failed to parse action from response", e)
             actions = None
             self.thoughts.append("")
 
         return actions
 
-    # @backoff.on_exception(
-    #     backoff.expo,
-    #     (Exception),
-    #     max_tries=5
-    # )
+    @backoff.on_exception(
+        backoff.expo,
+        # here you should add more model exceptions as you want,
+        # but you are forbidden to add "Exception", that is, a common type of exception
+        # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
+        (openai.RateLimitError,
+        openai.BadRequestError,
+        openai.InternalServerError,
+        InvalidArgument),
+        max_tries=5
+    )
+
     def call_llm(self, payload):
 
         if self.model.startswith("gpt"):
@@ -525,7 +536,7 @@ class PromptAgent:
                 "Content-Type": "application/json",
                 "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
             }
-            logger.info("Generating content with GPT model: %s", self.model)
+            # logger.info("Generating content with GPT model: %s", self.model)
             response = requests.post(
                 "https://api.openai.com/v1/chat/completions",
                 headers=headers,
diff --git a/run.py b/run.py
index 04aec2c..953c6b7 100644
--- a/run.py
+++ b/run.py
@@ -7,6 +7,7 @@ import json
 import logging
 import os
 import sys
+import signal
 
 from desktop_env.envs.desktop_env import DesktopEnv
 from mm_agents.agent import PromptAgent
@@ -45,6 +46,10 @@ logger.addHandler(sdebug_handler)
 
 logger = logging.getLogger("desktopenv.experiment")
 
+# make sure each example won't exceed the time limit
+def handler(signo, frame):
+    raise RuntimeError("Time limit exceeded!")
+signal.signal(signal.SIGALRM, handler)
 
 def config() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
@@ -77,6 +82,7 @@ def config() -> argparse.Namespace:
     # agent config
     parser.add_argument("--max_trajectory_length", type=int, default=3)
     parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples")
+    parser.add_argument("--example_time_limit", type=int, default=600)
 
     # lm config
     parser.add_argument("--model", type=str, default="gpt-4-vision-preview")
@@ -98,6 +104,7 @@ def test(
 ) -> None:
     scores = []
     max_steps = args.max_steps
+    time_limit = args.example_time_limit
 
     # log args
     logger.info("Args: %s", args)
@@ -119,6 +126,7 @@ def test(
 
     for domain in test_all_meta:
         for example_id in test_all_meta[domain]:
+            # example setting
             config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json")
             with open(config_file, "r", encoding="utf-8") as f:
                 example = json.load(f)
@@ -140,82 +148,115 @@ def test(
             )
             os.makedirs(example_result_dir, exist_ok=True)
 
-            agent.reset()
-            obs = env.reset(task_config=example)
-            done = False
-            step_idx = 0
-            env.controller.start_recording()
+            # example start running
+            try:
+                signal.alarm(time_limit)
+                agent.reset()
+                obs = env.reset(task_config=example)
+                done = False
+                step_idx = 0
+                env.controller.start_recording()
 
-            # todo: update max running time for each example, @xiaochuan
-            while not done and step_idx < max_steps:
-                actions = agent.predict(
-                    instruction,
-                    obs
-                )
+                while not done and step_idx < max_steps:
+                    actions = agent.predict(
+                        instruction,
+                        obs
+                    )
+                    for action in actions:
+                        # Capture the timestamp before executing the action
+                        action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+                        logger.info("Step %d: %s", step_idx + 1, action)
 
-                for action in actions:
+                        obs, reward, done, info = env.step(action, args.sleep_after_execution)
+
+                        logger.info("Reward: %.2f", reward)
+                        logger.info("Done: %s", done)
+                        logger.info("Info: %s", info)
+
+                        # Save screenshot and trajectory information
+                        with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
+                                "wb") as _f:
+                            with open(obs['screenshot'], "rb") as __f:
+                                screenshot = __f.read()
+                            _f.write(screenshot)
+
+                        with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                            f.write(json.dumps({
+                                "step_num": step_idx + 1,
+                                "action_timestamp": action_timestamp,
+                                "action": action,
+                                "reward": reward,
+                                "done": done,
+                                "info": info,
+                                "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                            }))
+                            f.write("\n")
+
+                        if done:
+                            logger.info("The episode is done.")
+                            break
                     step_idx += 1
-                    # Capture the timestamp before executing the action
-                    action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-                    logger.info("Step %d: %s", step_idx + 1, action)
 
-                    obs, reward, done, info = env.step(action, args.sleep_after_execution)
-
-                    logger.info("Reward: %.2f", reward)
-                    logger.info("Done: %s", done)
-                    logger.info("Info: %s", info)
-
-                    # Save screenshot and trajectory information
-                    with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
-                              "wb") as _f:
-                        with open(obs['screenshot'], "rb") as __f:
-                            screenshot = __f.read()
-                        _f.write(screenshot)
-
-                    with open(os.path.join(example_result_dir, "traj.json"), "a") as f:
+                result = env.evaluate()
+                logger.info("Result: %.2f", result)
+                scores.append(result)
+                env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+            except RuntimeError as e:
+                logger.error(f"Error in example {domain}/{example_id}: {e}")
+                # save info of this example and then continue
+                try:
+                    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+                    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
                         f.write(json.dumps({
-                            "step_num": step_idx + 1,
-                            "action_timestamp": action_timestamp,
-                            "action": action,
-                            "reward": reward,
-                            "done": done,
-                            "info": info,
-                            "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                            "Error": f"Error in example {domain}/{example_id}: {e}",
+                            "step": step_idx + 1,
                         }))
                         f.write("\n")
-
-                    if done:
-                        logger.info("The episode is done.")
-                        break
-            try:
-                result = env.evaluate()
-            except Exception as e:
-                logger.error(f"Error in evaluating the example {example_id}: {e}")
-                result = 0.0
-            logger.info("Result: %.2f", result)
-            env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
-            scores.append(result)
-            with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
-                f.write(f"{result}\n")
-
+                except Exception as new_e:
+                    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                        f.write(json.dumps({
+                            "Error": f"Error in example {domain}/{example_id}: {e} and {new_e}",
+                            "step": "before start recording",
+                        }))
+                        f.write("\n")
+                continue
     env.close()
     logger.info(f"Average score: {sum(scores) / len(scores)}")
 
 
-def get_unfinished(test, result_dir):
-    # todo @xiaochuan
-    pass
+def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json):
+    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
 
+    if not os.path.exists(target_dir):
+        return total_file_json
+
+    finished = {}
+    for domain in os.listdir(target_dir):
+        domain_path = os.path.join(target_dir, domain)
+        if os.path.isdir(domain_path):
+            finished[domain] = os.listdir(domain_path)
+
+    if not finished:
+        return total_file_json
+
+    for domain, examples in finished.items():
+        if domain in total_file_json:
+            total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples]
+
+    return total_file_json
 
 if __name__ == '__main__':
     ####### The complete version of the list of examples #######
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
     args = config()
 
-    # test_file_list = get_unfinished(args.test, args.result_dir)
-    # logger.info(f"Total {len(test_file_list)} tasks left")
-
     with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f:
         test_all_meta = json.load(f)
 
+    test_file_list = get_unfinished(args.action_space, args.model, args.observation_type, args.result_dir, test_all_meta)
+    left_info = ""
+    for domain in test_file_list:
+        left_info += f"{domain}: {len(test_file_list[domain])}\n"
+    logger.info(f"Left tasks:\n{left_info}")
+
     test(args, test_all_meta)