From 7d25f902a449b433672ce3fcbcba9c0d73c5e7f7 Mon Sep 17 00:00:00 2001
From: adlsdztony <zzl0712@connect.hku.hk>
Date: Fri, 6 Jun 2025 12:55:13 +0000
Subject: [PATCH] refactor&fix: update README and main.py for improved
 configuration and task status handling

---
 monitor/.env      |  4 ++--
 monitor/README.md | 22 ++++++++++++++--------
 monitor/main.py   | 31 +++++++++++++++++++++++++------
 3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/monitor/.env b/monitor/.env
index 8f8e845..a6bfc16 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -2,9 +2,9 @@
 # Do not write any secret keys or sensitive information here.
 
 # Monitor configuration
-TASK_CONFIG_PATH=../evaluation_examples/test_all.json
+TASK_CONFIG_PATH=../evaluation_examples/test.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_operator_aws
+RESULTS_BASE_PATH=../results
 ACTION_SPACE=pyautogui
 OBSERVATION_TYPE=screenshot
 MODEL_NAME=computer-use-preview
diff --git a/monitor/README.md b/monitor/README.md
index acbe2d3..f529693 100644
--- a/monitor/README.md
+++ b/monitor/README.md
@@ -19,10 +19,13 @@ The monitor can be configured by editing the `.env` file in the monitor director
 
 | Variable | Description | Default Value |
 |----------|-------------|---------------|
-| TASK_CONFIG_PATH | Path to the task configuration JSON file | ../evaluation_examples/test_small.json |
-| EXAMPLES_BASE_PATH | Base path for task example files | ../evaluation_examples/examples |
-| RESULTS_BASE_PATH | Base path for execution results | ../results_operator_aws/pyautogui/screenshot/computer-use-preview |
-| MAX_STEPS | Maximum steps to display for a task | 50 |
+| TASK_CONFIG_PATH | Path to the task configuration file | ../evaluation_examples/test.json |
+| EXAMPLES_BASE_PATH | Base path for example files | ../evaluation_examples/examples |
+| RESULTS_BASE_PATH | Base path for storing results | ../results |
+| ACTION_SPACE | Action space type (e.g., pyautogui, keyboard) | pyautogui |
+| OBSERVATION_TYPE | Type of observation (e.g., screenshot, video) | screenshot |
+| MODEL_NAME | Name of the model to use for task execution | computer-use-preview |
+| MAX_STEPS | Maximum steps to display for a task | 150 |
 | FLASK_PORT | Port for the web server | 80 |
 | FLASK_HOST | Host address for the web server | 0.0.0.0 |
 | FLASK_DEBUG | Enable debug mode (true/false) | false |
@@ -30,13 +33,16 @@ The monitor can be configured by editing the `.env` file in the monitor director
 For example:
 ```bash
 # .env
-TASK_CONFIG_PATH=../evaluation_examples/test_small.json
+TASK_CONFIG_PATH=../evaluation_examples/test.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_operator_aws/pyautogui/screenshot/computer-use-preview
-MAX_STEPS=50
+RESULTS_BASE_PATH=../results
+ACTION_SPACE=pyautogui
+OBSERVATION_TYPE=screenshot
+MODEL_NAME=computer-use-preview
+MAX_STEPS=150
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0
-FLASK_DEBUG=false
+FLASK_DEBUG=true
 ```
 
 ## Running with Docker
diff --git a/monitor/main.py b/monitor/main.py
index b2fde45..1657a78 100644
--- a/monitor/main.py
+++ b/monitor/main.py
@@ -12,8 +12,11 @@ from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
 
-# {task_type}_{task_id}: status_dict
+# {task_type}_{task_id}: (status_dict, timestamp)
+# For "Done" status, we need to verify it for a period to ensure it doesn't change to "Error"
 TASK_STATUS_CACHE = {}
+# Time in seconds to consider "Done" status as stable (default: 30s)
+DONE_STABILITY_PERIOD = int(os.getenv("DONE_STABILITY_PERIOD", "30"))
 
 app = Flask(__name__)
 
@@ -26,14 +29,14 @@ if MONITOR_IN_DOCKER:
     RESULTS_BASE_PATH = "/app/results"
 else:
     # Load configuration from environment variables
-    TASK_CONFIG_PATH = os.getenv("TASK_CONFIG_PATH", "../evaluation_examples/test_small.json")
+    TASK_CONFIG_PATH = os.getenv("TASK_CONFIG_PATH", "../evaluation_examples/test.json")
     EXAMPLES_BASE_PATH = os.getenv("EXAMPLES_BASE_PATH", "../evaluation_examples/examples")
     RESULTS_BASE_PATH = os.getenv("RESULTS_BASE_PATH", "../results")
 
 ACTION_SPACE=os.getenv("ACTION_SPACE", "pyautogui")
 OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot")
 MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview")
-MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
+MAX_STEPS = int(os.getenv("MAX_STEPS", "150"))
 
 RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME)
 
@@ -177,9 +180,24 @@ def get_task_status_brief(task_type, task_id):
     # Generate cache key based on task type and ID
     cache_key = f"{task_type}_{task_id}"
     
-    #  Check if the status is already cached
+    # Check if the status is already cached
+    current_time = time.time()
+    last_cache_time = None
     if cache_key in TASK_STATUS_CACHE:
-        return TASK_STATUS_CACHE[cache_key]
+        cached_status, cached_time = TASK_STATUS_CACHE[cache_key]
+        last_cache_time = cached_time
+        # If cached status is "Done", check if it's within the stability period
+        if cached_status["status"].startswith("Done"):
+            # If within stability period, recalculate status to ensure it's correct
+            if current_time - cached_time < DONE_STABILITY_PERIOD:
+                # Status is still in verification period, refresh it
+                pass
+            else:
+                # Status is stable, return from cache
+                return cached_status
+        else:
+            # For non-Done status (like Error), just return from cache
+            return cached_status
     
     result_dir = os.path.join(RESULTS_PATH, task_type, task_id)
     
@@ -293,7 +311,8 @@ def get_task_status_brief(task_type, task_id):
     
     # Cache the status if it is done or error
     if status.startswith("Done") or status == "Error":
-        TASK_STATUS_CACHE[cache_key] = status_dict
+        current_time = last_cache_time if last_cache_time else current_time
+        TASK_STATUS_CACHE[cache_key] = (status_dict, current_time)
     
     return status_dict