From 7d25f902a449b433672ce3fcbcba9c0d73c5e7f7 Mon Sep 17 00:00:00 2001 From: adlsdztony Date: Fri, 6 Jun 2025 12:55:13 +0000 Subject: [PATCH] refactor&fix: update README and main.py for improved configuration and task status handling --- monitor/.env | 4 ++-- monitor/README.md | 22 ++++++++++++++-------- monitor/main.py | 31 +++++++++++++++++++++++++------ 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/monitor/.env b/monitor/.env index 8f8e845..a6bfc16 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,9 +2,9 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_all.json +TASK_CONFIG_PATH=../evaluation_examples/test.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_operator_aws +RESULTS_BASE_PATH=../results ACTION_SPACE=pyautogui OBSERVATION_TYPE=screenshot MODEL_NAME=computer-use-preview diff --git a/monitor/README.md b/monitor/README.md index acbe2d3..f529693 100644 --- a/monitor/README.md +++ b/monitor/README.md @@ -19,10 +19,13 @@ The monitor can be configured by editing the `.env` file in the monitor director | Variable | Description | Default Value | |----------|-------------|---------------| -| TASK_CONFIG_PATH | Path to the task configuration JSON file | ../evaluation_examples/test_small.json | -| EXAMPLES_BASE_PATH | Base path for task example files | ../evaluation_examples/examples | -| RESULTS_BASE_PATH | Base path for execution results | ../results_operator_aws/pyautogui/screenshot/computer-use-preview | -| MAX_STEPS | Maximum steps to display for a task | 50 | +| TASK_CONFIG_PATH | Path to the task configuration file | ../evaluation_examples/test.json | +| EXAMPLES_BASE_PATH | Base path for example files | ../evaluation_examples/examples | +| RESULTS_BASE_PATH | Base path for storing results | ../results | +| ACTION_SPACE | Action space type (e.g., pyautogui, keyboard) | pyautogui | +| OBSERVATION_TYPE | Type of observation (e.g., screenshot, video) | screenshot | +| MODEL_NAME | Name of the model to use for task execution | computer-use-preview | +| MAX_STEPS | Maximum steps to display for a task | 150 | | FLASK_PORT | Port for the web server | 80 | | FLASK_HOST | Host address for the web server | 0.0.0.0 | | FLASK_DEBUG | Enable debug mode (true/false) | false | @@ -30,13 +33,16 @@ The monitor can be configured by editing the `.env` file in the monitor director For example: ```bash # .env -TASK_CONFIG_PATH=../evaluation_examples/test_small.json +TASK_CONFIG_PATH=../evaluation_examples/test.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_operator_aws/pyautogui/screenshot/computer-use-preview -MAX_STEPS=50 +RESULTS_BASE_PATH=../results +ACTION_SPACE=pyautogui +OBSERVATION_TYPE=screenshot +MODEL_NAME=computer-use-preview +MAX_STEPS=150 FLASK_PORT=80 FLASK_HOST=0.0.0.0 -FLASK_DEBUG=false +FLASK_DEBUG=true ``` ## Running with Docker diff --git a/monitor/main.py b/monitor/main.py index b2fde45..1657a78 100644 --- a/monitor/main.py +++ b/monitor/main.py @@ -12,8 +12,11 @@ from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() -# {task_type}_{task_id}: status_dict +# {task_type}_{task_id}: (status_dict, timestamp) +# For "Done" status, we need to verify it for a period to ensure it doesn't change to "Error" TASK_STATUS_CACHE = {} +# Time in seconds to consider "Done" status as stable (default: 30s) +DONE_STABILITY_PERIOD = int(os.getenv("DONE_STABILITY_PERIOD", "30")) app = Flask(__name__) @@ -26,14 +29,14 @@ if MONITOR_IN_DOCKER: RESULTS_BASE_PATH = "/app/results" else: # Load configuration from environment variables - TASK_CONFIG_PATH = os.getenv("TASK_CONFIG_PATH", "../evaluation_examples/test_small.json") + TASK_CONFIG_PATH = os.getenv("TASK_CONFIG_PATH", "../evaluation_examples/test.json") EXAMPLES_BASE_PATH = os.getenv("EXAMPLES_BASE_PATH", "../evaluation_examples/examples") RESULTS_BASE_PATH = os.getenv("RESULTS_BASE_PATH", "../results") ACTION_SPACE=os.getenv("ACTION_SPACE", "pyautogui") OBSERVATION_TYPE=os.getenv("OBSERVATION_TYPE", "screenshot") MODEL_NAME=os.getenv("MODEL_NAME", "computer-use-preview") -MAX_STEPS = int(os.getenv("MAX_STEPS", "50")) +MAX_STEPS = int(os.getenv("MAX_STEPS", "150")) RESULTS_PATH = os.path.join(RESULTS_BASE_PATH, ACTION_SPACE, OBSERVATION_TYPE, MODEL_NAME) @@ -177,9 +180,24 @@ def get_task_status_brief(task_type, task_id): # Generate cache key based on task type and ID cache_key = f"{task_type}_{task_id}" - # Check if the status is already cached + # Check if the status is already cached + current_time = time.time() + last_cache_time = None if cache_key in TASK_STATUS_CACHE: - return TASK_STATUS_CACHE[cache_key] + cached_status, cached_time = TASK_STATUS_CACHE[cache_key] + last_cache_time = cached_time + # If cached status is "Done", check if it's within the stability period + if cached_status["status"].startswith("Done"): + # If within stability period, recalculate status to ensure it's correct + if current_time - cached_time < DONE_STABILITY_PERIOD: + # Status is still in verification period, refresh it + pass + else: + # Status is stable, return from cache + return cached_status + else: + # For non-Done status (like Error), just return from cache + return cached_status result_dir = os.path.join(RESULTS_PATH, task_type, task_id) @@ -293,7 +311,8 @@ def get_task_status_brief(task_type, task_id): # Cache the status if it is done or error if status.startswith("Done") or status == "Error": - TASK_STATUS_CACHE[cache_key] = status_dict + current_time = last_cache_time if last_cache_time else current_time + TASK_STATUS_CACHE[cache_key] = (status_dict, current_time) return status_dict