From 2d5439d062101beb17577e969d09fb1d5fbd286f Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Tue, 10 Jun 2025 13:32:24 +0000 Subject: [PATCH] edit pub eval readme --- PUBLIC_EVALUATION_GUIDELINE.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PUBLIC_EVALUATION_GUIDELINE.md b/PUBLIC_EVALUATION_GUIDELINE.md index c0375f4..bff9888 100644 --- a/PUBLIC_EVALUATION_GUIDELINE.md +++ b/PUBLIC_EVALUATION_GUIDELINE.md @@ -1,5 +1,6 @@ # Public Evaluation Platform User Guide + We have built an AWS-based platform for large-scale parallel evaluation of OSWorld tasks. The system follows a Host-Client architecture: - **Host Instance**: The central controller that stores code, configurations, and manages task execution. @@ -7,6 +8,7 @@ We have built an AWS-based platform for large-scale parallel evaluation of OSWor All instances use a preconfigured AMI to ensure a consistent environment. + ## 1. Platform Deployment & Connection ### 1.1 Launch the Host Instance @@ -78,6 +80,7 @@ In the **Access keys** section, click **"Create access key"** to generate your o pubeval5 + ## 2. Environment Setup ### 2.1 Google Drive Integration @@ -117,6 +120,7 @@ export AWS_SUBNET_ID="subnet-0a4b0c5b8f6066712" export AWS_SECURITY_GROUP_ID="sg-08a53433e9b4abde6" ``` + ## 3. Running Evaluations Use the `run_multienv_xxx.py` scripts to launch tasks in parallel. @@ -143,6 +147,7 @@ Key Parameters: - `--test_all_meta_path`: Path to the test set metadata - `--region`: AWS region + ## 4. Viewing Results ### 4.1 Web Monitoring Tool