From 4c685bed99f1d4fb855aa3d98a4ed5987af7bb6f Mon Sep 17 00:00:00 2001 From: Timothyxxx Date: Wed, 1 Oct 2025 06:56:33 +0000 Subject: [PATCH] Update run_maestro.py to run in headless mode with a single environment and specify result directory. Adjust default TTL for AWS instances from 60 to 180 minutes in config.py. Enhance AWSProvider to handle missing security groups, subnet IDs, and instance types with fallbacks, and improve termination logic to skip already terminated instances while logging relevant information. --- desktop_env/providers/aws/config.py | 2 +- desktop_env/providers/aws/provider.py | 51 +++++++++++++++++++++++---- run_maestro.py | 2 +- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/desktop_env/providers/aws/config.py b/desktop_env/providers/aws/config.py index 5c43838..ef24881 100644 --- a/desktop_env/providers/aws/config.py +++ b/desktop_env/providers/aws/config.py @@ -3,7 +3,7 @@ import os # Default TTL minutes for instance auto-termination (cloud-side scheduler) # Can be overridden via environment variable DEFAULT_TTL_MINUTES -DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "60")) +DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "180")) # Master switch for TTL feature ENABLE_TTL: bool = os.getenv("ENABLE_TTL", "true").lower() == "true" diff --git a/desktop_env/providers/aws/provider.py b/desktop_env/providers/aws/provider.py index 124e142..d5ad06e 100644 --- a/desktop_env/providers/aws/provider.py +++ b/desktop_env/providers/aws/provider.py @@ -108,13 +108,52 @@ class AWSProvider(Provider): # Step 1: Retrieve the original instance details instance_details = ec2_client.describe_instances(InstanceIds=[path_to_vm]) instance = instance_details['Reservations'][0]['Instances'][0] - security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']] - subnet_id = instance['SubnetId'] - instance_type = instance['InstanceType'] + # Resolve security groups with fallbacks + security_groups = [sg['GroupId'] for sg in instance.get('SecurityGroups', []) if 'GroupId' in sg] + if not security_groups: + env_sg = os.getenv('AWS_SECURITY_GROUP_ID') + if env_sg: + security_groups = [env_sg] + logger.info("SecurityGroups missing on instance; using AWS_SECURITY_GROUP_ID from env") + else: + raise ValueError("No security groups found on instance and AWS_SECURITY_GROUP_ID not set") + + # Resolve subnet with fallbacks + subnet_id = instance.get('SubnetId') + if not subnet_id: + nis = instance.get('NetworkInterfaces', []) or [] + if nis and isinstance(nis, list): + for ni in nis: + if isinstance(ni, dict) and ni.get('SubnetId'): + subnet_id = ni.get('SubnetId') + break + if not subnet_id: + env_subnet = os.getenv('AWS_SUBNET_ID') + if env_subnet: + subnet_id = env_subnet + logger.info("SubnetId missing on instance; using AWS_SUBNET_ID from env") + else: + raise ValueError("SubnetId not available on instance, NetworkInterfaces, or environment") + + # Resolve instance type with fallbacks + instance_type = instance.get('InstanceType') or os.getenv('AWS_INSTANCE_TYPE') or 't3.large' + if instance.get('InstanceType') is None: + logger.info(f"InstanceType missing on instance; using '{instance_type}' from env/default") - # Step 2: Terminate the old instance - ec2_client.terminate_instances(InstanceIds=[path_to_vm]) - logger.info(f"Old instance {path_to_vm} has been terminated.") + # Step 2: Terminate the old instance (skip if already terminated/shutting-down) + state = (instance.get('State') or {}).get('Name') + if state in ['shutting-down', 'terminated']: + logger.info(f"Old instance {path_to_vm} is already in state '{state}', skipping termination.") + else: + try: + ec2_client.terminate_instances(InstanceIds=[path_to_vm]) + logger.info(f"Old instance {path_to_vm} has been terminated.") + except ClientError as e: + error_code = getattr(getattr(e, 'response', {}), 'get', lambda *_: None)('Error', {}).get('Code') if hasattr(e, 'response') else None + if error_code in ['InvalidInstanceID.NotFound', 'IncorrectInstanceState']: + logger.info(f"Ignore termination error for {path_to_vm}: {error_code}") + else: + raise # Step 3: Launch a new instance from the snapshot(AMI) with performance optimization logger.info(f"Launching a new instance from AMI {snapshot_name}...") diff --git a/run_maestro.py b/run_maestro.py index 5366ad8..b411112 100644 --- a/run_maestro.py +++ b/run_maestro.py @@ -544,7 +544,7 @@ def get_unfinished( if __name__ == "__main__": """ - xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 15 + xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 1 --headless --result_dir ./results_maestro_debug """ os.environ["TOKENIZERS_PARALLELISM"] = "false"