diff --git a/desktop_env/providers/aws/config.py b/desktop_env/providers/aws/config.py index 5c43838..ef24881 100644 --- a/desktop_env/providers/aws/config.py +++ b/desktop_env/providers/aws/config.py @@ -3,7 +3,7 @@ import os # Default TTL minutes for instance auto-termination (cloud-side scheduler) # Can be overridden via environment variable DEFAULT_TTL_MINUTES -DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "60")) +DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "180")) # Master switch for TTL feature ENABLE_TTL: bool = os.getenv("ENABLE_TTL", "true").lower() == "true" diff --git a/desktop_env/providers/aws/provider.py b/desktop_env/providers/aws/provider.py index 124e142..d5ad06e 100644 --- a/desktop_env/providers/aws/provider.py +++ b/desktop_env/providers/aws/provider.py @@ -108,13 +108,52 @@ class AWSProvider(Provider): # Step 1: Retrieve the original instance details instance_details = ec2_client.describe_instances(InstanceIds=[path_to_vm]) instance = instance_details['Reservations'][0]['Instances'][0] - security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']] - subnet_id = instance['SubnetId'] - instance_type = instance['InstanceType'] + # Resolve security groups with fallbacks + security_groups = [sg['GroupId'] for sg in instance.get('SecurityGroups', []) if 'GroupId' in sg] + if not security_groups: + env_sg = os.getenv('AWS_SECURITY_GROUP_ID') + if env_sg: + security_groups = [env_sg] + logger.info("SecurityGroups missing on instance; using AWS_SECURITY_GROUP_ID from env") + else: + raise ValueError("No security groups found on instance and AWS_SECURITY_GROUP_ID not set") + + # Resolve subnet with fallbacks + subnet_id = instance.get('SubnetId') + if not subnet_id: + nis = instance.get('NetworkInterfaces', []) or [] + if nis and isinstance(nis, list): + for ni in nis: + if isinstance(ni, dict) and ni.get('SubnetId'): + subnet_id = ni.get('SubnetId') + break + if not subnet_id: + env_subnet = os.getenv('AWS_SUBNET_ID') + if env_subnet: + subnet_id = env_subnet + logger.info("SubnetId missing on instance; using AWS_SUBNET_ID from env") + else: + raise ValueError("SubnetId not available on instance, NetworkInterfaces, or environment") + + # Resolve instance type with fallbacks + instance_type = instance.get('InstanceType') or os.getenv('AWS_INSTANCE_TYPE') or 't3.large' + if instance.get('InstanceType') is None: + logger.info(f"InstanceType missing on instance; using '{instance_type}' from env/default") - # Step 2: Terminate the old instance - ec2_client.terminate_instances(InstanceIds=[path_to_vm]) - logger.info(f"Old instance {path_to_vm} has been terminated.") + # Step 2: Terminate the old instance (skip if already terminated/shutting-down) + state = (instance.get('State') or {}).get('Name') + if state in ['shutting-down', 'terminated']: + logger.info(f"Old instance {path_to_vm} is already in state '{state}', skipping termination.") + else: + try: + ec2_client.terminate_instances(InstanceIds=[path_to_vm]) + logger.info(f"Old instance {path_to_vm} has been terminated.") + except ClientError as e: + error_code = getattr(getattr(e, 'response', {}), 'get', lambda *_: None)('Error', {}).get('Code') if hasattr(e, 'response') else None + if error_code in ['InvalidInstanceID.NotFound', 'IncorrectInstanceState']: + logger.info(f"Ignore termination error for {path_to_vm}: {error_code}") + else: + raise # Step 3: Launch a new instance from the snapshot(AMI) with performance optimization logger.info(f"Launching a new instance from AMI {snapshot_name}...") diff --git a/run_maestro.py b/run_maestro.py index 5366ad8..b411112 100644 --- a/run_maestro.py +++ b/run_maestro.py @@ -544,7 +544,7 @@ def get_unfinished( if __name__ == "__main__": """ - xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 15 + xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 1 --headless --result_dir ./results_maestro_debug """ os.environ["TOKENIZERS_PARALLELISM"] = "false"