Update run_maestro.py to run in headless mode with a single environment and specify result directory. Adjust default TTL for AWS instances from 60 to 180 minutes in config.py. Enhance AWSProvider to handle missing security groups, subnet IDs, and instance types with fallbacks, and improve termination logic to skip already terminated instances while logging relevant information.

This commit is contained in:
Timothyxxx
2025-10-01 06:56:33 +00:00
parent 3a4b67304f
commit 4c685bed99
3 changed files with 47 additions and 8 deletions

View File

@@ -3,7 +3,7 @@ import os
# Default TTL minutes for instance auto-termination (cloud-side scheduler)
# Can be overridden via environment variable DEFAULT_TTL_MINUTES
DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "60"))
DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "180"))
# Master switch for TTL feature
ENABLE_TTL: bool = os.getenv("ENABLE_TTL", "true").lower() == "true"

View File

@@ -108,13 +108,52 @@ class AWSProvider(Provider):
# Step 1: Retrieve the original instance details
instance_details = ec2_client.describe_instances(InstanceIds=[path_to_vm])
instance = instance_details['Reservations'][0]['Instances'][0]
security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']]
subnet_id = instance['SubnetId']
instance_type = instance['InstanceType']
# Resolve security groups with fallbacks
security_groups = [sg['GroupId'] for sg in instance.get('SecurityGroups', []) if 'GroupId' in sg]
if not security_groups:
env_sg = os.getenv('AWS_SECURITY_GROUP_ID')
if env_sg:
security_groups = [env_sg]
logger.info("SecurityGroups missing on instance; using AWS_SECURITY_GROUP_ID from env")
else:
raise ValueError("No security groups found on instance and AWS_SECURITY_GROUP_ID not set")
# Resolve subnet with fallbacks
subnet_id = instance.get('SubnetId')
if not subnet_id:
nis = instance.get('NetworkInterfaces', []) or []
if nis and isinstance(nis, list):
for ni in nis:
if isinstance(ni, dict) and ni.get('SubnetId'):
subnet_id = ni.get('SubnetId')
break
if not subnet_id:
env_subnet = os.getenv('AWS_SUBNET_ID')
if env_subnet:
subnet_id = env_subnet
logger.info("SubnetId missing on instance; using AWS_SUBNET_ID from env")
else:
raise ValueError("SubnetId not available on instance, NetworkInterfaces, or environment")
# Resolve instance type with fallbacks
instance_type = instance.get('InstanceType') or os.getenv('AWS_INSTANCE_TYPE') or 't3.large'
if instance.get('InstanceType') is None:
logger.info(f"InstanceType missing on instance; using '{instance_type}' from env/default")
# Step 2: Terminate the old instance
ec2_client.terminate_instances(InstanceIds=[path_to_vm])
logger.info(f"Old instance {path_to_vm} has been terminated.")
# Step 2: Terminate the old instance (skip if already terminated/shutting-down)
state = (instance.get('State') or {}).get('Name')
if state in ['shutting-down', 'terminated']:
logger.info(f"Old instance {path_to_vm} is already in state '{state}', skipping termination.")
else:
try:
ec2_client.terminate_instances(InstanceIds=[path_to_vm])
logger.info(f"Old instance {path_to_vm} has been terminated.")
except ClientError as e:
error_code = getattr(getattr(e, 'response', {}), 'get', lambda *_: None)('Error', {}).get('Code') if hasattr(e, 'response') else None
if error_code in ['InvalidInstanceID.NotFound', 'IncorrectInstanceState']:
logger.info(f"Ignore termination error for {path_to_vm}: {error_code}")
else:
raise
# Step 3: Launch a new instance from the snapshot(AMI) with performance optimization
logger.info(f"Launching a new instance from AMI {snapshot_name}...")

View File

@@ -544,7 +544,7 @@ def get_unfinished(
if __name__ == "__main__":
"""
xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 15
xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 1 --headless --result_dir ./results_maestro_debug
"""
os.environ["TOKENIZERS_PARALLELISM"] = "false"