Update run_maestro.py to run in headless mode with a single environment and specify result directory. Adjust default TTL for AWS instances from 60 to 180 minutes in config.py. Enhance AWSProvider to handle missing security groups, subnet IDs, and instance types with fallbacks, and improve termination logic to skip already terminated instances while logging relevant information.
This commit is contained in:
@@ -3,7 +3,7 @@ import os
|
||||
|
||||
# Default TTL minutes for instance auto-termination (cloud-side scheduler)
|
||||
# Can be overridden via environment variable DEFAULT_TTL_MINUTES
|
||||
DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "60"))
|
||||
DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "180"))
|
||||
|
||||
# Master switch for TTL feature
|
||||
ENABLE_TTL: bool = os.getenv("ENABLE_TTL", "true").lower() == "true"
|
||||
|
||||
@@ -108,13 +108,52 @@ class AWSProvider(Provider):
|
||||
# Step 1: Retrieve the original instance details
|
||||
instance_details = ec2_client.describe_instances(InstanceIds=[path_to_vm])
|
||||
instance = instance_details['Reservations'][0]['Instances'][0]
|
||||
security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']]
|
||||
subnet_id = instance['SubnetId']
|
||||
instance_type = instance['InstanceType']
|
||||
# Resolve security groups with fallbacks
|
||||
security_groups = [sg['GroupId'] for sg in instance.get('SecurityGroups', []) if 'GroupId' in sg]
|
||||
if not security_groups:
|
||||
env_sg = os.getenv('AWS_SECURITY_GROUP_ID')
|
||||
if env_sg:
|
||||
security_groups = [env_sg]
|
||||
logger.info("SecurityGroups missing on instance; using AWS_SECURITY_GROUP_ID from env")
|
||||
else:
|
||||
raise ValueError("No security groups found on instance and AWS_SECURITY_GROUP_ID not set")
|
||||
|
||||
# Resolve subnet with fallbacks
|
||||
subnet_id = instance.get('SubnetId')
|
||||
if not subnet_id:
|
||||
nis = instance.get('NetworkInterfaces', []) or []
|
||||
if nis and isinstance(nis, list):
|
||||
for ni in nis:
|
||||
if isinstance(ni, dict) and ni.get('SubnetId'):
|
||||
subnet_id = ni.get('SubnetId')
|
||||
break
|
||||
if not subnet_id:
|
||||
env_subnet = os.getenv('AWS_SUBNET_ID')
|
||||
if env_subnet:
|
||||
subnet_id = env_subnet
|
||||
logger.info("SubnetId missing on instance; using AWS_SUBNET_ID from env")
|
||||
else:
|
||||
raise ValueError("SubnetId not available on instance, NetworkInterfaces, or environment")
|
||||
|
||||
# Resolve instance type with fallbacks
|
||||
instance_type = instance.get('InstanceType') or os.getenv('AWS_INSTANCE_TYPE') or 't3.large'
|
||||
if instance.get('InstanceType') is None:
|
||||
logger.info(f"InstanceType missing on instance; using '{instance_type}' from env/default")
|
||||
|
||||
# Step 2: Terminate the old instance
|
||||
ec2_client.terminate_instances(InstanceIds=[path_to_vm])
|
||||
logger.info(f"Old instance {path_to_vm} has been terminated.")
|
||||
# Step 2: Terminate the old instance (skip if already terminated/shutting-down)
|
||||
state = (instance.get('State') or {}).get('Name')
|
||||
if state in ['shutting-down', 'terminated']:
|
||||
logger.info(f"Old instance {path_to_vm} is already in state '{state}', skipping termination.")
|
||||
else:
|
||||
try:
|
||||
ec2_client.terminate_instances(InstanceIds=[path_to_vm])
|
||||
logger.info(f"Old instance {path_to_vm} has been terminated.")
|
||||
except ClientError as e:
|
||||
error_code = getattr(getattr(e, 'response', {}), 'get', lambda *_: None)('Error', {}).get('Code') if hasattr(e, 'response') else None
|
||||
if error_code in ['InvalidInstanceID.NotFound', 'IncorrectInstanceState']:
|
||||
logger.info(f"Ignore termination error for {path_to_vm}: {error_code}")
|
||||
else:
|
||||
raise
|
||||
|
||||
# Step 3: Launch a new instance from the snapshot(AMI) with performance optimization
|
||||
logger.info(f"Launching a new instance from AMI {snapshot_name}...")
|
||||
|
||||
@@ -544,7 +544,7 @@ def get_unfinished(
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 15
|
||||
xvfb-run -a python run_maestro.py --test_all_meta_path evaluation_examples/test_nogdrive.json --num_envs 1 --headless --result_dir ./results_maestro_debug
|
||||
"""
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
Reference in New Issue
Block a user