From 3a96fd5046e3a0d93c8d0838cb863a876b1eb577 Mon Sep 17 00:00:00 2001 From: Timothyxxx Date: Mon, 18 Aug 2025 17:30:49 +0000 Subject: [PATCH] Add TTL configuration for AWS instance management - Introduced a new config module to manage TTL settings for EC2 instances, allowing for auto-termination based on environment variables. - Updated the AWSProvider and manager to utilize the new TTL settings, including scheduling instance termination via EventBridge Scheduler. - Added utility functions for resolving the scheduler role ARN and creating termination schedules, ensuring robust error handling and logging. - Maintained existing code logic while integrating new features for improved instance lifecycle management. --- desktop_env/providers/aws/config.py | 21 ++++ desktop_env/providers/aws/manager.py | 26 ++++- desktop_env/providers/aws/provider.py | 50 ++++++++- desktop_env/providers/aws/scheduler_utils.py | 107 +++++++++++++++++++ 4 files changed, 197 insertions(+), 7 deletions(-) create mode 100644 desktop_env/providers/aws/config.py create mode 100644 desktop_env/providers/aws/scheduler_utils.py diff --git a/desktop_env/providers/aws/config.py b/desktop_env/providers/aws/config.py new file mode 100644 index 0000000..5c43838 --- /dev/null +++ b/desktop_env/providers/aws/config.py @@ -0,0 +1,21 @@ +import os + + +# Default TTL minutes for instance auto-termination (cloud-side scheduler) +# Can be overridden via environment variable DEFAULT_TTL_MINUTES +DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "60")) + +# Master switch for TTL feature +ENABLE_TTL: bool = os.getenv("ENABLE_TTL", "true").lower() == "true" + +# EventBridge Scheduler role ARN for scheduling EC2 termination +AWS_SCHEDULER_ROLE_ARN: str = os.getenv("AWS_SCHEDULER_ROLE_ARN", "").strip() + + +def compute_ttl_seconds(ttl_minutes: int) -> int: + try: + return max(0, int(ttl_minutes) * 60) + except Exception: + return 0 + + diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py index e4522ce..09d5172 100644 --- a/desktop_env/providers/aws/manager.py +++ b/desktop_env/providers/aws/manager.py @@ -1,9 +1,13 @@ import os -from filelock import FileLock import boto3 import logging import dotenv import signal +from datetime import datetime, timedelta, timezone + +# TTL configuration +from desktop_env.providers.aws.config import ENABLE_TTL, DEFAULT_TTL_MINUTES, AWS_SCHEDULER_ROLE_ARN +from desktop_env.providers.aws.scheduler_utils import schedule_instance_termination INSTANCE_TYPE = "t3.medium" @@ -92,12 +96,20 @@ def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)): if not os.getenv('AWS_SUBNET_ID'): raise ValueError("AWS_SUBNET_ID is not set in the environment variables.") + # TTL configuration (cloud-init removed; use cloud-side scheduler only) + ttl_enabled = ENABLE_TTL + ttl_minutes = DEFAULT_TTL_MINUTES + ttl_seconds = max(0, int(ttl_minutes) * 60) + eta_utc = datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds) + logger.info(f"TTL config: minutes={ttl_minutes}, seconds={ttl_seconds}, ETA(UTC)={eta_utc.isoformat()}") + run_instances_params = { "MaxCount": 1, "MinCount": 1, "ImageId": ami_id, "InstanceType": INSTANCE_TYPE, "EbsOptimized": True, + "InstanceInitiatedShutdownBehavior": "terminate", "NetworkInterfaces": [ { "SubnetId": os.getenv('AWS_SUBNET_ID'), @@ -124,12 +136,20 @@ def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)): response = ec2_client.run_instances(**run_instances_params) instance_id = response['Instances'][0]['InstanceId'] - + + # Create TTL schedule immediately after instance is created, to survive early interruptions + try: + # Always attempt; helper resolves ARN via env or role name + if ttl_enabled: + schedule_instance_termination(region, instance_id, ttl_seconds, AWS_SCHEDULER_ROLE_ARN, logger) + except Exception as e: + logger.warning(f"Failed to create EventBridge Scheduler for {instance_id}: {e}") + waiter = ec2_client.get_waiter('instance_running') logger.info(f"Waiting for instance {instance_id} to be running...") waiter.wait(InstanceIds=[instance_id]) logger.info(f"Instance {instance_id} is ready.") - + try: instance_details = ec2_client.describe_instances(InstanceIds=[instance_id]) instance = instance_details['Reservations'][0]['Instances'][0] diff --git a/desktop_env/providers/aws/provider.py b/desktop_env/providers/aws/provider.py index 44de796..124e142 100644 --- a/desktop_env/providers/aws/provider.py +++ b/desktop_env/providers/aws/provider.py @@ -2,11 +2,14 @@ import boto3 from botocore.exceptions import ClientError import logging - -from desktop_env.providers.base import Provider -from datetime import datetime +import os import time +from datetime import datetime, timedelta, timezone +from desktop_env.providers.base import Provider +# TTL configuration +from desktop_env.providers.aws.config import ENABLE_TTL, DEFAULT_TTL_MINUTES, AWS_SCHEDULER_ROLE_ARN +from desktop_env.providers.aws.scheduler_utils import schedule_instance_termination logger = logging.getLogger("desktopenv.providers.aws.AWSProvider") logger.setLevel(logging.INFO) @@ -116,12 +119,18 @@ class AWSProvider(Provider): # Step 3: Launch a new instance from the snapshot(AMI) with performance optimization logger.info(f"Launching a new instance from AMI {snapshot_name}...") + # TTL configuration follows the same env flags as allocation (centralized) + enable_ttl = ENABLE_TTL + default_ttl_minutes = DEFAULT_TTL_MINUTES + ttl_seconds = max(0, default_ttl_minutes * 60) + run_instances_params = { "MaxCount": 1, "MinCount": 1, "ImageId": snapshot_name, "InstanceType": instance_type, "EbsOptimized": True, + "InstanceInitiatedShutdownBehavior": "terminate", "NetworkInterfaces": [ { "SubnetId": subnet_id, @@ -151,7 +160,40 @@ class AWSProvider(Provider): ec2_client.get_waiter('instance_running').wait(InstanceIds=[new_instance_id]) logger.info(f"Instance {new_instance_id} is ready.") - + # Schedule cloud-side termination via EventBridge Scheduler (auto-resolve role ARN) + try: + if enable_ttl: + schedule_instance_termination(self.region, new_instance_id, ttl_seconds, AWS_SCHEDULER_ROLE_ARN, logger) + except Exception as e: + logger.warning(f"Failed to create EventBridge Scheduler for {new_instance_id}: {e}") + + # Schedule cloud-side termination via EventBridge Scheduler (same as allocation path) + try: + if enable_ttl and os.getenv('AWS_SCHEDULER_ROLE_ARN'): + scheduler_client = boto3.client('scheduler', region_name=self.region) + schedule_name = f"osworld-ttl-{new_instance_id}-{int(time.time())}" + eta_scheduler = datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds) + schedule_expression = f"at({eta_scheduler.strftime('%Y-%m-%dT%H:%M:%S')})" + target_arn = "arn:aws:scheduler:::aws-sdk:ec2:terminateInstances" + input_payload = '{"InstanceIds":["' + new_instance_id + '"]}' + scheduler_client.create_schedule( + Name=schedule_name, + ScheduleExpression=schedule_expression, + FlexibleTimeWindow={"Mode": "OFF"}, + Target={ + "Arn": target_arn, + "RoleArn": os.getenv('AWS_SCHEDULER_ROLE_ARN'), + "Input": input_payload + }, + State='ENABLED', + Description=f"OSWorld TTL terminate for {new_instance_id}" + ) + logger.info(f"Scheduled EC2 termination via EventBridge Scheduler for snapshot revert: name={schedule_name}, when={eta_scheduler.isoformat()} (UTC)") + else: + logger.info("TTL enabled but AWS_SCHEDULER_ROLE_ARN not set; skipping scheduler for snapshot revert.") + except Exception as e: + logger.warning(f"Failed to create EventBridge Scheduler for {new_instance_id}: {e}") + try: instance_details = ec2_client.describe_instances(InstanceIds=[new_instance_id]) instance = instance_details['Reservations'][0]['Instances'][0] diff --git a/desktop_env/providers/aws/scheduler_utils.py b/desktop_env/providers/aws/scheduler_utils.py new file mode 100644 index 0000000..0471157 --- /dev/null +++ b/desktop_env/providers/aws/scheduler_utils.py @@ -0,0 +1,107 @@ +import os +import time +import json +from datetime import datetime, timedelta, timezone +import boto3 +from botocore.exceptions import ClientError + + +def _resolve_scheduler_role_arn(logger) -> str: + # 1) Explicit env takes precedence + role_arn = os.getenv('AWS_SCHEDULER_ROLE_ARN', '').strip() + if role_arn: + return role_arn + + # 2) Derive from role name + account id + role_name = os.getenv('AWS_SCHEDULER_ROLE_NAME', 'osworld-scheduler-ec2-terminate').strip() + try: + sts = boto3.client('sts') + account_id = sts.get_caller_identity()['Account'] + derived_arn = f"arn:aws:iam::{account_id}:role/{role_name}" + iam = boto3.client('iam') + try: + iam.get_role(RoleName=role_name) + logger.info(f"Derived AWS_SCHEDULER_ROLE_ARN={derived_arn} from role name '{role_name}'") + return derived_arn + except ClientError as e: + auto_create = os.getenv('AWS_AUTO_CREATE_SCHEDULER_ROLE', 'true').lower() == 'true' + if not auto_create: + logger.warning(f"Scheduler role '{role_name}' not found and auto-create disabled: {e}") + return '' + # Attempt to create role + try: + trust_policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "scheduler.amazonaws.com"}, + "Action": "sts:AssumeRole" + } + ] + } + iam.create_role( + RoleName=role_name, + AssumeRolePolicyDocument=json.dumps(trust_policy) + ) + # Attach minimal inline policy + inline_policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["ec2:TerminateInstances", "ec2:DescribeInstances"], + "Resource": "*" + } + ] + } + iam.put_role_policy( + RoleName=role_name, + PolicyName=f"{role_name}-inline", + PolicyDocument=json.dumps(inline_policy) + ) + # Small wait for IAM propagation + time.sleep(3) + logger.info(f"Auto-created scheduler role '{role_name}'. Using {derived_arn}") + return derived_arn + except ClientError as ce: + logger.warning(f"Failed to auto-create scheduler role '{role_name}': {ce}") + return '' + except Exception as e: + logger.warning(f"Failed to resolve Scheduler Role ARN: {e}") + return '' + + +def schedule_instance_termination(region: str, instance_id: str, ttl_seconds: int, role_arn: str, logger) -> None: + if not role_arn: + role_arn = _resolve_scheduler_role_arn(logger) + if not role_arn: + logger.info("Scheduler role ARN not available; skipping TTL schedule creation.") + return + scheduler_client = boto3.client('scheduler', region_name=region) + schedule_name = f"osworld-ttl-{instance_id}-{int(time.time())}" + eta_scheduler = datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds) + # EventBridge Scheduler expects RFC3339 without trailing Z for 'at()' when region-local is fine + schedule_expression = f"at({eta_scheduler.strftime('%Y-%m-%dT%H:%M:%S')})" + target_arn = "arn:aws:scheduler:::aws-sdk:ec2:terminateInstances" + input_payload = '{"InstanceIds":["' + instance_id + '"]}' + + scheduler_client.create_schedule( + Name=schedule_name, + ScheduleExpression=schedule_expression, + FlexibleTimeWindow={"Mode": "OFF"}, + ActionAfterCompletion='DELETE', + Target={ + "Arn": target_arn, + "RoleArn": role_arn, + "Input": input_payload + }, + State='ENABLED', + Description=f"OSWorld TTL terminate for {instance_id}" + ) + + logger.info( + f"Scheduled EC2 termination via EventBridge Scheduler: name={schedule_name}, when={eta_scheduler.isoformat()} (UTC)" + ) + +