Add TTL configuration for AWS instance management
- Introduced a new config module to manage TTL settings for EC2 instances, allowing for auto-termination based on environment variables. - Updated the AWSProvider and manager to utilize the new TTL settings, including scheduling instance termination via EventBridge Scheduler. - Added utility functions for resolving the scheduler role ARN and creating termination schedules, ensuring robust error handling and logging. - Maintained existing code logic while integrating new features for improved instance lifecycle management.
This commit is contained in:
21
desktop_env/providers/aws/config.py
Normal file
21
desktop_env/providers/aws/config.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import os
|
||||
|
||||
|
||||
# Default TTL minutes for instance auto-termination (cloud-side scheduler)
|
||||
# Can be overridden via environment variable DEFAULT_TTL_MINUTES
|
||||
DEFAULT_TTL_MINUTES: int = int(os.getenv("DEFAULT_TTL_MINUTES", "60"))
|
||||
|
||||
# Master switch for TTL feature
|
||||
ENABLE_TTL: bool = os.getenv("ENABLE_TTL", "true").lower() == "true"
|
||||
|
||||
# EventBridge Scheduler role ARN for scheduling EC2 termination
|
||||
AWS_SCHEDULER_ROLE_ARN: str = os.getenv("AWS_SCHEDULER_ROLE_ARN", "").strip()
|
||||
|
||||
|
||||
def compute_ttl_seconds(ttl_minutes: int) -> int:
|
||||
try:
|
||||
return max(0, int(ttl_minutes) * 60)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import os
|
||||
from filelock import FileLock
|
||||
import boto3
|
||||
import logging
|
||||
import dotenv
|
||||
import signal
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
# TTL configuration
|
||||
from desktop_env.providers.aws.config import ENABLE_TTL, DEFAULT_TTL_MINUTES, AWS_SCHEDULER_ROLE_ARN
|
||||
from desktop_env.providers.aws.scheduler_utils import schedule_instance_termination
|
||||
|
||||
|
||||
INSTANCE_TYPE = "t3.medium"
|
||||
@@ -92,12 +96,20 @@ def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)):
|
||||
if not os.getenv('AWS_SUBNET_ID'):
|
||||
raise ValueError("AWS_SUBNET_ID is not set in the environment variables.")
|
||||
|
||||
# TTL configuration (cloud-init removed; use cloud-side scheduler only)
|
||||
ttl_enabled = ENABLE_TTL
|
||||
ttl_minutes = DEFAULT_TTL_MINUTES
|
||||
ttl_seconds = max(0, int(ttl_minutes) * 60)
|
||||
eta_utc = datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds)
|
||||
logger.info(f"TTL config: minutes={ttl_minutes}, seconds={ttl_seconds}, ETA(UTC)={eta_utc.isoformat()}")
|
||||
|
||||
run_instances_params = {
|
||||
"MaxCount": 1,
|
||||
"MinCount": 1,
|
||||
"ImageId": ami_id,
|
||||
"InstanceType": INSTANCE_TYPE,
|
||||
"EbsOptimized": True,
|
||||
"InstanceInitiatedShutdownBehavior": "terminate",
|
||||
"NetworkInterfaces": [
|
||||
{
|
||||
"SubnetId": os.getenv('AWS_SUBNET_ID'),
|
||||
@@ -124,12 +136,20 @@ def _allocate_vm(region=DEFAULT_REGION, screen_size=(1920, 1080)):
|
||||
|
||||
response = ec2_client.run_instances(**run_instances_params)
|
||||
instance_id = response['Instances'][0]['InstanceId']
|
||||
|
||||
|
||||
# Create TTL schedule immediately after instance is created, to survive early interruptions
|
||||
try:
|
||||
# Always attempt; helper resolves ARN via env or role name
|
||||
if ttl_enabled:
|
||||
schedule_instance_termination(region, instance_id, ttl_seconds, AWS_SCHEDULER_ROLE_ARN, logger)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to create EventBridge Scheduler for {instance_id}: {e}")
|
||||
|
||||
waiter = ec2_client.get_waiter('instance_running')
|
||||
logger.info(f"Waiting for instance {instance_id} to be running...")
|
||||
waiter.wait(InstanceIds=[instance_id])
|
||||
logger.info(f"Instance {instance_id} is ready.")
|
||||
|
||||
|
||||
try:
|
||||
instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])
|
||||
instance = instance_details['Reservations'][0]['Instances'][0]
|
||||
|
||||
@@ -2,11 +2,14 @@ import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
import logging
|
||||
|
||||
from desktop_env.providers.base import Provider
|
||||
from datetime import datetime
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from desktop_env.providers.base import Provider
|
||||
|
||||
# TTL configuration
|
||||
from desktop_env.providers.aws.config import ENABLE_TTL, DEFAULT_TTL_MINUTES, AWS_SCHEDULER_ROLE_ARN
|
||||
from desktop_env.providers.aws.scheduler_utils import schedule_instance_termination
|
||||
|
||||
logger = logging.getLogger("desktopenv.providers.aws.AWSProvider")
|
||||
logger.setLevel(logging.INFO)
|
||||
@@ -116,12 +119,18 @@ class AWSProvider(Provider):
|
||||
# Step 3: Launch a new instance from the snapshot(AMI) with performance optimization
|
||||
logger.info(f"Launching a new instance from AMI {snapshot_name}...")
|
||||
|
||||
# TTL configuration follows the same env flags as allocation (centralized)
|
||||
enable_ttl = ENABLE_TTL
|
||||
default_ttl_minutes = DEFAULT_TTL_MINUTES
|
||||
ttl_seconds = max(0, default_ttl_minutes * 60)
|
||||
|
||||
run_instances_params = {
|
||||
"MaxCount": 1,
|
||||
"MinCount": 1,
|
||||
"ImageId": snapshot_name,
|
||||
"InstanceType": instance_type,
|
||||
"EbsOptimized": True,
|
||||
"InstanceInitiatedShutdownBehavior": "terminate",
|
||||
"NetworkInterfaces": [
|
||||
{
|
||||
"SubnetId": subnet_id,
|
||||
@@ -151,7 +160,40 @@ class AWSProvider(Provider):
|
||||
ec2_client.get_waiter('instance_running').wait(InstanceIds=[new_instance_id])
|
||||
|
||||
logger.info(f"Instance {new_instance_id} is ready.")
|
||||
|
||||
# Schedule cloud-side termination via EventBridge Scheduler (auto-resolve role ARN)
|
||||
try:
|
||||
if enable_ttl:
|
||||
schedule_instance_termination(self.region, new_instance_id, ttl_seconds, AWS_SCHEDULER_ROLE_ARN, logger)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to create EventBridge Scheduler for {new_instance_id}: {e}")
|
||||
|
||||
# Schedule cloud-side termination via EventBridge Scheduler (same as allocation path)
|
||||
try:
|
||||
if enable_ttl and os.getenv('AWS_SCHEDULER_ROLE_ARN'):
|
||||
scheduler_client = boto3.client('scheduler', region_name=self.region)
|
||||
schedule_name = f"osworld-ttl-{new_instance_id}-{int(time.time())}"
|
||||
eta_scheduler = datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds)
|
||||
schedule_expression = f"at({eta_scheduler.strftime('%Y-%m-%dT%H:%M:%S')})"
|
||||
target_arn = "arn:aws:scheduler:::aws-sdk:ec2:terminateInstances"
|
||||
input_payload = '{"InstanceIds":["' + new_instance_id + '"]}'
|
||||
scheduler_client.create_schedule(
|
||||
Name=schedule_name,
|
||||
ScheduleExpression=schedule_expression,
|
||||
FlexibleTimeWindow={"Mode": "OFF"},
|
||||
Target={
|
||||
"Arn": target_arn,
|
||||
"RoleArn": os.getenv('AWS_SCHEDULER_ROLE_ARN'),
|
||||
"Input": input_payload
|
||||
},
|
||||
State='ENABLED',
|
||||
Description=f"OSWorld TTL terminate for {new_instance_id}"
|
||||
)
|
||||
logger.info(f"Scheduled EC2 termination via EventBridge Scheduler for snapshot revert: name={schedule_name}, when={eta_scheduler.isoformat()} (UTC)")
|
||||
else:
|
||||
logger.info("TTL enabled but AWS_SCHEDULER_ROLE_ARN not set; skipping scheduler for snapshot revert.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to create EventBridge Scheduler for {new_instance_id}: {e}")
|
||||
|
||||
try:
|
||||
instance_details = ec2_client.describe_instances(InstanceIds=[new_instance_id])
|
||||
instance = instance_details['Reservations'][0]['Instances'][0]
|
||||
|
||||
107
desktop_env/providers/aws/scheduler_utils.py
Normal file
107
desktop_env/providers/aws/scheduler_utils.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
|
||||
def _resolve_scheduler_role_arn(logger) -> str:
|
||||
# 1) Explicit env takes precedence
|
||||
role_arn = os.getenv('AWS_SCHEDULER_ROLE_ARN', '').strip()
|
||||
if role_arn:
|
||||
return role_arn
|
||||
|
||||
# 2) Derive from role name + account id
|
||||
role_name = os.getenv('AWS_SCHEDULER_ROLE_NAME', 'osworld-scheduler-ec2-terminate').strip()
|
||||
try:
|
||||
sts = boto3.client('sts')
|
||||
account_id = sts.get_caller_identity()['Account']
|
||||
derived_arn = f"arn:aws:iam::{account_id}:role/{role_name}"
|
||||
iam = boto3.client('iam')
|
||||
try:
|
||||
iam.get_role(RoleName=role_name)
|
||||
logger.info(f"Derived AWS_SCHEDULER_ROLE_ARN={derived_arn} from role name '{role_name}'")
|
||||
return derived_arn
|
||||
except ClientError as e:
|
||||
auto_create = os.getenv('AWS_AUTO_CREATE_SCHEDULER_ROLE', 'true').lower() == 'true'
|
||||
if not auto_create:
|
||||
logger.warning(f"Scheduler role '{role_name}' not found and auto-create disabled: {e}")
|
||||
return ''
|
||||
# Attempt to create role
|
||||
try:
|
||||
trust_policy = {
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Principal": {"Service": "scheduler.amazonaws.com"},
|
||||
"Action": "sts:AssumeRole"
|
||||
}
|
||||
]
|
||||
}
|
||||
iam.create_role(
|
||||
RoleName=role_name,
|
||||
AssumeRolePolicyDocument=json.dumps(trust_policy)
|
||||
)
|
||||
# Attach minimal inline policy
|
||||
inline_policy = {
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": ["ec2:TerminateInstances", "ec2:DescribeInstances"],
|
||||
"Resource": "*"
|
||||
}
|
||||
]
|
||||
}
|
||||
iam.put_role_policy(
|
||||
RoleName=role_name,
|
||||
PolicyName=f"{role_name}-inline",
|
||||
PolicyDocument=json.dumps(inline_policy)
|
||||
)
|
||||
# Small wait for IAM propagation
|
||||
time.sleep(3)
|
||||
logger.info(f"Auto-created scheduler role '{role_name}'. Using {derived_arn}")
|
||||
return derived_arn
|
||||
except ClientError as ce:
|
||||
logger.warning(f"Failed to auto-create scheduler role '{role_name}': {ce}")
|
||||
return ''
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to resolve Scheduler Role ARN: {e}")
|
||||
return ''
|
||||
|
||||
|
||||
def schedule_instance_termination(region: str, instance_id: str, ttl_seconds: int, role_arn: str, logger) -> None:
|
||||
if not role_arn:
|
||||
role_arn = _resolve_scheduler_role_arn(logger)
|
||||
if not role_arn:
|
||||
logger.info("Scheduler role ARN not available; skipping TTL schedule creation.")
|
||||
return
|
||||
scheduler_client = boto3.client('scheduler', region_name=region)
|
||||
schedule_name = f"osworld-ttl-{instance_id}-{int(time.time())}"
|
||||
eta_scheduler = datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds)
|
||||
# EventBridge Scheduler expects RFC3339 without trailing Z for 'at()' when region-local is fine
|
||||
schedule_expression = f"at({eta_scheduler.strftime('%Y-%m-%dT%H:%M:%S')})"
|
||||
target_arn = "arn:aws:scheduler:::aws-sdk:ec2:terminateInstances"
|
||||
input_payload = '{"InstanceIds":["' + instance_id + '"]}'
|
||||
|
||||
scheduler_client.create_schedule(
|
||||
Name=schedule_name,
|
||||
ScheduleExpression=schedule_expression,
|
||||
FlexibleTimeWindow={"Mode": "OFF"},
|
||||
ActionAfterCompletion='DELETE',
|
||||
Target={
|
||||
"Arn": target_arn,
|
||||
"RoleArn": role_arn,
|
||||
"Input": input_payload
|
||||
},
|
||||
State='ENABLED',
|
||||
Description=f"OSWorld TTL terminate for {instance_id}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Scheduled EC2 termination via EventBridge Scheduler: name={schedule_name}, when={eta_scheduler.isoformat()} (UTC)"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user