feat&fix: enhance error handling during environment initialization and VM allocation

2025-06-03 13:38:47 +00:00
parent e363da2fd7
commit 8d54d4302f
3 changed files with 277 additions and 95 deletions
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -71,29 +71,39 @@ class DesktopEnv(gym.Env):
        else:

            self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=region)
+        try:
+            self.snapshot_name = snapshot_name
+            self.cache_dir_base: str = cache_dir
+            # todo: add the logic to get the screen size from the VM
+            self.headless = headless
+            self.require_a11y_tree = require_a11y_tree
+            self.require_terminal = require_terminal

-        self.snapshot_name = snapshot_name
-        self.cache_dir_base: str = cache_dir
-        # todo: add the logic to get the screen size from the VM
-        self.headless = headless
-        self.require_a11y_tree = require_a11y_tree
-        self.require_terminal = require_terminal
+            # Initialize emulator and controller
+            if provider_name != "docker": # Check if this is applicable to other VM providers
+                logger.info("Initializing...")
+                self._start_emulator()

-        # Initialize emulator and controller
-        if provider_name != "docker": # Check if this is applicable to other VM providers
-            logger.info("Initializing...")
-            self._start_emulator()
+            # mode: human or machine
+            self.instruction = None
+            assert action_space in ["computer_13", "pyautogui"]
+            self.action_space = action_space  # todo: refactor it to the ActType

-        # mode: human or machine
-        self.instruction = None
-        assert action_space in ["computer_13", "pyautogui"]
-        self.action_space = action_space  # todo: refactor it to the ActType
-
-        # episodic stuffs, like counters, will be updated or reset
-        # when calling self.reset()
-        self._traj_no: int = -1
-        self._step_no: int = 0
-        self.action_history: List[Dict[str, any]] = []
+            # episodic stuffs, like counters, will be updated or reset
+            # when calling self.reset()
+            self._traj_no: int = -1
+            self._step_no: int = 0
+            self.action_history: List[Dict[str, any]] = []
+        except Exception as e:
+            logger.error(f"Failed to initialize DesktopEnv: {e}")
+            # If initialization fails, we should clean up the VM
+            try:
+                self.close()
+                self.manager.delete_vm(self.path_to_vm, self.region)
+                logger.info(f"Cleaned up VM {self.path_to_vm}.")
+            except Exception as cleanup_error:
+                logger.error(f"Failed to clean up VM {self.path_to_vm}: {cleanup_error}")
+            raise

    def _start_emulator(self):
        # Power on the virtual machine
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -57,11 +57,22 @@ def _allocate_vm(region=DEFAULT_REGION):
    }

    ec2_client = boto3.client('ec2', region_name=region)
-    response = ec2_client.run_instances(**run_instances_params)
-    instance_id = response['Instances'][0]['InstanceId']
-    logger.info(f"Waiting for instance {instance_id} to be running...")
-    ec2_client.get_waiter('instance_running').wait(InstanceIds=[instance_id])
-    logger.info(f"Instance {instance_id} is ready.")
+    try:
+        response = ec2_client.run_instances(**run_instances_params)
+        instance_id = response['Instances'][0]['InstanceId']
+        logger.info(f"Waiting for instance {instance_id} to be running...")
+        ec2_client.get_waiter('instance_running').wait(InstanceIds=[instance_id])
+        logger.info(f"Instance {instance_id} is ready.")
+    except Exception as e:
+        logger.error(f"Failed to allocate VM in region {region}: {str(e)}")
+        # try to clean up any resources that were created
+        try:
+            if 'InstanceId' in response['Instances'][0]:
+                ec2_client.terminate_instances(InstanceIds=[instance_id])
+                logger.info(f"Terminated instance {instance_id} due to allocation failure.")
+        except Exception as cleanup_error:
+            logger.error(f"May fail to clean up instance {instance_id}: {str(cleanup_error)}")
+        raise

    return instance_id