From 347238e17e1bdbed6e05aadc6ecebbcacbcc950c Mon Sep 17 00:00:00 2001
From: Kaixin Li <happppppyli@gmail.com>
Date: Mon, 16 Jun 2025 02:40:40 +0800
Subject: [PATCH 1/7] Get VM IP again when getting screenshot fails (#215)

In rare cases, the IP of the VM changes after it launches. We can get the IP every time we retry to ensure the correct connection.
---
 desktop_env/providers/vmware/manager.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/desktop_env/providers/vmware/manager.py b/desktop_env/providers/vmware/manager.py
index 78e11d3..f3384fd 100644
--- a/desktop_env/providers/vmware/manager.py
+++ b/desktop_env/providers/vmware/manager.py
@@ -256,6 +256,8 @@ def _install_vm(vm_name, vms_dir, downloaded_file_name, os_type, original_vm_nam
 
     # Try downloading the screenshot until successful
     while not download_screenshot(vm_ip):
+        # Try to get the IP again in case it has changed
+        vm_ip = get_vm_ip(vm_path)
         logger.info("Check whether the virtual machine is ready...")
 
     logger.info("Virtual machine is ready. Start to make a snapshot on the virtual machine. It would take a while...")

From 4e11eafd1da94bee3120b0ac319f2b0e2a1f492f Mon Sep 17 00:00:00 2001
From: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
Date: Mon, 16 Jun 2025 21:37:19 +0800
Subject: [PATCH 2/7] Robust Evaluation, Blocking File Open, Grader
 Sensitivity, and LibreOffice Writer Fixes (#217)

* Refactor evaluator structure in LibreOffice Writer example JSON to support multiple expected and result files, enhancing evaluation flexibility.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update time format in get_vm_file function to include hours, minutes, and seconds for more precise file naming with time suffix.

* More delay for 936321ce-5236-426a-9a20-e0e3c5dc536f; support one more potential solutions.

* Enhance SetupController with configurable retry limit and improved error handling for file opening requests. Introduce new function to compare unique training records, and update logging for better debugging. Adjust JSON examples for evaluation to support multiple expected and result files.

* Clean debug code

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
---
 desktop_env/controllers/setup.py              |  21 +--
 desktop_env/evaluators/getters/file.py        |  14 +-
 desktop_env/evaluators/metrics/__init__.py    |   3 +-
 desktop_env/evaluators/metrics/docs.py        |  77 +++++++++-
 desktop_env/providers/aws/manager.py          | 110 ++++++++++-----
 desktop_env/providers/aws/provider.py         |  66 ++++++---
 .../providers/aws/provider_with_proxy.py      |  56 ++++++--
 desktop_env/server/main.py                    | 132 +++++++++++++++---
 .../4bcb1253-a636-4df4-8cb0-a35c04dfef31.json |  52 ++++++-
 .../6ada715d-3aae-4a32-a6a7-429b2e43fb93.json |   6 +-
 .../6f81754e-285d-4ce0-b59e-af7edb02d108.json |  14 +-
 .../88fe4b2d-3040-4c70-9a70-546a47764b48.json |  65 +++++++--
 .../936321ce-5236-426a-9a20-e0e3c5dc536f.json |  42 ++++--
 13 files changed, 523 insertions(+), 135 deletions(-)

diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py
index 4373600..6728370 100644
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -36,6 +36,8 @@ FILE_PATH = os.path.dirname(os.path.abspath(__file__))
 
 init_proxy_pool(PROXY_CONFIG_FILE)  # initialize the global proxy pool
 
+MAX_RETRIES = 20
+
 class SetupController:
     def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache"):
         self.vm_ip: str = vm_ip
@@ -64,16 +66,16 @@ class SetupController:
         # make sure connection can be established
         logger.info(f"try to connect {self.http_server}")
         retry = 0
-        while retry < 50:
+        while retry < MAX_RETRIES:
             try:
                 _ = requests.get(self.http_server + "/terminal")
                 break
             except:
                 time.sleep(5)
                 retry += 1
-                logger.info(f"retry: {retry}/50")
+                logger.info(f"retry: {retry}/{MAX_RETRIES}")
             
-            if retry == 50:
+            if retry == MAX_RETRIES:
                 return False
                 
 
@@ -219,13 +221,14 @@ class SetupController:
 
         # send request to server to open file
         try:
-            response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload)
-            if response.status_code == 200:
-                logger.info("Command executed successfully: %s", response.text)
-            else:
-                logger.error("Failed to open file. Status code: %s", response.text)
+            # The server-side call is now blocking and can take time.
+            # We set a timeout that is slightly longer than the server's timeout (1800s).
+            response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810)
+            response.raise_for_status()  # This will raise an exception for 4xx and 5xx status codes
+            logger.info("Command executed successfully: %s", response.text)
         except requests.exceptions.RequestException as e:
-            logger.error("An error occurred while trying to send the request: %s", e)
+            logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}")
+            raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e
 
     def _launch_setup(self, command: Union[str, List[str]], shell: bool = False):
         if not command:
diff --git a/desktop_env/evaluators/getters/file.py b/desktop_env/evaluators/getters/file.py
index 9171e7d..f4ab03a 100644
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -80,18 +80,16 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
           returned.
         only support for single file now:
         time_suffix(bool): optional. defaults to False. if True, append the current time in required format.
-        time_format(str): optional. defaults to "%Y_%m_%d". format of the time suffix.
+        time_format(str): optional. defaults to "%Y%m%d_%H%M%S". format of the time suffix.
     """
-    time_format = "%Y_%m_%d"
+    time_format = "%Y%m%d_%H%M%S"
     if not config.get("multi", False):
         paths: List[str] = [config["path"]]
         dests: List[str] = [config["dest"]]
-        if "time_suffix" in config.keys() and config["time_suffix"]:
-            if "time_format" in config.keys():
-                time_format = config["time_format"]
-            # Insert time before . in file type suffix
-            paths = [p.split(".")[0] + datetime.now().strftime(time_format) + "." + p.split(".")[1] if "." in p else p for p in paths]
-            dests = [d.split(".")[0] + datetime.now().strftime(time_format) + "." + d.split(".")[1] if "." in d else d for d in dests]
+        if config.get("time_suffix", False):
+            time_format = config.get("time_format", time_format)
+            # Insert time before file extension.
+            dests = [f"{os.path.splitext(d)[0]}_{datetime.now().strftime(time_format)}{os.path.splitext(d)[1]}" for d in dests]
     else:
         paths: List[str] = config["path"]
         dests: List[str] = config["dest"]
diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py
index 19a450d..79cd248 100644
--- a/desktop_env/evaluators/metrics/__init__.py
+++ b/desktop_env/evaluators/metrics/__init__.py
@@ -52,7 +52,8 @@ from .docs import (
     compare_docx_files_and_ignore_new_lines,
     compare_docx_images,
     compare_image_text,
-    compare_references
+    compare_references,
+    compare_unique_train_records
 )
 from .general import (
     check_csv,
diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py
index 81b3dc0..908a387 100644
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -167,8 +167,12 @@ def compare_docx_files(file1, file2, **options):
             if ignore_case:
                 p1, p2 = p1.lower(), p2.lower()
             if p1 != p2:
-                print(p1)
-                print(p2)
+                # show the difference
+                print("=== First Paragraph ===")
+                print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
+                print("=== Second Paragraph ===") 
+                print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
+                print("=" * 50)  # Clear boundary
                 return 0
 
     return 1
@@ -886,3 +890,72 @@ def compare_references(file1, file2, **options):
         return (result - reference_base_result) / (1 - reference_base_result)
     else:
         return 0
+
+
+def compare_unique_train_records(processed_file, expected_files, **kwargs):
+    """
+    Compares the processed file with a list of expected files containing the
+    gold standard and the initial document.
+    expected_files[0] should be the gold standard file.
+    expected_files[1] should be the initial file.
+    """
+    # Debug logging to understand what we're actually receiving
+    logger.info(f"DEBUG: processed_file type: {type(processed_file)}, value: {processed_file}")
+    logger.info(f"DEBUG: expected_files type: {type(expected_files)}, value: {expected_files}")
+    logger.info(f"DEBUG: kwargs: {kwargs}")
+    
+    if not processed_file or not isinstance(expected_files, list) or len(expected_files) < 2:
+        logger.error("Invalid arguments: processed_file and a list of 2 expected_files are required.")
+        return 0
+
+    gold_file = expected_files[0]
+    initial_file = expected_files[1]
+
+    if not gold_file or not initial_file:
+        logger.error("Gold file or initial file path is missing from expected_files list.")
+        return 0
+
+    # Helper function to get lines and IDs from a file
+    def get_lines_and_ids_from_file(file_path):
+        try:
+            doc = Document(file_path)
+            lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
+            train_ids = [line.split(',')[1].strip() for line in lines if len(line.split(',')) == 4]
+            return lines, train_ids
+        except Exception as e:
+            logger.error(f"Error opening or parsing file {file_path}: {e}")
+            return None, None
+
+    # Get data from all three files
+    processed_lines, processed_train_ids = get_lines_and_ids_from_file(processed_file)
+    if processed_lines is None: return 0
+
+    gold_lines, gold_train_ids = get_lines_and_ids_from_file(gold_file)
+    if gold_lines is None: return 0
+
+    initial_lines, _ = get_lines_and_ids_from_file(initial_file)
+    if initial_lines is None: return 0
+    initial_lines_set = set(initial_lines)
+
+    # 1. Subset Check: Ensure every processed line was in the initial file
+    if not set(processed_lines).issubset(initial_lines_set):
+        logger.error("Processed file contains lines not present in the initial file.")
+        logger.error(f"Extra lines: {set(processed_lines) - initial_lines_set}")
+        return 0
+
+    # 2. Uniqueness Check: Check for duplicates within the processed file
+    if len(processed_train_ids) != len(set(processed_train_ids)):
+        logger.error("Duplicate train_ids found in the processed file.")
+        return 0
+
+    # 3. Correctness Check: Compare the set of train_ids
+    if set(processed_train_ids) != set(gold_train_ids):
+        logger.error("Set of train_ids does not match between processed file and gold file.")
+        return 0
+
+    # 4. Line count check
+    if len(processed_lines) != len(gold_lines):
+        logger.error("Number of lines does not match between processed file and gold file.")
+        return 0
+
+    return 1
diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py
index 6e6dafb..1502083 100644
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -5,6 +5,9 @@ import psutil
 import logging
 import dotenv
 import signal
+
+INSTANCE_TYPE = "t3.large"
+
 # Load environment variables from .env file
 dotenv.load_dotenv()
 
@@ -31,37 +34,17 @@ logger.setLevel(logging.INFO)
 DEFAULT_REGION = "us-east-1"
 # todo: Add doc for the configuration of image, security group and network interface
 # todo: public the AMI images
-# ami-05e7d7bd279ea4f14
 IMAGE_ID_MAP = {
-    "us-east-1": "ami-00674d875de9addc1",
+    "us-east-1": "ami-03a22c6e501415fb1",
     "ap-east-1": "ami-0c092a5b8be4116f5",
 }
 
-INSTANCE_TYPE = "t3.medium"
 
 def _allocate_vm(region=DEFAULT_REGION):
     
     if region not in IMAGE_ID_MAP:
         raise ValueError(f"Region {region} is not supported. Supported regions are: {list(IMAGE_ID_MAP.keys())}")
 
-    run_instances_params = {
-        "MaxCount": 1,
-        "MinCount": 1,
-        "ImageId": IMAGE_ID_MAP[region],
-        "InstanceType": INSTANCE_TYPE,
-        "EbsOptimized": True,
-        "NetworkInterfaces": [
-            {
-                "SubnetId": os.getenv('AWS_SUBNET_ID'),
-                "AssociatePublicIpAddress": True,
-                "DeviceIndex": 0,
-                "Groups": [
-                    os.getenv('AWS_SECURITY_GROUP_ID')
-                ]
-            }
-        ]
-    }
-
     ec2_client = boto3.client('ec2', region_name=region)
     instance_id = None
     original_sigint_handler = signal.getsignal(signal.SIGINT)
@@ -94,26 +77,64 @@ def _allocate_vm(region=DEFAULT_REGION):
         signal.signal(signal.SIGINT, signal_handler)
         signal.signal(signal.SIGTERM, signal_handler)
         
+        if not os.getenv('AWS_SECURITY_GROUP_ID'):
+            raise ValueError("AWS_SECURITY_GROUP_ID is not set in the environment variables.")
+        if not os.getenv('AWS_SUBNET_ID'):
+            raise ValueError("AWS_SUBNET_ID is not set in the environment variables.")
+
+        run_instances_params = {
+            "MaxCount": 1,
+            "MinCount": 1,
+            "ImageId": IMAGE_ID_MAP[region],
+            "InstanceType": INSTANCE_TYPE,
+            "EbsOptimized": True,
+            "NetworkInterfaces": [
+                {
+                    "SubnetId": os.getenv('AWS_SUBNET_ID'),
+                    "AssociatePublicIpAddress": True,
+                    "DeviceIndex": 0,
+                    "Groups": [
+                        os.getenv('AWS_SECURITY_GROUP_ID')
+                    ]
+                }
+            ]
+        }
+        
         response = ec2_client.run_instances(**run_instances_params)
         instance_id = response['Instances'][0]['InstanceId']
+        
+        waiter = ec2_client.get_waiter('instance_running')
         logger.info(f"Waiting for instance {instance_id} to be running...")
-        ec2_client.get_waiter('instance_running').wait(InstanceIds=[instance_id])
+        waiter.wait(InstanceIds=[instance_id])
         logger.info(f"Instance {instance_id} is ready.")
+        
+        # 获取并显示VNC访问地址
+        try:
+            instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])
+            instance = instance_details['Reservations'][0]['Instances'][0]
+            public_ip = instance.get('PublicIpAddress', '')
+            if public_ip:
+                vnc_url = f"http://{public_ip}:5910/vnc.html"
+                logger.info("="*80)
+                logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                logger.info(f"📡 Public IP: {public_ip}")
+                logger.info(f"🆔 Instance ID: {instance_id}")
+                logger.info("="*80)
+                print(f"\n🌐 VNC访问地址: {vnc_url}")
+                print(f"📍 请在浏览器中打开上述地址进行远程桌面访问\n")
+        except Exception as e:
+            logger.warning(f"Failed to get VNC address for instance {instance_id}: {e}")
     except KeyboardInterrupt:
         logger.warning("VM allocation interrupted by user (SIGINT).")
-        raise
-    except SystemExit:
-        logger.warning("VM allocation terminated by parent process (SIGTERM).")
+        if instance_id:
+            logger.info(f"Terminating instance {instance_id} due to interruption.")
+            ec2_client.terminate_instances(InstanceIds=[instance_id])
         raise
     except Exception as e:
-        logger.error(f"Failed to allocate VM in region {region}: {str(e)}")
-        # try to clean up any resources that were created
-        try:
-            if instance_id:
-                ec2_client.terminate_instances(InstanceIds=[instance_id])
-                logger.info(f"Terminated instance {instance_id} due to allocation failure.")
-        except Exception as cleanup_error:
-            logger.error(f"May fail to clean up instance {instance_id}: {str(cleanup_error)}")
+        logger.error(f"Failed to allocate VM: {e}", exc_info=True)
+        if instance_id:
+            logger.info(f"Terminating instance {instance_id} due to an error.")
+            ec2_client.terminate_instances(InstanceIds=[instance_id])
         raise
     finally:
         # Restore original signal handlers
@@ -153,6 +174,27 @@ def _allocate_vm_with_proxy(region=DEFAULT_REGION, proxy_config_file=None):
         subnet_id=os.getenv('AWS_SUBNET_ID')
     )
     
+    try:
+        ec2_client = boto3.client('ec2', region_name=region)
+        instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])
+        instance = instance_details['Reservations'][0]['Instances'][0]
+        public_ip = instance.get('PublicIpAddress', '')
+        if public_ip:
+            vnc_url = f"http://{public_ip}:5910/vnc.html"
+            logger.info("="*80)
+            logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+            logger.info(f"📡 Public IP: {public_ip}")
+            logger.info(f"🆔 Instance ID: {instance_id}")
+            if current_proxy:
+                logger.info(f"🌐 Proxy: {current_proxy.host}:{current_proxy.port}")
+            logger.info("="*80)
+            print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+            if current_proxy:
+                print(f"🔄 Current Proxy: {current_proxy.host}:{current_proxy.port}")
+            print(f"📍 Please open the above address in the browser for remote desktop access\n")
+    except Exception as e:
+        logger.warning(f"Failed to get VNC address for proxy instance {instance_id}: {e}")
+    
     return instance_id
 
 
@@ -213,4 +255,4 @@ class AWSVMManager(VMManager):
         else:
             logger.info("Allocating a new VM in region: {}".format(region))
             new_vm_path = _allocate_vm(region)
-        return new_vm_path
+        return new_vm_path
\ No newline at end of file
diff --git a/desktop_env/providers/aws/provider.py b/desktop_env/providers/aws/provider.py
index 882710e..d2a87c8 100644
--- a/desktop_env/providers/aws/provider.py
+++ b/desktop_env/providers/aws/provider.py
@@ -63,10 +63,24 @@ class AWSProvider(Provider):
             for reservation in response['Reservations']:
                 for instance in reservation['Instances']:
                     private_ip_address = instance.get('PrivateIpAddress', '')
+                    public_ip_address = instance.get('PublicIpAddress', '')
+                    
+                    if public_ip_address:
+                        vnc_url = f"http://{public_ip_address}:5910/vnc.html"
+                        logger.info("="*80)
+                        logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                        logger.info(f"📡 Public IP: {public_ip_address}")
+                        logger.info(f"🏠 Private IP: {private_ip_address}")
+                        logger.info("="*80)
+                        print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+                        print(f"📍 Please open the above address in the browser for remote desktop access\n")
+                    else:
+                        logger.warning("No public IP address available for VNC access")
+                    
                     return private_ip_address
             return ''  # Return an empty string if no IP address is found
         except ClientError as e:
-            logger.error(f"Failed to retrieve private IP address for the instance {path_to_vm}: {str(e)}")
+            logger.error(f"Failed to retrieve IP address for the instance {path_to_vm}: {str(e)}")
             raise
 
     def save_state(self, path_to_vm: str, snapshot_name: str):
@@ -74,7 +88,7 @@ class AWSProvider(Provider):
         ec2_client = boto3.client('ec2', region_name=self.region)
 
         try:
-            image_response = ec2_client.create_image(InstanceId=path_to_vm, ImageId=snapshot_name)
+            image_response = ec2_client.create_image(InstanceId=path_to_vm, Name=snapshot_name)
             image_id = image_response['ImageId']
             logger.info(f"AMI {image_id} created successfully from instance {path_to_vm}.")
             return image_id
@@ -83,7 +97,7 @@ class AWSProvider(Provider):
             raise
 
     def revert_to_snapshot(self, path_to_vm: str, snapshot_name: str):
-        logger.info(f"Reverting AWS VM to snapshot: {snapshot_name}...")
+        logger.info(f"Reverting AWS VM to snapshot AMI: {snapshot_name}...")
         ec2_client = boto3.client('ec2', region_name=self.region)
 
         try:
@@ -93,23 +107,21 @@ class AWSProvider(Provider):
             security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']]
             subnet_id = instance['SubnetId']
             instance_type = instance['InstanceType']
-            instance_snapshot = instance_details['Reservations'][0]['Instances'][0]['ImageId']
-
+            
             # Step 2: Terminate the old instance
             ec2_client.terminate_instances(InstanceIds=[path_to_vm])
             logger.info(f"Old instance {path_to_vm} has been terminated.")
 
-            # Step 3: Launch a new instance from the snapshot
-            logger.info(f"Launching a new instance from snapshot {instance_snapshot}...")
-
-
-            new_instance = ec2_client.run_instances(
-                MaxCount = 1,
-                MinCount = 1,
-                ImageId = instance_snapshot, 
-                InstanceType = instance_type,
-                EbsOptimized = True,
-                NetworkInterfaces = [
+            # Step 3: Launch a new instance from the snapshot(AMI) with performance optimization
+            logger.info(f"Launching a new instance from AMI {snapshot_name}...")
+            
+            run_instances_params = {
+                "MaxCount": 1,
+                "MinCount": 1,
+                "ImageId": snapshot_name,
+                "InstanceType": instance_type,
+                "EbsOptimized": True,
+                "NetworkInterfaces": [
                     {
                         "SubnetId": subnet_id,
                         "AssociatePublicIpAddress": True,
@@ -117,13 +129,31 @@ class AWSProvider(Provider):
                         "Groups": security_groups
                     }
                 ]
-            )
+            }
+            
+            new_instance = ec2_client.run_instances(**run_instances_params)
             new_instance_id = new_instance['Instances'][0]['InstanceId']
-            logger.info(f"New instance {new_instance_id} launched from snapshot {snapshot_name}.")
+            logger.info(f"New instance {new_instance_id} launched from AMI {snapshot_name}.")
             logger.info(f"Waiting for instance {new_instance_id} to be running...")
             ec2_client.get_waiter('instance_running').wait(InstanceIds=[new_instance_id])
 
             logger.info(f"Instance {new_instance_id} is ready.")
+            
+            try:
+                instance_details = ec2_client.describe_instances(InstanceIds=[new_instance_id])
+                instance = instance_details['Reservations'][0]['Instances'][0]
+                public_ip = instance.get('PublicIpAddress', '')
+                if public_ip:
+                    vnc_url = f"http://{public_ip}:5910/vnc.html"
+                    logger.info("="*80)
+                    logger.info(f"🖥️  New Instance VNC Web Access URL: {vnc_url}")
+                    logger.info(f"📡 Public IP: {public_ip}")
+                    logger.info(f"🆔 New Instance ID: {new_instance_id}")
+                    logger.info("="*80)
+                    print(f"\n🌐 New Instance VNC Web Access URL: {vnc_url}")
+                    print(f"📍 Please open the above address in the browser for remote desktop access\n")
+            except Exception as e:
+                logger.warning(f"Failed to get VNC address for new instance {new_instance_id}: {e}")
 
             return new_instance_id
 
diff --git a/desktop_env/providers/aws/provider_with_proxy.py b/desktop_env/providers/aws/provider_with_proxy.py
index d7cfa0e..2ffb7c0 100644
--- a/desktop_env/providers/aws/provider_with_proxy.py
+++ b/desktop_env/providers/aws/provider_with_proxy.py
@@ -163,16 +163,34 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po
             
             logger.info(f"Created new instance {instance_id} with proxy configuration")
             
-            # 等待实例运行
             logger.info(f"Waiting for instance {instance_id} to be running...")
             ec2_client.get_waiter('instance_running').wait(InstanceIds=[instance_id])
             logger.info(f"Instance {instance_id} is ready.")
+
+            try:
+                instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])
+                instance = instance_details['Reservations'][0]['Instances'][0]
+                public_ip = instance.get('PublicIpAddress', '')
+                if public_ip:
+                    vnc_url = f"http://{public_ip}:5910/vnc.html"
+                    logger.info("="*80)
+                    logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                    logger.info(f"📡 Public IP: {public_ip}")
+                    logger.info(f"🆔 Instance ID: {instance_id}")
+                    if self.current_proxy:
+                        logger.info(f"🌐 Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                    logger.info("="*80)
+                    print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+                    if self.current_proxy:
+                        print(f"🔄 Current Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                    print(f"📍 Please open the above address in the browser for remote desktop access\n")
+            except Exception as e:
+                logger.warning(f"Failed to get VNC address for instance {instance_id}: {e}")
             
             return instance_id
             
         except ClientError as e:
             logger.error(f"Failed to create instance with proxy: {str(e)}")
-            # 如果当前代理失败，尝试轮换代理
             if self.current_proxy:
                 proxy_pool = get_global_proxy_pool()
                 proxy_pool.mark_proxy_failed(self.current_proxy)
@@ -188,10 +206,28 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po
             for reservation in response['Reservations']:
                 for instance in reservation['Instances']:
                     private_ip_address = instance.get('PrivateIpAddress', '')
+                    public_ip_address = instance.get('PublicIpAddress', '')
+
+                    if public_ip_address:
+                        vnc_url = f"http://{public_ip_address}:5910/vnc.html"
+                        logger.info("="*80)
+                        logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                        logger.info(f"📡 Public IP: {public_ip_address}")
+                        logger.info(f"🏠 Private IP: {private_ip_address}")
+                        if self.current_proxy:
+                            logger.info(f"🌐 Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                        logger.info("="*80)
+                        print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+                        if self.current_proxy:
+                            print(f"🔄 Current Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                        print(f"📍 Please open the above address in the browser for remote desktop access\n")
+                    else:
+                        logger.warning("No public IP address available for VNC access")
+                    
                     return private_ip_address
             return ''
         except ClientError as e:
-            logger.error(f"Failed to retrieve private IP address for the instance {path_to_vm}: {str(e)}")
+            logger.error(f"Failed to retrieve IP address for the instance {path_to_vm}: {str(e)}")
             raise
 
     def save_state(self, path_to_vm: str, snapshot_name: str):
@@ -212,24 +248,28 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po
         ec2_client = boto3.client('ec2', region_name=self.region)
 
         try:
-            # 获取原实例详情
+            # Get original instance details for config.
             instance_details = ec2_client.describe_instances(InstanceIds=[path_to_vm])
             instance = instance_details['Reservations'][0]['Instances'][0]
             security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']]
             subnet_id = instance['SubnetId']
             instance_type = instance['InstanceType']
 
-            # 终止旧实例
+            # Terminate the old instance. This is a non-blocking call.
+            logger.info(f"Initiating termination for old instance {path_to_vm}...")
             ec2_client.terminate_instances(InstanceIds=[path_to_vm])
-            logger.info(f"Old instance {path_to_vm} has been terminated.")
+            logger.info(f"Old instance {path_to_vm} termination initiated.")
 
-            # 轮换到新的代理
+            # Rotate to a new proxy
             self._rotate_proxy()
             
-            # 创建新实例
+            # Create a new instance
             new_instance_id = self.create_instance_with_proxy(
                 snapshot_name, instance_type, security_groups, subnet_id
             )
+            
+            # Note: VNC address is displayed within create_instance_with_proxy
+            logger.info(f"Successfully launched new instance {new_instance_id} for revert.")
 
             return new_instance_id
 
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index 77ed4d4..de8009d 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -4,6 +4,7 @@ import platform
 import shlex
 import json
 import subprocess, signal
+import time
 from pathlib import Path
 from typing import Any, Optional, Sequence
 from typing import List, Dict, Tuple, Literal
@@ -65,6 +66,8 @@ app = Flask(__name__)
 pyautogui.PAUSE = 0
 pyautogui.DARWIN_CATCH_UP_TIME = 0
 
+TIMEOUT = 1800  # seconds
+
 logger = app.logger
 recording_process = None  # fixme: this is a temporary solution for recording, need to be changed to support multiple-process
 recording_path = "/tmp/recording.mp4"
@@ -202,8 +205,8 @@ def capture_screen_with_cursor():
             pos = (round(pos_win[0]*ratio - hotspotx), round(pos_win[1]*ratio - hotspoty))
 
             img.paste(cursor, pos, cursor)
-        except:
-            pass
+        except Exception as e:
+            logger.warning(f"Failed to capture cursor on Windows, screenshot will not have a cursor. Error: {e}")
 
         img.save(file_path)
     elif user_platform == "Linux":
@@ -1124,18 +1127,72 @@ def open_file():
     if not path:
         return "Path not supplied!", 400
 
-    path = Path(os.path.expandvars(os.path.expanduser(path)))
+    path_obj = Path(os.path.expandvars(os.path.expanduser(path)))
 
-    if not path.exists():
-        return f"File not found: {path}", 404
+    if not path_obj.exists():
+        return f"File not found: {path_obj}", 404
 
     try:
         if platform.system() == "Windows":
-            os.startfile(path)
+            os.startfile(path_obj)
         else:
             open_cmd: str = "open" if platform.system() == "Darwin" else "xdg-open"
-            subprocess.Popen([open_cmd, str(path)])
-        return "File opened successfully"
+            subprocess.Popen([open_cmd, str(path_obj)])
+
+        # Wait for the file to open
+        file_name = path_obj.name
+        # Some apps don't include the extension in the title
+        file_name_without_ext, _ = os.path.splitext(file_name)
+
+        start_time = time.time()
+        window_found = False
+
+        while time.time() - start_time < TIMEOUT:
+            os_name = platform.system()
+            if os_name in ['Windows', 'Darwin']:
+                import pygetwindow as gw
+                # Check for window title containing file name or file name without extension
+                windows = gw.getWindowsWithTitle(file_name)
+                if not windows:
+                    windows = gw.getWindowsWithTitle(file_name_without_ext)
+
+                if windows:
+                    # To be more specific, we can try to activate it
+                    windows[0].activate()
+                    window_found = True
+                    break
+            elif os_name == 'Linux':
+                try:
+                    # Using wmctrl to list windows and check if any window title contains the filename
+                    result = subprocess.run(['wmctrl', '-l'], capture_output=True, text=True, check=True)
+                    window_list = result.stdout.strip().split('\n')
+                    if not result.stdout.strip():
+                        pass  # No windows, just continue waiting
+                    else:
+                        for window in window_list:
+                            if file_name in window or file_name_without_ext in window:
+                                # a window is found, now activate it
+                                window_id = window.split()[0]
+                                subprocess.run(['wmctrl', '-i', '-a', window_id], check=True)
+                                window_found = True
+                                break
+                        if window_found:
+                            break
+                except (subprocess.CalledProcessError, FileNotFoundError):
+                    # wmctrl might not be installed or the window manager isn't ready.
+                    # We just log it once and let the main loop retry.
+                    if 'wmctrl_failed_once' not in locals():
+                        logger.warning("wmctrl command is not ready, will keep retrying...")
+                        wmctrl_failed_once = True
+                    pass  # Let the outer loop retry
+
+            time.sleep(1)
+
+        if window_found:
+            return "File opened and window activated successfully"
+        else:
+            return f"Failed to find window for {file_name} within {timeout} seconds.", 500
+
     except Exception as e:
         return f"Failed to open {path}. Error: {e}", 500
 
@@ -1258,37 +1315,78 @@ def close_window():
 @app.route('/start_recording', methods=['POST'])
 def start_recording():
     global recording_process
-    if recording_process:
+    if recording_process and recording_process.poll() is None:
         return jsonify({'status': 'error', 'message': 'Recording is already in progress.'}), 400
 
+    # Clean up previous recording if it exists
+    if os.path.exists(recording_path):
+        try:
+            os.remove(recording_path)
+        except OSError as e:
+            logger.error(f"Error removing old recording file: {e}")
+            return jsonify({'status': 'error', 'message': f'Failed to remove old recording file: {e}'}), 500
+
     d = display.Display()
     screen_width = d.screen().width_in_pixels
     screen_height = d.screen().height_in_pixels
 
     start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"
 
-    recording_process = subprocess.Popen(shlex.split(start_command), stdout=subprocess.DEVNULL,
-                                         stderr=subprocess.DEVNULL)
+    # Use stderr=PIPE to capture potential errors from ffmpeg
+    recording_process = subprocess.Popen(shlex.split(start_command),
+                                         stdout=subprocess.DEVNULL,
+                                         stderr=subprocess.PIPE,
+                                         text=True  # To get stderr as string
+                                         )
 
-    return jsonify({'status': 'success', 'message': 'Started recording.'})
+    # Wait a couple of seconds to see if ffmpeg starts successfully
+    try:
+        # Wait for 2 seconds. If ffmpeg exits within this time, it's an error.
+        recording_process.wait(timeout=2)
+        # If wait() returns, it means the process has terminated.
+        error_output = recording_process.stderr.read()
+        return jsonify({
+            'status': 'error',
+            'message': f'Failed to start recording. ffmpeg terminated unexpectedly. Error: {error_output}'
+        }), 500
+    except subprocess.TimeoutExpired:
+        # This is the expected outcome: the process is still running after 2 seconds.
+        return jsonify({'status': 'success', 'message': 'Started recording successfully.'})
 
 
 @app.route('/end_recording', methods=['POST'])
 def end_recording():
     global recording_process
 
-    if not recording_process:
+    if not recording_process or recording_process.poll() is not None:
+        recording_process = None  # Clean up stale process object
         return jsonify({'status': 'error', 'message': 'No recording in progress to stop.'}), 400
 
+    error_output = ""
+    try:
+        # Send SIGINT for a graceful shutdown, allowing ffmpeg to finalize the file.
     recording_process.send_signal(signal.SIGINT)
-    recording_process.wait()
+        # Wait for ffmpeg to terminate. communicate() gets output and waits.
+        _, error_output = recording_process.communicate(timeout=15)
+    except subprocess.TimeoutExpired:
+        logger.error("ffmpeg did not respond to SIGINT, killing the process.")
+        recording_process.kill()
+        # After killing, communicate to get any remaining output.
+        _, error_output = recording_process.communicate()
     recording_process = None
+        return jsonify({
+            'status': 'error',
+            'message': f'Recording process was unresponsive and had to be killed. Stderr: {error_output}'
+        }), 500
 
-    # return recording video file
-    if os.path.exists(recording_path):
+    recording_process = None  # Clear the process from global state
+
+    # Check if the recording file was created and is not empty.
+    if os.path.exists(recording_path) and os.path.getsize(recording_path) > 0:
         return send_file(recording_path, as_attachment=True)
     else:
-        return abort(404, description="Recording failed")
+        logger.error(f"Recording failed. The output file is missing or empty. ffmpeg stderr: {error_output}")
+        return abort(500, description=f"Recording failed. The output file is missing or empty. ffmpeg stderr: {error_output}")
 
 
 if __name__ == '__main__':
diff --git a/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json b/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json
index d192413..dd20e95 100644
--- a/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json
+++ b/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json
@@ -27,17 +27,57 @@
     "libreoffice_writer"
   ],
   "evaluator": {
-    "func": "compare_pdfs",
-    "expected": {
+    "func": [
+      "compare_pdfs",
+      "compare_pdfs", 
+      "compare_pdfs",
+      "compare_pdfs"
+    ],
+    "conj": "or",
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_Gold_1.pdf"
+      },
+      {
+        "type": "cloud_file", 
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_Gold_2.pdf"
+      },
+      {
       "type": "cloud_file",
       "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
-      "dest": "Constitution_Template_With_Guidelines_Gold.pdf"
+        "dest": "Constitution_Template_With_Guidelines_Gold_3.pdf"
     },
-    "result": {
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_Gold_4.pdf" 
+      }
+    ],
+    "result": [
+      {
       "type": "vm_file",
       "path": "/home/user/Desktop/View_Person_Organizational_Summary.pdf",
-      "dest": "Constitution_Template_With_Guidelines.pdf"
-    }
+        "dest": "Constitution_Template_With_Guidelines_1.pdf"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Documents/View_Person_Organizational_Summary.pdf", 
+        "dest": "Constitution_Template_With_Guidelines_2.pdf"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Downloads/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_3.pdf"
+      },
+      {
+        "type": "vm_file", 
+        "path": "/home/user/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_4.pdf"
+      }
+    ]
   },
   "proxy": false
 }
\ No newline at end of file
diff --git a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
index eac54d7..49a817a 100644
--- a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
+++ b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
@@ -38,7 +38,7 @@
         "command": [
           "python",
           "-c",
-          "import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=8, interval=0.01); time.sleep(1); pyautogui.scroll(-2)"
+          "import pyautogui; import time; time.sleep(15); pyautogui.press(\"down\", presses=8, interval=0.01); time.sleep(1); pyautogui.scroll(-2)"
         ]
       }
     }
@@ -68,12 +68,12 @@
           "command": [
             "python",
             "-c",
-            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); pyautogui.press('down'); time.sleep(0.5); pyautogui.press('enter');"
+            "import pyautogui; import time; time.sleep(1); pyautogui.hotkey('ctrl', 's'); time.sleep(3);"
           ]
         }
       }
     ],
-    "func": "compare_contains_image",
+    "func": "compare_docx_images",
     "result": {
       "type": "vm_file",
       "path": "/home/user/Desktop/Viewing_Your_Class_Schedule_and_Textbooks.docx",
diff --git a/evaluation_examples/examples/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108.json b/evaluation_examples/examples/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108.json
index bb6bbb0..cb632a4 100644
--- a/evaluation_examples/examples/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108.json
+++ b/evaluation_examples/examples/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108.json
@@ -52,7 +52,7 @@
         }
       }
     ],
-    "func": "compare_docx_lines",
+    "func": "compare_unique_train_records",
     "result": {
       "type": "vm_file",
       "path": "/home/user/Desktop/HK_train_record.docx",
@@ -60,8 +60,16 @@
     },
     "expected": {
       "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record_Gold.docx",
-      "dest": "HK_train_record_Gold.docx"
+      "path": [
+        "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record_Gold.docx",
+        "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record.docx"
+      ],
+      "dest": [
+        "HK_train_record_Gold.docx",
+        "HK_train_record_Original.docx"
+      ],
+      "multi": true,
+      "gives": [0, 1]
     }
   },
   "proxy": false
diff --git a/evaluation_examples/examples/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48.json b/evaluation_examples/examples/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48.json
index 546d957..2bcf614 100644
--- a/evaluation_examples/examples/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48.json
+++ b/evaluation_examples/examples/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48.json
@@ -52,20 +52,57 @@
         }
       }
     ],
-    "func": "compare_docx_files",
-    "expected": {
-      "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold.docx",
-      "dest": "CCCH9003_Tutorial_guidelines_Gold.docx"
-    },
-    "result": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
-      "dest": "CCCH9003_Tutorial_guidelines.docx"
-    },
-    "options": {
-      "ignore_blanks": false
-    }
+    "func": [
+      "compare_docx_files",
+      "compare_docx_files",
+      "compare_docx_files"
+    ],
+    "conj": "or",
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_1.docx",
+        "dest": "CCCH9003_Tutorial_guidelines_Gold_1.docx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_2.docx",
+        "dest": "CCCH9003_Tutorial_guidelines_Gold_2.docx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_3.docx",
+        "dest": "CCCH9003_Tutorial_guidelines_Gold_3.docx"
+      }
+    ],
+    "result": [
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
+        "dest": "CCCH9003_Tutorial_guidelines.docx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
+        "dest": "CCCH9003_Tutorial_guidelines.docx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
+        "dest": "CCCH9003_Tutorial_guidelines.docx"
+      }
+    ],
+    "options": [
+      {
+        "ignore_blanks": false
+      },
+      {
+        "ignore_blanks": false
+      },
+      {
+        "ignore_blanks": false
+      }
+    ]
   },
   "proxy": false
 }
\ No newline at end of file
diff --git a/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json b/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json
index 9abfd48..e7143c5 100644
--- a/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json
+++ b/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json
@@ -47,22 +47,40 @@
           "command": [
             "python",
             "-c",
-            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); "
+            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(2); "
           ]
         }
       }
     ],
-    "func": "compare_docx_tables",
-    "expected": {
-      "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold.docx",
-      "dest": "Graphemes_Sound_Letter_Patterns_Gold.docx"
-    },
-    "result": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
-      "dest": "Graphemes_Sound_Letter_Patterns.docx"
-    }
+    "func": [
+      "compare_docx_tables",
+      "compare_docx_tables"
+    ],
+    "conj": "or",
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns_Gold.docx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold_2.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns_Gold_2.docx"
+      }
+    ],
+    "result": [
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns.docx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns.docx"
+      }
+    ]
   },
   "proxy": false
 }
\ No newline at end of file

From 3d8f1779a21aaf388d1e7d60c269db1a804a3b55 Mon Sep 17 00:00:00 2001
From: Zilong Zhou <zzl0712@connect.hku.hk>
Date: Tue, 17 Jun 2025 18:39:42 +0800
Subject: [PATCH 3/7] feat: use SSD with high throughput (#218)

---
 desktop_env/providers/aws/manager.py  | 11 ++++++++++-
 desktop_env/providers/aws/provider.py |  9 +++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py
index 1502083..0b24b0f 100644
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -6,7 +6,7 @@ import logging
 import dotenv
 import signal
 
-INSTANCE_TYPE = "t3.large"
+INSTANCE_TYPE = "t3.medium" 
 
 # Load environment variables from .env file
 dotenv.load_dotenv()
@@ -97,6 +97,15 @@ def _allocate_vm(region=DEFAULT_REGION):
                         os.getenv('AWS_SECURITY_GROUP_ID')
                     ]
                 }
+            ],
+            "BlockDeviceMappings": [
+                {
+                    "DeviceName": "/dev/sda1", 
+                    "Ebs": {
+                        'VolumeType': 'io2',
+                        'Iops': 16000,
+                    }
+                }
             ]
         }
         
diff --git a/desktop_env/providers/aws/provider.py b/desktop_env/providers/aws/provider.py
index d2a87c8..284ef2a 100644
--- a/desktop_env/providers/aws/provider.py
+++ b/desktop_env/providers/aws/provider.py
@@ -128,6 +128,15 @@ class AWSProvider(Provider):
                         "DeviceIndex": 0,
                         "Groups": security_groups
                     }
+                ],
+                "BlockDeviceMappings": [
+                    {
+                        "DeviceName": "/dev/sda1", 
+                        "Ebs": {
+                            'VolumeType': 'io2',
+                            'Iops': 16000,
+                        }
+                    }
                 ]
             }
             

From 634e1c3d6ffa1037c5a92188743e6fa87175cf51 Mon Sep 17 00:00:00 2001
From: Zilong Zhou <zzl0712@connect.hku.hk>
Date: Tue, 24 Jun 2025 15:35:38 +0800
Subject: [PATCH 4/7] Reduce the startup time of the software on AWS from one
 minute to five seconds. (#221)

* feat: use SSD with high throughput

* fix&refactor: update AMI ID and change EBS volume type to gp3 with adjusted IOPS and throughput
---
 desktop_env/providers/aws/manager.py  | 9 ++++++---
 desktop_env/providers/aws/provider.py | 7 +++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py
index 0b24b0f..bcc89c5 100644
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -35,7 +35,7 @@ DEFAULT_REGION = "us-east-1"
 # todo: Add doc for the configuration of image, security group and network interface
 # todo: public the AMI images
 IMAGE_ID_MAP = {
-    "us-east-1": "ami-03a22c6e501415fb1",
+    "us-east-1": "ami-025b3f907e5c1b805",
     "ap-east-1": "ami-0c092a5b8be4116f5",
 }
 
@@ -102,8 +102,11 @@ def _allocate_vm(region=DEFAULT_REGION):
                 {
                     "DeviceName": "/dev/sda1", 
                     "Ebs": {
-                        'VolumeType': 'io2',
-                        'Iops': 16000,
+                        # "VolumeInitializationRate": 300
+                        "VolumeSize": 30,  # Size in GB
+                        "VolumeType": "gp3",  # General Purpose SSD
+                        "Throughput": 1000,
+                        "Iops": 4000  # Adjust IOPS as needed
                     }
                 }
             ]
diff --git a/desktop_env/providers/aws/provider.py b/desktop_env/providers/aws/provider.py
index 284ef2a..d2c034e 100644
--- a/desktop_env/providers/aws/provider.py
+++ b/desktop_env/providers/aws/provider.py
@@ -133,8 +133,11 @@ class AWSProvider(Provider):
                     {
                         "DeviceName": "/dev/sda1", 
                         "Ebs": {
-                            'VolumeType': 'io2',
-                            'Iops': 16000,
+                            # "VolumeInitializationRate": 300
+                            "VolumeSize": 30,  # Size in GB
+                            "VolumeType": "gp3",  # General Purpose SSD
+                            "Throughput": 1000,
+                            "Iops": 4000  # Adjust IOPS as needed
                         }
                     }
                 ]

From 48ac57697a3b839f7a644705a259327bbcd50248 Mon Sep 17 00:00:00 2001
From: MillanK <51349692+MillanK0817@users.noreply.github.com>
Date: Tue, 24 Jun 2025 17:08:09 +0800
Subject: [PATCH 5/7] VSCode fix (#222)

---
 desktop_env/evaluators/metrics/vscode.py      | 20 +++++++++++--------
 .../276cc624-87ea-4f08-ab93-f770e3790175.json |  2 +-
 .../7c4cc09e-7a92-40dd-8338-b2286535c4ed.json |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/desktop_env/evaluators/metrics/vscode.py b/desktop_env/evaluators/metrics/vscode.py
index d207aae..82129f6 100644
--- a/desktop_env/evaluators/metrics/vscode.py
+++ b/desktop_env/evaluators/metrics/vscode.py
@@ -58,16 +58,20 @@ def check_json_settings(actual: str, expected: str, **options) -> float:
     if not actual:
         return 0.
 
-    with open(actual, 'r') as f:
-        data = json.load(f)
+    try:
+        with open(actual, 'r') as f:
+            data = json.load(f)
+    except Exception as e:
+        return 0.0
 
     expect = expected['expected']
-    data_copy = copy.deepcopy(data)
-    data_copy.update(expect)
-    if data == data_copy:
-        return 1.0
-    else:
-        return 0.0
+    
+    # Check if all expected key-value pairs are in the actual data
+    for key, value in expect.items():
+        if key not in data or data[key] != value:
+            return 0.0
+    
+    return 1.0
 
 
 def compare_text_file(actual: str, expected: str, **options) -> float:
diff --git a/evaluation_examples/examples/vs_code/276cc624-87ea-4f08-ab93-f770e3790175.json b/evaluation_examples/examples/vs_code/276cc624-87ea-4f08-ab93-f770e3790175.json
index d35ba53..bc421a9 100644
--- a/evaluation_examples/examples/vs_code/276cc624-87ea-4f08-ab93-f770e3790175.json
+++ b/evaluation_examples/examples/vs_code/276cc624-87ea-4f08-ab93-f770e3790175.json
@@ -1,7 +1,7 @@
 {
   "id": "276cc624-87ea-4f08-ab93-f770e3790175",
   "snapshot": "vscode",
-  "instruction": "Please help me set the current user's line length to 50 characters in VS Code.",
+  "instruction": "Please help me set the current user's line length for code wrapping to 50 characters in VS Code.",
   "source": "https://www.quora.com/unanswered/How-do-you-set-the-line-length-in-Visual-Studio-Code",
   "config": [
     {
diff --git a/evaluation_examples/examples/vs_code/7c4cc09e-7a92-40dd-8338-b2286535c4ed.json b/evaluation_examples/examples/vs_code/7c4cc09e-7a92-40dd-8338-b2286535c4ed.json
index f25886a..8345b1b 100644
--- a/evaluation_examples/examples/vs_code/7c4cc09e-7a92-40dd-8338-b2286535c4ed.json
+++ b/evaluation_examples/examples/vs_code/7c4cc09e-7a92-40dd-8338-b2286535c4ed.json
@@ -1,7 +1,7 @@
 {
   "id": "7c4cc09e-7a92-40dd-8338-b2286535c4ed",
   "snapshot": "vscode",
-  "instruction": "Please help me change the display language of VS Code to \"Arabic\".",
+  "instruction": "Please help me change the display language of VS Code to \"Arabic\" without using any extensions.",
   "source": "",
   "config": [
     {

From 0cc93543a81903720ac2ff8fae4ea835a5de2312 Mon Sep 17 00:00:00 2001
From: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
Date: Sat, 28 Jun 2025 00:45:53 +0800
Subject: [PATCH 6/7] Environment is_used flag; OS domain fix (#219)

* Refactor evaluator structure in LibreOffice Writer example JSON to support multiple expected and result files, enhancing evaluation flexibility.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities.

* Update time format in get_vm_file function to include hours, minutes, and seconds for more precise file naming with time suffix.

* More delay for 936321ce-5236-426a-9a20-e0e3c5dc536f; support one more potential solutions.

* Enhance SetupController with configurable retry limit and improved error handling for file opening requests. Introduce new function to compare unique training records, and update logging for better debugging. Adjust JSON examples for evaluation to support multiple expected and result files.

* Clean debug code

* Enhance DesktopEnv to track environment usage for optimized snapshot management. Introduce is_environment_used flag to determine if a snapshot revert is necessary based on provider type. Update setup and step methods to mark environment usage appropriately. Add new execute_with_verification method in SetupController for command execution with result verification, improving reliability. Change AWS instance type to m5.large for better performance and update AMI ID for compatibility. Update file opening logic in main.py to handle both file paths and application commands more effectively.

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
---
 desktop_env/controllers/setup.py              |  51 ++++++
 desktop_env/desktop_env.py                    |  42 ++++-
 desktop_env/providers/aws/manager.py          |   3 +-
 desktop_env/server/main.py                    | 147 ++++++++++++++++--
 .../4127319a-8b79-4410-b58a-7a151e15f3d7.json |   2 +-
 5 files changed, 224 insertions(+), 21 deletions(-)

diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py
index 6728370..3a1d11a 100644
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -305,6 +305,57 @@ class SetupController:
             if not terminates:
                 time.sleep(0.3)
 
+    def _execute_with_verification_setup(
+            self,
+            command: List[str],
+            verification: Dict[str, Any] = None,
+            max_wait_time: int = 10,
+            check_interval: float = 1.0,
+            shell: bool = False
+    ):
+        """Execute command with verification of results
+        
+        Args:
+            command: Command to execute
+            verification: Dict with verification criteria:
+                - window_exists: Check if window with this name exists
+                - command_success: Execute this command and check if it succeeds
+            max_wait_time: Maximum time to wait for verification
+            check_interval: Time between verification checks
+            shell: Whether to use shell
+        """
+        if not command:
+            raise Exception("Empty command to launch.")
+
+        verification = verification or {}
+        
+        payload = json.dumps({
+            "command": command, 
+            "shell": shell,
+            "verification": verification,
+            "max_wait_time": max_wait_time,
+            "check_interval": check_interval
+        })
+        headers = {"Content-Type": "application/json"}
+
+        try:
+            response = requests.post(self.http_server + "/setup" + "/execute_with_verification", 
+                                   headers=headers, data=payload, timeout=max_wait_time + 10)
+            if response.status_code == 200:
+                result = response.json()
+                logger.info("Command executed and verified successfully: %s -> %s"
+                            , " ".join(command) if isinstance(command, list) else command
+                            , response.text
+                            )
+                return result
+            else:
+                logger.error("Failed to execute with verification. Status code: %s", response.text)
+                raise Exception(f"Command verification failed: {response.text}")
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+            traceback.print_exc()
+            raise Exception(f"Request failed: {e}")
+
     def _command_setup(self, command: List[str], **kwargs):
         self._execute_setup(command, **kwargs)
 
diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py
index 72bea71..4c8cc48 100644
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -18,7 +18,7 @@ logger = logging.getLogger("desktopenv.env")
 Metric = Callable[[Any, Any], float]
 Getter = Callable[[gym.Env, Dict[str, Any]], Any]
 
-MAX_RETRIES = 5
+MAX_RETRIES = 5 # Maximum retries for environment setup
 
 class DesktopEnv(gym.Env):
     """
@@ -72,6 +72,16 @@ class DesktopEnv(gym.Env):
 
         self.os_type = os_type
 
+        # Track whether environment has been used (step/setup) to optimize snapshot revert
+        # docker, aws, gcp, azure are always unused as the emulator starts from a clean state
+        # vmware, virtualbox are always used as the emulator starts from a dirty state
+        if self.provider_name in {"docker", "aws", "gcp", "azure"}:
+            self.is_environment_used = False
+        elif self.provider_name in {"vmware", "virtualbox"}:
+            self.is_environment_used = True
+        else:
+            raise ValueError(f"Invalid provider name: {self.provider_name}")
+
         # Initialize environment variables
         if path_to_vm:
             self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(path_to_vm))) \
@@ -190,11 +200,19 @@ class DesktopEnv(gym.Env):
                         logger.info("Using regular AWS provider.")
 
         
-            logger.info("Reverting to snapshot to {}...".format(self.snapshot_name))
-            self._revert_to_snapshot()
-            logger.info("Starting emulator...")
-            self._start_emulator()
-            logger.info("Emulator started.")
+            # Only revert to snapshot if environment has been used (step/setup)
+            # This optimization is especially important for cloud providers like AWS
+            # where unnecessary snapshot operations are costly and time-consuming
+            if self.is_environment_used:
+                logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
+                self._revert_to_snapshot()
+                logger.info("Starting emulator...")
+                self._start_emulator()
+                logger.info("Emulator started.")
+                # Reset the usage flag after reverting
+                self.is_environment_used = False
+            else:
+                logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
 
             if task_config is not None:
                 self._set_task_info(task_config)
@@ -202,6 +220,9 @@ class DesktopEnv(gym.Env):
                 logger.info("Setting up environment...")
                 success = self.setup_controller.setup(self.config)
                 if success:
+                    # Mark environment as used when setup is successfully executed
+                    if self.config:  # Only mark as used if there were actual setup operations
+                        self.is_environment_used = True
                     break
                 else:
                     logger.error(
@@ -300,6 +321,9 @@ class DesktopEnv(gym.Env):
     def step(self, action, pause=2):
         self._step_no += 1
         self.action_history.append(action)
+        
+        # Mark environment as used when step is called
+        self.is_environment_used = True
 
         reward = 0  # todo: Define reward calculation for each example
         done = False  # todo: Define episode termination condition for each example
@@ -336,7 +360,11 @@ class DesktopEnv(gym.Env):
         Evaluate whether the task is successfully completed.
         """
 
-        self.setup_controller.setup(self.evaluator.get("postconfig", []))
+        postconfig = self.evaluator.get("postconfig", [])
+        self.setup_controller.setup(postconfig)
+        # Mark environment as used if there were postconfig setup operations
+        if postconfig:
+            self.is_environment_used = True
 
         if self.evaluator['func'] == "infeasible":
             if len(self.action_history) > 0 and self.action_history[-1] == "FAIL":
diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py
index bcc89c5..076c182 100644
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -6,7 +6,8 @@ import logging
 import dotenv
 import signal
 
-INSTANCE_TYPE = "t3.medium" 
+# Using m5.large for better storage I/O performance to match io2 capabilities
+INSTANCE_TYPE = "m5.large"
 
 # Load environment variables from .env file
 dotenv.load_dotenv()
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index de8009d..c39943e 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -117,6 +117,113 @@ def execute_command():
         }), 500
 
 
+@app.route('/setup/execute_with_verification', methods=['POST'])
+@app.route('/execute_with_verification', methods=['POST'])
+def execute_command_with_verification():
+    """Execute command and verify the result based on provided verification criteria"""
+    data = request.json
+    shell = data.get('shell', False)
+    command = data.get('command', "" if shell else [])
+    verification = data.get('verification', {})
+    max_wait_time = data.get('max_wait_time', 10)  # Maximum wait time in seconds
+    check_interval = data.get('check_interval', 1)  # Check interval in seconds
+
+    if isinstance(command, str) and not shell:
+        command = shlex.split(command)
+
+    # Expand user directory
+    for i, arg in enumerate(command):
+        if arg.startswith("~/"):
+            command[i] = os.path.expanduser(arg)
+
+    # Execute the main command
+    try:
+        if platform_name == "Windows":
+            flags = subprocess.CREATE_NO_WINDOW
+        else:
+            flags = 0
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=shell,
+            text=True,
+            timeout=120,
+            creationflags=flags,
+        )
+        
+        # If no verification is needed, return immediately
+        if not verification:
+            return jsonify({
+                'status': 'success',
+                'output': result.stdout,
+                'error': result.stderr,
+                'returncode': result.returncode
+            })
+        
+        # Wait and verify the result
+        import time
+        start_time = time.time()
+        while time.time() - start_time < max_wait_time:
+            verification_passed = True
+            
+            # Check window existence if specified
+            if 'window_exists' in verification:
+                window_name = verification['window_exists']
+                try:
+                    if platform_name == 'Linux':
+                        wmctrl_result = subprocess.run(['wmctrl', '-l'], 
+                                                     capture_output=True, text=True, check=True)
+                        if window_name.lower() not in wmctrl_result.stdout.lower():
+                            verification_passed = False
+                    elif platform_name in ['Windows', 'Darwin']:
+                        import pygetwindow as gw
+                        windows = gw.getWindowsWithTitle(window_name)
+                        if not windows:
+                            verification_passed = False
+                except Exception:
+                    verification_passed = False
+            
+            # Check command execution if specified
+            if 'command_success' in verification:
+                verify_cmd = verification['command_success']
+                try:
+                    verify_result = subprocess.run(verify_cmd, shell=True, 
+                                                 capture_output=True, text=True, timeout=5)
+                    if verify_result.returncode != 0:
+                        verification_passed = False
+                except Exception:
+                    verification_passed = False
+            
+            if verification_passed:
+                return jsonify({
+                    'status': 'success',
+                    'output': result.stdout,
+                    'error': result.stderr,
+                    'returncode': result.returncode,
+                    'verification': 'passed',
+                    'wait_time': time.time() - start_time
+                })
+            
+            time.sleep(check_interval)
+        
+        # Verification failed
+        return jsonify({
+            'status': 'verification_failed',
+            'output': result.stdout,
+            'error': result.stderr,
+            'returncode': result.returncode,
+            'verification': 'failed',
+            'wait_time': max_wait_time
+        }), 500
+        
+    except Exception as e:
+        return jsonify({
+            'status': 'error',
+            'message': str(e)
+        }), 500
+
+
 def _get_machine_architecture() -> str:
     """ Get the machine architecture, e.g., x86_64, arm64, aarch64, i386, etc.
     """
@@ -1129,20 +1236,36 @@ def open_file():
 
     path_obj = Path(os.path.expandvars(os.path.expanduser(path)))
 
-    if not path_obj.exists():
-        return f"File not found: {path_obj}", 404
+    # Check if it's a file path that exists
+    is_file_path = path_obj.exists()
+    
+    # If it's not a file path, treat it as an application name/command
+    if not is_file_path:
+        # Check if it's a valid command by trying to find it in PATH
+        import shutil
+        if not shutil.which(path):
+            return f"Application/file not found: {path}", 404
 
     try:
-        if platform.system() == "Windows":
-            os.startfile(path_obj)
+        if is_file_path:
+            # Handle file opening
+            if platform.system() == "Windows":
+                os.startfile(path_obj)
+            else:
+                open_cmd: str = "open" if platform.system() == "Darwin" else "xdg-open"
+                subprocess.Popen([open_cmd, str(path_obj)])
+            file_name = path_obj.name
+            file_name_without_ext, _ = os.path.splitext(file_name)
         else:
-            open_cmd: str = "open" if platform.system() == "Darwin" else "xdg-open"
-            subprocess.Popen([open_cmd, str(path_obj)])
+            # Handle application launching
+            if platform.system() == "Windows":
+                subprocess.Popen([path])
+            else:
+                subprocess.Popen([path])
+            file_name = path
+            file_name_without_ext = path
 
-        # Wait for the file to open
-        file_name = path_obj.name
-        # Some apps don't include the extension in the title
-        file_name_without_ext, _ = os.path.splitext(file_name)
+        # Wait for the file/application to open
 
         start_time = time.time()
         window_found = False
@@ -1365,7 +1488,7 @@ def end_recording():
     error_output = ""
     try:
         # Send SIGINT for a graceful shutdown, allowing ffmpeg to finalize the file.
-    recording_process.send_signal(signal.SIGINT)
+        recording_process.send_signal(signal.SIGINT)
         # Wait for ffmpeg to terminate. communicate() gets output and waits.
         _, error_output = recording_process.communicate(timeout=15)
     except subprocess.TimeoutExpired:
@@ -1373,7 +1496,7 @@ def end_recording():
         recording_process.kill()
         # After killing, communicate to get any remaining output.
         _, error_output = recording_process.communicate()
-    recording_process = None
+        recording_process = None
         return jsonify({
             'status': 'error',
             'message': f'Recording process was unresponsive and had to be killed. Stderr: {error_output}'
diff --git a/evaluation_examples/examples/os/4127319a-8b79-4410-b58a-7a151e15f3d7.json b/evaluation_examples/examples/os/4127319a-8b79-4410-b58a-7a151e15f3d7.json
index 36be485..eb73a32 100644
--- a/evaluation_examples/examples/os/4127319a-8b79-4410-b58a-7a151e15f3d7.json
+++ b/evaluation_examples/examples/os/4127319a-8b79-4410-b58a-7a151e15f3d7.json
@@ -50,7 +50,7 @@
       "type": "rule",
       "rules": {
         "include": [
-          "54\n"
+          "54"
         ],
         "exclude": []
       }

From 30138c5db1fb04d723acbe01893642fdc3d12fd1 Mon Sep 17 00:00:00 2001
From: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
Date: Sun, 29 Jun 2025 20:18:44 +0800
Subject: [PATCH 7/7] VLC fix (#224)

* Enhance SetupController with improved logging and error handling during setup and file upload processes. Update instance type to t3.xlarge and AMI ID for AWS configuration. Add download progress logging and exception handling for better debugging.

* Enhance VLC status evaluation by adding multiple paths for file and URL information extraction, improving robustness against varying VLC XML structures. Implement detailed logging for better debugging and error handling in case of mismatches or missing data. Update example JSON for VLC evaluation to use a valid HLS stream URL.

* Improve audio comparison robustness in VLC evaluator by adding error handling for audio file loading and extraction. Implement detailed logging for empty or corrupt files, and normalize DTW distance calculation for more accurate similarity scoring. Remove deprecated audio fingerprint comparison function.

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
---
 desktop_env/controllers/setup.py              |  49 +++--
 desktop_env/evaluators/metrics/vlc.py         | 180 ++++++++++++++----
 desktop_env/providers/aws/manager.py          |   6 +-
 desktop_env/server/main.py                    |  66 ++++++-
 .../bba3381f-b5eb-4439-bd9e-80c22218d5a7.json |   6 +-
 5 files changed, 245 insertions(+), 62 deletions(-)

diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py
index 3a1d11a..65b4a8a 100644
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -79,7 +79,7 @@ class SetupController:
                 return False
                 
 
-        for cfg in config:
+        for i, cfg in enumerate(config):
             config_type: str = cfg["type"]
             parameters: Dict[str, Any] = cfg["parameters"]
 
@@ -87,10 +87,17 @@ class SetupController:
             # protocol
             setup_function: str = "_{:}_setup".format(config_type)
             assert hasattr(self, setup_function), f'Setup controller cannot find init function {setup_function}'
-            logger.info(f"call function {setup_function}")
-            getattr(self, setup_function)(**parameters)
-
-            logger.info("SETUP: %s(%s)", setup_function, str(parameters))
+            
+            try:
+                logger.info(f"Executing setup step {i+1}/{len(config)}: {setup_function}")
+                logger.debug(f"Setup parameters: {parameters}")
+                getattr(self, setup_function)(**parameters)
+                logger.info(f"SETUP COMPLETED: {setup_function}({str(parameters)})")
+            except Exception as e:
+                logger.error(f"SETUP FAILED at step {i+1}/{len(config)}: {setup_function}({str(parameters)})")
+                logger.error(f"Error details: {e}")
+                logger.error(f"Traceback: {traceback.format_exc()}")
+                raise Exception(f"Setup step {i+1} failed: {setup_function} - {e}") from e
         
         return True
 
@@ -113,25 +120,41 @@ class SetupController:
                 raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
 
             if not os.path.exists(cache_path):
+                logger.info(f"Cache file not found, downloading from {url} to {cache_path}")
                 max_retries = 3
                 downloaded = False
                 e = None
                 for i in range(max_retries):
                     try:
-                        response = requests.get(url, stream=True)
+                        logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
+                        response = requests.get(url, stream=True, timeout=300)  # Add 5 minute timeout
                         response.raise_for_status()
+                        
+                        # Get file size if available
+                        total_size = int(response.headers.get('content-length', 0))
+                        if total_size > 0:
+                            logger.info(f"File size: {total_size / (1024*1024):.2f} MB")
 
+                        downloaded_size = 0
                         with open(cache_path, 'wb') as f:
                             for chunk in response.iter_content(chunk_size=8192):
                                 if chunk:
                                     f.write(chunk)
-                        logger.info("File downloaded successfully")
+                                    downloaded_size += len(chunk)
+                                    if total_size > 0 and downloaded_size % (1024*1024) == 0:  # Log every MB
+                                        progress = (downloaded_size / total_size) * 100
+                                        logger.info(f"Download progress: {progress:.1f}%")
+                        
+                        logger.info(f"File downloaded successfully to {cache_path} ({downloaded_size / (1024*1024):.2f} MB)")
                         downloaded = True
                         break
 
                     except requests.RequestException as e:
                         logger.error(
                             f"Failed to download {url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+                        # Clean up partial download
+                        if os.path.exists(cache_path):
+                            os.remove(cache_path)
                 if not downloaded:
                     raise requests.RequestException(f"Failed to download {url}. No retries left.")
 
@@ -144,14 +167,18 @@ class SetupController:
 
             # send request to server to upload file
             try:
+                logger.info(f"Uploading {os.path.basename(path)} to VM at {path}")
                 logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
-                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
+                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form, timeout=600)  # 10 minute timeout for upload
                 if response.status_code == 200:
-                    logger.info("Command executed successfully: %s", response.text)
+                    logger.info(f"File uploaded successfully: {path}")
+                    logger.debug("Upload response: %s", response.text)
                 else:
-                    logger.error("Failed to upload file. Status code: %s", response.text)
+                    logger.error(f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}")
+                    raise requests.RequestException(f"Upload failed with status {response.status_code}")
             except requests.exceptions.RequestException as e:
-                logger.error("An error occurred while trying to send the request: %s", e)
+                logger.error(f"An error occurred while trying to upload {path}: {e}")
+                raise
 
     def _upload_file_setup(self, files: List[Dict[str, str]]):
         """
diff --git a/desktop_env/evaluators/metrics/vlc.py b/desktop_env/evaluators/metrics/vlc.py
index 5af7b50..bb8e5de 100644
--- a/desktop_env/evaluators/metrics/vlc.py
+++ b/desktop_env/evaluators/metrics/vlc.py
@@ -3,6 +3,7 @@ import os
 import subprocess
 from typing import Dict
 from xml.etree import ElementTree
+from urllib.parse import urlparse
 
 import acoustid
 import cv2
@@ -26,15 +27,108 @@ def is_vlc_playing(actual_status_path: str, rule: Dict[str, str]) -> float:
 
     tree = ElementTree.fromstring(actual_status)
     status = tree.find('state').text
+    logger.info(f"VLC Status: {status}")
     if status == 'playing':
         if rule['type'] == 'file_name':
-            file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text
+            # Try multiple possible paths for file information in VLC XML
+            file_paths = [
+                'information/category[@name="meta"]/info[@name="filename"]',
+                'information/category[@name="meta"]/info[@name="title"]',
+                'information/category[@name="meta"]/info[@name="uri"]',
+                'information/category[@name="meta"]/info[@name="location"]',
+                'information/category[@name="meta"]/info[@name="name"]'
+            ]
+            
+            file_info = None
+            for path in file_paths:
+                element = tree.find(path)
+                if element is not None and element.text:
+                    file_info = element.text
+                    break
+            
             if file_info:
-                return 1 if file_info.endswith(rule['file_name']) else 0
+                expected_filename = rule['file_name']
+                
+                # Method 1: Direct filename match (most precise)
+                actual_basename = os.path.basename(file_info)
+                if actual_basename == expected_filename:
+                    return 1
+                
+                # Method 2: Endswith match (for backward compatibility)
+                if file_info.endswith(expected_filename):
+                    return 1
+                
+                # Method 3: For paths, check if expected filename is in the path
+                if expected_filename in file_info:
+                    # Additional check to avoid false positives
+                    # Make sure it's actually the filename, not just part of a path
+                    if file_info.endswith('/' + expected_filename) or file_info.endswith('\\' + expected_filename):
+                        return 1
+                
+                logger.warning(f"File name mismatch - Expected: {expected_filename}, Found: {file_info}")
+                return 0
+            else:
+                logger.warning(f"Could not find file information in VLC status XML for rule: {rule}")
+                return 0
         elif rule['type'] == 'url':
-            file_info = tree.find('information/category[@name="meta"]/info[@name="url"]').text
+            # Try multiple possible paths for URL information in VLC XML
+            url_paths = [
+                'information/category[@name="meta"]/info[@name="url"]',
+                'information/category[@name="meta"]/info[@name="URI"]', 
+                'information/category[@name="meta"]/info[@name="location"]',
+                'information/category[@name="meta"]/info[@name="title"]',  # Sometimes URL is in title for streams
+                'information/category[@name="meta"]/info[@name="filename"]',  # Sometimes URL is in filename for streams
+                'information/category[@name="Stream 0"]/info[@name="Codec"]',  # Try stream info
+                'information/category[@name="Stream 0"]/info[@name="Type"]',
+                'information/category[@name="Stream 0"]/info[@name="Language"]'
+            ]
+            
+            file_info = None
+            logger.debug(f"Looking for URL: {rule['url']}")
+            
+            for path in url_paths:
+                element = tree.find(path)
+                if element is not None and element.text:
+                    file_info = element.text
+                    logger.debug(f"Found URL info at '{path}': {file_info}")
+                    break
+            
             if file_info:
-                return 1 if file_info.endswith(rule['url']) else 0
+                # For URL comparison, check if the rule URL is contained in the file_info
+                # This handles cases where VLC might show a longer or modified URL
+                expected_url = rule['url']
+                
+                # Method 1: Direct URL match
+                if expected_url in file_info or file_info.endswith(expected_url):
+                    return 1
+                
+                # Method 2: For HLS streams, VLC often shows just the filename instead of full URL
+                # Check if the file_info matches the filename part of the expected URL
+                try:
+                    expected_parsed = urlparse(expected_url)
+                    expected_filename = os.path.basename(expected_parsed.path)
+                    
+                    # If VLC shows just the filename (common for HLS streams)
+                    if file_info == expected_filename:
+                        logger.info(f"URL filename match - Expected URL: {expected_url}, VLC shows filename: {file_info}")
+                        return 1
+                    
+                    # Method 3: Check if both are URLs from the same domain and similar path
+                    if '://' in file_info:  # file_info is also a URL
+                        actual_parsed = urlparse(file_info)
+                        # Same domain and similar path structure
+                        if (expected_parsed.netloc == actual_parsed.netloc and 
+                            expected_parsed.path in actual_parsed.path):
+                            return 1
+                except Exception as e:
+                    logger.debug(f"URL parsing error: {e}")
+                    pass
+                
+                logger.warning(f"URL mismatch - Expected: {expected_url}, Found: {file_info}")
+                return 0
+            else:
+                logger.warning(f"Could not find URL information in VLC status XML for rule: {rule}")
+                return 0
         else:
             logger.error(f"Unknown type: {rule['type']}")
             return 0
@@ -130,33 +224,67 @@ def compare_audios(audio_path_1, audio_path_2):
     Compare two audio files and return a similarity score in the range [0, 1].
     audio_path_1, audio_path_2: paths to the audio files to compare
     """
-    # Example Usage:
     # similarity = compare_audios_simple('path_to_audio1.mp3', 'path_to_audio2.mp3')
     # print(f'Similarity Score: {similarity}')
 
-    # Convert to common format if necessary and load audio
     if not audio_path_1 or not audio_path_2:
         return 0
 
-    # Load the audio files and extract MFCC features
-    y1, sr1 = librosa.load(audio_path_1)
-    mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
+    y1, y2 = None, None
+    try:
+        y1, sr1 = librosa.load(audio_path_1)
+    except Exception:
+        logger.warning(f"Could not load audio from {os.path.basename(audio_path_1)}. It might be empty or corrupt.")
 
-    y2, sr2 = librosa.load(audio_path_2)
-    mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)
+    try:
+        y2, sr2 = librosa.load(audio_path_2)
+    except Exception:
+        logger.warning(f"Could not load audio from {os.path.basename(audio_path_2)}. It might be empty or corrupt.")
+
+    # Handle cases where one or both audio files are empty or corrupt.
+    is_y1_bad = (y1 is None) or (y1.shape[0] == 0)
+    is_y2_bad = (y2 is None) or (y2.shape[0] == 0)
+
+    if is_y1_bad and is_y2_bad:
+        logger.info("Both audio files are empty or corrupt. Considering them perfectly similar.")
+        return 1.0
+    
+    if is_y1_bad or is_y2_bad:
+        logger.warning(f"One audio file is empty/corrupt, the other is not. Similarity is 0.")
+        return 0.0
+
+    try:
+        logger.info(f"Audio 1 ({os.path.basename(audio_path_1)}): sr={sr1}, len={len(y1)}")
+        logger.info(f"Audio 2 ({os.path.basename(audio_path_2)}): sr={sr2}, len={len(y2)}")
+
+        # Extract MFCC features
+        mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
+        mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)
+    except Exception as e:
+        logger.error(f"Error during MFCC extraction: {e}")
+        return 0.0
 
     # Normalize the MFCC features
     mfcc1 = librosa.util.normalize(mfcc1, axis=1)
     mfcc2 = librosa.util.normalize(mfcc2, axis=1)
+    logger.info(f"MFCCs normalized.")
 
     # Define a lambda function to compute cosine distance
     dist_func = lambda x, y: cosine(x, y)
 
     # Use the DTW algorithm to find the best alignment path
     distance, path = fastdtw(mfcc1.T, mfcc2.T, dist=dist_func)
+    logger.info(f"DTW distance: {distance:.4f}, Path length: {len(path)}")
 
-    # Calculate the similarity score, here we use 1/(1+distance) to convert distance to a similarity score
-    similarity = 1 / (1 + distance)
+    # Normalize the DTW distance by the length of the alignment path.
+    if len(path) == 0:
+        normalized_distance = np.inf
+    else:
+        normalized_distance = distance / len(path)
+    logger.info(f"Normalized DTW distance: {normalized_distance:.4f}")
+    
+    # Convert the normalized distance to a similarity score using an exponential decay function.
+    similarity = np.exp(-normalized_distance)
 
     return similarity
 
@@ -204,32 +332,6 @@ def compare_videos(video_path1, video_path2, max_frames_to_check=100, threshold=
     return 1.
 
 
-def are_audio_files_similar(mp3_file_path, mp4_file_path):
-    # Extract audio fingerprint from MP3 file
-    mp3_fingerprint, mp3_duration = acoustid.fingerprint_file(mp3_file_path)
-
-    # Extract the audio stream from the MP4 file
-    mp4_audio_path = os.path.splitext(mp4_file_path)[0] + '_extracted.mp3'
-    try:
-        subprocess.run(["ffmpeg", "-i", mp4_file_path, "-vn", "-ar", "44100", "-ac", "2", "-ab", "192k", "-f", "mp3",
-                        mp4_audio_path], check=True)
-    except subprocess.CalledProcessError as e:
-        print(f"An error occurred during audio extraction from MP4: {e}")
-        return 0.
-
-    # Extract audio fingerprint from the extracted audio
-    mp4_fingerprint, mp4_duration = acoustid.fingerprint_file(mp4_audio_path)
-
-    # Clean up temporary extracted audio file
-    os.remove(mp4_audio_path)
-
-    # Compare fingerprints (rudimentary comparison)
-    if mp3_duration >= mp4_duration and mp3_fingerprint == mp4_fingerprint:
-        return 1.
-
-    return 0.
-
-
 def check_qt_bgcone(actual_config_path, rule):
     with open(actual_config_path, 'rb') as file:
         config_file = file.read().decode('utf-8')
diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py
index 076c182..15875d1 100644
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -6,8 +6,8 @@ import logging
 import dotenv
 import signal
 
-# Using m5.large for better storage I/O performance to match io2 capabilities
-INSTANCE_TYPE = "m5.large"
+
+INSTANCE_TYPE = "t3.xlarge" 
 
 # Load environment variables from .env file
 dotenv.load_dotenv()
@@ -36,7 +36,7 @@ DEFAULT_REGION = "us-east-1"
 # todo: Add doc for the configuration of image, security group and network interface
 # todo: public the AMI images
 IMAGE_ID_MAP = {
-    "us-east-1": "ami-025b3f907e5c1b805",
+    "us-east-1": "ami-0cae20d2680c939d4",
     "ap-east-1": "ami-0c092a5b8be4116f5",
 }
 
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index c39943e..606d0ac 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -1135,11 +1135,21 @@ def get_file():
         return jsonify({"error": "file_path is required"}), 400
 
     try:
+        # Check if the file exists and get its size
+        if not os.path.exists(file_path):
+            return jsonify({"error": "File not found"}), 404
+        
+        file_size = os.path.getsize(file_path)
+        logger.info(f"Serving file: {file_path} ({file_size} bytes)")
+        
         # Check if the file exists and send it to the user
         return send_file(file_path, as_attachment=True)
     except FileNotFoundError:
         # If the file is not found, return a 404 error
         return jsonify({"error": "File not found"}), 404
+    except Exception as e:
+        logger.error(f"Error serving file {file_path}: {e}")
+        return jsonify({"error": f"Failed to serve file: {str(e)}"}), 500
 
 
 @app.route("/setup/upload", methods=["POST"])
@@ -1148,8 +1158,27 @@ def upload_file():
     if 'file_path' in request.form and 'file_data' in request.files:
         file_path = os.path.expandvars(os.path.expanduser(request.form['file_path']))
         file = request.files["file_data"]
-        file.save(file_path)
-        return "File Uploaded"
+        
+        try:
+            # Ensure target directory exists
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            
+            # Save file and get size for verification
+            file.save(file_path)
+            uploaded_size = os.path.getsize(file_path)
+            
+            logger.info(f"File uploaded successfully: {file_path} ({uploaded_size} bytes)")
+            return f"File Uploaded: {uploaded_size} bytes"
+            
+        except Exception as e:
+            logger.error(f"Error uploading file to {file_path}: {e}")
+            # Clean up partial file if it exists
+            if os.path.exists(file_path):
+                try:
+                    os.remove(file_path)
+                except:
+                    pass
+            return jsonify({"error": f"Failed to upload file: {str(e)}"}), 500
     else:
         return jsonify({"error": "file_path and file_data are required"}), 400
 
@@ -1208,20 +1237,45 @@ def download_file():
 
     max_retries = 3
     error: Optional[Exception] = None
+    
     for i in range(max_retries):
         try:
-            response = requests.get(url, stream=True)
+            logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
+            response = requests.get(url, stream=True, timeout=300)
             response.raise_for_status()
+            
+            # Get expected file size if available
+            total_size = int(response.headers.get('content-length', 0))
+            if total_size > 0:
+                logger.info(f"Expected file size: {total_size / (1024*1024):.2f} MB")
 
+            downloaded_size = 0
             with open(path, 'wb') as f:
                 for chunk in response.iter_content(chunk_size=8192):
                     if chunk:
                         f.write(chunk)
-            return "File downloaded successfully"
+                        downloaded_size += len(chunk)
+                        if total_size > 0 and downloaded_size % (1024*1024) == 0:  # Log every MB
+                            progress = (downloaded_size / total_size) * 100
+                            logger.info(f"Download progress: {progress:.1f}%")
+            
+            # Verify download completeness
+            actual_size = os.path.getsize(path)
+            if total_size > 0 and actual_size != total_size:
+                raise Exception(f"Download incomplete. Expected {total_size} bytes, got {actual_size} bytes")
+            
+            logger.info(f"File downloaded successfully: {path} ({actual_size} bytes)")
+            return f"File downloaded successfully: {actual_size} bytes"
 
-        except requests.RequestException as e:
+        except (requests.RequestException, Exception) as e:
             error = e
-            logger.error(f"Failed to download {url}. Retrying... ({max_retries - i - 1} attempts left)")
+            logger.error(f"Failed to download {url}: {e}. Retrying... ({max_retries - i - 1} attempts left)")
+            # Clean up partial download
+            if path.exists():
+                try:
+                    path.unlink()
+                except:
+                    pass
 
     return f"Failed to download {url}. No retries left. Error: {error}", 500
 
diff --git a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
index 90440f6..f4b85b6 100644
--- a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
+++ b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
@@ -1,8 +1,8 @@
 {
   "id": "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
   "snapshot": "base_setup",
-  "instruction": "Can you start streaming the video from this link for me? https://www.youtube.com/watch?v=pgBsyTKAwLw",
-  "source": "https://www.quora.com/How-do-I-play-online-videos-using-the-VLC-media-player",
+  "instruction": "Can you start streaming the video from this link for me? https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8",
+  "source": "https://developer.apple.com/streaming/examples/ - Apple HLS test streams for developers",
   "config": [
     {
       "type": "launch",
@@ -32,7 +32,7 @@
       "type": "rule",
       "rules": {
         "type": "url",
-        "url": "https://www.youtube.com/watch?v=pgBsyTKAwLw"
+        "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8"
       }
     },
     "result": {