Robust Evaluation, Blocking File Open, Grader Sensitivity, and LibreOffice Writer Fixes (#217)

* Refactor evaluator structure in LibreOffice Writer example JSON to support multiple expected and result files, enhancing evaluation flexibility. * Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities. * Update instance type to t3.large and add VNC access URL logging for allocated VMs, enhancing remote access capabilities. * Update time format in get_vm_file function to include hours, minutes, and seconds for more precise file naming with time suffix. * More delay for 936321ce-5236-426a-9a20-e0e3c5dc536f; support one more potential solutions. * Enhance SetupController with configurable retry limit and improved error handling for file opening requests. Introduce new function to compare unique training records, and update logging for better debugging. Adjust JSON examples for evaluation to support multiple expected and result files. * Clean debug code --------- Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
2025-06-16 21:37:19 +08:00
parent 347238e17e
commit 4e11eafd1d
13 changed files with 523 additions and 135 deletions
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -36,6 +36,8 @@ FILE_PATH = os.path.dirname(os.path.abspath(__file__))

 init_proxy_pool(PROXY_CONFIG_FILE)  # initialize the global proxy pool

+MAX_RETRIES = 20
+
 class SetupController:
    def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache"):
        self.vm_ip: str = vm_ip
@@ -64,16 +66,16 @@ class SetupController:
        # make sure connection can be established
        logger.info(f"try to connect {self.http_server}")
        retry = 0
-        while retry < 50:
+        while retry < MAX_RETRIES:
            try:
                _ = requests.get(self.http_server + "/terminal")
                break
            except:
                time.sleep(5)
                retry += 1
-                logger.info(f"retry: {retry}/50")
+                logger.info(f"retry: {retry}/{MAX_RETRIES}")
            
-            if retry == 50:
+            if retry == MAX_RETRIES:
                return False
                

@@ -219,13 +221,14 @@ class SetupController:

        # send request to server to open file
        try:
-            response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload)
-            if response.status_code == 200:
-                logger.info("Command executed successfully: %s", response.text)
-            else:
-                logger.error("Failed to open file. Status code: %s", response.text)
+            # The server-side call is now blocking and can take time.
+            # We set a timeout that is slightly longer than the server's timeout (1800s).
+            response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810)
+            response.raise_for_status()  # This will raise an exception for 4xx and 5xx status codes
+            logger.info("Command executed successfully: %s", response.text)
        except requests.exceptions.RequestException as e:
-            logger.error("An error occurred while trying to send the request: %s", e)
+            logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}")
+            raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e

    def _launch_setup(self, command: Union[str, List[str]], shell: bool = False):
        if not command:
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -80,18 +80,16 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
          returned.
        only support for single file now:
        time_suffix(bool): optional. defaults to False. if True, append the current time in required format.
-        time_format(str): optional. defaults to "%Y_%m_%d". format of the time suffix.
+        time_format(str): optional. defaults to "%Y%m%d_%H%M%S". format of the time suffix.
    """
-    time_format = "%Y_%m_%d"
+    time_format = "%Y%m%d_%H%M%S"
    if not config.get("multi", False):
        paths: List[str] = [config["path"]]
        dests: List[str] = [config["dest"]]
-        if "time_suffix" in config.keys() and config["time_suffix"]:
-            if "time_format" in config.keys():
-                time_format = config["time_format"]
-            # Insert time before . in file type suffix
-            paths = [p.split(".")[0] + datetime.now().strftime(time_format) + "." + p.split(".")[1] if "." in p else p for p in paths]
-            dests = [d.split(".")[0] + datetime.now().strftime(time_format) + "." + d.split(".")[1] if "." in d else d for d in dests]
+        if config.get("time_suffix", False):
+            time_format = config.get("time_format", time_format)
+            # Insert time before file extension.
+            dests = [f"{os.path.splitext(d)[0]}_{datetime.now().strftime(time_format)}{os.path.splitext(d)[1]}" for d in dests]
    else:
        paths: List[str] = config["path"]
        dests: List[str] = config["dest"]
--- a/desktop_env/evaluators/metrics/init.py
+++ b/desktop_env/evaluators/metrics/init.py
@@ -52,7 +52,8 @@ from .docs import (
    compare_docx_files_and_ignore_new_lines,
    compare_docx_images,
    compare_image_text,
-    compare_references
+    compare_references,
+    compare_unique_train_records
 )
 from .general import (
    check_csv,
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -167,8 +167,12 @@ def compare_docx_files(file1, file2, **options):
            if ignore_case:
                p1, p2 = p1.lower(), p2.lower()
            if p1 != p2:
-                print(p1)
-                print(p2)
+                # show the difference
+                print("=== First Paragraph ===")
+                print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
+                print("=== Second Paragraph ===") 
+                print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
+                print("=" * 50)  # Clear boundary
                return 0

    return 1
@@ -886,3 +890,72 @@ def compare_references(file1, file2, **options):
        return (result - reference_base_result) / (1 - reference_base_result)
    else:
        return 0
+
+
+def compare_unique_train_records(processed_file, expected_files, **kwargs):
+    """
+    Compares the processed file with a list of expected files containing the
+    gold standard and the initial document.
+    expected_files[0] should be the gold standard file.
+    expected_files[1] should be the initial file.
+    """
+    # Debug logging to understand what we're actually receiving
+    logger.info(f"DEBUG: processed_file type: {type(processed_file)}, value: {processed_file}")
+    logger.info(f"DEBUG: expected_files type: {type(expected_files)}, value: {expected_files}")
+    logger.info(f"DEBUG: kwargs: {kwargs}")
+    
+    if not processed_file or not isinstance(expected_files, list) or len(expected_files) < 2:
+        logger.error("Invalid arguments: processed_file and a list of 2 expected_files are required.")
+        return 0
+
+    gold_file = expected_files[0]
+    initial_file = expected_files[1]
+
+    if not gold_file or not initial_file:
+        logger.error("Gold file or initial file path is missing from expected_files list.")
+        return 0
+
+    # Helper function to get lines and IDs from a file
+    def get_lines_and_ids_from_file(file_path):
+        try:
+            doc = Document(file_path)
+            lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
+            train_ids = [line.split(',')[1].strip() for line in lines if len(line.split(',')) == 4]
+            return lines, train_ids
+        except Exception as e:
+            logger.error(f"Error opening or parsing file {file_path}: {e}")
+            return None, None
+
+    # Get data from all three files
+    processed_lines, processed_train_ids = get_lines_and_ids_from_file(processed_file)
+    if processed_lines is None: return 0
+
+    gold_lines, gold_train_ids = get_lines_and_ids_from_file(gold_file)
+    if gold_lines is None: return 0
+
+    initial_lines, _ = get_lines_and_ids_from_file(initial_file)
+    if initial_lines is None: return 0
+    initial_lines_set = set(initial_lines)
+
+    # 1. Subset Check: Ensure every processed line was in the initial file
+    if not set(processed_lines).issubset(initial_lines_set):
+        logger.error("Processed file contains lines not present in the initial file.")
+        logger.error(f"Extra lines: {set(processed_lines) - initial_lines_set}")
+        return 0
+
+    # 2. Uniqueness Check: Check for duplicates within the processed file
+    if len(processed_train_ids) != len(set(processed_train_ids)):
+        logger.error("Duplicate train_ids found in the processed file.")
+        return 0
+
+    # 3. Correctness Check: Compare the set of train_ids
+    if set(processed_train_ids) != set(gold_train_ids):
+        logger.error("Set of train_ids does not match between processed file and gold file.")
+        return 0
+
+    # 4. Line count check
+    if len(processed_lines) != len(gold_lines):
+        logger.error("Number of lines does not match between processed file and gold file.")
+        return 0
+
+    return 1
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -5,6 +5,9 @@ import psutil
 import logging
 import dotenv
 import signal
+
+INSTANCE_TYPE = "t3.large"
+
 # Load environment variables from .env file
 dotenv.load_dotenv()

@@ -31,37 +34,17 @@ logger.setLevel(logging.INFO)
 DEFAULT_REGION = "us-east-1"
 # todo: Add doc for the configuration of image, security group and network interface
 # todo: public the AMI images
-# ami-05e7d7bd279ea4f14
 IMAGE_ID_MAP = {
-    "us-east-1": "ami-00674d875de9addc1",
+    "us-east-1": "ami-03a22c6e501415fb1",
    "ap-east-1": "ami-0c092a5b8be4116f5",
 }

-INSTANCE_TYPE = "t3.medium"

 def _allocate_vm(region=DEFAULT_REGION):
    
    if region not in IMAGE_ID_MAP:
        raise ValueError(f"Region {region} is not supported. Supported regions are: {list(IMAGE_ID_MAP.keys())}")

-    run_instances_params = {
-        "MaxCount": 1,
-        "MinCount": 1,
-        "ImageId": IMAGE_ID_MAP[region],
-        "InstanceType": INSTANCE_TYPE,
-        "EbsOptimized": True,
-        "NetworkInterfaces": [
-            {
-                "SubnetId": os.getenv('AWS_SUBNET_ID'),
-                "AssociatePublicIpAddress": True,
-                "DeviceIndex": 0,
-                "Groups": [
-                    os.getenv('AWS_SECURITY_GROUP_ID')
-                ]
-            }
-        ]
-    }
-
    ec2_client = boto3.client('ec2', region_name=region)
    instance_id = None
    original_sigint_handler = signal.getsignal(signal.SIGINT)
@@ -94,26 +77,64 @@ def _allocate_vm(region=DEFAULT_REGION):
        signal.signal(signal.SIGINT, signal_handler)
        signal.signal(signal.SIGTERM, signal_handler)
        
+        if not os.getenv('AWS_SECURITY_GROUP_ID'):
+            raise ValueError("AWS_SECURITY_GROUP_ID is not set in the environment variables.")
+        if not os.getenv('AWS_SUBNET_ID'):
+            raise ValueError("AWS_SUBNET_ID is not set in the environment variables.")
+
+        run_instances_params = {
+            "MaxCount": 1,
+            "MinCount": 1,
+            "ImageId": IMAGE_ID_MAP[region],
+            "InstanceType": INSTANCE_TYPE,
+            "EbsOptimized": True,
+            "NetworkInterfaces": [
+                {
+                    "SubnetId": os.getenv('AWS_SUBNET_ID'),
+                    "AssociatePublicIpAddress": True,
+                    "DeviceIndex": 0,
+                    "Groups": [
+                        os.getenv('AWS_SECURITY_GROUP_ID')
+                    ]
+                }
+            ]
+        }
+        
        response = ec2_client.run_instances(**run_instances_params)
        instance_id = response['Instances'][0]['InstanceId']
+        
+        waiter = ec2_client.get_waiter('instance_running')
        logger.info(f"Waiting for instance {instance_id} to be running...")
-        ec2_client.get_waiter('instance_running').wait(InstanceIds=[instance_id])
+        waiter.wait(InstanceIds=[instance_id])
        logger.info(f"Instance {instance_id} is ready.")
+        
+        # 获取并显示VNC访问地址
+        try:
+            instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])
+            instance = instance_details['Reservations'][0]['Instances'][0]
+            public_ip = instance.get('PublicIpAddress', '')
+            if public_ip:
+                vnc_url = f"http://{public_ip}:5910/vnc.html"
+                logger.info("="*80)
+                logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                logger.info(f"📡 Public IP: {public_ip}")
+                logger.info(f"🆔 Instance ID: {instance_id}")
+                logger.info("="*80)
+                print(f"\n🌐 VNC访问地址: {vnc_url}")
+                print(f"📍 请在浏览器中打开上述地址进行远程桌面访问\n")
+        except Exception as e:
+            logger.warning(f"Failed to get VNC address for instance {instance_id}: {e}")
    except KeyboardInterrupt:
        logger.warning("VM allocation interrupted by user (SIGINT).")
-        raise
-    except SystemExit:
-        logger.warning("VM allocation terminated by parent process (SIGTERM).")
+        if instance_id:
+            logger.info(f"Terminating instance {instance_id} due to interruption.")
+            ec2_client.terminate_instances(InstanceIds=[instance_id])
        raise
    except Exception as e:
-        logger.error(f"Failed to allocate VM in region {region}: {str(e)}")
-        # try to clean up any resources that were created
-        try:
-            if instance_id:
-                ec2_client.terminate_instances(InstanceIds=[instance_id])
-                logger.info(f"Terminated instance {instance_id} due to allocation failure.")
-        except Exception as cleanup_error:
-            logger.error(f"May fail to clean up instance {instance_id}: {str(cleanup_error)}")
+        logger.error(f"Failed to allocate VM: {e}", exc_info=True)
+        if instance_id:
+            logger.info(f"Terminating instance {instance_id} due to an error.")
+            ec2_client.terminate_instances(InstanceIds=[instance_id])
        raise
    finally:
        # Restore original signal handlers
@@ -153,6 +174,27 @@ def _allocate_vm_with_proxy(region=DEFAULT_REGION, proxy_config_file=None):
        subnet_id=os.getenv('AWS_SUBNET_ID')
    )
    
+    try:
+        ec2_client = boto3.client('ec2', region_name=region)
+        instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])
+        instance = instance_details['Reservations'][0]['Instances'][0]
+        public_ip = instance.get('PublicIpAddress', '')
+        if public_ip:
+            vnc_url = f"http://{public_ip}:5910/vnc.html"
+            logger.info("="*80)
+            logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+            logger.info(f"📡 Public IP: {public_ip}")
+            logger.info(f"🆔 Instance ID: {instance_id}")
+            if current_proxy:
+                logger.info(f"🌐 Proxy: {current_proxy.host}:{current_proxy.port}")
+            logger.info("="*80)
+            print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+            if current_proxy:
+                print(f"🔄 Current Proxy: {current_proxy.host}:{current_proxy.port}")
+            print(f"📍 Please open the above address in the browser for remote desktop access\n")
+    except Exception as e:
+        logger.warning(f"Failed to get VNC address for proxy instance {instance_id}: {e}")
+    
    return instance_id


@@ -213,4 +255,4 @@ class AWSVMManager(VMManager):
        else:
            logger.info("Allocating a new VM in region: {}".format(region))
            new_vm_path = _allocate_vm(region)
-        return new_vm_path
+        return new_vm_path
--- a/desktop_env/providers/aws/provider.py
+++ b/desktop_env/providers/aws/provider.py
@@ -63,10 +63,24 @@ class AWSProvider(Provider):
            for reservation in response['Reservations']:
                for instance in reservation['Instances']:
                    private_ip_address = instance.get('PrivateIpAddress', '')
+                    public_ip_address = instance.get('PublicIpAddress', '')
+                    
+                    if public_ip_address:
+                        vnc_url = f"http://{public_ip_address}:5910/vnc.html"
+                        logger.info("="*80)
+                        logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                        logger.info(f"📡 Public IP: {public_ip_address}")
+                        logger.info(f"🏠 Private IP: {private_ip_address}")
+                        logger.info("="*80)
+                        print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+                        print(f"📍 Please open the above address in the browser for remote desktop access\n")
+                    else:
+                        logger.warning("No public IP address available for VNC access")
+                    
                    return private_ip_address
            return ''  # Return an empty string if no IP address is found
        except ClientError as e:
-            logger.error(f"Failed to retrieve private IP address for the instance {path_to_vm}: {str(e)}")
+            logger.error(f"Failed to retrieve IP address for the instance {path_to_vm}: {str(e)}")
            raise

    def save_state(self, path_to_vm: str, snapshot_name: str):
@@ -74,7 +88,7 @@ class AWSProvider(Provider):
        ec2_client = boto3.client('ec2', region_name=self.region)

        try:
-            image_response = ec2_client.create_image(InstanceId=path_to_vm, ImageId=snapshot_name)
+            image_response = ec2_client.create_image(InstanceId=path_to_vm, Name=snapshot_name)
            image_id = image_response['ImageId']
            logger.info(f"AMI {image_id} created successfully from instance {path_to_vm}.")
            return image_id
@@ -83,7 +97,7 @@ class AWSProvider(Provider):
            raise

    def revert_to_snapshot(self, path_to_vm: str, snapshot_name: str):
-        logger.info(f"Reverting AWS VM to snapshot: {snapshot_name}...")
+        logger.info(f"Reverting AWS VM to snapshot AMI: {snapshot_name}...")
        ec2_client = boto3.client('ec2', region_name=self.region)

        try:
@@ -93,23 +107,21 @@ class AWSProvider(Provider):
            security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']]
            subnet_id = instance['SubnetId']
            instance_type = instance['InstanceType']
-            instance_snapshot = instance_details['Reservations'][0]['Instances'][0]['ImageId']
-
+            
            # Step 2: Terminate the old instance
            ec2_client.terminate_instances(InstanceIds=[path_to_vm])
            logger.info(f"Old instance {path_to_vm} has been terminated.")

-            # Step 3: Launch a new instance from the snapshot
-            logger.info(f"Launching a new instance from snapshot {instance_snapshot}...")
-
-
-            new_instance = ec2_client.run_instances(
-                MaxCount = 1,
-                MinCount = 1,
-                ImageId = instance_snapshot, 
-                InstanceType = instance_type,
-                EbsOptimized = True,
-                NetworkInterfaces = [
+            # Step 3: Launch a new instance from the snapshot(AMI) with performance optimization
+            logger.info(f"Launching a new instance from AMI {snapshot_name}...")
+            
+            run_instances_params = {
+                "MaxCount": 1,
+                "MinCount": 1,
+                "ImageId": snapshot_name,
+                "InstanceType": instance_type,
+                "EbsOptimized": True,
+                "NetworkInterfaces": [
                    {
                        "SubnetId": subnet_id,
                        "AssociatePublicIpAddress": True,
@@ -117,13 +129,31 @@ class AWSProvider(Provider):
                        "Groups": security_groups
                    }
                ]
-            )
+            }
+            
+            new_instance = ec2_client.run_instances(**run_instances_params)
            new_instance_id = new_instance['Instances'][0]['InstanceId']
-            logger.info(f"New instance {new_instance_id} launched from snapshot {snapshot_name}.")
+            logger.info(f"New instance {new_instance_id} launched from AMI {snapshot_name}.")
            logger.info(f"Waiting for instance {new_instance_id} to be running...")
            ec2_client.get_waiter('instance_running').wait(InstanceIds=[new_instance_id])

            logger.info(f"Instance {new_instance_id} is ready.")
+            
+            try:
+                instance_details = ec2_client.describe_instances(InstanceIds=[new_instance_id])
+                instance = instance_details['Reservations'][0]['Instances'][0]
+                public_ip = instance.get('PublicIpAddress', '')
+                if public_ip:
+                    vnc_url = f"http://{public_ip}:5910/vnc.html"
+                    logger.info("="*80)
+                    logger.info(f"🖥️  New Instance VNC Web Access URL: {vnc_url}")
+                    logger.info(f"📡 Public IP: {public_ip}")
+                    logger.info(f"🆔 New Instance ID: {new_instance_id}")
+                    logger.info("="*80)
+                    print(f"\n🌐 New Instance VNC Web Access URL: {vnc_url}")
+                    print(f"📍 Please open the above address in the browser for remote desktop access\n")
+            except Exception as e:
+                logger.warning(f"Failed to get VNC address for new instance {new_instance_id}: {e}")

            return new_instance_id

--- a/desktop_env/providers/aws/provider_with_proxy.py
+++ b/desktop_env/providers/aws/provider_with_proxy.py
@@ -163,16 +163,34 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po
            
            logger.info(f"Created new instance {instance_id} with proxy configuration")
            
-            # 等待实例运行
            logger.info(f"Waiting for instance {instance_id} to be running...")
            ec2_client.get_waiter('instance_running').wait(InstanceIds=[instance_id])
            logger.info(f"Instance {instance_id} is ready.")
+
+            try:
+                instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])
+                instance = instance_details['Reservations'][0]['Instances'][0]
+                public_ip = instance.get('PublicIpAddress', '')
+                if public_ip:
+                    vnc_url = f"http://{public_ip}:5910/vnc.html"
+                    logger.info("="*80)
+                    logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                    logger.info(f"📡 Public IP: {public_ip}")
+                    logger.info(f"🆔 Instance ID: {instance_id}")
+                    if self.current_proxy:
+                        logger.info(f"🌐 Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                    logger.info("="*80)
+                    print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+                    if self.current_proxy:
+                        print(f"🔄 Current Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                    print(f"📍 Please open the above address in the browser for remote desktop access\n")
+            except Exception as e:
+                logger.warning(f"Failed to get VNC address for instance {instance_id}: {e}")
            
            return instance_id
            
        except ClientError as e:
            logger.error(f"Failed to create instance with proxy: {str(e)}")
-            # 如果当前代理失败，尝试轮换代理
            if self.current_proxy:
                proxy_pool = get_global_proxy_pool()
                proxy_pool.mark_proxy_failed(self.current_proxy)
@@ -188,10 +206,28 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po
            for reservation in response['Reservations']:
                for instance in reservation['Instances']:
                    private_ip_address = instance.get('PrivateIpAddress', '')
+                    public_ip_address = instance.get('PublicIpAddress', '')
+
+                    if public_ip_address:
+                        vnc_url = f"http://{public_ip_address}:5910/vnc.html"
+                        logger.info("="*80)
+                        logger.info(f"🖥️  VNC Web Access URL: {vnc_url}")
+                        logger.info(f"📡 Public IP: {public_ip_address}")
+                        logger.info(f"🏠 Private IP: {private_ip_address}")
+                        if self.current_proxy:
+                            logger.info(f"🌐 Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                        logger.info("="*80)
+                        print(f"\n🌐 VNC Web Access URL: {vnc_url}")
+                        if self.current_proxy:
+                            print(f"🔄 Current Proxy: {self.current_proxy.host}:{self.current_proxy.port}")
+                        print(f"📍 Please open the above address in the browser for remote desktop access\n")
+                    else:
+                        logger.warning("No public IP address available for VNC access")
+                    
                    return private_ip_address
            return ''
        except ClientError as e:
-            logger.error(f"Failed to retrieve private IP address for the instance {path_to_vm}: {str(e)}")
+            logger.error(f"Failed to retrieve IP address for the instance {path_to_vm}: {str(e)}")
            raise

    def save_state(self, path_to_vm: str, snapshot_name: str):
@@ -212,24 +248,28 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po
        ec2_client = boto3.client('ec2', region_name=self.region)

        try:
-            # 获取原实例详情
+            # Get original instance details for config.
            instance_details = ec2_client.describe_instances(InstanceIds=[path_to_vm])
            instance = instance_details['Reservations'][0]['Instances'][0]
            security_groups = [sg['GroupId'] for sg in instance['SecurityGroups']]
            subnet_id = instance['SubnetId']
            instance_type = instance['InstanceType']

-            # 终止旧实例
+            # Terminate the old instance. This is a non-blocking call.
+            logger.info(f"Initiating termination for old instance {path_to_vm}...")
            ec2_client.terminate_instances(InstanceIds=[path_to_vm])
-            logger.info(f"Old instance {path_to_vm} has been terminated.")
+            logger.info(f"Old instance {path_to_vm} termination initiated.")

-            # 轮换到新的代理
+            # Rotate to a new proxy
            self._rotate_proxy()
            
-            # 创建新实例
+            # Create a new instance
            new_instance_id = self.create_instance_with_proxy(
                snapshot_name, instance_type, security_groups, subnet_id
            )
+            
+            # Note: VNC address is displayed within create_instance_with_proxy
+            logger.info(f"Successfully launched new instance {new_instance_id} for revert.")

            return new_instance_id

--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -4,6 +4,7 @@ import platform
 import shlex
 import json
 import subprocess, signal
+import time
 from pathlib import Path
 from typing import Any, Optional, Sequence
 from typing import List, Dict, Tuple, Literal
@@ -65,6 +66,8 @@ app = Flask(__name__)
 pyautogui.PAUSE = 0
 pyautogui.DARWIN_CATCH_UP_TIME = 0

+TIMEOUT = 1800  # seconds
+
 logger = app.logger
 recording_process = None  # fixme: this is a temporary solution for recording, need to be changed to support multiple-process
 recording_path = "/tmp/recording.mp4"
@@ -202,8 +205,8 @@ def capture_screen_with_cursor():
            pos = (round(pos_win[0]*ratio - hotspotx), round(pos_win[1]*ratio - hotspoty))

            img.paste(cursor, pos, cursor)
-        except:
-            pass
+        except Exception as e:
+            logger.warning(f"Failed to capture cursor on Windows, screenshot will not have a cursor. Error: {e}")

        img.save(file_path)
    elif user_platform == "Linux":
@@ -1124,18 +1127,72 @@ def open_file():
    if not path:
        return "Path not supplied!", 400

-    path = Path(os.path.expandvars(os.path.expanduser(path)))
+    path_obj = Path(os.path.expandvars(os.path.expanduser(path)))

-    if not path.exists():
-        return f"File not found: {path}", 404
+    if not path_obj.exists():
+        return f"File not found: {path_obj}", 404

    try:
        if platform.system() == "Windows":
-            os.startfile(path)
+            os.startfile(path_obj)
        else:
            open_cmd: str = "open" if platform.system() == "Darwin" else "xdg-open"
-            subprocess.Popen([open_cmd, str(path)])
-        return "File opened successfully"
+            subprocess.Popen([open_cmd, str(path_obj)])
+
+        # Wait for the file to open
+        file_name = path_obj.name
+        # Some apps don't include the extension in the title
+        file_name_without_ext, _ = os.path.splitext(file_name)
+
+        start_time = time.time()
+        window_found = False
+
+        while time.time() - start_time < TIMEOUT:
+            os_name = platform.system()
+            if os_name in ['Windows', 'Darwin']:
+                import pygetwindow as gw
+                # Check for window title containing file name or file name without extension
+                windows = gw.getWindowsWithTitle(file_name)
+                if not windows:
+                    windows = gw.getWindowsWithTitle(file_name_without_ext)
+
+                if windows:
+                    # To be more specific, we can try to activate it
+                    windows[0].activate()
+                    window_found = True
+                    break
+            elif os_name == 'Linux':
+                try:
+                    # Using wmctrl to list windows and check if any window title contains the filename
+                    result = subprocess.run(['wmctrl', '-l'], capture_output=True, text=True, check=True)
+                    window_list = result.stdout.strip().split('\n')
+                    if not result.stdout.strip():
+                        pass  # No windows, just continue waiting
+                    else:
+                        for window in window_list:
+                            if file_name in window or file_name_without_ext in window:
+                                # a window is found, now activate it
+                                window_id = window.split()[0]
+                                subprocess.run(['wmctrl', '-i', '-a', window_id], check=True)
+                                window_found = True
+                                break
+                        if window_found:
+                            break
+                except (subprocess.CalledProcessError, FileNotFoundError):
+                    # wmctrl might not be installed or the window manager isn't ready.
+                    # We just log it once and let the main loop retry.
+                    if 'wmctrl_failed_once' not in locals():
+                        logger.warning("wmctrl command is not ready, will keep retrying...")
+                        wmctrl_failed_once = True
+                    pass  # Let the outer loop retry
+
+            time.sleep(1)
+
+        if window_found:
+            return "File opened and window activated successfully"
+        else:
+            return f"Failed to find window for {file_name} within {timeout} seconds.", 500
+
    except Exception as e:
        return f"Failed to open {path}. Error: {e}", 500

@@ -1258,37 +1315,78 @@ def close_window():
@app.route('/start_recording', methods=['POST'])
 def start_recording():
    global recording_process
-    if recording_process:
+    if recording_process and recording_process.poll() is None:
        return jsonify({'status': 'error', 'message': 'Recording is already in progress.'}), 400

+    # Clean up previous recording if it exists
+    if os.path.exists(recording_path):
+        try:
+            os.remove(recording_path)
+        except OSError as e:
+            logger.error(f"Error removing old recording file: {e}")
+            return jsonify({'status': 'error', 'message': f'Failed to remove old recording file: {e}'}), 500
+
    d = display.Display()
    screen_width = d.screen().width_in_pixels
    screen_height = d.screen().height_in_pixels

    start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"

-    recording_process = subprocess.Popen(shlex.split(start_command), stdout=subprocess.DEVNULL,
-                                         stderr=subprocess.DEVNULL)
+    # Use stderr=PIPE to capture potential errors from ffmpeg
+    recording_process = subprocess.Popen(shlex.split(start_command),
+                                         stdout=subprocess.DEVNULL,
+                                         stderr=subprocess.PIPE,
+                                         text=True  # To get stderr as string
+                                         )

-    return jsonify({'status': 'success', 'message': 'Started recording.'})
+    # Wait a couple of seconds to see if ffmpeg starts successfully
+    try:
+        # Wait for 2 seconds. If ffmpeg exits within this time, it's an error.
+        recording_process.wait(timeout=2)
+        # If wait() returns, it means the process has terminated.
+        error_output = recording_process.stderr.read()
+        return jsonify({
+            'status': 'error',
+            'message': f'Failed to start recording. ffmpeg terminated unexpectedly. Error: {error_output}'
+        }), 500
+    except subprocess.TimeoutExpired:
+        # This is the expected outcome: the process is still running after 2 seconds.
+        return jsonify({'status': 'success', 'message': 'Started recording successfully.'})


@app.route('/end_recording', methods=['POST'])
 def end_recording():
    global recording_process

-    if not recording_process:
+    if not recording_process or recording_process.poll() is not None:
+        recording_process = None  # Clean up stale process object
        return jsonify({'status': 'error', 'message': 'No recording in progress to stop.'}), 400

+    error_output = ""
+    try:
+        # Send SIGINT for a graceful shutdown, allowing ffmpeg to finalize the file.
    recording_process.send_signal(signal.SIGINT)
-    recording_process.wait()
+        # Wait for ffmpeg to terminate. communicate() gets output and waits.
+        _, error_output = recording_process.communicate(timeout=15)
+    except subprocess.TimeoutExpired:
+        logger.error("ffmpeg did not respond to SIGINT, killing the process.")
+        recording_process.kill()
+        # After killing, communicate to get any remaining output.
+        _, error_output = recording_process.communicate()
    recording_process = None
+        return jsonify({
+            'status': 'error',
+            'message': f'Recording process was unresponsive and had to be killed. Stderr: {error_output}'
+        }), 500

-    # return recording video file
-    if os.path.exists(recording_path):
+    recording_process = None  # Clear the process from global state
+
+    # Check if the recording file was created and is not empty.
+    if os.path.exists(recording_path) and os.path.getsize(recording_path) > 0:
        return send_file(recording_path, as_attachment=True)
    else:
-        return abort(404, description="Recording failed")
+        logger.error(f"Recording failed. The output file is missing or empty. ffmpeg stderr: {error_output}")
+        return abort(500, description=f"Recording failed. The output file is missing or empty. ffmpeg stderr: {error_output}")


 if __name__ == '__main__':
--- a/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json
+++ b/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json
@@ -27,17 +27,57 @@
    "libreoffice_writer"
  ],
  "evaluator": {
-    "func": "compare_pdfs",
-    "expected": {
+    "func": [
+      "compare_pdfs",
+      "compare_pdfs", 
+      "compare_pdfs",
+      "compare_pdfs"
+    ],
+    "conj": "or",
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_Gold_1.pdf"
+      },
+      {
+        "type": "cloud_file", 
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_Gold_2.pdf"
+      },
+      {
      "type": "cloud_file",
      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
-      "dest": "Constitution_Template_With_Guidelines_Gold.pdf"
+        "dest": "Constitution_Template_With_Guidelines_Gold_3.pdf"
    },
-    "result": {
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_Gold_4.pdf" 
+      }
+    ],
+    "result": [
+      {
      "type": "vm_file",
      "path": "/home/user/Desktop/View_Person_Organizational_Summary.pdf",
-      "dest": "Constitution_Template_With_Guidelines.pdf"
-    }
+        "dest": "Constitution_Template_With_Guidelines_1.pdf"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Documents/View_Person_Organizational_Summary.pdf", 
+        "dest": "Constitution_Template_With_Guidelines_2.pdf"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Downloads/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_3.pdf"
+      },
+      {
+        "type": "vm_file", 
+        "path": "/home/user/View_Person_Organizational_Summary.pdf",
+        "dest": "Constitution_Template_With_Guidelines_4.pdf"
+      }
+    ]
  },
  "proxy": false
 }
--- a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
+++ b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
@@ -38,7 +38,7 @@
        "command": [
          "python",
          "-c",
-          "import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=8, interval=0.01); time.sleep(1); pyautogui.scroll(-2)"
+          "import pyautogui; import time; time.sleep(15); pyautogui.press(\"down\", presses=8, interval=0.01); time.sleep(1); pyautogui.scroll(-2)"
        ]
      }
    }
@@ -68,12 +68,12 @@
          "command": [
            "python",
            "-c",
-            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); pyautogui.press('down'); time.sleep(0.5); pyautogui.press('enter');"
+            "import pyautogui; import time; time.sleep(1); pyautogui.hotkey('ctrl', 's'); time.sleep(3);"
          ]
        }
      }
    ],
-    "func": "compare_contains_image",
+    "func": "compare_docx_images",
    "result": {
      "type": "vm_file",
      "path": "/home/user/Desktop/Viewing_Your_Class_Schedule_and_Textbooks.docx",
--- a/evaluation_examples/examples/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108.json
+++ b/evaluation_examples/examples/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108.json
@@ -52,7 +52,7 @@
        }
      }
    ],
-    "func": "compare_docx_lines",
+    "func": "compare_unique_train_records",
    "result": {
      "type": "vm_file",
      "path": "/home/user/Desktop/HK_train_record.docx",
@@ -60,8 +60,16 @@
    },
    "expected": {
      "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record_Gold.docx",
-      "dest": "HK_train_record_Gold.docx"
+      "path": [
+        "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record_Gold.docx",
+        "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/6f81754e-285d-4ce0-b59e-af7edb02d108/HK_train_record.docx"
+      ],
+      "dest": [
+        "HK_train_record_Gold.docx",
+        "HK_train_record_Original.docx"
+      ],
+      "multi": true,
+      "gives": [0, 1]
    }
  },
  "proxy": false
--- a/evaluation_examples/examples/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48.json
+++ b/evaluation_examples/examples/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48.json
@@ -52,20 +52,57 @@
        }
      }
    ],
-    "func": "compare_docx_files",
-    "expected": {
-      "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold.docx",
-      "dest": "CCCH9003_Tutorial_guidelines_Gold.docx"
-    },
-    "result": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
-      "dest": "CCCH9003_Tutorial_guidelines.docx"
-    },
-    "options": {
-      "ignore_blanks": false
-    }
+    "func": [
+      "compare_docx_files",
+      "compare_docx_files",
+      "compare_docx_files"
+    ],
+    "conj": "or",
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_1.docx",
+        "dest": "CCCH9003_Tutorial_guidelines_Gold_1.docx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_2.docx",
+        "dest": "CCCH9003_Tutorial_guidelines_Gold_2.docx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/88fe4b2d-3040-4c70-9a70-546a47764b48/CCCH9003_Tutorial_guidelines_Gold_3.docx",
+        "dest": "CCCH9003_Tutorial_guidelines_Gold_3.docx"
+      }
+    ],
+    "result": [
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
+        "dest": "CCCH9003_Tutorial_guidelines.docx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
+        "dest": "CCCH9003_Tutorial_guidelines.docx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/CCCH9003_Tutorial_guidelines.docx",
+        "dest": "CCCH9003_Tutorial_guidelines.docx"
+      }
+    ],
+    "options": [
+      {
+        "ignore_blanks": false
+      },
+      {
+        "ignore_blanks": false
+      },
+      {
+        "ignore_blanks": false
+      }
+    ]
  },
  "proxy": false
 }
--- a/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json
+++ b/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json
@@ -47,22 +47,40 @@
          "command": [
            "python",
            "-c",
-            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); "
+            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(2); "
          ]
        }
      }
    ],
-    "func": "compare_docx_tables",
-    "expected": {
-      "type": "cloud_file",
-      "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold.docx",
-      "dest": "Graphemes_Sound_Letter_Patterns_Gold.docx"
-    },
-    "result": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
-      "dest": "Graphemes_Sound_Letter_Patterns.docx"
-    }
+    "func": [
+      "compare_docx_tables",
+      "compare_docx_tables"
+    ],
+    "conj": "or",
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns_Gold.docx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f/Graphemes_Sound_Letter_Patterns_Gold_2.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns_Gold_2.docx"
+      }
+    ],
+    "result": [
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns.docx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/Graphemes_Sound_Letter_Patterns.docx",
+        "dest": "Graphemes_Sound_Letter_Patterns.docx"
+      }
+    ]
  },
  "proxy": false
 }