From 30138c5db1fb04d723acbe01893642fdc3d12fd1 Mon Sep 17 00:00:00 2001
From: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
Date: Sun, 29 Jun 2025 20:18:44 +0800
Subject: [PATCH] VLC fix (#224)

* Enhance SetupController with improved logging and error handling during setup and file upload processes. Update instance type to t3.xlarge and AMI ID for AWS configuration. Add download progress logging and exception handling for better debugging.

* Enhance VLC status evaluation by adding multiple paths for file and URL information extraction, improving robustness against varying VLC XML structures. Implement detailed logging for better debugging and error handling in case of mismatches or missing data. Update example JSON for VLC evaluation to use a valid HLS stream URL.

* Improve audio comparison robustness in VLC evaluator by adding error handling for audio file loading and extraction. Implement detailed logging for empty or corrupt files, and normalize DTW distance calculation for more accurate similarity scoring. Remove deprecated audio fingerprint comparison function.

---------

Co-authored-by: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
---
 desktop_env/controllers/setup.py              |  49 +++--
 desktop_env/evaluators/metrics/vlc.py         | 180 ++++++++++++++----
 desktop_env/providers/aws/manager.py          |   6 +-
 desktop_env/server/main.py                    |  66 ++++++-
 .../bba3381f-b5eb-4439-bd9e-80c22218d5a7.json |   6 +-
 5 files changed, 245 insertions(+), 62 deletions(-)

diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py
index 3a1d11a..65b4a8a 100644
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -79,7 +79,7 @@ class SetupController:
                 return False
                 
 
-        for cfg in config:
+        for i, cfg in enumerate(config):
             config_type: str = cfg["type"]
             parameters: Dict[str, Any] = cfg["parameters"]
 
@@ -87,10 +87,17 @@ class SetupController:
             # protocol
             setup_function: str = "_{:}_setup".format(config_type)
             assert hasattr(self, setup_function), f'Setup controller cannot find init function {setup_function}'
-            logger.info(f"call function {setup_function}")
-            getattr(self, setup_function)(**parameters)
-
-            logger.info("SETUP: %s(%s)", setup_function, str(parameters))
+            
+            try:
+                logger.info(f"Executing setup step {i+1}/{len(config)}: {setup_function}")
+                logger.debug(f"Setup parameters: {parameters}")
+                getattr(self, setup_function)(**parameters)
+                logger.info(f"SETUP COMPLETED: {setup_function}({str(parameters)})")
+            except Exception as e:
+                logger.error(f"SETUP FAILED at step {i+1}/{len(config)}: {setup_function}({str(parameters)})")
+                logger.error(f"Error details: {e}")
+                logger.error(f"Traceback: {traceback.format_exc()}")
+                raise Exception(f"Setup step {i+1} failed: {setup_function} - {e}") from e
         
         return True
 
@@ -113,25 +120,41 @@ class SetupController:
                 raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
 
             if not os.path.exists(cache_path):
+                logger.info(f"Cache file not found, downloading from {url} to {cache_path}")
                 max_retries = 3
                 downloaded = False
                 e = None
                 for i in range(max_retries):
                     try:
-                        response = requests.get(url, stream=True)
+                        logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
+                        response = requests.get(url, stream=True, timeout=300)  # Add 5 minute timeout
                         response.raise_for_status()
+                        
+                        # Get file size if available
+                        total_size = int(response.headers.get('content-length', 0))
+                        if total_size > 0:
+                            logger.info(f"File size: {total_size / (1024*1024):.2f} MB")
 
+                        downloaded_size = 0
                         with open(cache_path, 'wb') as f:
                             for chunk in response.iter_content(chunk_size=8192):
                                 if chunk:
                                     f.write(chunk)
-                        logger.info("File downloaded successfully")
+                                    downloaded_size += len(chunk)
+                                    if total_size > 0 and downloaded_size % (1024*1024) == 0:  # Log every MB
+                                        progress = (downloaded_size / total_size) * 100
+                                        logger.info(f"Download progress: {progress:.1f}%")
+                        
+                        logger.info(f"File downloaded successfully to {cache_path} ({downloaded_size / (1024*1024):.2f} MB)")
                         downloaded = True
                         break
 
                     except requests.RequestException as e:
                         logger.error(
                             f"Failed to download {url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+                        # Clean up partial download
+                        if os.path.exists(cache_path):
+                            os.remove(cache_path)
                 if not downloaded:
                     raise requests.RequestException(f"Failed to download {url}. No retries left.")
 
@@ -144,14 +167,18 @@ class SetupController:
 
             # send request to server to upload file
             try:
+                logger.info(f"Uploading {os.path.basename(path)} to VM at {path}")
                 logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
-                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
+                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form, timeout=600)  # 10 minute timeout for upload
                 if response.status_code == 200:
-                    logger.info("Command executed successfully: %s", response.text)
+                    logger.info(f"File uploaded successfully: {path}")
+                    logger.debug("Upload response: %s", response.text)
                 else:
-                    logger.error("Failed to upload file. Status code: %s", response.text)
+                    logger.error(f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}")
+                    raise requests.RequestException(f"Upload failed with status {response.status_code}")
             except requests.exceptions.RequestException as e:
-                logger.error("An error occurred while trying to send the request: %s", e)
+                logger.error(f"An error occurred while trying to upload {path}: {e}")
+                raise
 
     def _upload_file_setup(self, files: List[Dict[str, str]]):
         """
diff --git a/desktop_env/evaluators/metrics/vlc.py b/desktop_env/evaluators/metrics/vlc.py
index 5af7b50..bb8e5de 100644
--- a/desktop_env/evaluators/metrics/vlc.py
+++ b/desktop_env/evaluators/metrics/vlc.py
@@ -3,6 +3,7 @@ import os
 import subprocess
 from typing import Dict
 from xml.etree import ElementTree
+from urllib.parse import urlparse
 
 import acoustid
 import cv2
@@ -26,15 +27,108 @@ def is_vlc_playing(actual_status_path: str, rule: Dict[str, str]) -> float:
 
     tree = ElementTree.fromstring(actual_status)
     status = tree.find('state').text
+    logger.info(f"VLC Status: {status}")
     if status == 'playing':
         if rule['type'] == 'file_name':
-            file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text
+            # Try multiple possible paths for file information in VLC XML
+            file_paths = [
+                'information/category[@name="meta"]/info[@name="filename"]',
+                'information/category[@name="meta"]/info[@name="title"]',
+                'information/category[@name="meta"]/info[@name="uri"]',
+                'information/category[@name="meta"]/info[@name="location"]',
+                'information/category[@name="meta"]/info[@name="name"]'
+            ]
+            
+            file_info = None
+            for path in file_paths:
+                element = tree.find(path)
+                if element is not None and element.text:
+                    file_info = element.text
+                    break
+            
             if file_info:
-                return 1 if file_info.endswith(rule['file_name']) else 0
+                expected_filename = rule['file_name']
+                
+                # Method 1: Direct filename match (most precise)
+                actual_basename = os.path.basename(file_info)
+                if actual_basename == expected_filename:
+                    return 1
+                
+                # Method 2: Endswith match (for backward compatibility)
+                if file_info.endswith(expected_filename):
+                    return 1
+                
+                # Method 3: For paths, check if expected filename is in the path
+                if expected_filename in file_info:
+                    # Additional check to avoid false positives
+                    # Make sure it's actually the filename, not just part of a path
+                    if file_info.endswith('/' + expected_filename) or file_info.endswith('\\' + expected_filename):
+                        return 1
+                
+                logger.warning(f"File name mismatch - Expected: {expected_filename}, Found: {file_info}")
+                return 0
+            else:
+                logger.warning(f"Could not find file information in VLC status XML for rule: {rule}")
+                return 0
         elif rule['type'] == 'url':
-            file_info = tree.find('information/category[@name="meta"]/info[@name="url"]').text
+            # Try multiple possible paths for URL information in VLC XML
+            url_paths = [
+                'information/category[@name="meta"]/info[@name="url"]',
+                'information/category[@name="meta"]/info[@name="URI"]', 
+                'information/category[@name="meta"]/info[@name="location"]',
+                'information/category[@name="meta"]/info[@name="title"]',  # Sometimes URL is in title for streams
+                'information/category[@name="meta"]/info[@name="filename"]',  # Sometimes URL is in filename for streams
+                'information/category[@name="Stream 0"]/info[@name="Codec"]',  # Try stream info
+                'information/category[@name="Stream 0"]/info[@name="Type"]',
+                'information/category[@name="Stream 0"]/info[@name="Language"]'
+            ]
+            
+            file_info = None
+            logger.debug(f"Looking for URL: {rule['url']}")
+            
+            for path in url_paths:
+                element = tree.find(path)
+                if element is not None and element.text:
+                    file_info = element.text
+                    logger.debug(f"Found URL info at '{path}': {file_info}")
+                    break
+            
             if file_info:
-                return 1 if file_info.endswith(rule['url']) else 0
+                # For URL comparison, check if the rule URL is contained in the file_info
+                # This handles cases where VLC might show a longer or modified URL
+                expected_url = rule['url']
+                
+                # Method 1: Direct URL match
+                if expected_url in file_info or file_info.endswith(expected_url):
+                    return 1
+                
+                # Method 2: For HLS streams, VLC often shows just the filename instead of full URL
+                # Check if the file_info matches the filename part of the expected URL
+                try:
+                    expected_parsed = urlparse(expected_url)
+                    expected_filename = os.path.basename(expected_parsed.path)
+                    
+                    # If VLC shows just the filename (common for HLS streams)
+                    if file_info == expected_filename:
+                        logger.info(f"URL filename match - Expected URL: {expected_url}, VLC shows filename: {file_info}")
+                        return 1
+                    
+                    # Method 3: Check if both are URLs from the same domain and similar path
+                    if '://' in file_info:  # file_info is also a URL
+                        actual_parsed = urlparse(file_info)
+                        # Same domain and similar path structure
+                        if (expected_parsed.netloc == actual_parsed.netloc and 
+                            expected_parsed.path in actual_parsed.path):
+                            return 1
+                except Exception as e:
+                    logger.debug(f"URL parsing error: {e}")
+                    pass
+                
+                logger.warning(f"URL mismatch - Expected: {expected_url}, Found: {file_info}")
+                return 0
+            else:
+                logger.warning(f"Could not find URL information in VLC status XML for rule: {rule}")
+                return 0
         else:
             logger.error(f"Unknown type: {rule['type']}")
             return 0
@@ -130,33 +224,67 @@ def compare_audios(audio_path_1, audio_path_2):
     Compare two audio files and return a similarity score in the range [0, 1].
     audio_path_1, audio_path_2: paths to the audio files to compare
     """
-    # Example Usage:
     # similarity = compare_audios_simple('path_to_audio1.mp3', 'path_to_audio2.mp3')
     # print(f'Similarity Score: {similarity}')
 
-    # Convert to common format if necessary and load audio
     if not audio_path_1 or not audio_path_2:
         return 0
 
-    # Load the audio files and extract MFCC features
-    y1, sr1 = librosa.load(audio_path_1)
-    mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
+    y1, y2 = None, None
+    try:
+        y1, sr1 = librosa.load(audio_path_1)
+    except Exception:
+        logger.warning(f"Could not load audio from {os.path.basename(audio_path_1)}. It might be empty or corrupt.")
 
-    y2, sr2 = librosa.load(audio_path_2)
-    mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)
+    try:
+        y2, sr2 = librosa.load(audio_path_2)
+    except Exception:
+        logger.warning(f"Could not load audio from {os.path.basename(audio_path_2)}. It might be empty or corrupt.")
+
+    # Handle cases where one or both audio files are empty or corrupt.
+    is_y1_bad = (y1 is None) or (y1.shape[0] == 0)
+    is_y2_bad = (y2 is None) or (y2.shape[0] == 0)
+
+    if is_y1_bad and is_y2_bad:
+        logger.info("Both audio files are empty or corrupt. Considering them perfectly similar.")
+        return 1.0
+    
+    if is_y1_bad or is_y2_bad:
+        logger.warning(f"One audio file is empty/corrupt, the other is not. Similarity is 0.")
+        return 0.0
+
+    try:
+        logger.info(f"Audio 1 ({os.path.basename(audio_path_1)}): sr={sr1}, len={len(y1)}")
+        logger.info(f"Audio 2 ({os.path.basename(audio_path_2)}): sr={sr2}, len={len(y2)}")
+
+        # Extract MFCC features
+        mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
+        mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)
+    except Exception as e:
+        logger.error(f"Error during MFCC extraction: {e}")
+        return 0.0
 
     # Normalize the MFCC features
     mfcc1 = librosa.util.normalize(mfcc1, axis=1)
     mfcc2 = librosa.util.normalize(mfcc2, axis=1)
+    logger.info(f"MFCCs normalized.")
 
     # Define a lambda function to compute cosine distance
     dist_func = lambda x, y: cosine(x, y)
 
     # Use the DTW algorithm to find the best alignment path
     distance, path = fastdtw(mfcc1.T, mfcc2.T, dist=dist_func)
+    logger.info(f"DTW distance: {distance:.4f}, Path length: {len(path)}")
 
-    # Calculate the similarity score, here we use 1/(1+distance) to convert distance to a similarity score
-    similarity = 1 / (1 + distance)
+    # Normalize the DTW distance by the length of the alignment path.
+    if len(path) == 0:
+        normalized_distance = np.inf
+    else:
+        normalized_distance = distance / len(path)
+    logger.info(f"Normalized DTW distance: {normalized_distance:.4f}")
+    
+    # Convert the normalized distance to a similarity score using an exponential decay function.
+    similarity = np.exp(-normalized_distance)
 
     return similarity
 
@@ -204,32 +332,6 @@ def compare_videos(video_path1, video_path2, max_frames_to_check=100, threshold=
     return 1.
 
 
-def are_audio_files_similar(mp3_file_path, mp4_file_path):
-    # Extract audio fingerprint from MP3 file
-    mp3_fingerprint, mp3_duration = acoustid.fingerprint_file(mp3_file_path)
-
-    # Extract the audio stream from the MP4 file
-    mp4_audio_path = os.path.splitext(mp4_file_path)[0] + '_extracted.mp3'
-    try:
-        subprocess.run(["ffmpeg", "-i", mp4_file_path, "-vn", "-ar", "44100", "-ac", "2", "-ab", "192k", "-f", "mp3",
-                        mp4_audio_path], check=True)
-    except subprocess.CalledProcessError as e:
-        print(f"An error occurred during audio extraction from MP4: {e}")
-        return 0.
-
-    # Extract audio fingerprint from the extracted audio
-    mp4_fingerprint, mp4_duration = acoustid.fingerprint_file(mp4_audio_path)
-
-    # Clean up temporary extracted audio file
-    os.remove(mp4_audio_path)
-
-    # Compare fingerprints (rudimentary comparison)
-    if mp3_duration >= mp4_duration and mp3_fingerprint == mp4_fingerprint:
-        return 1.
-
-    return 0.
-
-
 def check_qt_bgcone(actual_config_path, rule):
     with open(actual_config_path, 'rb') as file:
         config_file = file.read().decode('utf-8')
diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py
index 076c182..15875d1 100644
--- a/desktop_env/providers/aws/manager.py
+++ b/desktop_env/providers/aws/manager.py
@@ -6,8 +6,8 @@ import logging
 import dotenv
 import signal
 
-# Using m5.large for better storage I/O performance to match io2 capabilities
-INSTANCE_TYPE = "m5.large"
+
+INSTANCE_TYPE = "t3.xlarge" 
 
 # Load environment variables from .env file
 dotenv.load_dotenv()
@@ -36,7 +36,7 @@ DEFAULT_REGION = "us-east-1"
 # todo: Add doc for the configuration of image, security group and network interface
 # todo: public the AMI images
 IMAGE_ID_MAP = {
-    "us-east-1": "ami-025b3f907e5c1b805",
+    "us-east-1": "ami-0cae20d2680c939d4",
     "ap-east-1": "ami-0c092a5b8be4116f5",
 }
 
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index c39943e..606d0ac 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -1135,11 +1135,21 @@ def get_file():
         return jsonify({"error": "file_path is required"}), 400
 
     try:
+        # Check if the file exists and get its size
+        if not os.path.exists(file_path):
+            return jsonify({"error": "File not found"}), 404
+        
+        file_size = os.path.getsize(file_path)
+        logger.info(f"Serving file: {file_path} ({file_size} bytes)")
+        
         # Check if the file exists and send it to the user
         return send_file(file_path, as_attachment=True)
     except FileNotFoundError:
         # If the file is not found, return a 404 error
         return jsonify({"error": "File not found"}), 404
+    except Exception as e:
+        logger.error(f"Error serving file {file_path}: {e}")
+        return jsonify({"error": f"Failed to serve file: {str(e)}"}), 500
 
 
 @app.route("/setup/upload", methods=["POST"])
@@ -1148,8 +1158,27 @@ def upload_file():
     if 'file_path' in request.form and 'file_data' in request.files:
         file_path = os.path.expandvars(os.path.expanduser(request.form['file_path']))
         file = request.files["file_data"]
-        file.save(file_path)
-        return "File Uploaded"
+        
+        try:
+            # Ensure target directory exists
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            
+            # Save file and get size for verification
+            file.save(file_path)
+            uploaded_size = os.path.getsize(file_path)
+            
+            logger.info(f"File uploaded successfully: {file_path} ({uploaded_size} bytes)")
+            return f"File Uploaded: {uploaded_size} bytes"
+            
+        except Exception as e:
+            logger.error(f"Error uploading file to {file_path}: {e}")
+            # Clean up partial file if it exists
+            if os.path.exists(file_path):
+                try:
+                    os.remove(file_path)
+                except:
+                    pass
+            return jsonify({"error": f"Failed to upload file: {str(e)}"}), 500
     else:
         return jsonify({"error": "file_path and file_data are required"}), 400
 
@@ -1208,20 +1237,45 @@ def download_file():
 
     max_retries = 3
     error: Optional[Exception] = None
+    
     for i in range(max_retries):
         try:
-            response = requests.get(url, stream=True)
+            logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
+            response = requests.get(url, stream=True, timeout=300)
             response.raise_for_status()
+            
+            # Get expected file size if available
+            total_size = int(response.headers.get('content-length', 0))
+            if total_size > 0:
+                logger.info(f"Expected file size: {total_size / (1024*1024):.2f} MB")
 
+            downloaded_size = 0
             with open(path, 'wb') as f:
                 for chunk in response.iter_content(chunk_size=8192):
                     if chunk:
                         f.write(chunk)
-            return "File downloaded successfully"
+                        downloaded_size += len(chunk)
+                        if total_size > 0 and downloaded_size % (1024*1024) == 0:  # Log every MB
+                            progress = (downloaded_size / total_size) * 100
+                            logger.info(f"Download progress: {progress:.1f}%")
+            
+            # Verify download completeness
+            actual_size = os.path.getsize(path)
+            if total_size > 0 and actual_size != total_size:
+                raise Exception(f"Download incomplete. Expected {total_size} bytes, got {actual_size} bytes")
+            
+            logger.info(f"File downloaded successfully: {path} ({actual_size} bytes)")
+            return f"File downloaded successfully: {actual_size} bytes"
 
-        except requests.RequestException as e:
+        except (requests.RequestException, Exception) as e:
             error = e
-            logger.error(f"Failed to download {url}. Retrying... ({max_retries - i - 1} attempts left)")
+            logger.error(f"Failed to download {url}: {e}. Retrying... ({max_retries - i - 1} attempts left)")
+            # Clean up partial download
+            if path.exists():
+                try:
+                    path.unlink()
+                except:
+                    pass
 
     return f"Failed to download {url}. No retries left. Error: {error}", 500
 
diff --git a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
index 90440f6..f4b85b6 100644
--- a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
+++ b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json
@@ -1,8 +1,8 @@
 {
   "id": "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
   "snapshot": "base_setup",
-  "instruction": "Can you start streaming the video from this link for me? https://www.youtube.com/watch?v=pgBsyTKAwLw",
-  "source": "https://www.quora.com/How-do-I-play-online-videos-using-the-VLC-media-player",
+  "instruction": "Can you start streaming the video from this link for me? https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8",
+  "source": "https://developer.apple.com/streaming/examples/ - Apple HLS test streams for developers",
   "config": [
     {
       "type": "launch",
@@ -32,7 +32,7 @@
       "type": "rule",
       "rules": {
         "type": "url",
-        "url": "https://www.youtube.com/watch?v=pgBsyTKAwLw"
+        "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8"
       }
     },
     "result": {