sci-gui-agent-benchmark/desktop_env/evaluators/metrics/vlc.py

import logging
import os
import subprocess
from typing import Dict
from xml.etree import ElementTree

import acoustid
import cv2
import imagehash
from skimage.metrics import structural_similarity as ssim
import librosa
from PIL import Image
import numpy as np

logger = logging.getLogger("desktopenv.metrics.vlc")


def is_vlc_playing(actual_status_path: str, rule: Dict[str, str]) -> float:
    """
    Checks if VLC is currently playing a file.
    """
    with open(actual_status_path, 'rb') as file:
        actual_status = file.read().decode('utf-8')

    tree = ElementTree.fromstring(actual_status)
    status = tree.find('state').text
    if status == 'playing':
        if rule['type'] == 'file_name':
            file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text
            if file_info:
                return 1 if file_info.endswith(rule['file_name']) else 0
        elif rule['type'] == 'url':
            file_info = tree.find('information/category[@name="meta"]/info[@name="url"]').text
            if file_info:
                return 1 if file_info.endswith(rule['url']) else 0
        else:
            logger.error(f"Unknown type: {rule['type']}")
            return 0
    else:
        return 0


def is_vlc_recordings_folder(actual_config_path: str, rule: Dict[str, str]) -> float:
    """
    Checks if VLC's recording folder is set to the expected value.
    """
    with open(actual_config_path, 'rb') as file:
        config_file = file.read().decode('utf-8')

    expected_recording_file_path = rule['recording_file_path']

    try:
        for line in config_file:
            # Skip comments and empty lines
            if line.startswith('#') or not line.strip():
                continue
            # Check if the line contains the recording path setting
            if 'recorded_files_path' in line:
                # Extract the value of the recording path and remove surrounding whitespace
                current_path = line.split('=')[-1].strip()
                # Compare with the Desktop path
                if current_path == expected_recording_file_path:
                    return 1
                else:
                    return 0
            # The configuration key was not found in the file
            return 0
    except FileNotFoundError:
        logger.error("VLC configuration file not found.")
        return 0
    except Exception as e:
        logger.error(f"An error occurred: {e}")
        return 0


def is_vlc_fullscreen(actual_window_size, screen_size):
    if actual_window_size['width'] == screen_size['width'] and actual_window_size['height'] == screen_size['height']:
        return 1
    else:
        return 0


def compare_images(image1_path, image2_path):
    # You would call this function with the paths to the two images you want to compare:
    # score = compare_images('path_to_image1', 'path_to_image2')
    # print("Similarity score:", score)

    # Open the images and convert to grayscale
    image1 = Image.open(image1_path).convert('L')
    image2 = Image.open(image2_path).convert('L')

    # Resize images to the smaller one's size for comparison
    image1_size = image1.size
    image2_size = image2.size
    new_size = min(image1_size, image2_size)

    image1 = image1.resize(new_size, Image.Resampling.LANCZOS)
    image2 = image2.resize(new_size, Image.Resampling.LANCZOS)

    # Convert images to numpy arrays
    image1_array = np.array(image1)
    image2_array = np.array(image2)

    # Calculate SSIM between two images
    similarity_index = ssim(image1_array, image2_array)

    return similarity_index


def compare_audios(audio_path_1, audio_path_2, max_distance=1000):
    """
    Compare two audio files and return a similarity score in the range [0, 1].
    audio_path_1, audio_path_2: paths to the audio files to compare
    max_distance: an empirically determined maximum expected DTW distance
    """
    # Example Usage:
    # similarity = compare_audios_simple('path_to_audio1.mp3', 'path_to_audio2.mp3')
    # print(f'Similarity Score: {similarity}')

    # Convert to common format if necessary and load audio
    y1, sr1 = librosa.load(audio_path_1)
    y2, sr2 = librosa.load(audio_path_2)

    # Extract MFCC features
    mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
    mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)

    # Compute Dynamic Time Warping distance
    distance, path = librosa.sequence.dtw(mfcc1.T, mfcc2.T)

    # Normalize distance to get a similarity score
    normalized_distance = np.mean(distance) / max_distance
    similarity_score = 1 - min(normalized_distance, 1)  # Ensure the score is within [0, 1]

    return similarity_score


def compare_videos(video_path1, video_path2, max_frames_to_check=100, threshold=5):
    # Open both video files
    cap1 = cv2.VideoCapture(video_path1)
    cap2 = cv2.VideoCapture(video_path2)

    frames_checked = 0
    mismatch_count = 0

    while frames_checked < max_frames_to_check:
        # Read frames from both videos
        ret1, frame1 = cap1.read()
        ret2, frame2 = cap2.read()

        # If a video ends, then check if both ended to confirm they are of the same length
        if not ret1 or not ret2:
            return ret1 == ret2

        # Convert frames to PIL Images
        frame1 = Image.fromarray(cv2.cvtColor(frame1, cv2.COLOR_BGR2RGB))
        frame2 = Image.fromarray(cv2.cvtColor(frame2, cv2.COLOR_BGR2RGB))

        # Compute the perceptual hash for each frame
        hash1 = imagehash.phash(frame1)
        hash2 = imagehash.phash(frame2)

        # Increment the frames checked
        frames_checked += 1

        # Compute the difference in the hashes
        if hash1 - hash2 > threshold:
            mismatch_count += 1
            # If there's a significant difference, the frames are not the same
            if mismatch_count > threshold:
                return False

    # If we reach here, the content appears to be the same
    return True


def are_audio_files_similar(mp3_file_path, mp4_file_path):
    # Extract audio fingerprint from MP3 file
    mp3_fingerprint, mp3_duration = acoustid.fingerprint_file(mp3_file_path)

    # Extract the audio stream from the MP4 file
    mp4_audio_path = os.path.splitext(mp4_file_path)[0] + '_extracted.mp3'
    try:
        subprocess.run(["ffmpeg", "-i", mp4_file_path, "-vn", "-ar", "44100", "-ac", "2", "-ab", "192k", "-f", "mp3",
                        mp4_audio_path], check=True)
    except subprocess.CalledProcessError as e:
        print(f"An error occurred during audio extraction from MP4: {e}")
        return False

    # Extract audio fingerprint from the extracted audio
    mp4_fingerprint, mp4_duration = acoustid.fingerprint_file(mp4_audio_path)

    # Clean up temporary extracted audio file
    os.remove(mp4_audio_path)

    # Compare fingerprints (rudimentary comparison)
    if mp3_duration >= mp4_duration and mp3_fingerprint == mp4_fingerprint:
        return True

    return False