Finish Impress v2 loading, some pre-process left

2024-01-23 15:29:23 +08:00
parent 177bae1e92
commit 42b433dce8
8 changed files with 184 additions and 77 deletions
--- a/desktop_env/evaluators/metrics/vlc.py
+++ b/desktop_env/evaluators/metrics/vlc.py
@@ -10,6 +10,8 @@ import imagehash
 import librosa
 import numpy as np
 from PIL import Image
+from fastdtw import fastdtw
+from scipy.spatial.distance import cosine
 from skimage.metrics import structural_similarity as ssim

 logger = logging.getLogger("desktopenv.metrics.vlc")
@@ -111,11 +113,10 @@ def compare_images(image1_path, image2_path):
    return similarity_index


-def compare_audios(audio_path_1, audio_path_2, max_distance=1000):
+def compare_audios(audio_path_1, audio_path_2):
    """
    Compare two audio files and return a similarity score in the range [0, 1].
    audio_path_1, audio_path_2: paths to the audio files to compare
-    max_distance: an empirically determined maximum expected DTW distance
    """
    # Example Usage:
    # similarity = compare_audios_simple('path_to_audio1.mp3', 'path_to_audio2.mp3')
@@ -125,21 +126,31 @@ def compare_audios(audio_path_1, audio_path_2, max_distance=1000):
    if not audio_path_1 or not audio_path_2:
        return 0

+    # Load the audio files and extract MFCC features
    y1, sr1 = librosa.load(audio_path_1)
-    y2, sr2 = librosa.load(audio_path_2)
-
-    # Extract MFCC features
    mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
+
+    y2, sr2 = librosa.load(audio_path_2)
    mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)

-    # Compute Dynamic Time Warping distance
-    distance, path = librosa.sequence.dtw(mfcc1.T, mfcc2.T)
+    # Normalize the MFCC features
+    mfcc1 = librosa.util.normalize(mfcc1, axis=1)
+    mfcc2 = librosa.util.normalize(mfcc2, axis=1)

-    # Normalize distance to get a similarity score
-    normalized_distance = np.mean(distance) / max_distance
-    similarity_score = 1 - min(normalized_distance, 1)  # Ensure the score is within [0, 1]
+    # Define a lambda function to compute cosine distance
+    dist_func = lambda x, y: cosine(x, y)

-    return similarity_score
+    # Use the DTW algorithm to find the best alignment path
+    distance, path = fastdtw(mfcc1.T, mfcc2.T, dist=dist_func)
+
+    # Calculate the similarity score, here we use 1/(1+distance) to convert distance to a similarity score
+    similarity = 1 / (1 + distance)
+
+    return similarity
+
+
+def compare_audios_by_dl_model(audio_path_1, audio_path_2):
+    pass


 def compare_videos(video_path1, video_path2, max_frames_to_check=100, threshold=5):