Merge remote-tracking branch 'origin/main'

# Conflicts: # mm_agents/gpt_4v_agent.py
2024-02-02 14:37:23 +08:00
parent 068c6f5769 3184f091a8
commit e07a3d52ce
25 changed files with 1141 additions and 46 deletions
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -7,6 +7,7 @@ import uuid
 import tempfile
 from typing import Any, Union, Optional
 from typing import Dict, List
+import os

 import requests
 from pydrive.auth import GoogleAuth
@@ -114,6 +115,7 @@ class SetupController:
            if not os.path.exists(cache_path):
                max_retries = 3
                downloaded = False
+                e = None
                for i in range(max_retries):
                    try:
                        response = requests.get(url, stream=True)
@@ -128,7 +130,7 @@ class SetupController:
                        break

                    except requests.RequestException as e:
-                        logger.error(f"Failed to download {url}. Retrying... ({max_retries - i - 1} attempts left)")
+                        logger.error(f"Failed to download {url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
                if not downloaded:
                    raise requests.RequestException(f"Failed to download {url}. No retries left. Error: {e}")

@@ -344,39 +346,49 @@ class SetupController:
        port = 9222  # fixme: this port is hard-coded, need to be changed from config file

        remote_debugging_url = f"http://{host}:{port}"
-        with sync_playwright() as p:
+        logger.info("Connect to Chrome @: %s", remote_debugging_url)
+        logger.debug("PLAYWRIGHT ENV: %s", repr(os.environ))
+        for attempt in range(15):
+            if attempt>0:
+                time.sleep(5)
+
            browser = None
-            for attempt in range(15):
+            with sync_playwright() as p:
                try:
                    browser = p.chromium.connect_over_cdp(remote_debugging_url)
-                    break
+                    #break
                except Exception as e:
                    if attempt < 14:
                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
-                        time.sleep(1)
+                        #time.sleep(10)
+                        continue
                    else:
                        logger.error(f"Failed to connect after multiple attempts: {e}")
                        raise e

-            if not browser:
-                return
+                if not browser:
+                    return

-            for i, url in enumerate(urls_to_open):
-                # Use the first context (which should be the only one if using default profile)
-                if i == 0:
-                    context = browser.contexts[0]
+                logger.info("Opening %s...", urls_to_open)
+                for i, url in enumerate(urls_to_open):
+                    # Use the first context (which should be the only one if using default profile)
+                    if i == 0:
+                        context = browser.contexts[0]

-                page = context.new_page()  # Create a new page (tab) within the existing context
-                page.goto(url, timeout=60000)
-                logger.info(f"Opened tab {i + 1}: {url}")
+                    page = context.new_page()  # Create a new page (tab) within the existing context
+                    try:
+                        page.goto(url, timeout=60000)
+                    except:
+                        logger.warning("Opening %s exceeds time limit", url) # only for human test
+                    logger.info(f"Opened tab {i + 1}: {url}")

-                if i == 0:
-                    # clear the default tab
-                    default_page = context.pages[0]
-                    default_page.close()
+                    if i == 0:
+                        # clear the default tab
+                        default_page = context.pages[0]
+                        default_page.close()

-            # Do not close the context or browser; they will remain open after script ends
-            return browser, context
+                # Do not close the context or browser; they will remain open after script ends
+                return browser, context

    def _chrome_close_tabs_setup(self, urls_to_close: List[str]):
        time.sleep(5)  # Wait for Chrome to finish launching
@@ -552,4 +564,4 @@ class SetupController:
            else:
                raise NotImplementedError

-            return browser, context
+            return browser, context
--- a/desktop_env/evaluators/README.md
+++ b/desktop_env/evaluators/README.md
@@ -191,7 +191,7 @@ To enable and use the HTTP interface in VLC Media Player for remote control and
 #### 4. Configure Lua HTTP

 - Expand the `Main interfaces` node and select `Lua`.
- Under `Lua HTTP`, set a password in the `Lua HTTP` section. This password will be required to access the HTTP interface.
+- Under `Lua HTTP`, set a password `password` in the `Lua HTTP` section. This password will be required to access the HTTP interface.

 #### 5. Save and Restart VLC

@@ -217,4 +217,4 @@ pip install opencv-python-headless Pillow imagehash
 - If the port is in use by another application, you may change the port number in VLC's settings.

 ## GIMP
-Click on the "Keep" of the image loading pop-up.
+Click on the "Keep" of the image loading pop-up.
--- a/desktop_env/evaluators/metrics/init.py
+++ b/desktop_env/evaluators/metrics/init.py
@@ -43,6 +43,7 @@ from .docs import (
    compare_highlighted_text,
    is_first_line_centered,
    check_file_exists,
+    check_tabstops,
    compare_contains_image
 )
 from .general import (
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -6,11 +6,13 @@ import zipfile
 from typing import List, Dict, Any

 from docx import Document
-from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
 from docx.shared import RGBColor
 from odf.opendocument import load
 from odf.text import P
 from odf.text import Span
+from skimage.color import deltaE_ciede2000
+from skimage.color import rgb2lab

 logger = logging.getLogger("desktopenv.metric.docs")

@@ -141,7 +143,7 @@ def compare_docx_tables(docx_file1, docx_file2):
        # Compare each cell
        for i in range(len(table1.rows)):
            for j in range(len(table1.columns)):
-                if table1.cell(i, j).text != table2.cell(i, j).text:
+                if table1.cell(i, j).text.strip() != table2.cell(i, j).text.strip():
                    return 0

    return 1
@@ -234,6 +236,40 @@ def check_file_exists(directory, filename):
    return 1 if os.path.isfile(file_path) else 0


+def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
+    doc1: Document = Document(docx_file1)
+    doc2: Document = Document(docx_file2)
+    para1 = [p for p in doc1.paragraphs if p.text.strip()]
+    para2 = [p for p in doc2.paragraphs if p.text.strip()]
+    if len(para1) != len(para2): return .0
+
+    if kwargs.get('word_number_split_by_tabstop', None) is not None:
+        number = kwargs['word_number_split_by_tabstop']
+        index = kwargs.get('index', 0)
+        for p1 in para1:
+            splits = p1.text.split('\t')
+            if len(splits) == 0: return .0
+            words = list(filter(lambda x: x.strip(), re.split(r'\s', splits[index])))
+            if len(words) != number: return .0        
+    
+    section = doc2.sections[0]
+    paragraph_width = section.page_width - section.left_margin - section.right_margin
+    ignore_tabs = lambda x: x.alignment == WD_TAB_ALIGNMENT.CLEAR or (x.alignment == WD_TAB_ALIGNMENT.LEFT and x.position == 0)
+    minus = .0
+    for p1, p2 in zip(para1, para2):
+        # filter CLEAR tabstop and default left-0 tabstop
+        tabs1 = [tst for tst in p1.paragraph_format.tab_stops if not ignore_tabs(tst)]
+        tabs2 = [tst for tst in p2.paragraph_format.tab_stops if not ignore_tabs(tst)]
+        if len(tabs1) != len(tabs2): return .0
+        difference = .0
+        for t1, t2 in zip(tabs1, tabs2):
+            if t1.alignment != t2.alignment: return .0
+            difference += abs(t1.position - t2.position)
+        minus += difference / paragraph_width
+    score = 1 - (minus / len(para1))
+    return score
+
+
 def compare_contains_image(docx_file1, docx_file2):
    doc1 = Document(docx_file1)
    doc2 = Document(docx_file2)
@@ -258,10 +294,18 @@ def compare_contains_image(docx_file1, docx_file2):
 # print(find_default_font("Ani", config_path))


-def evaluate_colored_words_in_tables(file_path1, file_path2):
+def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
    if not compare_docx_files(file_path1, file_path2):
        return 0
    document = Document(file_path1)
+    threshold = kwargs.get('threshold', 3.5)
+
+    def _calculate_color_difference(rgb1, rgb2):
+        srgb1 = [rgb1[0] / 255.0, rgb1[1] / 255.0, rgb1[2] / 255.0]
+        srgb2 = [rgb2[0] / 255.0, rgb2[1] / 255.0, rgb2[2] / 255.0]
+        lab1, lab2 = rgb2lab(srgb1), rgb2lab(srgb2)
+        delta_e = deltaE_ciede2000(lab1, lab2)
+        return delta_e

    for table in document.tables:
        # Iterate through rows and cells in the table
@@ -273,9 +317,9 @@ def evaluate_colored_words_in_tables(file_path1, file_path2):
                        if word:
                            first_letter = word[0].lower()

-                            if first_letter in 'aeiou' and run.font.color.rgb != RGBColor(255, 0, 0):
+                            if first_letter in 'aeiou' and _calculate_color_difference(run.font.color.rgb, RGBColor(255, 0, 0)) > threshold:
                                return 0  # Vowel-colored words should be red
-                            elif first_letter not in 'aeiou' and run.font.color.rgb != RGBColor(0, 0, 255):
+                            elif first_letter not in 'aeiou' and _calculate_color_difference(run.font.color.rgb, RGBColor(0, 0, 255)) > threshold:
                                return 0  # Non-vowel-colored words should be blue

    return 1  # All words in tables are correctly colored
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -2,7 +2,7 @@ import ctypes
 import os
 import platform
 import shlex
-import subprocess
+import subprocess, signal
 from pathlib import Path
 from typing import Any, Optional
 from typing import List, Dict, Tuple
@@ -997,7 +997,7 @@ def start_recording():

    start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"

-    recording_process = subprocess.Popen(shlex.split(start_command), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    recording_process = subprocess.Popen(shlex.split(start_command), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    return jsonify({'status': 'success', 'message': 'Started recording.'})

@@ -1009,10 +1009,8 @@ def end_recording():
    if not recording_process:
        return jsonify({'status': 'error', 'message': 'No recording in progress to stop.'}), 400

-    recording_process.terminate()
+    recording_process.send_signal(signal.SIGINT)
    recording_process.wait()
-    # return_code = recording_process.returncode
-    output, error = recording_process.communicate()
    recording_process = None

    # return recording video file