feat(server): add cross-platform support and improve screenshot handling

2026-01-30 16:27:49 +08:00
parent 788b248dbc
commit 308282e830
1 changed files with 175 additions and 22 deletions
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -4,25 +4,27 @@ import platform
 import shlex
 import json
 import subprocess, signal
+import sys
 import time
 from pathlib import Path
 from typing import Any, Optional, Sequence
 from typing import List, Dict, Tuple, Literal
 import concurrent.futures

-import Xlib
 import lxml.etree
 import pyautogui
 import requests
 import re
 from PIL import Image, ImageGrab
-from Xlib import display, X
 from flask import Flask, request, jsonify, send_file, abort  # , send_from_directory
 from lxml.etree import _Element

 platform_name: str = platform.system()

 if platform_name == "Linux":
+    import Xlib
+    from Xlib import display, X
+    from pyxcursor import Xcursor
    import pyatspi
    from pyatspi import Accessible, StateType, STATE_SHOWING
    from pyatspi import Action as ATAction
@@ -39,9 +41,14 @@ elif platform_name == "Windows":
    import win32ui, win32gui

    Accessible = Any
+    Xlib = None
+    display = None
+    X = None
+    Xcursor = None

 elif platform_name == "Darwin":
    import plistlib
+    from pyxcursor import Xcursor

    import AppKit
    import ApplicationServices
@@ -51,13 +58,16 @@ elif platform_name == "Darwin":

    Accessible = Any
    BaseWrapper = Any
+    Xlib = None

 else:
    # Platform not supported
    Accessible = None
    BaseWrapper = Any
-
-from pyxcursor import Xcursor
+    Xlib = None
+    display = None
+    X = None
+    Xcursor = None

 # todo: need to reformat and organize this whole file

@@ -89,6 +99,10 @@ def execute_command():
        if arg.startswith("~/"):
            command[i] = os.path.expanduser(arg)

+    # Replace 'python' with sys.executable to use the same Python interpreter as the server
+    if len(command) > 0 and command[0] in ['python', 'python3', 'python.exe', 'python3.exe']:
+        command[0] = sys.executable
+
    # Execute the command without any safety checks.
    try:
        if platform_name == "Windows":
@@ -262,15 +276,12 @@ def launch_app():

@app.route('/screenshot', methods=['GET'])
 def capture_screen_with_cursor():
-    # fixme: when running on virtual machines, the cursor is not captured, don't know why
-
    file_path = os.path.join(os.path.dirname(__file__), "screenshots", "screenshot.png")
    user_platform = platform.system()

    # Ensure the screenshots directory exists
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

-    # fixme: This is a temporary fix for the cursor not being captured on Windows and Linux
    if user_platform == "Windows":
        def get_cursor():
            hcursor = win32gui.GetCursorInfo()[1]
@@ -303,19 +314,53 @@ def capture_screen_with_cursor():

        ratio = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100

+        # get logical screen size
+        user32 = ctypes.windll.user32
+        logical_width = user32.GetSystemMetrics(0)
+        logical_height = user32.GetSystemMetrics(1)
+
+        # ===== Key fix: get cursor position before taking screenshot =====
+        # win32gui.GetCursorPos() returns logical coordinates (consistent with pyautogui)
+        pos_win = win32gui.GetCursorPos()
+        logger.info(f"Cursor position (logical coordinates): {pos_win}")
+
+        # Take screenshot immediately to reduce time difference
        img = ImageGrab.grab(bbox=None, include_layered_windows=True)
+        # =============================================
+
+        # ===== DPI scaling fix =====
+        if ratio != 1.0:
+            physical_width, physical_height = img.size
+            logger.info(f"Detected DPI scaling: {ratio}x ({ratio*100}%)")
+            logger.info(f"Physical screenshot size: {physical_width}x{physical_height}")
+            logger.info(f"Logical resolution: {logical_width}x{logical_height}")
+            logger.info(f"Resizing screenshot to match logical resolution...")
+            img = img.resize((logical_width, logical_height), Image.Resampling.LANCZOS)
+            logger.info(f"Screenshot resized to: {img.size}")
+        # ==========================

        try:
            cursor, (hotspotx, hotspoty) = get_cursor()

-            pos_win = win32gui.GetCursorPos()
-            pos = (round(pos_win[0]*ratio - hotspotx), round(pos_win[1]*ratio - hotspoty))
+            # ===== Cursor position handling =====
+            # win32gui.GetCursorPos() and pyautogui both use logical coordinates
+            # The screenshot has been resized to logical resolution, so use directly
+            logical_cursor_x = pos_win[0]
+            logical_cursor_y = pos_win[1]
+
+            pos = (logical_cursor_x - hotspotx, logical_cursor_y - hotspoty)
+
+            logger.info(f"Cursor position (logical coordinates): ({logical_cursor_x}, {logical_cursor_y})")
+            logger.info(f"Hotspot offset: ({hotspotx}, {hotspoty})")
+            logger.info(f"Final paste position: {pos}")
+            # ===================================

            img.paste(cursor, pos, cursor)
        except Exception as e:
-            logger.warning(f"Failed to capture cursor on Windows, screenshot will not have a cursor. Error: {e}")
+            logger.warning(f"Failed to capture cursor on Windows, screenshot will not include cursor. Error: {e}")

        img.save(file_path)
+       
    elif user_platform == "Linux":
        cursor_obj = Xcursor()
        imgarray = cursor_obj.getCursorImageArrayFast()
@@ -324,17 +369,19 @@ def capture_screen_with_cursor():
        cursor_x, cursor_y = pyautogui.position()
        screenshot.paste(cursor_img, (cursor_x, cursor_y), cursor_img)
        screenshot.save(file_path)
+        
    elif user_platform == "Darwin":  # (Mac OS)
-        # Use the screencapture utility to capture the screen with the cursor
        subprocess.run(["screencapture", "-C", file_path])
+        
    else:
        logger.warning(f"The platform you're using ({user_platform}) is not currently supported")

    return send_file(file_path, mimetype='image/png')


+
 def _has_active_terminal(desktop: Accessible) -> bool:
-    """ A quick check whether the terminal window is open and active.
+    """ A quick check whether the terminal window is open and active (Linux only).
    """
    for app in desktop:
        if app.getRoleName() == "application" and app.name == "gnome-terminal-server":
@@ -344,6 +391,87 @@ def _has_active_terminal(desktop: Accessible) -> bool:
    return False


+def _get_windows_terminal_output() -> Optional[str]:
+    """ Get terminal output on Windows platform.
+    Supports Windows Terminal, PowerShell, Command Prompt, and ConHost.
+    """
+    try:
+        from pywinauto import Desktop
+        from pywinauto.findwindows import ElementNotFoundError
+        
+        desktop = Desktop(backend="uia")
+        
+        # Common terminal applications on Windows
+        terminal_apps = [
+            "WindowsTerminal.exe",  # Windows Terminal
+            "powershell.exe",       # PowerShell
+            "pwsh.exe",             # PowerShell Core
+            "cmd.exe",              # Command Prompt
+            "conhost.exe"           # Console Host
+        ]
+        
+        # Try to find active terminal windows
+        for window in desktop.windows():
+            try:
+                # Check if window is visible and not minimized
+                if not window.is_visible() or window.is_minimized():
+                    continue
+                
+                # Get window process name
+                process_name = window.element_info.name.lower()
+                
+                # Check if this is a terminal window
+                is_terminal = False
+                for term_app in terminal_apps:
+                    if term_app.lower() in process_name or \
+                       any(term_name in process_name for term_name in ['terminal', 'powershell', 'command prompt', 'cmd']):
+                        is_terminal = True
+                        break
+                
+                if not is_terminal:
+                    continue
+                
+                # Try to get text content from the terminal
+                # First, try to find console/edit controls that contain the output
+                try:
+                    # For Windows Terminal and modern consoles
+                    # Look for Edit or Document controls that contain the text
+                    text_controls = window.descendants(control_type="Edit")
+                    if not text_controls:
+                        text_controls = window.descendants(control_type="Document")
+                    if not text_controls:
+                        text_controls = window.descendants(control_type="Text")
+                    
+                    for control in text_controls:
+                        try:
+                            text = control.window_text()
+                            if text and len(text.strip()) > 0:
+                                return text.rstrip()
+                        except:
+                            pass
+                    
+                    # If no text controls found, try to get the window text directly
+                    window_text = window.window_text()
+                    if window_text and len(window_text.strip()) > 0:
+                        # Filter out just the window title
+                        if window_text not in ['Windows PowerShell', 'Command Prompt', 'PowerShell', 'Administrator: Windows PowerShell']:
+                            return window_text.rstrip()
+                    
+                except Exception as e:
+                    logger.debug(f"Error getting text from window {process_name}: {e}")
+                    continue
+                    
+            except Exception as e:
+                logger.debug(f"Error processing window: {e}")
+                continue
+        
+        return None
+        
+    except Exception as e:
+        logger.error(f"Error in _get_windows_terminal_output: {e}")
+        return None
+
+
@app.route('/terminal', methods=['GET'])
 def get_terminal_output():
    user_platform = platform.system()
@@ -358,8 +486,10 @@ def get_terminal_output():
                xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]'
                terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map_ubuntu)
                output = terminals[0].text.rstrip() if len(terminals) == 1 else None
-        else:  # windows and macos platform is not implemented currently
-            # raise NotImplementedError
+        elif user_platform == "Windows":
+            output = _get_windows_terminal_output()
+            logger.debug(f"Terminal output retrieved: {output}")
+        else:  # macOS platform is not implemented currently
            return "Currently not implemented for platform {:}.".format(platform.platform()), 500
        return jsonify({"output": output, "status": "success"})
    except Exception as e:
@@ -989,6 +1119,9 @@ def get_window_size():
    else:
        return jsonify({"error": "app_class_name is required"}), 400

+    if platform_name != "Linux":
+        return jsonify({"error": "window_size is only supported on Linux"}), 501
+
    d = display.Display()
    root = d.screen().root
    window_ids = root.get_full_property(d.intern_atom('_NET_CLIENT_LIST'), X.AnyPropertyType).value
@@ -1505,11 +1638,19 @@ def start_recording():
            logger.error(f"Error removing old recording file: {e}")
            return jsonify({'status': 'error', 'message': f'Failed to remove old recording file: {e}'}), 500

-    d = display.Display()
-    screen_width = d.screen().width_in_pixels
-    screen_height = d.screen().height_in_pixels
-
-    start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"
+    if platform_name == "Linux":
+        d = display.Display()
+        screen_width = d.screen().width_in_pixels
+        screen_height = d.screen().height_in_pixels
+        start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"
+    elif platform_name == "Windows":
+        user32 = ctypes.windll.user32
+        screen_width = user32.GetSystemMetrics(0)
+        screen_height = user32.GetSystemMetrics(1)
+        # Use gdigrab for Windows screen capture
+        start_command = f"ffmpeg -y -f gdigrab -draw_mouse 1 -framerate 30 -video_size {screen_width}x{screen_height} -i desktop -c:v libx264 -r 30 {recording_path}"
+    else:
+        return jsonify({'status': 'error', 'message': f'Recording not supported on {platform_name}'}), 501

    # Use stderr=PIPE to capture potential errors from ffmpeg
    recording_process = subprocess.Popen(shlex.split(start_command),
@@ -1544,11 +1685,22 @@ def end_recording():
    error_output = ""
    try:
        # Send SIGINT for a graceful shutdown, allowing ffmpeg to finalize the file.
-        recording_process.send_signal(signal.SIGINT)
+        # On Windows, use CTRL_C_EVENT; on Unix, use SIGINT
+        if platform_name == "Windows":
+            # On Windows, we need to terminate the process gracefully
+            # ffmpeg responds to standard input 'q' to quit gracefully
+            try:
+                recording_process.stdin.write(b'q')
+                recording_process.stdin.flush()
+            except:
+                # If stdin is not available, use terminate
+                recording_process.terminate()
+        else:
+            recording_process.send_signal(signal.SIGINT)
        # Wait for ffmpeg to terminate. communicate() gets output and waits.
        _, error_output = recording_process.communicate(timeout=15)
    except subprocess.TimeoutExpired:
-        logger.error("ffmpeg did not respond to SIGINT, killing the process.")
+        logger.error("ffmpeg did not respond to stop signal, killing the process.")
        recording_process.kill()
        # After killing, communicate to get any remaining output.
        _, error_output = recording_process.communicate()
@@ -1589,8 +1741,9 @@ def run_python():
            f.write(code)
        
        # Execute the file using subprocess to capture all output
+        # Use sys.executable to use the same Python interpreter as the Flask server
        result = subprocess.run(
-            ['/usr/bin/python3', temp_filename],
+            [sys.executable, temp_filename],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,