feat(server): add cross-platform support and improve screenshot handling

This commit is contained in:
cui0711
2026-01-30 16:27:49 +08:00
parent 788b248dbc
commit 308282e830

View File

@@ -4,25 +4,27 @@ import platform
import shlex
import json
import subprocess, signal
import sys
import time
from pathlib import Path
from typing import Any, Optional, Sequence
from typing import List, Dict, Tuple, Literal
import concurrent.futures
import Xlib
import lxml.etree
import pyautogui
import requests
import re
from PIL import Image, ImageGrab
from Xlib import display, X
from flask import Flask, request, jsonify, send_file, abort # , send_from_directory
from lxml.etree import _Element
platform_name: str = platform.system()
if platform_name == "Linux":
import Xlib
from Xlib import display, X
from pyxcursor import Xcursor
import pyatspi
from pyatspi import Accessible, StateType, STATE_SHOWING
from pyatspi import Action as ATAction
@@ -39,9 +41,14 @@ elif platform_name == "Windows":
import win32ui, win32gui
Accessible = Any
Xlib = None
display = None
X = None
Xcursor = None
elif platform_name == "Darwin":
import plistlib
from pyxcursor import Xcursor
import AppKit
import ApplicationServices
@@ -51,13 +58,16 @@ elif platform_name == "Darwin":
Accessible = Any
BaseWrapper = Any
Xlib = None
else:
# Platform not supported
Accessible = None
BaseWrapper = Any
from pyxcursor import Xcursor
Xlib = None
display = None
X = None
Xcursor = None
# todo: need to reformat and organize this whole file
@@ -89,6 +99,10 @@ def execute_command():
if arg.startswith("~/"):
command[i] = os.path.expanduser(arg)
# Replace 'python' with sys.executable to use the same Python interpreter as the server
if len(command) > 0 and command[0] in ['python', 'python3', 'python.exe', 'python3.exe']:
command[0] = sys.executable
# Execute the command without any safety checks.
try:
if platform_name == "Windows":
@@ -262,15 +276,12 @@ def launch_app():
@app.route('/screenshot', methods=['GET'])
def capture_screen_with_cursor():
# fixme: when running on virtual machines, the cursor is not captured, don't know why
file_path = os.path.join(os.path.dirname(__file__), "screenshots", "screenshot.png")
user_platform = platform.system()
# Ensure the screenshots directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# fixme: This is a temporary fix for the cursor not being captured on Windows and Linux
if user_platform == "Windows":
def get_cursor():
hcursor = win32gui.GetCursorInfo()[1]
@@ -303,19 +314,53 @@ def capture_screen_with_cursor():
ratio = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100
# get logical screen size
user32 = ctypes.windll.user32
logical_width = user32.GetSystemMetrics(0)
logical_height = user32.GetSystemMetrics(1)
# ===== Key fix: get cursor position before taking screenshot =====
# win32gui.GetCursorPos() returns logical coordinates (consistent with pyautogui)
pos_win = win32gui.GetCursorPos()
logger.info(f"Cursor position (logical coordinates): {pos_win}")
# Take screenshot immediately to reduce time difference
img = ImageGrab.grab(bbox=None, include_layered_windows=True)
# =============================================
# ===== DPI scaling fix =====
if ratio != 1.0:
physical_width, physical_height = img.size
logger.info(f"Detected DPI scaling: {ratio}x ({ratio*100}%)")
logger.info(f"Physical screenshot size: {physical_width}x{physical_height}")
logger.info(f"Logical resolution: {logical_width}x{logical_height}")
logger.info(f"Resizing screenshot to match logical resolution...")
img = img.resize((logical_width, logical_height), Image.Resampling.LANCZOS)
logger.info(f"Screenshot resized to: {img.size}")
# ==========================
try:
cursor, (hotspotx, hotspoty) = get_cursor()
pos_win = win32gui.GetCursorPos()
pos = (round(pos_win[0]*ratio - hotspotx), round(pos_win[1]*ratio - hotspoty))
# ===== Cursor position handling =====
# win32gui.GetCursorPos() and pyautogui both use logical coordinates
# The screenshot has been resized to logical resolution, so use directly
logical_cursor_x = pos_win[0]
logical_cursor_y = pos_win[1]
pos = (logical_cursor_x - hotspotx, logical_cursor_y - hotspoty)
logger.info(f"Cursor position (logical coordinates): ({logical_cursor_x}, {logical_cursor_y})")
logger.info(f"Hotspot offset: ({hotspotx}, {hotspoty})")
logger.info(f"Final paste position: {pos}")
# ===================================
img.paste(cursor, pos, cursor)
except Exception as e:
logger.warning(f"Failed to capture cursor on Windows, screenshot will not have a cursor. Error: {e}")
logger.warning(f"Failed to capture cursor on Windows, screenshot will not include cursor. Error: {e}")
img.save(file_path)
elif user_platform == "Linux":
cursor_obj = Xcursor()
imgarray = cursor_obj.getCursorImageArrayFast()
@@ -324,17 +369,19 @@ def capture_screen_with_cursor():
cursor_x, cursor_y = pyautogui.position()
screenshot.paste(cursor_img, (cursor_x, cursor_y), cursor_img)
screenshot.save(file_path)
elif user_platform == "Darwin": # (Mac OS)
# Use the screencapture utility to capture the screen with the cursor
subprocess.run(["screencapture", "-C", file_path])
else:
logger.warning(f"The platform you're using ({user_platform}) is not currently supported")
return send_file(file_path, mimetype='image/png')
def _has_active_terminal(desktop: Accessible) -> bool:
""" A quick check whether the terminal window is open and active.
""" A quick check whether the terminal window is open and active (Linux only).
"""
for app in desktop:
if app.getRoleName() == "application" and app.name == "gnome-terminal-server":
@@ -344,6 +391,87 @@ def _has_active_terminal(desktop: Accessible) -> bool:
return False
def _get_windows_terminal_output() -> Optional[str]:
""" Get terminal output on Windows platform.
Supports Windows Terminal, PowerShell, Command Prompt, and ConHost.
"""
try:
from pywinauto import Desktop
from pywinauto.findwindows import ElementNotFoundError
desktop = Desktop(backend="uia")
# Common terminal applications on Windows
terminal_apps = [
"WindowsTerminal.exe", # Windows Terminal
"powershell.exe", # PowerShell
"pwsh.exe", # PowerShell Core
"cmd.exe", # Command Prompt
"conhost.exe" # Console Host
]
# Try to find active terminal windows
for window in desktop.windows():
try:
# Check if window is visible and not minimized
if not window.is_visible() or window.is_minimized():
continue
# Get window process name
process_name = window.element_info.name.lower()
# Check if this is a terminal window
is_terminal = False
for term_app in terminal_apps:
if term_app.lower() in process_name or \
any(term_name in process_name for term_name in ['terminal', 'powershell', 'command prompt', 'cmd']):
is_terminal = True
break
if not is_terminal:
continue
# Try to get text content from the terminal
# First, try to find console/edit controls that contain the output
try:
# For Windows Terminal and modern consoles
# Look for Edit or Document controls that contain the text
text_controls = window.descendants(control_type="Edit")
if not text_controls:
text_controls = window.descendants(control_type="Document")
if not text_controls:
text_controls = window.descendants(control_type="Text")
for control in text_controls:
try:
text = control.window_text()
if text and len(text.strip()) > 0:
return text.rstrip()
except:
pass
# If no text controls found, try to get the window text directly
window_text = window.window_text()
if window_text and len(window_text.strip()) > 0:
# Filter out just the window title
if window_text not in ['Windows PowerShell', 'Command Prompt', 'PowerShell', 'Administrator: Windows PowerShell']:
return window_text.rstrip()
except Exception as e:
logger.debug(f"Error getting text from window {process_name}: {e}")
continue
except Exception as e:
logger.debug(f"Error processing window: {e}")
continue
return None
except Exception as e:
logger.error(f"Error in _get_windows_terminal_output: {e}")
return None
@app.route('/terminal', methods=['GET'])
def get_terminal_output():
user_platform = platform.system()
@@ -358,8 +486,10 @@ def get_terminal_output():
xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]'
terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map_ubuntu)
output = terminals[0].text.rstrip() if len(terminals) == 1 else None
else: # windows and macos platform is not implemented currently
# raise NotImplementedError
elif user_platform == "Windows":
output = _get_windows_terminal_output()
logger.debug(f"Terminal output retrieved: {output}")
else: # macOS platform is not implemented currently
return "Currently not implemented for platform {:}.".format(platform.platform()), 500
return jsonify({"output": output, "status": "success"})
except Exception as e:
@@ -989,6 +1119,9 @@ def get_window_size():
else:
return jsonify({"error": "app_class_name is required"}), 400
if platform_name != "Linux":
return jsonify({"error": "window_size is only supported on Linux"}), 501
d = display.Display()
root = d.screen().root
window_ids = root.get_full_property(d.intern_atom('_NET_CLIENT_LIST'), X.AnyPropertyType).value
@@ -1505,11 +1638,19 @@ def start_recording():
logger.error(f"Error removing old recording file: {e}")
return jsonify({'status': 'error', 'message': f'Failed to remove old recording file: {e}'}), 500
d = display.Display()
screen_width = d.screen().width_in_pixels
screen_height = d.screen().height_in_pixels
start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"
if platform_name == "Linux":
d = display.Display()
screen_width = d.screen().width_in_pixels
screen_height = d.screen().height_in_pixels
start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"
elif platform_name == "Windows":
user32 = ctypes.windll.user32
screen_width = user32.GetSystemMetrics(0)
screen_height = user32.GetSystemMetrics(1)
# Use gdigrab for Windows screen capture
start_command = f"ffmpeg -y -f gdigrab -draw_mouse 1 -framerate 30 -video_size {screen_width}x{screen_height} -i desktop -c:v libx264 -r 30 {recording_path}"
else:
return jsonify({'status': 'error', 'message': f'Recording not supported on {platform_name}'}), 501
# Use stderr=PIPE to capture potential errors from ffmpeg
recording_process = subprocess.Popen(shlex.split(start_command),
@@ -1544,11 +1685,22 @@ def end_recording():
error_output = ""
try:
# Send SIGINT for a graceful shutdown, allowing ffmpeg to finalize the file.
recording_process.send_signal(signal.SIGINT)
# On Windows, use CTRL_C_EVENT; on Unix, use SIGINT
if platform_name == "Windows":
# On Windows, we need to terminate the process gracefully
# ffmpeg responds to standard input 'q' to quit gracefully
try:
recording_process.stdin.write(b'q')
recording_process.stdin.flush()
except:
# If stdin is not available, use terminate
recording_process.terminate()
else:
recording_process.send_signal(signal.SIGINT)
# Wait for ffmpeg to terminate. communicate() gets output and waits.
_, error_output = recording_process.communicate(timeout=15)
except subprocess.TimeoutExpired:
logger.error("ffmpeg did not respond to SIGINT, killing the process.")
logger.error("ffmpeg did not respond to stop signal, killing the process.")
recording_process.kill()
# After killing, communicate to get any remaining output.
_, error_output = recording_process.communicate()
@@ -1589,8 +1741,9 @@ def run_python():
f.write(code)
# Execute the file using subprocess to capture all output
# Use sys.executable to use the same Python interpreter as the Flask server
result = subprocess.run(
['/usr/bin/python3', temp_filename],
[sys.executable, temp_filename],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,