feat(server): add cross-platform support and improve screenshot handling

This commit is contained in:
cui0711
2026-01-30 16:27:49 +08:00
parent 788b248dbc
commit 308282e830

View File

@@ -4,25 +4,27 @@ import platform
import shlex import shlex
import json import json
import subprocess, signal import subprocess, signal
import sys
import time import time
from pathlib import Path from pathlib import Path
from typing import Any, Optional, Sequence from typing import Any, Optional, Sequence
from typing import List, Dict, Tuple, Literal from typing import List, Dict, Tuple, Literal
import concurrent.futures import concurrent.futures
import Xlib
import lxml.etree import lxml.etree
import pyautogui import pyautogui
import requests import requests
import re import re
from PIL import Image, ImageGrab from PIL import Image, ImageGrab
from Xlib import display, X
from flask import Flask, request, jsonify, send_file, abort # , send_from_directory from flask import Flask, request, jsonify, send_file, abort # , send_from_directory
from lxml.etree import _Element from lxml.etree import _Element
platform_name: str = platform.system() platform_name: str = platform.system()
if platform_name == "Linux": if platform_name == "Linux":
import Xlib
from Xlib import display, X
from pyxcursor import Xcursor
import pyatspi import pyatspi
from pyatspi import Accessible, StateType, STATE_SHOWING from pyatspi import Accessible, StateType, STATE_SHOWING
from pyatspi import Action as ATAction from pyatspi import Action as ATAction
@@ -39,9 +41,14 @@ elif platform_name == "Windows":
import win32ui, win32gui import win32ui, win32gui
Accessible = Any Accessible = Any
Xlib = None
display = None
X = None
Xcursor = None
elif platform_name == "Darwin": elif platform_name == "Darwin":
import plistlib import plistlib
from pyxcursor import Xcursor
import AppKit import AppKit
import ApplicationServices import ApplicationServices
@@ -51,13 +58,16 @@ elif platform_name == "Darwin":
Accessible = Any Accessible = Any
BaseWrapper = Any BaseWrapper = Any
Xlib = None
else: else:
# Platform not supported # Platform not supported
Accessible = None Accessible = None
BaseWrapper = Any BaseWrapper = Any
Xlib = None
from pyxcursor import Xcursor display = None
X = None
Xcursor = None
# todo: need to reformat and organize this whole file # todo: need to reformat and organize this whole file
@@ -89,6 +99,10 @@ def execute_command():
if arg.startswith("~/"): if arg.startswith("~/"):
command[i] = os.path.expanduser(arg) command[i] = os.path.expanduser(arg)
# Replace 'python' with sys.executable to use the same Python interpreter as the server
if len(command) > 0 and command[0] in ['python', 'python3', 'python.exe', 'python3.exe']:
command[0] = sys.executable
# Execute the command without any safety checks. # Execute the command without any safety checks.
try: try:
if platform_name == "Windows": if platform_name == "Windows":
@@ -262,15 +276,12 @@ def launch_app():
@app.route('/screenshot', methods=['GET']) @app.route('/screenshot', methods=['GET'])
def capture_screen_with_cursor(): def capture_screen_with_cursor():
# fixme: when running on virtual machines, the cursor is not captured, don't know why
file_path = os.path.join(os.path.dirname(__file__), "screenshots", "screenshot.png") file_path = os.path.join(os.path.dirname(__file__), "screenshots", "screenshot.png")
user_platform = platform.system() user_platform = platform.system()
# Ensure the screenshots directory exists # Ensure the screenshots directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True) os.makedirs(os.path.dirname(file_path), exist_ok=True)
# fixme: This is a temporary fix for the cursor not being captured on Windows and Linux
if user_platform == "Windows": if user_platform == "Windows":
def get_cursor(): def get_cursor():
hcursor = win32gui.GetCursorInfo()[1] hcursor = win32gui.GetCursorInfo()[1]
@@ -303,19 +314,53 @@ def capture_screen_with_cursor():
ratio = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100 ratio = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100
# get logical screen size
user32 = ctypes.windll.user32
logical_width = user32.GetSystemMetrics(0)
logical_height = user32.GetSystemMetrics(1)
# ===== Key fix: get cursor position before taking screenshot =====
# win32gui.GetCursorPos() returns logical coordinates (consistent with pyautogui)
pos_win = win32gui.GetCursorPos()
logger.info(f"Cursor position (logical coordinates): {pos_win}")
# Take screenshot immediately to reduce time difference
img = ImageGrab.grab(bbox=None, include_layered_windows=True) img = ImageGrab.grab(bbox=None, include_layered_windows=True)
# =============================================
# ===== DPI scaling fix =====
if ratio != 1.0:
physical_width, physical_height = img.size
logger.info(f"Detected DPI scaling: {ratio}x ({ratio*100}%)")
logger.info(f"Physical screenshot size: {physical_width}x{physical_height}")
logger.info(f"Logical resolution: {logical_width}x{logical_height}")
logger.info(f"Resizing screenshot to match logical resolution...")
img = img.resize((logical_width, logical_height), Image.Resampling.LANCZOS)
logger.info(f"Screenshot resized to: {img.size}")
# ==========================
try: try:
cursor, (hotspotx, hotspoty) = get_cursor() cursor, (hotspotx, hotspoty) = get_cursor()
pos_win = win32gui.GetCursorPos() # ===== Cursor position handling =====
pos = (round(pos_win[0]*ratio - hotspotx), round(pos_win[1]*ratio - hotspoty)) # win32gui.GetCursorPos() and pyautogui both use logical coordinates
# The screenshot has been resized to logical resolution, so use directly
logical_cursor_x = pos_win[0]
logical_cursor_y = pos_win[1]
pos = (logical_cursor_x - hotspotx, logical_cursor_y - hotspoty)
logger.info(f"Cursor position (logical coordinates): ({logical_cursor_x}, {logical_cursor_y})")
logger.info(f"Hotspot offset: ({hotspotx}, {hotspoty})")
logger.info(f"Final paste position: {pos}")
# ===================================
img.paste(cursor, pos, cursor) img.paste(cursor, pos, cursor)
except Exception as e: except Exception as e:
logger.warning(f"Failed to capture cursor on Windows, screenshot will not have a cursor. Error: {e}") logger.warning(f"Failed to capture cursor on Windows, screenshot will not include cursor. Error: {e}")
img.save(file_path) img.save(file_path)
elif user_platform == "Linux": elif user_platform == "Linux":
cursor_obj = Xcursor() cursor_obj = Xcursor()
imgarray = cursor_obj.getCursorImageArrayFast() imgarray = cursor_obj.getCursorImageArrayFast()
@@ -324,17 +369,19 @@ def capture_screen_with_cursor():
cursor_x, cursor_y = pyautogui.position() cursor_x, cursor_y = pyautogui.position()
screenshot.paste(cursor_img, (cursor_x, cursor_y), cursor_img) screenshot.paste(cursor_img, (cursor_x, cursor_y), cursor_img)
screenshot.save(file_path) screenshot.save(file_path)
elif user_platform == "Darwin": # (Mac OS) elif user_platform == "Darwin": # (Mac OS)
# Use the screencapture utility to capture the screen with the cursor
subprocess.run(["screencapture", "-C", file_path]) subprocess.run(["screencapture", "-C", file_path])
else: else:
logger.warning(f"The platform you're using ({user_platform}) is not currently supported") logger.warning(f"The platform you're using ({user_platform}) is not currently supported")
return send_file(file_path, mimetype='image/png') return send_file(file_path, mimetype='image/png')
def _has_active_terminal(desktop: Accessible) -> bool: def _has_active_terminal(desktop: Accessible) -> bool:
""" A quick check whether the terminal window is open and active. """ A quick check whether the terminal window is open and active (Linux only).
""" """
for app in desktop: for app in desktop:
if app.getRoleName() == "application" and app.name == "gnome-terminal-server": if app.getRoleName() == "application" and app.name == "gnome-terminal-server":
@@ -344,6 +391,87 @@ def _has_active_terminal(desktop: Accessible) -> bool:
return False return False
def _get_windows_terminal_output() -> Optional[str]:
""" Get terminal output on Windows platform.
Supports Windows Terminal, PowerShell, Command Prompt, and ConHost.
"""
try:
from pywinauto import Desktop
from pywinauto.findwindows import ElementNotFoundError
desktop = Desktop(backend="uia")
# Common terminal applications on Windows
terminal_apps = [
"WindowsTerminal.exe", # Windows Terminal
"powershell.exe", # PowerShell
"pwsh.exe", # PowerShell Core
"cmd.exe", # Command Prompt
"conhost.exe" # Console Host
]
# Try to find active terminal windows
for window in desktop.windows():
try:
# Check if window is visible and not minimized
if not window.is_visible() or window.is_minimized():
continue
# Get window process name
process_name = window.element_info.name.lower()
# Check if this is a terminal window
is_terminal = False
for term_app in terminal_apps:
if term_app.lower() in process_name or \
any(term_name in process_name for term_name in ['terminal', 'powershell', 'command prompt', 'cmd']):
is_terminal = True
break
if not is_terminal:
continue
# Try to get text content from the terminal
# First, try to find console/edit controls that contain the output
try:
# For Windows Terminal and modern consoles
# Look for Edit or Document controls that contain the text
text_controls = window.descendants(control_type="Edit")
if not text_controls:
text_controls = window.descendants(control_type="Document")
if not text_controls:
text_controls = window.descendants(control_type="Text")
for control in text_controls:
try:
text = control.window_text()
if text and len(text.strip()) > 0:
return text.rstrip()
except:
pass
# If no text controls found, try to get the window text directly
window_text = window.window_text()
if window_text and len(window_text.strip()) > 0:
# Filter out just the window title
if window_text not in ['Windows PowerShell', 'Command Prompt', 'PowerShell', 'Administrator: Windows PowerShell']:
return window_text.rstrip()
except Exception as e:
logger.debug(f"Error getting text from window {process_name}: {e}")
continue
except Exception as e:
logger.debug(f"Error processing window: {e}")
continue
return None
except Exception as e:
logger.error(f"Error in _get_windows_terminal_output: {e}")
return None
@app.route('/terminal', methods=['GET']) @app.route('/terminal', methods=['GET'])
def get_terminal_output(): def get_terminal_output():
user_platform = platform.system() user_platform = platform.system()
@@ -358,8 +486,10 @@ def get_terminal_output():
xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]' xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]'
terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map_ubuntu) terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map_ubuntu)
output = terminals[0].text.rstrip() if len(terminals) == 1 else None output = terminals[0].text.rstrip() if len(terminals) == 1 else None
else: # windows and macos platform is not implemented currently elif user_platform == "Windows":
# raise NotImplementedError output = _get_windows_terminal_output()
logger.debug(f"Terminal output retrieved: {output}")
else: # macOS platform is not implemented currently
return "Currently not implemented for platform {:}.".format(platform.platform()), 500 return "Currently not implemented for platform {:}.".format(platform.platform()), 500
return jsonify({"output": output, "status": "success"}) return jsonify({"output": output, "status": "success"})
except Exception as e: except Exception as e:
@@ -989,6 +1119,9 @@ def get_window_size():
else: else:
return jsonify({"error": "app_class_name is required"}), 400 return jsonify({"error": "app_class_name is required"}), 400
if platform_name != "Linux":
return jsonify({"error": "window_size is only supported on Linux"}), 501
d = display.Display() d = display.Display()
root = d.screen().root root = d.screen().root
window_ids = root.get_full_property(d.intern_atom('_NET_CLIENT_LIST'), X.AnyPropertyType).value window_ids = root.get_full_property(d.intern_atom('_NET_CLIENT_LIST'), X.AnyPropertyType).value
@@ -1505,11 +1638,19 @@ def start_recording():
logger.error(f"Error removing old recording file: {e}") logger.error(f"Error removing old recording file: {e}")
return jsonify({'status': 'error', 'message': f'Failed to remove old recording file: {e}'}), 500 return jsonify({'status': 'error', 'message': f'Failed to remove old recording file: {e}'}), 500
d = display.Display() if platform_name == "Linux":
screen_width = d.screen().width_in_pixels d = display.Display()
screen_height = d.screen().height_in_pixels screen_width = d.screen().width_in_pixels
screen_height = d.screen().height_in_pixels
start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}" start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"
elif platform_name == "Windows":
user32 = ctypes.windll.user32
screen_width = user32.GetSystemMetrics(0)
screen_height = user32.GetSystemMetrics(1)
# Use gdigrab for Windows screen capture
start_command = f"ffmpeg -y -f gdigrab -draw_mouse 1 -framerate 30 -video_size {screen_width}x{screen_height} -i desktop -c:v libx264 -r 30 {recording_path}"
else:
return jsonify({'status': 'error', 'message': f'Recording not supported on {platform_name}'}), 501
# Use stderr=PIPE to capture potential errors from ffmpeg # Use stderr=PIPE to capture potential errors from ffmpeg
recording_process = subprocess.Popen(shlex.split(start_command), recording_process = subprocess.Popen(shlex.split(start_command),
@@ -1544,11 +1685,22 @@ def end_recording():
error_output = "" error_output = ""
try: try:
# Send SIGINT for a graceful shutdown, allowing ffmpeg to finalize the file. # Send SIGINT for a graceful shutdown, allowing ffmpeg to finalize the file.
recording_process.send_signal(signal.SIGINT) # On Windows, use CTRL_C_EVENT; on Unix, use SIGINT
if platform_name == "Windows":
# On Windows, we need to terminate the process gracefully
# ffmpeg responds to standard input 'q' to quit gracefully
try:
recording_process.stdin.write(b'q')
recording_process.stdin.flush()
except:
# If stdin is not available, use terminate
recording_process.terminate()
else:
recording_process.send_signal(signal.SIGINT)
# Wait for ffmpeg to terminate. communicate() gets output and waits. # Wait for ffmpeg to terminate. communicate() gets output and waits.
_, error_output = recording_process.communicate(timeout=15) _, error_output = recording_process.communicate(timeout=15)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
logger.error("ffmpeg did not respond to SIGINT, killing the process.") logger.error("ffmpeg did not respond to stop signal, killing the process.")
recording_process.kill() recording_process.kill()
# After killing, communicate to get any remaining output. # After killing, communicate to get any remaining output.
_, error_output = recording_process.communicate() _, error_output = recording_process.communicate()
@@ -1589,8 +1741,9 @@ def run_python():
f.write(code) f.write(code)
# Execute the file using subprocess to capture all output # Execute the file using subprocess to capture all output
# Use sys.executable to use the same Python interpreter as the Flask server
result = subprocess.run( result = subprocess.run(
['/usr/bin/python3', temp_filename], [sys.executable, temp_filename],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,