From d2ae0f697db913c6751cc34b6bb5b3fb3c7de7bf Mon Sep 17 00:00:00 2001 From: Timothyxxx Date: Tue, 12 Aug 2025 05:34:18 +0000 Subject: [PATCH] feat: enhance AnthropicAgent with start_coordinate handling and modifier key support - Added support for an optional start_coordinate parameter to facilitate drag actions from a specified starting point. - Implemented validation for start_coordinate to ensure it is a tuple of two integers. - Enhanced click actions to handle modifier keys, allowing for more complex interactions. - Ensured existing code logic remains unchanged while improving functionality and usability. --- mm_agents/anthropic/main.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mm_agents/anthropic/main.py b/mm_agents/anthropic/main.py index 2f7fdb5..4ff838c 100644 --- a/mm_agents/anthropic/main.py +++ b/mm_agents/anthropic/main.py @@ -101,6 +101,7 @@ class AnthropicAgent: text = function_args.get("text") coordinate = function_args.get("coordinate") + start_coordinate = function_args.get("start_coordinate") scroll_direction = function_args.get("scroll_direction") scroll_amount = function_args.get("scroll_amount") duration = function_args.get("duration") @@ -111,6 +112,11 @@ class AnthropicAgent: int(coordinate[0] * self.resize_factor[0]), int(coordinate[1] * self.resize_factor[1]) ) + if start_coordinate and self.resize_factor: + start_coordinate = ( + int(start_coordinate[0] * self.resize_factor[0]), + int(start_coordinate[1] * self.resize_factor[1]) + ) if action == "left_mouse_down": result += "pyautogui.mouseDown()\n" @@ -145,6 +151,16 @@ class AnthropicAgent: ) expected_outcome = f"Mouse moved to ({x},{y})." elif action == "left_click_drag": + # If start_coordinate is provided, validate and move to start before dragging + if start_coordinate: + if not isinstance(start_coordinate, (list, tuple)) or len(start_coordinate) != 2: + raise ValueError(f"{start_coordinate} must be a tuple of length 2") + if not all(isinstance(i, int) for i in start_coordinate): + raise ValueError(f"{start_coordinate} must be a tuple of ints") + start_x, start_y = start_coordinate[0], start_coordinate[1] + result += ( + f"pyautogui.moveTo({start_x}, {start_y}, duration={duration or 0.5})\n" + ) result += ( f"pyautogui.dragTo({x}, {y}, duration={duration or 0.5})\n" ) @@ -209,6 +225,12 @@ class AnthropicAgent: # Handle click actions elif action in ("left_click", "right_click", "double_click", "middle_click", "left_press", "triple_click"): + # Handle modifier keys during click if specified + if text: + keys = text.split('+') + for key in keys: + key = key.strip().lower() + result += f"pyautogui.keyDown('{key}')\n" if coordinate is not None: x, y = coordinate if action == "left_click": @@ -241,6 +263,12 @@ class AnthropicAgent: result += ("pyautogui.mouseUp()\n") elif action == "triple_click": result += ("pyautogui.tripleClick()\n") + # Release modifier keys after click + if text: + keys = text.split('+') + for key in reversed(keys): + key = key.strip().lower() + result += f"pyautogui.keyUp('{key}')\n" expected_outcome = "Click action finished" elif action == "wait":