Add Claude Sonnet 4.5 support and improve action handling (#362)
🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -17,7 +17,7 @@ from anthropic.types.beta import (
|
||||
BetaMessageParam,
|
||||
BetaTextBlockParam,
|
||||
)
|
||||
from .utils import COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG,SYSTEM_PROMPT, SYSTEM_PROMPT_WINDOWS, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME
|
||||
from .utils import COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG,SYSTEM_PROMPT, SYSTEM_PROMPT_WINDOWS, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME, get_model_name
|
||||
from .utils import _response_to_params, _inject_prompt_caching, _maybe_filter_to_n_most_recent_images
|
||||
|
||||
import logging
|
||||
@@ -30,14 +30,18 @@ API_RETRY_INTERVAL = 5
|
||||
class AnthropicAgent:
|
||||
def __init__(self,
|
||||
platform: str = "Ubuntu",
|
||||
model: str = "claude-3-5-sonnet-20241022",
|
||||
provider: APIProvider = APIProvider.BEDROCK,
|
||||
model: str = "claude-sonnet-4-5-20250929",
|
||||
provider: APIProvider = APIProvider.ANTHROPIC,
|
||||
max_tokens: int = 4096,
|
||||
api_key: str = os.environ.get("ANTHROPIC_API_KEY", None),
|
||||
system_prompt_suffix: str = "",
|
||||
only_n_most_recent_images: Optional[int] = 10,
|
||||
action_space: str = "claude_computer_use",
|
||||
screen_size: tuple[int, int] = (1920, 1080),
|
||||
no_thinking: bool = False,
|
||||
use_isp: bool = False,
|
||||
temperature: Optional[float] = None,
|
||||
top_p: Optional[float] = None,
|
||||
*args, **kwargs
|
||||
):
|
||||
self.platform = platform
|
||||
@@ -52,10 +56,24 @@ class AnthropicAgent:
|
||||
self.only_n_most_recent_images = only_n_most_recent_images
|
||||
self.messages: list[BetaMessageParam] = []
|
||||
self.screen_size = screen_size
|
||||
self.no_thinking = no_thinking
|
||||
self.use_isp = use_isp
|
||||
self.temperature = temperature
|
||||
self.top_p = top_p
|
||||
|
||||
self.resize_factor = (
|
||||
screen_size[0] / 1280, # Assuming 1280 is the base width
|
||||
screen_size[1] / 720 # Assuming 720 is the base height
|
||||
)
|
||||
|
||||
def _get_sampling_params(self):
|
||||
"""Get sampling parameters (temperature and/or top_p) - let API validate exclusivity"""
|
||||
params = {}
|
||||
if self.temperature is not None:
|
||||
params['temperature'] = self.temperature
|
||||
if self.top_p is not None:
|
||||
params['top_p'] = self.top_p
|
||||
return params
|
||||
|
||||
def add_tool_result(self, tool_call_id: str, result: str, screenshot: bytes = None):
|
||||
"""Add tool result to message history"""
|
||||
@@ -84,6 +102,21 @@ class AnthropicAgent:
|
||||
"content": tool_result_content
|
||||
})
|
||||
|
||||
def _extract_raw_response_string(self, response) -> str:
|
||||
"""Extract and concatenate raw response content into a single string."""
|
||||
raw_response_str = ""
|
||||
if response.content:
|
||||
for block in response.content:
|
||||
if hasattr(block, 'text') and block.text:
|
||||
raw_response_str += f"[TEXT] {block.text}\n"
|
||||
elif hasattr(block, 'thinking') and block.thinking:
|
||||
raw_response_str += f"[THINKING] {block.thinking}\n"
|
||||
elif hasattr(block, 'name') and hasattr(block, 'input'):
|
||||
raw_response_str += f"[TOOL_USE] {block.name}: {block.input}\n"
|
||||
else:
|
||||
raw_response_str += f"[OTHER] {str(block)}\n"
|
||||
return raw_response_str.strip()
|
||||
|
||||
def parse_actions_from_tool_call(self, tool_call: Dict) -> str:
|
||||
result = ""
|
||||
function_args = (
|
||||
@@ -194,13 +227,23 @@ class AnthropicAgent:
|
||||
result += (f"pyautogui.keyUp('{key}')\n")
|
||||
expected_outcome = f"Key {key} pressed."
|
||||
elif action == "type":
|
||||
result += (
|
||||
f"pyautogui.typewrite(\"\"\"{text}\"\"\", interval=0.01)\n"
|
||||
)
|
||||
for char in text:
|
||||
if char == '\n':
|
||||
result += "pyautogui.press('enter')\n"
|
||||
elif char == "'":
|
||||
result += 'pyautogui.press("\'")\n'
|
||||
elif char == '\\':
|
||||
result += "pyautogui.press('\\\\')\n"
|
||||
elif char == '"':
|
||||
result += "pyautogui.press('\"')\n"
|
||||
else:
|
||||
result += f"pyautogui.press('{char}')\n"
|
||||
expected_outcome = f"Text {text} written."
|
||||
|
||||
# Handle scroll actions
|
||||
elif action == "scroll":
|
||||
if text is not None:
|
||||
result += (f"pyautogui.keyDown('{text.lower()}')\n")
|
||||
if coordinate is None:
|
||||
if scroll_direction in ("up", "down"):
|
||||
result += (
|
||||
@@ -221,6 +264,8 @@ class AnthropicAgent:
|
||||
result += (
|
||||
f"pyautogui.hscroll({scroll_amount if scroll_direction == 'right' else -scroll_amount}, {x}, {y})\n"
|
||||
)
|
||||
if text is not None:
|
||||
result += (f"pyautogui.keyUp('{text.lower()}')\n")
|
||||
expected_outcome = "Scroll action finished"
|
||||
|
||||
# Handle click actions
|
||||
@@ -285,7 +330,7 @@ class AnthropicAgent:
|
||||
expected_outcome = "Call user"
|
||||
elif action == "screenshot":
|
||||
result += "pyautogui.sleep(0.1)\n"
|
||||
expected_outcome = "Screenshot taken"
|
||||
expected_outcome = "Screenshot taken"
|
||||
else:
|
||||
raise ValueError(f"Invalid action: {action}")
|
||||
|
||||
@@ -303,6 +348,9 @@ class AnthropicAgent:
|
||||
screenshot_bytes = obs["screenshot"]
|
||||
screenshot_image = Image.open(io.BytesIO(screenshot_bytes))
|
||||
|
||||
# Store original unresized screenshot for zoom processing
|
||||
obs["screenshot_original"] = screenshot_bytes
|
||||
|
||||
# Calculate new size based on resize factor
|
||||
new_width, new_height = 1280, 720
|
||||
|
||||
@@ -334,23 +382,45 @@ class AnthropicAgent:
|
||||
]
|
||||
})
|
||||
|
||||
if self.messages and "tool_use" in [content_block["type"] for content_block in self.messages[-1]["content"]]:
|
||||
self.add_tool_result(
|
||||
self.messages[-1]["content"][-1]["id"],
|
||||
f"Success",
|
||||
screenshot=obs.get("screenshot") if obs else None
|
||||
)
|
||||
# Add tool_result for ALL tool_use blocks in the last message
|
||||
if self.messages:
|
||||
last_message_content = self.messages[-1]["content"]
|
||||
tool_use_blocks = [block for block in last_message_content if block.get("type") == "tool_use"]
|
||||
|
||||
for i, tool_block in enumerate(tool_use_blocks):
|
||||
tool_input = tool_block.get("input", {})
|
||||
action = tool_input.get("action")
|
||||
is_last_tool = i == len(tool_use_blocks) - 1
|
||||
|
||||
include_screenshot = None
|
||||
|
||||
if obs:
|
||||
if action == "screenshot":
|
||||
# Screenshot action always gets regular screenshot
|
||||
include_screenshot = obs.get("screenshot")
|
||||
elif is_last_tool:
|
||||
# Auto-screenshot: last tool gets regular screenshot (unless it's zoom, handled above)
|
||||
include_screenshot = obs.get("screenshot")
|
||||
|
||||
self.add_tool_result(
|
||||
tool_block["id"],
|
||||
f"Success",
|
||||
screenshot=include_screenshot
|
||||
)
|
||||
|
||||
enable_prompt_caching = False
|
||||
betas = ["computer-use-2025-01-24"]
|
||||
if self.model_name == "claude-3-7-sonnet-20250219" or self.model_name == "claude-4-opus-20250514" or self.model_name == "claude-4-sonnet-20250514":
|
||||
betas = ["computer-use-2025-01-24"]
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
betas = [COMPUTER_USE_BETA_FLAG]
|
||||
betas = [COMPUTER_USE_BETA_FLAG]
|
||||
|
||||
# Add interleaved thinking beta if ISP is requested
|
||||
if self.use_isp:
|
||||
betas.append("interleaved-thinking-2025-05-14")
|
||||
logger.info(f"Added interleaved thinking beta. Betas: {betas}")
|
||||
|
||||
image_truncation_threshold = 10
|
||||
if self.provider == APIProvider.ANTHROPIC:
|
||||
client = Anthropic(api_key=self.api_key, max_retries=4)
|
||||
client = Anthropic(api_key=self.api_key, max_retries=4).with_options(
|
||||
default_headers={"anthropic-beta": COMPUTER_USE_BETA_FLAG}
|
||||
)
|
||||
enable_prompt_caching = True
|
||||
elif self.provider == APIProvider.VERTEX:
|
||||
client = AnthropicVertex()
|
||||
@@ -368,7 +438,7 @@ class AnthropicAgent:
|
||||
if enable_prompt_caching:
|
||||
betas.append(PROMPT_CACHING_BETA_FLAG)
|
||||
_inject_prompt_caching(self.messages)
|
||||
image_truncation_threshold = 50
|
||||
image_truncation_threshold = 20
|
||||
system["cache_control"] = {"type": "ephemeral"}
|
||||
|
||||
if self.only_n_most_recent_images:
|
||||
@@ -378,49 +448,65 @@ class AnthropicAgent:
|
||||
min_removal_threshold=image_truncation_threshold,
|
||||
)
|
||||
|
||||
try:
|
||||
if self.model_name == "claude-3-5-sonnet-20241022":
|
||||
tools = [
|
||||
{'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
# {'type': 'bash_20241022', 'name': 'bash'},
|
||||
# {'name': 'str_replace_editor', 'type': 'text_editor_20241022'}
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
{'name': 'computer', 'type': 'computer_20241022', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
]
|
||||
elif self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
tools = [
|
||||
{'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
# {'type': 'bash_20250124', 'name': 'bash'},
|
||||
# {'name': 'str_replace_editor', 'type': 'text_editor_20250124'}
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
{'name': 'computer', 'type': 'computer_20250124', 'display_width_px': 1280, 'display_height_px': 720, 'display_number': 1},
|
||||
]
|
||||
# Configure tool settings - use modern computer tool for all models
|
||||
tool_config = {
|
||||
'name': 'computer',
|
||||
'type': 'computer_20250124',
|
||||
'display_width_px': 1280,
|
||||
'display_height_px': 720,
|
||||
'display_number': 1
|
||||
}
|
||||
|
||||
tools = [
|
||||
tool_config,
|
||||
] if self.platform == 'Ubuntu' else [
|
||||
tool_config,
|
||||
]
|
||||
|
||||
# Configure thinking mode based on user preferences
|
||||
if self.no_thinking:
|
||||
# Disable thinking mode - omit the thinking parameter
|
||||
extra_body = {}
|
||||
actual_max_tokens = self.max_tokens # Use default when no thinking
|
||||
logger.info("Thinking mode: DISABLED")
|
||||
else:
|
||||
# Enable thinking mode (regular or interleaved)
|
||||
# Use consistent 2048 budget for both regular and ISP thinking
|
||||
budget_tokens = 2048
|
||||
|
||||
# For regular thinking: max_tokens > budget_tokens (API requirement)
|
||||
# For ISP: budget_tokens can exceed max_tokens (represents total across all thinking blocks)
|
||||
if self.max_tokens <= budget_tokens:
|
||||
required_max_tokens = budget_tokens + 500 # Give some headroom
|
||||
logger.warning(f"Regular thinking requires max_tokens > budget_tokens. Increasing max_tokens from {self.max_tokens} to {required_max_tokens}")
|
||||
actual_max_tokens = required_max_tokens
|
||||
else:
|
||||
actual_max_tokens = self.max_tokens
|
||||
|
||||
extra_body = {
|
||||
"thinking": {"type": "enabled", "budget_tokens": 1024}
|
||||
"thinking": {"type": "enabled", "budget_tokens": budget_tokens}
|
||||
}
|
||||
if self.use_isp:
|
||||
logger.info("Thinking mode: INTERLEAVED SCRATCHPAD (ISP)")
|
||||
else:
|
||||
logger.info("Thinking mode: REGULAR SCRATCHPAD")
|
||||
|
||||
try:
|
||||
response = None
|
||||
|
||||
for attempt in range(API_RETRY_TIMES):
|
||||
try:
|
||||
if self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=actual_max_tokens,
|
||||
messages=self.messages,
|
||||
model=get_model_name(self.provider, self.model_name),
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body,
|
||||
**self._get_sampling_params()
|
||||
)
|
||||
|
||||
logger.info(f"Response: {response}")
|
||||
break
|
||||
except (APIError, APIStatusError, APIResponseValidationError) as e:
|
||||
@@ -450,26 +536,20 @@ class AnthropicAgent:
|
||||
try:
|
||||
logger.warning("Retrying with backup API key...")
|
||||
|
||||
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4)
|
||||
if self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
backup_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY_BACKUP"), max_retries=4).with_options(
|
||||
default_headers={"anthropic-beta": COMPUTER_USE_BETA_FLAG}
|
||||
)
|
||||
response = backup_client.beta.messages.create(
|
||||
max_tokens=actual_max_tokens,
|
||||
messages=self.messages,
|
||||
model=get_model_name(self.provider, self.model_name),
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body,
|
||||
**self._get_sampling_params()
|
||||
)
|
||||
|
||||
logger.info("Successfully used backup API key")
|
||||
except Exception as backup_e:
|
||||
backup_error_msg = str(backup_e)
|
||||
@@ -497,9 +577,16 @@ class AnthropicAgent:
|
||||
logger.exception(f"Error in Anthropic API: {str(e)}")
|
||||
return None, None
|
||||
|
||||
if response is None:
|
||||
logger.error("Response is None after API call - this should not happen")
|
||||
return None, None
|
||||
|
||||
response_params = _response_to_params(response)
|
||||
logger.info(f"Received response params: {response_params}")
|
||||
|
||||
# Convert raw response to concatenated string for trajectory logging
|
||||
raw_response_str = self._extract_raw_response_string(response)
|
||||
|
||||
# Store response in message history
|
||||
self.messages.append({
|
||||
"role": "assistant",
|
||||
@@ -518,7 +605,8 @@ class AnthropicAgent:
|
||||
"input": cast(dict[str, Any], content_block["input"]),
|
||||
"id": content_block["id"],
|
||||
"action_type": content_block.get("type"),
|
||||
"command": self.parse_actions_from_tool_call(content_block)
|
||||
"command": self.parse_actions_from_tool_call(content_block),
|
||||
"raw_response": raw_response_str # Add raw response to each action
|
||||
})
|
||||
elif content_block["type"] == "text":
|
||||
reasonings.append(content_block["text"])
|
||||
@@ -526,10 +614,23 @@ class AnthropicAgent:
|
||||
reasonings = reasonings[0]
|
||||
else:
|
||||
reasonings = ""
|
||||
|
||||
# Check if the model indicated the task is infeasible
|
||||
if raw_response_str and "[INFEASIBLE]" in raw_response_str:
|
||||
logger.info("Detected [INFEASIBLE] pattern in response, triggering FAIL action")
|
||||
# Override actions with FAIL
|
||||
actions = [{
|
||||
"action_type": "FAIL",
|
||||
"raw_response": raw_response_str
|
||||
}]
|
||||
|
||||
logger.info(f"Received actions: {actions}")
|
||||
logger.info(f"Received reasonings: {reasonings}")
|
||||
if len(actions) == 0:
|
||||
actions = ["DONE"]
|
||||
actions = [{
|
||||
"action_type": "DONE",
|
||||
"raw_response": raw_response_str
|
||||
}]
|
||||
return reasonings, actions
|
||||
except Exception as e:
|
||||
logger.warning(f"parse_actions_from_tool_call parsing failed (attempt {parse_retry+1}/3), will retry API request: {e}")
|
||||
@@ -539,25 +640,17 @@ class AnthropicAgent:
|
||||
response = None
|
||||
for attempt in range(API_RETRY_TIMES):
|
||||
try:
|
||||
if self.model_name in ["claude-3-7-sonnet-20250219", "claude-4-opus-20250514", "claude-4-sonnet-20250514"]:
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body
|
||||
)
|
||||
elif self.model_name == "claude-3-5-sonnet-20241022":
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
messages=self.messages,
|
||||
model=PROVIDER_TO_DEFAULT_MODEL_NAME[self.provider, self.model_name],
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
)
|
||||
response = client.beta.messages.create(
|
||||
max_tokens=actual_max_tokens,
|
||||
messages=self.messages,
|
||||
model=get_model_name(self.provider, self.model_name),
|
||||
system=[system],
|
||||
tools=tools,
|
||||
betas=betas,
|
||||
extra_body=extra_body,
|
||||
**self._get_sampling_params()
|
||||
)
|
||||
|
||||
logger.info(f"Response: {response}")
|
||||
break # Success, exit retry loop
|
||||
except (APIError, APIStatusError, APIResponseValidationError) as e2:
|
||||
@@ -569,13 +662,20 @@ class AnthropicAgent:
|
||||
raise
|
||||
response_params = _response_to_params(response)
|
||||
logger.info(f"Received response params: {response_params}")
|
||||
|
||||
# Update raw response string for retry case (will be used in next loop iteration)
|
||||
raw_response_str = self._extract_raw_response_string(response)
|
||||
|
||||
self.messages.append({
|
||||
"role": "assistant",
|
||||
"content": response_params
|
||||
})
|
||||
if parse_retry == max_parse_retry - 1:
|
||||
logger.error(f"parse_actions_from_tool_call parsing failed 3 times consecutively, terminating: {e}")
|
||||
actions = ["FAIL"]
|
||||
actions = [{
|
||||
"action_type": "FAIL",
|
||||
"raw_response": f"Failed to parse actions from tool call after {max_parse_retry} attempts: {e}"
|
||||
}]
|
||||
return reasonings, actions
|
||||
def reset(self, _logger = None, *args, **kwargs):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user