Files
sci-gui-agent-benchmark/mm_agents/os_symphony/agents/coder_agent.py
2025-12-23 14:30:44 +08:00

351 lines
14 KiB
Python
Executable File

import logging
from typing import Dict, List, Tuple, Optional
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
from mm_agents.os_symphony.utils.common_utils import call_llm_safe, parse_code_from_string
from mm_agents.os_symphony.core.mllm import LMMAgent
logger = logging.getLogger("desktopenv.coder_agent")
def extract_code_block(action: str) -> Tuple[Optional[str], Optional[str]]:
"""Extract code and determine type from action string."""
if "```python" in action:
code_type = "python"
code = action.split("```python")[1].split("```")[0].strip()
elif "```bash" in action:
code_type = "bash"
code = action.split("```bash")[1].split("```")[0].strip()
elif "```" in action:
code_type = None
code = action.split("```")[1].split("```")[0].strip()
else:
code_type = None
code = None
logger.debug(
f"Extracted code block: type={code_type}, length={len(code) if code else 0}"
)
return code_type, code
def execute_code(code_type: str, code: str, env_controller) -> Dict:
"""Execute code based on its type."""
# Log the full code being executed (untruncated)
logger.info(f"CODING_AGENT_CODE_EXECUTION - Type: {code_type}\nCode:\n{code}")
try:
if code_type == "bash":
result = env_controller.run_bash_script(code, timeout=30)
elif code_type == "python":
result = env_controller.run_python_script(code)
else:
result = {"status": "error", "error": f"Unknown code type: {code_type}"}
return result
except Exception as e:
logger.error(f"Error executing {code_type} code: {e}")
return {"status": "error", "error": str(e)}
def format_result(result: Dict, step_count: int) -> str:
"""Format execution result into context string."""
if not result:
logger.warning(f"Step {step_count + 1}: No result returned from execution")
return f"""
Step {step_count + 1} Error:
Error: No result returned from execution
"""
status = result.get("status", "unknown")
return_code = result.get("returncode", result.get("return_code", -1))
# Handle different response structures for bash vs python
if "returncode" in result:
# Bash script response
output = result.get("output", "") # Contains both stdout and stderr merged
error = result.get("error", "") # Always empty for bash
else:
# Python script response
output = result.get("output", "") # stdout only
error = result.get("error", "") # stderr only
logger.debug(f"Step {step_count + 1}: Status={status}, Return Code={return_code}")
# Format with better structure for multi-line outputs
result_text = f"Step {step_count + 1} Result:\n"
result_text += f"Status: {status}\n"
result_text += f"Return Code: {return_code}\n"
if output:
result_text += f"Output:\n{output}\n"
if error:
result_text += f"Error:\n{error}\n"
return result_text
class CoderAgent:
"""A dedicated agent for executing code with a budget of steps."""
def __init__(self, engine_params: Dict, client_password: str, platform: str = "linux"):
"""Initialize the CodeAgent."""
if not engine_params:
raise ValueError("engine_params cannot be None or empty")
self.engine_params = engine_params
self.budget = engine_params.get("budget", 20)
self.temperature = engine_params.get("temperature", 0.1)
self.agent = None
self.platform = platform
self.client_password = client_password
logger.info(f"CodeAgent initialized with budget={self.budget} and platform={self.platform}")
self.reset()
def reset(self):
"""Reset the code agent state."""
logger.debug("Resetting CodeAgent state")
self.agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.construct_coder_procedural_memory(platform=self.platform, client_password=self.client_password)
)
def execute(self, task_instruction: str, screenshot: str, env_controller) -> Dict:
"""Execute code for the given task with a budget of steps."""
if env_controller is None:
raise ValueError("env_controller is required for code execution")
print(f"\n🚀 STARTING CODE EXECUTION")
print("=" * 60)
print(f"Task: {task_instruction}")
print(f"Budget: {self.budget} steps")
print("=" * 60)
logger.info(f"Starting code execution for task: {task_instruction}")
logger.info(f"Budget: {self.budget} steps")
self.reset()
# Add initial task instruction and screenshot context as user message
context = (
f"Task: {task_instruction}\n\nCurrent screenshot is provided for context."
)
self.agent.add_message(context, image_content=screenshot, role="user")
step_count = 0
execution_history = []
execution_result_history = []
while step_count < self.budget:
logger.info(f"Step {step_count + 1}/{self.budget}")
# Get assistant response (thoughts and code)
response = call_llm_safe(self.agent, temperature=self.temperature)
# Print to terminal for immediate visibility
# print(f"\n🤖 CODING AGENT RESPONSE - Step {step_count + 1}/{self.budget}")
# print("=" * 60)
# print(response)
# print("=" * 60)
# Log the latest message from the coding agent (untruncated)
logger.info(
f"CODING_AGENT_LATEST_MESSAGE - Step {step_count + 1}:\n{response}"
)
# Check if response is None or empty
if not response or response.strip() == "":
error_msg = f"Step {step_count + 1}: LLM returned empty response"
logger.error(error_msg)
raise RuntimeError(error_msg)
# Parse the response to extract action
action = parse_code_from_string(response)
thoughts = response
execution_history.append(
{"step": step_count + 1, "action": action, "thoughts": thoughts}
)
# Check for completion signals
action_upper = action.upper().strip()
if action_upper == "DONE":
print(f"\n✅ TASK COMPLETED - Step {step_count + 1}")
print("=" * 60)
print("Agent signaled task completion")
print("=" * 60)
logger.info(f"Step {step_count + 1}: Task completed successfully")
completion_reason = "DONE"
break
elif action_upper == "FAIL":
print(f"\n❌ TASK FAILED - Step {step_count + 1}")
print("=" * 60)
print("Agent signaled task failure")
print("=" * 60)
logger.info(f"Step {step_count + 1}: Task failed by agent request")
completion_reason = "FAIL"
break
elif action_upper == 'INFEASIBLE':
print(f"\n❌ TASK INFEASIBLE - Step {step_count + 1}")
print("=" * 60)
print("Agent signaled task infeasible")
print("=" * 60)
logger.info(f"Step {step_count + 1}: Task infeasible by agent request")
completion_reason = "INFEASIBLE"
break
# Extract and execute code
code_type, code = extract_code_block(response.split("(Answer)")[-1])
if code:
result = execute_code(code_type, code, env_controller)
execution_result_history.append(
{"step": step_count + 1, "result": result}
)
# Prepare formatted output and error for logging
output = result.get("output", "")
error = result.get("error", "")
message = result.get("message", "")
status = result.get("status", "")
# Print execution result to terminal for immediate visibility
print(f"\n⚡ CODE EXECUTION RESULT - Step {step_count + 1}")
print("-" * 50)
print(f"Status: {status}")
if output:
print(f"Output:\n{output}")
if error:
print(f"Error:\n{error}")
if message and not output and not error:
print(f"Message:\n{message}")
print("-" * 50)
log_lines = [
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:",
f"Status: {status}" if status else None,
]
if output:
log_lines.append(
"Output:\n" + ("-" * 40) + f"\n{output}\n" + ("-" * 40)
)
if error:
log_lines.append(
"Error:\n" + ("!" * 40) + f"\n{error}\n" + ("!" * 40)
)
if message and not output and not error:
log_lines.append(
"Message:\n" + ("-" * 40) + f"\n{message}\n" + ("-" * 40)
)
# Remove None entries and join
formatted_log = "\n".join([line for line in log_lines if line])
logger.info(formatted_log)
else:
print(f"\n⚠️ NO CODE BLOCK FOUND - Step {step_count + 1}")
print("-" * 50)
print("Action did not contain executable code")
print("-" * 50)
logger.warning(f"Step {step_count + 1}: No code block found in action")
result = {"status": "skipped", "message": "No code block found"}
logger.info(
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\n"
f"Status: skipped\n"
f"Message:\n{'-' * 40}\n{result['message']}\n{'-' * 40}"
)
# Add assistant's thoughts and code to message history
self.agent.add_message(response, role="assistant")
# Process result and add formatted environment results as user message
result_context = format_result(result, step_count)
self.agent.add_message(result_context, role="user")
step_count += 1
# Handle budget exhaustion
if "completion_reason" not in locals():
print(f"\n⏰ BUDGET EXHAUSTED - {step_count} steps completed")
print("=" * 60)
print(f"Maximum budget of {self.budget} steps reached")
print("=" * 60)
logger.info(f"Budget exhausted after {step_count} steps")
completion_reason = f"BUDGET_EXHAUSTED_AFTER_{step_count}_STEPS"
# Generate final summary
logger.info("Generating execution summary")
summary = self._generate_summary(execution_history, task_instruction)
result = {
"task_instruction": task_instruction,
"completion_reason": completion_reason,
"summary": summary,
"execution_history": execution_history,
"execution_result_history": execution_result_history,
"steps_executed": step_count,
"budget": self.budget
}
logger.info(f"Code execution completed: steps={step_count}")
return result
def _generate_summary(
self, execution_history: List[Dict], task_instruction: str
) -> str:
"""Generate summary of code execution session."""
if not execution_history:
logger.info("No execution history to summarize")
return "No actions were executed."
logger.info(f"Generated summary for {len(execution_history)} steps")
# Build detailed execution context for summary agent
execution_context = f"Task: {task_instruction}\n\nExecution Steps:\n"
for step in execution_history:
step_num = step["step"]
thoughts = step.get("thoughts", "")
action = step.get("action", "")
execution_context += f"\nStep {step_num}:\n"
if thoughts:
execution_context += f"Thoughts: {thoughts}\n"
execution_context += f"Code: {action}\n"
# Create summary prompt with same context as coding agent
summary_prompt = f"""
{execution_context}
Please provide a concise summary of the code execution session. Focus on:
1. The code logic implemented at each step
2. The outputs and results produced by each code execution
3. The progression of the solution approach
Do not make judgments about success or failure. Simply describe what was attempted and what resulted.
Keep the summary under 150 words and use clear, factual language.
"""
# Generate summary using LLM with dedicated summary system prompt
try:
summary_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.CODE_SUMMARY_AGENT_PROMPT,
)
summary_agent.add_message(summary_prompt, role="user")
summary = call_llm_safe(summary_agent, temperature=self.temperature)
if not summary or summary.strip() == "":
summary = "Summary generation failed - no response from LLM"
logger.warning("Summary generation failed - empty response from LLM")
except Exception as e:
summary = f"Summary generation failed: {str(e)}"
logger.error(f"Error generating summary: {e}")
return summary