Files
sci-gui-agent-benchmark/mm_agents/aworldguiagent/agent.py
2025-09-23 16:50:29 +08:00

100 lines
3.1 KiB
Python

"""
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
with modifications to suit specific requirements.
"""
import logging
import platform
from typing import Dict, List, Tuple
from mm_agents.aworldguiagent.grounding import ACI
from mm_agents.aworldguiagent.workflow import Worker
logger = logging.getLogger("desktopenv.agent")
class UIAgent:
"""Base class for UI automation agents"""
""""""
def __init__(
self,
engine_params: Dict,
grounding_agent: ACI,
platform: str = platform.system().lower(),
):
"""Initialize UIAgent
Args:
engine_params: Configuration parameters for the LLM engine
grounding_agent: Instance of ACI class for UI interaction
platform: Operating system platform (macos, linux, windows)
"""
self.engine_params = engine_params
self.grounding_agent = grounding_agent
self.platform = platform
def reset(self) -> None:
"""Reset agent state"""
pass
def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
"""Generate next action prediction
Args:
instruction: Natural language instruction
observation: Current UI state observation
Returns:
Tuple containing agent info dictionary and list of actions
"""
pass
class AworldGUIAgent(UIAgent):
"""Agent that uses no hierarchy for less inference time"""
def __init__(
self,
engine_params: Dict,
grounding_agent: ACI,
platform: str = platform.system().lower(),
max_trajectory_length: int = 8,
enable_reflection: bool = True,
):
"""Initialize a minimalist AgentS2 without hierarchy
Args:
engine_params: Configuration parameters for the LLM engine
grounding_agent: Instance of ACI class for UI interaction
platform: Operating system platform (darwin, linux, windows)
max_trajectory_length: Maximum number of image turns to keep
enable_reflection: Creates a reflection agent to assist the worker agent
"""
super().__init__(engine_params, grounding_agent, platform)
self.max_trajectory_length = max_trajectory_length
self.enable_reflection = enable_reflection
self.reset()
def reset(self) -> None:
"""Reset agent state and initialize components"""
self.executor = Worker(
engine_params=self.engine_params,
grounding_agent=self.grounding_agent,
platform=self.platform,
max_trajectory_length=self.max_trajectory_length,
enable_reflection=self.enable_reflection,
)
def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
# Initialize the three info dictionaries
executor_info, actions = self.executor.generate_next_action(
instruction=instruction, obs=observation
)
# concatenate the three info dictionaries
info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}
return info, actions