sci-gui-agent-benchmark/mm_agents/os_symphony/agents/os_symphony.py

import logging
import platform
from typing import Dict, List, Tuple
from mm_agents.os_symphony.agents.os_aci import OSACI
from mm_agents.os_symphony.agents.searcher_agent import VLMSearcherAgent
from mm_agents.os_symphony.agents.worker import Worker

logger = logging.getLogger("desktopenv.agent")

class OSSymphony:
    def __init__(
        self,
        engine_params_for_orchestrator: Dict,
        engine_params_for_memoryer: Dict,
        os_aci: OSACI,
        platform: str = platform.system().lower(),
        client_password: str = "",
        max_trajectory_length: int = 8,
        enable_reflection: bool = True,
    ):
        """
        Args:
            worker_engine_params: Configuration parameters for the worker agent.
            grounding_agent: Instance of ACI class for UI interaction
            platform: Operating system platform (darwin, linux, windows)
            max_trajectory_length: Maximum number of image turns to keep
            enable_reflection: Creates a reflection agent to assist the worker agent
        """

        self.engine_params_for_orchestrator = engine_params_for_orchestrator
        self.engine_params_for_memoryer = engine_params_for_memoryer
        self.os_aci: OSACI = os_aci
        self.platform =platform
        self.client_password = client_password
        self.max_trajectory_length = max_trajectory_length
        self.enable_reflection = enable_reflection

    def reset(self, result_dir) -> None:
        """Reset agent state and initialize components"""
        # Reset the search time per task
        self.os_aci.result_dir = result_dir
        self.executor = Worker(
            engine_params_for_orchestrator=self.engine_params_for_orchestrator,
            engine_params_for_memoryer=self.engine_params_for_memoryer,
            os_aci=self.os_aci,
            platform=self.platform,
            client_password=self.client_password,
            max_trajectory_length=self.max_trajectory_length,
            enable_reflection=self.enable_reflection,
        )

    def predict(self, instruction: str, observation: Dict, is_last_step: bool) -> Tuple[Dict, List[str]]:
        # Initialize the three info dictionaries
        executor_info, actions = self.executor.generate_next_action(
            instruction=instruction, obs=observation, is_last_step=is_last_step
        )

        # concatenate the three info dictionaries
        info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}

        return info, actions