Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)

* Added a **pyproject.toml** file to define project metadata and dependencies. * Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic. * Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis. * Added a **tools module** containing utility functions and tool configurations to improve code reusability. * Updated the **README** and documentation with usage examples and module descriptions. These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience. Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
2025-09-08 15:07:21 +08:00
parent 029885e78c
commit 3a4b67304f
96 changed files with 31982 additions and 2 deletions
--- a/mm_agents/maestro/core/new_knowledge.py
+++ b/mm_agents/maestro/core/new_knowledge.py
@@ -0,0 +1,481 @@
+import json
+import os
+from typing import Dict, Tuple, List
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from ..utils.common_utils import (
+    load_embeddings,
+    load_knowledge_base,
+    save_embeddings,
+)
+from ..tools.new_tools import NewTools
+from .mllm import CostManager
+
+def get_embedding_dim(model_name):
+    if model_name == "doubao-embedding-large-text-250515":
+        return 2048
+    elif model_name == "doubao-embedding-text-240715":
+        return 2560
+    elif model_name == "text-embedding-ada-002":
+        return 1536
+    elif model_name == "text-embedding-3-small":
+        return 1536
+    elif model_name == "text-embedding-3-large":
+        return 3072
+    elif model_name == "gemini-embedding-001":
+        return 3072
+    elif model_name == "jina-embeddings-v4":
+        return 2048
+    elif model_name == "jina-embeddings-v3":
+        return 1024
+    elif model_name == "text-embedding-v4":
+        return 1024
+    elif model_name == "text-embedding-v3":
+        return 1024
+    elif model_name == "embedding-2" or model_name == "embedding-3":
+        return 2048
+    else:
+        return None
+
+class NewKnowledgeBase:
+    def __init__(
+        self,
+        embedding_engine: NewTools,
+        local_kb_path: str,
+        platform: str,
+        Tools_dict: Dict,
+        save_knowledge: bool = True,
+    ):
+        """
+        Initialize the KnowledgeBase module
+        
+        Args:
+            embedding_engine: Embedding engine instance
+            local_kb_path: Path to local knowledge base
+            platform: Target platform (Windows/Darwin/Ubuntu)
+            Tools_dict: Dictionary containing tool configurations
+            save_knowledge: Whether to save knowledge embeddings
+        """
+        self.platform = platform
+
+        self.local_kb_path = local_kb_path
+
+        # initialize embedding engine
+        self.embedding_engine = embedding_engine
+
+        # Initialize paths for different memory types
+        self.episodic_memory_path = os.path.join(
+            self.local_kb_path, self.platform, "episodic_memory.json"
+        )
+        self.narrative_memory_path = os.path.join(
+            self.local_kb_path, self.platform, "narrative_memory.json"
+        )
+        embedding_model_name = ""
+        if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
+            embedding_model_name = self.embedding_engine.tools["embedding"].model_name
+        else:
+            embedding_model_name = "default"
+        embedding_dim = get_embedding_dim(embedding_model_name)
+        self.embeddings_path = os.path.join(
+            self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
+        )
+
+        # Initialize trajectory tracking
+        self.task_trajectory = ""
+        self.current_subtask_trajectory = ""
+        self.current_search_query = ""
+        
+        # query_formulator
+        self.query_formulator_name = "query_formulator"
+        self.query_formulator = NewTools()
+        self.query_formulator.register_tool(
+            self.query_formulator_name,
+            Tools_dict[self.query_formulator_name]["provider"],
+            Tools_dict[self.query_formulator_name]["model"],
+        )
+        
+        # knowledge_fusion_agent
+        self.knowledge_fusion_agent_name = "context_fusion"
+        self.knowledge_fusion_agent = NewTools()
+        self.knowledge_fusion_agent.register_tool(
+            self.knowledge_fusion_agent_name,
+            Tools_dict[self.knowledge_fusion_agent_name]["provider"],
+            Tools_dict[self.knowledge_fusion_agent_name]["model"],
+        )
+
+        # narrative_summarization_agent
+        self.narrative_summarization_agent_name = "narrative_summarization"
+        self.narrative_summarization_agent = NewTools()
+        self.narrative_summarization_agent.register_tool(
+            self.narrative_summarization_agent_name,
+            Tools_dict[self.narrative_summarization_agent_name]["provider"],
+            Tools_dict[self.narrative_summarization_agent_name]["model"],
+        )
+        
+        # episode_summarization_agent
+        self.episode_summarization_agent_name = "episode_summarization"
+        self.episode_summarization_agent = NewTools()
+        self.episode_summarization_agent.register_tool(
+            self.episode_summarization_agent_name,
+            Tools_dict[self.episode_summarization_agent_name]["provider"],
+            Tools_dict[self.episode_summarization_agent_name]["model"],
+        )
+
+        self.save_knowledge = save_knowledge
+
+    def retrieve_knowledge(
+        self, instruction: str, search_query: str, search_engine: NewTools
+    ) -> Tuple[str, List[int], str]:
+        """Retrieve knowledge using search engine
+        Args:
+            instruction (str): task instruction
+            search_query (str): search query to use
+            search_engine (NewTools): search engine tool to use
+            
+        Returns:
+            Tuple[str, List[int], float]: The search results, token usage, and cost
+        """
+        search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
+
+        return search_results, total_tokens, cost_string
+
+    def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
+        """Formulate search query based on instruction and current state
+        
+        Args:
+            instruction (str): The task instruction
+            observation (Dict): Current observation including screenshot
+            
+        Returns:
+            Tuple[str, List[int], float]: The formulated query, token usage, and cost
+        """
+        query_path = os.path.join(
+            self.local_kb_path, self.platform, "formulate_query.json"
+        )
+        try:
+            with open(query_path, "r") as f:
+                formulate_query = json.load(f)
+        except:
+            formulate_query = {}
+
+        if instruction in formulate_query:
+            return formulate_query[instruction], [0, 0, 0], ""
+
+        self.query_formulator.tools["query_formulator"].llm_agent.reset()
+
+        content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
+            "str_input": f"The task is: {instruction}\n" + 
+                "To use google search to get some useful information, first carefully analyze " + 
+                "the screenshot of the current desktop UI state, then given the task " + 
+                "instruction, formulate a question that can be used to search on the Internet " + 
+                "for information in helping with the task execution.\n" + 
+                "The question should not be too general or too specific. Please ONLY provide " + 
+                "the question.\nQuestion:",
+            "img_input": observation["screenshot"] if "screenshot" in observation else None
+        })
+        
+        search_query = content.strip().replace('"', "")
+    
+        # print("search query: ", search_query)
+        formulate_query[instruction] = search_query
+        with open(query_path, "w") as f:
+            json.dump(formulate_query, f, indent=2)
+
+        return search_query, total_tokens, cost_string
+
+    def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
+        """Retrieve narrative experience using embeddings
+        
+        Args:
+            instruction (str): The task instruction
+            
+        Returns:
+            Tuple[str, str]: The similar task key and its narrative experience
+        """
+
+        knowledge_base = load_knowledge_base(self.narrative_memory_path)
+        if not knowledge_base:
+            return "None", "None", [0, 0, 0], ""
+
+        embeddings = load_embeddings(self.embeddings_path)
+
+        # Get or create instruction embedding
+        instruction_embedding = embeddings.get(instruction)
+        total_tokens, cost_string = [0, 0, 0], ""
+
+        if instruction_embedding is None:
+            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
+            embeddings[instruction] = instruction_embedding
+            # total_tokens += tokens
+            for i in range(len(total_tokens)):
+                total_tokens[i] += tokens[i]
+            cost_string = cost_string_now
+        # Get or create embeddings for knowledge base entries
+        candidate_embeddings = []
+        for key in knowledge_base:
+            candidate_embedding = embeddings.get(key)
+            if candidate_embedding is None:
+                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
+                for i in range(len(tokens)):
+                    total_tokens[i] += tokens[i]
+                # total_tokens += tokens
+                cost_string = CostManager.add_costs(cost_string, cost_string_now)
+            embeddings[key] = candidate_embedding
+
+            candidate_embeddings.append(candidate_embedding)
+
+        save_embeddings(self.embeddings_path, embeddings)
+
+        similarities = cosine_similarity(
+            instruction_embedding, np.vstack(candidate_embeddings)
+        )[0]
+        sorted_indices = np.argsort(similarities)[::-1]
+
+        keys = list(knowledge_base.keys())
+        idx = 1 if keys[sorted_indices[0]] == instruction else 0
+        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
+
+    def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
+        """Retrieve similar task experience using embeddings
+        
+        Args:
+            instruction (str): The task instruction
+            
+        Returns:
+            Tuple[str, str]: The similar task key and its episodic experience
+        """
+
+        knowledge_base = load_knowledge_base(self.episodic_memory_path)
+        if not knowledge_base:
+            return "None", "None", [0, 0, 0], ""
+
+        embeddings = load_embeddings(self.embeddings_path)
+
+        # Get or create instruction embedding
+        instruction_embedding = embeddings.get(instruction)
+        total_tokens, cost_string = [0, 0, 0], ""
+
+        if instruction_embedding is None:
+            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
+            embeddings[instruction] = instruction_embedding
+
+            # total_tokens += tokens
+            for i in range(len(total_tokens)):
+                total_tokens[i] += tokens[i]
+            cost_string = cost_string_now
+
+        # Get or create embeddings for knowledge base entries
+        candidate_embeddings = []
+        for key in knowledge_base:
+            candidate_embedding = embeddings.get(key)
+            if candidate_embedding is None:
+                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
+                # total_tokens += tokens
+                for i in range(len(total_tokens)):
+                    total_tokens[i] += tokens[i]
+                cost_string = CostManager.add_costs(cost_string, cost_string_now)
+            embeddings[key] = candidate_embedding
+
+            candidate_embeddings.append(candidate_embedding)
+
+        save_embeddings(self.embeddings_path, embeddings)
+
+        similarities = cosine_similarity(
+            instruction_embedding, np.vstack(candidate_embeddings)
+        )[0]
+        sorted_indices = np.argsort(similarities)[::-1]
+
+        keys = list(knowledge_base.keys())
+        idx = 1 if keys[sorted_indices[0]] == instruction else 0
+        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
+
+    def knowledge_fusion(
+        self,
+        observation: Dict,
+        instruction: str,
+        web_knowledge: str,
+        similar_task: str,
+        experience: str,
+    ) -> Tuple[str, list, str]:
+        """Combine web knowledge with similar task experience"""
+
+        content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
+            "str_input": f"Task: {instruction}\n" + 
+                f"**Web search result**:\n{web_knowledge}\n\n" + 
+                f"**Retrieved similar task experience**:\n" + 
+                f"Similar task:{similar_task}\n{experience}\n\n" + 
+                f"Based on the web search result and the retrieved similar task experience, " + 
+                f"if you think the similar task experience is indeed useful to the main task, " + 
+                f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
+            "img_input": observation["screenshot"] if "screenshot" in observation else None
+        })
+        
+        return content, total_tokens, cost
+        
+
+    def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
+        """Save episodic memory (subtask level knowledge).
+
+        Args:
+            subtask_key (str): Key identifying the subtask
+            subtask_traj (str): Trajectory/experience of the subtask
+        """
+        if not self.save_knowledge:
+            return
+
+        try:
+            kb = load_knowledge_base(self.episodic_memory_path)
+        except:
+            kb = {}
+
+        if subtask_key not in kb:
+            subtask_summarization = self.summarize_episode(subtask_traj)
+            kb[subtask_key] = subtask_summarization
+
+            if self.save_knowledge:
+                os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
+                with open(self.episodic_memory_path, "w") as fout:
+                    json.dump(kb, fout, indent=2)
+
+        return kb.get(subtask_key)
+
+    def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
+        """Save narrative memory (task level knowledge).
+
+        Args:
+            task_key (str): Key identifying the task
+            task_traj (str): Full trajectory/experience of the task
+        """
+        if not self.save_knowledge:
+            return
+
+        try:
+            kb = load_knowledge_base(self.narrative_memory_path)
+        except:
+            kb = {}
+
+        if task_key not in kb:
+            task_summarization = self.summarize_narrative(task_traj)
+            kb[task_key] = task_summarization
+
+            if self.save_knowledge:
+                os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
+                with open(self.narrative_memory_path, "w") as fout:
+                    json.dump(kb, fout, indent=2)
+
+        return kb.get(task_key)
+
+    def initialize_task_trajectory(self, instruction: str) -> None:
+        """Initialize a new task trajectory.
+
+        Args:
+            instruction (str): The task instruction
+        """
+        self.task_trajectory = f"Task:\n{instruction}"
+        self.current_search_query = ""
+        self.current_subtask_trajectory = ""
+
+    def update_task_trajectory(self, meta_data: Dict) -> None:
+        """Update the task trajectory with new metadata.
+
+        Args:
+            meta_data (Dict): Metadata from the agent's prediction
+        """
+        if not self.current_search_query and "search_query" in meta_data:
+            self.current_search_query = meta_data["search_query"]
+
+        self.task_trajectory += (
+            "\n\nReflection:\n"
+            + str(meta_data["reflection"])
+            + "\n\n----------------------\n\nPlan:\n"
+            + meta_data["executor_plan"]
+        )
+
+    def handle_subtask_trajectory(self, meta_data: Dict):
+        """Handle subtask trajectory updates based on subtask status.
+
+        Args:
+            meta_data (Dict): Metadata containing subtask information
+
+        Returns:
+            bool: Whether the subtask was completed
+        """
+        subtask_status = meta_data["subtask_status"]
+        subtask = meta_data["subtask"]
+        subtask_info = meta_data["subtask_info"]
+
+        if subtask_status in ["Start", "Done"]:
+            # If there's an existing subtask trajectory, finalize it
+            if self.current_subtask_trajectory:
+                self.current_subtask_trajectory += "\nSubtask Completed.\n"
+                subtask_key = self.current_subtask_trajectory.split(
+                    "\n----------------------\n\nPlan:\n"
+                )[0]
+                self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
+                self.current_subtask_trajectory = ""
+                return True
+
+            # Start new subtask trajectory
+            self.current_subtask_trajectory = (
+                f"Task:\n{self.current_search_query}\n\n"
+                f"Subtask: {subtask}\n"
+                f"Subtask Instruction: {subtask_info}\n"
+                f"----------------------\n\n"
+                f'Plan:\n{meta_data["executor_plan"]}\n'
+            )
+            return False
+
+        elif subtask_status == "In":
+            # Continue current subtask trajectory
+            self.current_subtask_trajectory += (
+                f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
+            )
+            return False
+
+    def finalize_task(self) -> None:
+        """Finalize the task by saving any remaining trajectories."""
+        # Save any remaining subtask trajectory
+        if self.current_subtask_trajectory:
+            self.current_subtask_trajectory += "\nSubtask Completed.\n"
+            subtask_key = self.current_subtask_trajectory.split(
+                "\n----------------------\n\nPlan:\n"
+            )[0]
+            self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
+
+        # Save the complete task trajectory
+        if self.task_trajectory and self.current_search_query:
+            self.save_narrative_memory(self.current_search_query, self.task_trajectory)
+
+        # Reset trajectories
+        self.task_trajectory = ""
+        self.current_subtask_trajectory = ""
+        self.current_search_query = ""
+
+    def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
+        """Summarize the episode experience for lifelong learning reflection
+        
+        Args:
+            trajectory (str): The episode experience to be summarized
+            
+        Returns:
+            str: The summarized episode experience
+        """
+
+        # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
+        content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
+
+        return content, total_tokens, cost
+
+    def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
+        """Summarize the narrative experience for lifelong learning reflection
+        
+        Args:
+            trajectory (str): The narrative experience to be summarized
+            
+        Returns:
+            str: The summarized narrative experience
+        """
+        # Create Reflection on whole trajectories for next round trial
+        content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
+
+        return content, total_tokens, cost