Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)

* Added a **pyproject.toml** file to define project metadata and dependencies. * Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic. * Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis. * Added a **tools module** containing utility functions and tool configurations to improve code reusability. * Updated the **README** and documentation with usage examples and module descriptions. These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience. Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
2025-09-08 15:07:21 +08:00
parent 029885e78c
commit 3a4b67304f
96 changed files with 31982 additions and 2 deletions
--- a/mm_agents/maestro/core/init.py
+++ b/mm_agents/maestro/core/init.py
--- a/mm_agents/maestro/core/engine.py
+++ b/mm_agents/maestro/core/engine.py
--- a/mm_agents/maestro/core/mllm.py
+++ b/mm_agents/maestro/core/mllm.py
@@ -0,0 +1,566 @@
+import base64
+
+import numpy as np
+
+from .engine import (
+    LMMEngineAnthropic,
+    LMMEngineAzureOpenAI,
+    LMMEngineHuggingFace,
+    LMMEngineOpenAI,
+    LMMEngineLybic,
+    LMMEngineOpenRouter,
+    LMMEnginevLLM,
+    LMMEngineGemini,
+    LMMEngineQwen,
+    LMMEngineDoubao,
+    LMMEngineDeepSeek,
+    LMMEngineZhipu,
+    LMMEngineGroq,
+    LMMEngineSiliconflow,
+    LMMEngineMonica,
+    LMMEngineAWSBedrock,
+    OpenAIEmbeddingEngine,
+    GeminiEmbeddingEngine,
+    AzureOpenAIEmbeddingEngine,
+    DashScopeEmbeddingEngine,
+    DoubaoEmbeddingEngine,
+    JinaEmbeddingEngine,
+    BochaAISearchEngine,
+    ExaResearchEngine,
+)
+
+class CostManager:
+    """Cost manager, responsible for adding currency symbols based on engine type"""
+    
+    # Chinese engines use CNY
+    CNY_ENGINES = {
+        LMMEngineQwen, LMMEngineDoubao, LMMEngineDeepSeek, LMMEngineZhipu, 
+        LMMEngineSiliconflow, DashScopeEmbeddingEngine, DoubaoEmbeddingEngine
+    }
+    # Other engines use USD
+    USD_ENGINES = {
+        LMMEngineOpenAI, LMMEngineLybic, LMMEngineAnthropic, LMMEngineAzureOpenAI, LMMEngineGemini,
+        LMMEngineOpenRouter, LMMEnginevLLM, LMMEngineHuggingFace, LMMEngineGroq,
+        LMMEngineMonica, LMMEngineAWSBedrock, OpenAIEmbeddingEngine, 
+        GeminiEmbeddingEngine, AzureOpenAIEmbeddingEngine, JinaEmbeddingEngine
+    }
+    
+    @classmethod
+    def get_currency_symbol(cls, engine) -> str:
+        engine_type = type(engine)
+        
+        if engine_type in cls.CNY_ENGINES:
+            return "￥"
+        elif engine_type in cls.USD_ENGINES:
+            return "$"
+        else:
+            return "$"
+    
+    @classmethod
+    def format_cost(cls, cost: float, engine) -> str:
+        currency = cls.get_currency_symbol(engine)
+        return f"{cost:.7f}{currency}"
+    
+    @classmethod
+    def add_costs(cls, cost1: str, cost2: str) -> str:
+        currency_symbols = ["$", "￥", "¥", "€", "£"]
+        currency1 = currency2 = "$"
+        value1 = value2 = 0.0
+        
+        if isinstance(cost1, (int, float)):
+            value1 = float(cost1)
+            currency1 = "$"
+        else:
+            cost1_str = str(cost1)
+            for symbol in currency_symbols:
+                if symbol in cost1_str:
+                    value1 = float(cost1_str.replace(symbol, "").strip())
+                    currency1 = symbol
+                    break
+            else:
+                try:
+                    value1 = float(cost1_str)
+                    currency1 = "$"
+                except:
+                    value1 = 0.0
+        
+        if isinstance(cost2, (int, float)):
+            value2 = float(cost2)
+            currency2 = "$"
+        else:
+            cost2_str = str(cost2)
+            for symbol in currency_symbols:
+                if symbol in cost2_str:
+                    value2 = float(cost2_str.replace(symbol, "").strip())
+                    currency2 = symbol
+                    break
+            else:
+                try:
+                    value2 = float(cost2_str)
+                    currency2 = "$"
+                except:
+                    value2 = 0.0
+        
+        if currency1 != currency2:
+            print(f"Warning: Different currencies in cost accumulation: {currency1} and {currency2}")
+            currency = currency1
+        else:
+            currency = currency1
+        
+        total_value = value1 + value2
+        return f"{total_value:.6f}{currency}"
+
+class LLMAgent:
+    def __init__(self, engine_params=None, system_prompt=None, engine=None):
+        if engine is None:
+            if engine_params is not None:
+                engine_type = engine_params.get("engine_type")
+                if engine_type == "openai":
+                    self.engine = LMMEngineOpenAI(**engine_params)
+                elif engine_type == "lybic":
+                    self.engine = LMMEngineLybic(**engine_params)
+                elif engine_type == "anthropic":
+                    self.engine = LMMEngineAnthropic(**engine_params)
+                elif engine_type == "azure":
+                    self.engine = LMMEngineAzureOpenAI(**engine_params)
+                elif engine_type == "vllm":
+                    self.engine = LMMEnginevLLM(**engine_params)
+                elif engine_type == "huggingface":
+                    self.engine = LMMEngineHuggingFace(**engine_params)
+                elif engine_type == "gemini":
+                    self.engine = LMMEngineGemini(**engine_params)
+                elif engine_type == "openrouter":
+                    self.engine = LMMEngineOpenRouter(**engine_params)
+                elif engine_type == "dashscope":
+                    self.engine = LMMEngineQwen(**engine_params)
+                elif engine_type == "doubao":
+                    self.engine = LMMEngineDoubao(**engine_params)
+                elif engine_type == "deepseek":
+                    self.engine = LMMEngineDeepSeek(**engine_params)
+                elif engine_type == "zhipu":
+                    self.engine = LMMEngineZhipu(**engine_params)
+                elif engine_type == "groq":
+                    self.engine = LMMEngineGroq(**engine_params)
+                elif engine_type == "siliconflow":
+                    self.engine = LMMEngineSiliconflow(**engine_params)
+                elif engine_type == "monica":
+                    self.engine = LMMEngineMonica(**engine_params)
+                elif engine_type == "aws_bedrock":
+                    self.engine = LMMEngineAWSBedrock(**engine_params)
+                else:
+                    raise ValueError("engine_type is not supported")
+            else:
+                raise ValueError("engine_params must be provided")
+        else:
+            self.engine = engine
+
+        self.messages = []  # Empty messages
+
+        if system_prompt:
+            self.add_system_prompt(system_prompt)
+        else:
+            self.add_system_prompt("You are a helpful assistant.")
+
+    def encode_image(self, image_content):
+        # if image_content is a path to an image file, check type of the image_content to verify
+        if isinstance(image_content, str):
+            with open(image_content, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        else:
+            return base64.b64encode(image_content).decode("utf-8")
+
+    def reset(
+        self,
+    ):
+
+        self.messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": self.system_prompt}],
+            }
+        ]
+
+    def add_system_prompt(self, system_prompt):
+        self.system_prompt = system_prompt
+        if len(self.messages) > 0:
+            self.messages[0] = {
+                "role": "system",
+                "content": [{"type": "text", "text": self.system_prompt}],
+            }
+        else:
+            self.messages.append(
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": self.system_prompt}],
+                }
+            )
+
+    def remove_message_at(self, index):
+        """Remove a message at a given index"""
+        if index < len(self.messages):
+            self.messages.pop(index)
+
+    def replace_message_at(
+        self, index, text_content, image_content=None, image_detail="high"
+    ):
+        """Replace a message at a given index"""
+        if index < len(self.messages):
+            self.messages[index] = {
+                "role": self.messages[index]["role"],
+                "content": [{"type": "text", "text": text_content}],
+            }
+            if image_content:
+                base64_image = self.encode_image(image_content)
+                self.messages[index]["content"].append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{base64_image}",
+                            "detail": image_detail,
+                        },
+                    }
+                )
+
+    def add_message(
+        self,
+        text_content,
+        image_content=None,
+        role=None,
+        image_detail="high",
+        put_text_last=False,
+    ):
+        """Add a new message to the list of messages"""
+
+        # API-style inference from OpenAI and similar services
+        if isinstance(
+            self.engine,
+            (
+                LMMEngineAnthropic,
+                LMMEngineAzureOpenAI,
+                LMMEngineHuggingFace,
+                LMMEngineOpenAI,
+                LMMEngineLybic,
+                LMMEngineOpenRouter,
+                LMMEnginevLLM,
+                LMMEngineGemini,
+                LMMEngineQwen,
+                LMMEngineDoubao,
+                LMMEngineDeepSeek,
+                LMMEngineZhipu,
+                LMMEngineGroq,
+                LMMEngineSiliconflow,
+                LMMEngineMonica,
+                LMMEngineAWSBedrock,
+            ),
+        ):
+            # infer role from previous message
+            if role != "user":
+                if self.messages[-1]["role"] == "system":
+                    role = "user"
+                elif self.messages[-1]["role"] == "user":
+                    role = "assistant"
+                elif self.messages[-1]["role"] == "assistant":
+                    role = "user"
+
+            message = {
+                "role": role,
+                "content": [{"type": "text", "text": text_content}],
+            }
+
+            if isinstance(image_content, np.ndarray) or image_content:
+                # Check if image_content is a list or a single image
+                if isinstance(image_content, list):
+                    # If image_content is a list of images, loop through each image
+                    for image in image_content:
+                        base64_image = self.encode_image(image)
+                        message["content"].append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{base64_image}",
+                                    "detail": image_detail,
+                                },
+                            }
+                        )
+                else:
+                    # If image_content is a single image, handle it directly
+                    base64_image = self.encode_image(image_content)
+                    message["content"].append(
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{base64_image}",
+                                "detail": image_detail,
+                            },
+                        }
+                    )
+
+            # Rotate text to be the last message if desired
+            if put_text_last:
+                text_content = message["content"].pop(0)
+                message["content"].append(text_content)
+
+            self.messages.append(message)
+
+        # For API-style inference from Anthropic
+        elif isinstance(self.engine, (LMMEngineAnthropic, LMMEngineAWSBedrock)):
+            # infer role from previous message
+            if role != "user":
+                if self.messages[-1]["role"] == "system":
+                    role = "user"
+                elif self.messages[-1]["role"] == "user":
+                    role = "assistant"
+                elif self.messages[-1]["role"] == "assistant":
+                    role = "user"
+
+            message = {
+                "role": role,
+                "content": [{"type": "text", "text": text_content}],
+            }
+
+            if image_content:
+                # Check if image_content is a list or a single image
+                if isinstance(image_content, list):
+                    # If image_content is a list of images, loop through each image
+                    for image in image_content:
+                        base64_image = self.encode_image(image)
+                        message["content"].append(
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/png",
+                                    "data": base64_image,
+                                },
+                            }
+                        )
+                else:
+                    # If image_content is a single image, handle it directly
+                    base64_image = self.encode_image(image_content)
+                    message["content"].append(
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/png",
+                                "data": base64_image,
+                            },
+                        }
+                    )
+            self.messages.append(message)
+
+        # Locally hosted vLLM model inference
+        elif isinstance(self.engine, LMMEnginevLLM):
+            # infer role from previous message
+            if role != "user":
+                if self.messages[-1]["role"] == "system":
+                    role = "user"
+                elif self.messages[-1]["role"] == "user":
+                    role = "assistant"
+                elif self.messages[-1]["role"] == "assistant":
+                    role = "user"
+
+            message = {
+                "role": role,
+                "content": [{"type": "text", "text": text_content}],
+            }
+
+            if image_content:
+                # Check if image_content is a list or a single image
+                if isinstance(image_content, list):
+                    # If image_content is a list of images, loop through each image
+                    for image in image_content:
+                        base64_image = self.encode_image(image)
+                        message["content"].append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image;base64,{base64_image}"
+                                },
+                            }
+                        )
+                else:
+                    # If image_content is a single image, handle it directly
+                    base64_image = self.encode_image(image_content)
+                    message["content"].append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image;base64,{base64_image}"},
+                        }
+                    )
+
+            self.messages.append(message)
+        else:
+            raise ValueError("engine_type is not supported")
+
+    def get_response(
+        self,
+        user_message=None,
+        messages=None,
+        temperature=0.0,
+        max_new_tokens=None,
+        **kwargs,
+    ):
+        """Generate the next response based on previous messages"""
+        if messages is None:
+            messages = self.messages
+        if user_message:
+            messages.append(
+                {"role": "user", "content": [{"type": "text", "text": user_message}]}
+            )
+    
+        if isinstance(self.engine, LMMEngineLybic):
+            content, total_tokens, cost = self.engine.generate(
+                messages,
+                max_new_tokens=max_new_tokens,  # type: ignore
+                **kwargs,
+            )
+        else:
+            content, total_tokens, cost = self.engine.generate(
+                messages,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,  # type: ignore
+                **kwargs,
+            )
+        
+        cost_string = CostManager.format_cost(cost, self.engine)
+        
+        return content, total_tokens, cost_string
+
+class EmbeddingAgent:
+    def __init__(self, engine_params=None, engine=None):
+        if engine is None:
+            if engine_params is not None:
+                engine_type = engine_params.get("engine_type")
+                if engine_type == "openai":
+                    self.engine = OpenAIEmbeddingEngine(**engine_params)
+                elif engine_type == "gemini":
+                    self.engine = GeminiEmbeddingEngine(**engine_params)
+                elif engine_type == "azure":
+                    self.engine = AzureOpenAIEmbeddingEngine(**engine_params)
+                elif engine_type == "dashscope":
+                    self.engine = DashScopeEmbeddingEngine(**engine_params)
+                elif engine_type == "doubao":
+                    self.engine = DoubaoEmbeddingEngine(**engine_params)
+                elif engine_type == "jina":
+                    self.engine = JinaEmbeddingEngine(**engine_params)
+                else:
+                    raise ValueError(f"Embedding engine type '{engine_type}' is not supported")
+            else:
+                raise ValueError("engine_params must be provided")
+        else:
+            self.engine = engine
+
+    def get_embeddings(self, text):
+        """Get embeddings for the given text
+        
+        Args:
+            text (str): The text to get embeddings for
+            
+        Returns:
+            numpy.ndarray: The embeddings for the text
+        """
+        embeddings, total_tokens, cost = self.engine.get_embeddings(text)
+        cost_string = CostManager.format_cost(cost, self.engine)
+        return embeddings, total_tokens, cost_string
+
+    
+    def get_similarity(self, text1, text2):
+        """Calculate the cosine similarity between two texts
+        
+        Args:
+            text1 (str): First text
+            text2 (str): Second text
+            
+        Returns:
+            float: Cosine similarity score between the two texts
+        """
+        embeddings1, tokens1, cost1 = self.get_embeddings(text1)
+        embeddings2, tokens2, cost2 = self.get_embeddings(text2)
+        
+        # Calculate cosine similarity
+        dot_product = np.dot(embeddings1, embeddings2)
+        norm1 = np.linalg.norm(embeddings1)
+        norm2 = np.linalg.norm(embeddings2)
+        
+        similarity = dot_product / (norm1 * norm2)
+        total_tokens = tokens1 + tokens2
+        total_cost = CostManager.add_costs(cost1, cost2)
+        
+        return similarity, total_tokens, total_cost
+    
+    def batch_get_embeddings(self, texts):
+        """Get embeddings for multiple texts
+        
+        Args:
+            texts (List[str]): List of texts to get embeddings for
+            
+        Returns:
+            List[numpy.ndarray]: List of embeddings for each text
+        """
+        embeddings = []
+        total_tokens = [0, 0, 0]
+        if texts:
+            first_embedding, first_tokens, first_cost = self.get_embeddings(texts[0])
+            embeddings.append(first_embedding)
+            total_tokens[0] += first_tokens[0]
+            total_tokens[1] += first_tokens[1]
+            total_tokens[2] += first_tokens[2]
+            total_cost = first_cost
+            
+            for text in texts[1:]:
+                embedding, tokens, cost = self.get_embeddings(text)
+                embeddings.append(embedding)
+                total_tokens[0] += tokens[0]
+                total_tokens[1] += tokens[1]
+                total_tokens[2] += tokens[2]
+                total_cost = CostManager.add_costs(total_cost, cost)
+        else:
+            currency = CostManager.get_currency_symbol(self.engine)
+            total_cost = f"0.0{currency}"
+        
+        return embeddings, total_tokens, total_cost
+
+
+class WebSearchAgent:
+    def __init__(self, engine_params=None, engine=None):
+        if engine is None:
+            if engine_params is not None:
+                self.engine_type = engine_params.get("engine_type")
+                if self.engine_type == "bocha":
+                    self.engine = BochaAISearchEngine(**engine_params)
+                elif self.engine_type == "exa":
+                    self.engine = ExaResearchEngine(**engine_params)
+                else:
+                    raise ValueError(f"Web search engine type '{self.engine_type}' is not supported")
+            else:
+                raise ValueError("engine_params must be provided")
+        else:
+            self.engine = engine
+    
+    def get_answer(self, query, **kwargs):
+        """Get a direct answer for the query
+        
+        Args:
+            query (str): The search query
+            **kwargs: Additional arguments to pass to the search engine
+            
+        Returns:
+            str: The answer text
+        """
+        if isinstance(self.engine, BochaAISearchEngine):
+            answer, tokens, cost = self.engine.get_answer(query, **kwargs)
+            return answer, tokens, str(cost)
+
+        elif isinstance(self.engine, ExaResearchEngine):
+            # For Exa, we'll use the chat_research method which returns a complete answer
+            # results, tokens, cost = self.engine.search(query, **kwargs)
+            results, tokens, cost = self.engine.chat_research(query, **kwargs)
+            if isinstance(results, dict) and "messages" in results:
+                for message in results.get("messages", []):
+                    if message.get("type") == "answer":
+                        return message.get("content", ""), tokens, str(cost)
+            return str(results), tokens, str(cost)
+
+        else:
+            raise ValueError(f"Web search engine type '{self.engine_type}' is not supported")
--- a/mm_agents/maestro/core/model.md
+++ b/mm_agents/maestro/core/model.md
@@ -0,0 +1,385 @@
+# Supported Model Providers and Model Lists
+
+## LLM Model Providers
+
+### 1. OpenAI
+
+**Provider**
+
+- `openai`
+
+**Supported Models:**
+
+- `gpt-5` Window: 400,000  Max Output Tokens: 128,000
+- `gpt-5-mini` Window: 400,000  Max Output Tokens: 128,000
+- `gpt-4.1-nano` Window: 400,000  Max Output Tokens: 128,000
+- `gpt-4.1` Window: 1,047,576   Max Output Tokens: 32,768
+- `gpt-4.1-mini`  Window: 1,047,576   Max Output Tokens: 32,768
+- `gpt-4.1-nano`   Window: 1,047,576   Max Output Tokens: 32,768
+- `gpt-4o`   Window: 128,000   Max Output Tokens: 16,384 
+- `gpt-4o-mini`   Window: 128,000   Max Output Tokens: 16,384 
+- `o1`   Window: 200,000   Max Output Tokens: 100,000 
+- `o1-pro`   Window: 200,000   Max Output Tokens: 100,000 
+- `o1-mini`   Window: 200,000   Max Output Tokens: 100,000 
+- `o3`   Window: 200,000   Max Output Tokens: 100,000 
+- `o3-pro`   Window: 200,000   Max Output Tokens: 100,000 
+- `o3-mini`    Window: 200,000   Max Output Tokens: 100,000 
+- `o4-mini`    Window: 200,000   Max Output Tokens: 100,000 
+
+**Embedding Models:**
+
+- `text-embedding-3-small`
+- `text-embedding-3-large`
+- `text-embedding-ada-002`
+
+📚 **Reference Link:** <https://platform.openai.com/docs/pricing>
+
+---
+
+### 2. Anthropic Claude
+
+**Provider**
+
+- `anthropic`
+
+**Supported Models:**
+
+- `claude-opus-4-1-20250805`  Context window: 200K  Max output: 32000
+- `claude-opus-4-20250514`   Context window: 200K  Max output: 32000
+- `claude-sonnet-4-20250514`  Context window: 200K  Max output: 64000
+- `claude-3-7-sonnet-20250219`   Context window: 200K  Max output: 64000
+- - `claude-3-5-sonnet-20240620`   Context window: 200K  Max output: 64000
+- `claude-3-5-haiku-20241022`    Context window: 200K  Max output: 8192
+
+📚 **Reference Link:** <https://www.anthropic.com/api>
+
+---
+
+### 3. AWS Bedrock
+
+**Provider**
+
+- `bedrock`
+
+
+**Supported Claude Models:**
+
+- `Claude-Opus-4`
+- `Claude-Sonnet-4`
+- `Claude-Sonnet-3.7`
+- `Claude-Sonnet-3.5`
+
+📚 **Reference Link:** <https://aws.amazon.com/bedrock/>
+
+---
+
+### 4. Google Gemini
+
+**Provider**
+
+- `gemini`
+
+**Supported Models:**
+
+- `gemini-2.5-pro` in: 1,048,576  out: 65536
+- `gemini-2.5-flash`  in: 1,048,576  out: 65536
+- `gemini-2.0-flash` in: 1,048,576  out: 8192
+- `gemini-1.5-pro`  in: 2,097,152  out: 8192
+- `gemini-1.5-flash` in: 1,048,576  out: 8192
+
+**Embedding Models:**
+
+- `gemini-embedding-001`
+
+📚 **Reference Link:** <https://ai.google.dev/gemini-api/docs/pricing>
+
+---
+
+### 5. Groq
+
+**Provider**
+
+- `groq`
+
+**Supported Models:**
+
+- `Kimi-K2-Instruct`
+- `Llama-4-Scout-17B-16E-Instruct`
+- `Llama-4-Maverick-17B-128E-Instruct`
+- `Llama-Guard-4-12B`
+- `DeepSeek-R1-Distill-Llama-70B`
+- `Qwen3-32B`
+- `Llama-3.3-70B-Instruct`
+
+📚 **Reference Link:** <https://groq.com/pricing>
+
+---
+
+### 6. Monica (Proxy Platform)
+
+**Provider**
+
+- `monica`
+
+**OpenAI Models:**
+
+- `gpt-4.1`
+- `gpt-4.1-mini`
+- `gpt-4.1-nano`
+- `gpt-4o-2024-11-20`
+- `gpt-4o-mini-2024-07-18`
+- `o4-mini`
+- `o3`
+
+**Anthropic Claude Models:**
+
+- `claude-opus-4-20250514`
+- `claude-sonnet-4-20250514`
+- `claude-3-7-sonnet-latest`
+- `claude-3-5-sonnet-20241022`
+- `claude-3-5-sonnet-20240620`
+- `claude-3-5-haiku-20241022`
+
+
+**Google Gemini Models:**
+
+- `gemini-2.5-pro-preview-03-25`
+- `gemini-2.5-flash-lite`
+- `gemini-2.5-flash-preview-05-20`
+- `gemini-2.0-flash-001`
+- `gemini-1.5-pro-002`
+- `gemini-1.5-flash-002`
+
+**DeepSeek Models:**
+
+- `deepseek-reasoner`
+- `deepseek-chat`
+
+**Meta Llama Models:**
+
+- `Llama-4-Scout-17B-16E-Instruct`   Context length: 10M tokens
+- `Llama-4-Maverick-17B-128E-Instruct `   Context length: 1M tokens
+- `llama-3.3-70b-instruct`
+- `llama-3-70b-instruct`
+- `llama-3.1-405b-instruct`
+
+**xAI Grok Models:**
+
+- `grok-3-beta`
+- `grok-beta`
+
+📚 **Reference Link:** <https://platform.monica.im/docs/en/models-and-pricing>
+
+---
+
+### 7. OpenRouter (Proxy Platform)
+
+**Provider**
+
+- `openrouter`
+
+**OpenAI Models:**
+
+- `gpt-4.1`
+- `gpt-4.1-mini`
+- `o1`
+- `o1-pro`
+- `o1-mini`
+- `o3`
+- `o3-pro`
+- `o3-mini`
+- `o4-mini`
+
+**xAI Grok Models:**
+
+- `grok-4`  Total Context: 256K   Max Output: 256K
+- `grok-3`
+- `grok-3-mini`
+
+**Anthropic Claude Models:**
+
+- `claude-opus-4`
+- `claude-sonnet-4`
+
+**Google Gemini Models:**
+
+- `gemini-2.5-flash`
+- `gemini-2.5-pro`
+
+📚 **Reference Link:** <https://openrouter.ai/models>
+
+---
+
+### 8. Azure OpenAI
+
+**Provider**
+
+- `azure`
+
+
+**Supported Models:**
+
+- `gpt-4.1`
+- `gpt-4.1-mini`
+- `gpt-4.1-nano`
+- `o1`
+- `o3`
+- `o4-mini`
+
+📚 **Reference Link:** <https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/>
+
+---
+
+### 9. Lybic AI
+
+**Provider:**
+
+- `lybic`
+
+**Supported Models:**
+
+- `gpt-5`
+- `gpt-4.1`
+- `gpt-4.1-mini`
+- `gpt-4.1-nano`
+- `gpt-4.5-preview`
+- `gpt-4o`
+- `gpt-4o-realtime-preview`
+- `gpt-4o-mini`
+- `o1`
+- `o1-pro`
+- `o1-mini`
+- `o3`
+- `o3-pro`
+- `o3-mini`
+- `o4-mini`
+
+**Note:** Lybic AI provides OpenAI-compatible API endpoints with the same model names and pricing structure.
+
+📚 **Reference Link:** <https://aigw.lybicai.com/>
+
+---
+
+### 10. DeepSeek
+
+**Provider**
+
+- `deepseek`
+
+**Supported Models:**
+
+- `deepseek-chat`  Context length: 128K, Output length: Default 4K, Max 8K
+- `deepseek-reasoner`  Context length: 128K, Output length: Default 32K, Max 64K
+
+📚 **Reference Link:** <https://platform.deepseek.com/>
+
+---
+
+### 11. Alibaba Cloud Qwen
+
+**Supported Models:**
+
+- `qwen-max-latest`  Context window: 32,768  Max input token length: 30,720  Max generation token length: 8,192
+- `qwen-plus-latest`  Context window: 131,072  Max input token length: 98,304 (thinking)  Max generation token length: 129,024  Max output: 16,384
+- `qwen-turbo-latest`  Context window: 1,000,000  Max input token length: 1,000,000  Max generation token length: 16,384
+- `qwen-vl-max-latest` (Grounding)  Context window: 131,072  Max input token length: 129,024  Max generation token length: 8,192
+- `qwen-vl-plus-latest` (Grounding)  Context window: 131,072  Max input token length: 129,024  Max generation token length: 8,192
+
+**Embedding Models:**
+
+- `text-embedding-v4`
+- `text-embedding-v3`
+
+📚 **Reference Link:** <https://bailian.console.aliyun.com/?tab=doc#/doc/?type=model&url=https%3A%2F%2Fhelp.aliyun.com%2Fdocument_detail%2F2840914.html&renderType=iframe>
+
+---
+
+### 12. ByteDance Doubao
+
+**Supported Models:**
+
+- `doubao-seed-1-6-flash-250615`  Context window: 256k  Max input token length: 224k  Max generation token length: 32k  Max thinking content token length: 32k
+- `doubao-seed-1-6-thinking-250715`  Context window: 256k  Max input token length: 224k  Max generation token length: 32k  Max thinking content token length: 32k
+- `doubao-seed-1-6-250615` Context window: 256k  Max input token length: 224k  Max generation token length: 32k  Max thinking content token length: 32k
+- `doubao-1.5-vision-pro-250328` (Grounding)  Context window: 128k  Max input token length: 96k   Max generation token length: 16k  Max thinking content token length: 32k
+- `doubao-1-5-thinking-vision-pro-250428` (Grounding)  Context window: 128k  Max input token length: 96k  Max generation token length: 16k  Max thinking content token length: 32k
+- `doubao-1-5-ui-tars-250428` (Grounding)  Context window: 128k  Max input token length: 96k  Max generation token length: 16k  Max thinking content token length: 32k
+
+**Embedding Models:**
+
+- `doubao-embedding-large-text-250515`
+- `doubao-embedding-text-240715`
+
+📚 **Reference Link:** <https://console.volcengine.com/ark/region:ark+cn-beijing/model?vendor=Bytedance&view=LIST_VIEW>
+
+---
+
+### 13. Zhipu GLM
+
+**Supported Models:**
+
+- `GLM-4.5`  Max in: 128k Max output: 0.2K
+- `GLM-4.5-X`  Max in: 128k  Max output: 0.2K
+- `GLM-4.5-Air` Max in: 128k  Max output: 0.2K
+- `GLM-4-Plus`
+- `GLM-4-Air-250414`
+- `GLM-4-AirX` (Grounding)
+- `GLM-4V-Plus-0111` (Grounding)
+
+**Embedding Models:**
+
+- `Embedding-3`
+- `Embedding-2`
+
+📚 **Reference Link:** <https://open.bigmodel.cn/pricing>
+
+---
+
+### 14. SiliconFlow
+
+**Supported Models:**
+
+- `Kimi-K2-Instruct`   Context Length: 128K
+- `DeepSeek-V3`
+- `DeepSeek-R1`
+- `Qwen3-32B`
+
+📚 **Reference Link:** <https://cloud.siliconflow.cn/sft-d1pi8rbk20jc73c62gm0/models>
+
+---
+
+## 🔤 Dedicated Embedding Providers
+
+### 15. Jina AI
+
+**Embedding Models:**
+
+- `jina-embeddings-v4`
+- `jina-embeddings-v3`
+
+📚 **Reference Link:** <https://jina.ai/embeddings>
+
+---
+
+## 🔍 AI Search Engines
+
+### 16. Bocha AI
+
+**Service Type:** AI Research & Search
+
+📚 **Reference Link:** <https://open.bochaai.com/overview>
+
+---
+
+### 17. Exa
+
+**Service Type:** AI Research & Search
+
+**Pricing Model:**
+
+- $5.00 / 1k agent searches
+- $5.00 / 1k exa-research agent page reads
+- $10.00 / 1k exa-research-pro agent page reads
+- $5.00 / 1M reasoning tokens
+
+📚 **Reference Link:** <https://dashboard.exa.ai/home>
--- a/mm_agents/maestro/core/model_pricing.json
+++ b/mm_agents/maestro/core/model_pricing.json
@@ -0,0 +1,194 @@
+{
+    "llm_models": {
+      "openai": {
+        "gpt-4.1": {"input": "2.00$", "output": "8.00$"},
+        "gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
+        "gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
+        "gpt-4.5-preview": {"input": "75$", "output": "150$"},
+        "gpt-4o": {"input": "2.5$", "output": "10$"},
+        "gpt-4o-realtime-preview": {"input": "5$", "output": "20$"},
+        "gpt-4o-mini": {"input": "0.15$", "output": "0.6$"},
+        "o1": {"input": "15$", "output": "60$"},
+        "o1-pro": {"input": "150$", "output": "600$"},
+        "o1-mini": {"input": "1.10$", "output": "4.40$"},
+        "o3": {"input": "2.0$", "output": "8$"},
+        "o3-pro": {"input": "20$", "output": "80$"},
+        "o3-mini": {"input": "1.10$", "output": "4.40$"},
+        "o4-mini": {"input": "1.1$", "output": "4.40$"}
+      },
+      "anthropic": {
+        "claude-opus-4-20250514": {"input": "15$", "output": "75$"},
+        "claude-sonnet-4-20250514": {"input": "3$", "output": "15$"},
+        "claude-3-7-sonnet-20250219": {"input": "3$", "output": "15$"},
+        "claude-3-5-sonnet-20241022": {"input": "3$", "output": "15$"},
+        "claude-3-5-haiku-20241022": {"input": "0.8$", "output": "4$"}
+      },
+      "qwen": {
+        "qwen-max-latest": {"input": "2.4￥", "output": "9.6￥"},
+        "qwen-plus-latest": {"input": "0.8￥", "output": "2￥"},
+        "qwen-turbo-latest": {"input": "0.3￥", "output": "0.6￥"},
+        "qwen-vl-max-latest": {"input": "3￥", "output": "9￥"},
+        "qwen-vl-plus-latest": {"input": "1.5￥", "output": "4.5￥"}
+      },
+      "doubao": {
+        "doubao-seed-1-6-flash-250615": {"input": "0.15￥", "output": "1.50￥"},
+        "doubao-seed-1-6-thinking-250715": {"input": "0.8￥", "output": "8￥"},
+        "doubao-seed-1-6-250615": {"input": "0.8￥", "output": "2￥"},
+        "doubao-1.5-vision-pro-250328": {"input": "3￥", "output": "9￥"},
+        "doubao-1-5-thinking-vision-pro-250428": {"input": "3￥", "output": "9￥"},
+        "doubao-1-5-ui-tars-250428": {"input": "3.5￥", "output": "12￥"}
+      },
+      "deepseek": {
+        "deepseek-chat": {"input": "2￥", "output": "8￥"},
+        "deepseek-reasoner": {"input": "4￥", "output": "16￥"}
+      },
+      "zhipu": {
+        "GLM-4.5": {"input": "4￥", "output": "16￥"},
+        "GLM-4.5V": {"input": "4￥", "output": "12￥"},
+        "GLM-4-Plus": {"input": "5￥", "output": "5￥"},
+        "GLM-4-Air-250414": {"input": "0.5￥", "output": "0.5￥"},
+        "GLM-4-AirX": {"input": "10￥", "output": "10￥"},
+        "GLM-4V-Plus-0111": {"input": "4￥", "output": "4￥"}
+      },
+      "groq": {
+        "Kimi-K2-Instruct": {"input": "1.00$", "output": "3.00$"},
+        "Llama-4-Scout-17B-16E-Instruct": {"input": "0.11$", "output": "0.34$"},
+        "Llama-4-Maverick-17B-128E-Instruct": {"input": "0.20$", "output": "0.60$"},
+        "Llama-Guard-4-12B": {"input": "0.20$", "output": "0.20$"},
+        "DeepSeek-R1-Distill-Llama-70B": {"input": "0.75$", "output": "0.99$"},
+        "Qwen3-32B": {"input": "0.29$", "output": "0.59$"},
+        "Llama-3.3-70B-Instruct": {"input": "0.59$", "output": "0.79$"}
+      },
+      "siliconflow": {
+        "Kimi-K2-Instruct": {"input": "4￥", "output": "16￥"},
+        "DeepSeek-V3": {"input": "2￥", "output": "8￥"},
+        "DeepSeek-R1": {"input": "4￥", "output": "16￥"},
+        "Qwen3-32B": {"input": "1￥", "output": "4￥"}
+      },
+      "monica": {
+        "gpt-4.1": {"input": "2.00$", "output": "8.00$"},
+        "gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
+        "gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
+        "gpt-4o-2024-11-20": {"input": "2.50$", "output": "10.00$"},
+        "gpt-4o-mini-2024-07-18": {"input": "0.15$", "output": "0.60$"},
+        "o4-mini": {"input": "0.55$", "output": "2.20$"},
+        "o3": {"input": "2.00$", "output": "8.00$"},
+        "claude-opus-4-20250514": {"input": "15.00$", "output": "75.00$"},
+        "claude-sonnet-4-20250514": {"input": "3.00$", "output": "15.00$"},
+        "claude-3-7-sonnet-latest": {"input": "3.00$", "output": "15.00$"},
+        "claude-3-5-sonnet-20241022": {"input": "3.00$", "output": "15.00$"},
+        "claude-3-5-sonnet-20240620": {"input": "3.00$", "output": "15.00$"},
+        "claude-3-5-haiku-20241022": {"input": "0.80$", "output": "4.00$"},
+        "claude-3-opus-20240229": {"input": "15.00$", "output": "75.00$"},
+        "claude-3-haiku-20240307": {"input": "0.25$", "output": "1.25$"},
+        "gemini-2.5-pro-preview-03-25": {"input": "1.25$", "output": "10.00$"},
+        "gemini-2.5-flash-lite": {"input": "0.10$", "output": "0.40$"},
+        "gemini-2.5-flash-preview-05-20": {"input": "0.30$", "output": "2.50$"},
+        "gemini-2.0-flash-001": {"input": "0.10$", "output": "0.40$"},
+        "gemini-1.5-pro-002": {"input": "1.25$", "output": "5.00$"},
+        "gemini-1.5-flash-002": {"input": "0.075$", "output": "0.30$"},
+        "deepseek-reasoner": {"input": "0.55$", "output": "2.21$"},
+        "deepseek-chat": {"input": "0.28$", "output": "1.10$"},
+        "llama-3-8b-instruct": {"input": "0.28$", "output": "0.83$"},
+        "llama-3.1-8b-instruct": {"input": "0.025$", "output": "0.06$"},
+        "llama-3.3-70b-instruct": {"input": "0.13$", "output": "0.40$"},
+        "llama-3-70b-instruct": {"input": "0.88$", "output": "0.88$"},
+        "llama-3.1-405b-instruct": {"input": "4.00$", "output": "4.00$"},
+        "grok-3-beta": {"input": "3.00$", "output": "15.00$"},
+        "grok-beta": {"input": "5.00$", "output": "15.00$"}
+      },
+      "gemini": {
+        "gemini-2.5-pro": {"input": "1.25$", "output": "10$"},
+        "gemini-2.5-flash": {"input": "0.30$", "output": "2.50$"},
+        "gemini-2.0-flash": {"input": "0.10$", "output": "0.40$"},
+        "gemini-1.5-pro": {"input": "1.25$", "output": "5$"},
+        "gemini-1.5-flash": {"input": "0.075$", "output": "0.30$"}
+      },
+      "bedrock": {
+        "Claude-Opus-4": {"input": "15.00$", "output": "75.00$"},
+        "Claude-Sonnet-4": {"input": "3.00$", "output": "15.00$"},
+        "Claude-Sonnet-3.7": {"input": "3.00$", "output": "15.00$"},
+        "Claude-Sonnet-3.5": {"input": "3.00$", "output": "15.00$"}
+      },
+      "openrouter": {
+        "openai/gpt-5": {"input": "1.25$", "output": "10$"},
+        "openai/gpt-5-chat": {"input": "1.25$", "output": "10$"},
+        "openai/gpt-4.1": {"input": "2$", "output": "8$"},
+        "openai/gpt-4.1-mini": {"input": "0.4$", "output": "1.6$"},
+        "openai/o1": {"input": "15$", "output": "60$"},
+        "openai/o1-pro": {"input": "150$", "output": "600$"},
+        "openai/o1-mini": {"input": "1.1$", "output": "4.4$"},
+        "openai/o3": {"input": "2$", "output": "8$"},
+        "openai/o3-pro": {"input": "20$", "output": "80$"},
+        "openai/o3-mini": {"input": "1.1$", "output": "4.4$"},
+        "openai/o4-mini": {"input": "1.1$", "output": "4.4$"},
+        "x-ai/grok-4": {"input": "3$", "output": "15$"},
+        "x-ai/grok-3": {"input": "3$", "output": "15$"},
+        "x-ai/grok-3-mini": {"input": "0.3$", "output": "0.5$"},
+        "anthropic/claude-opus-4": {"input": "15$", "output": "75$"},
+        "anthropic/claude-sonnet-4": {"input": "3$", "output": "15$"},
+        "google/gemini-2.5-flash": {"input": "0.3$", "output": "2.5$"},
+        "google/gemini-2.5-pro": {"input": "1.25$", "output": "10$"}
+      },
+      "azure": {
+        "gpt-4.1": {"input": "2.00$", "output": "8.00$"},
+        "gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
+        "gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
+        "o1": {"input": "15$", "output": "60$"},
+        "o3": {"input": "2.0$", "output": "8$"},
+        "o4-mini": {"input": "1.1$", "output": "4.40$"}
+      },
+      "lybic": {
+        "gpt-5": {"input": "1.25$", "output": "10$"},
+        "gpt-4.1": {"input": "2.00$", "output": "8.00$"},
+        "gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
+        "gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
+        "gpt-4.5-preview": {"input": "75$", "output": "150$"},
+        "gpt-4o": {"input": "2.5$", "output": "10$"},
+        "gpt-4o-realtime-preview": {"input": "5$", "output": "20$"},
+        "gpt-4o-mini": {"input": "0.15$", "output": "0.6$"},
+        "o1": {"input": "15$", "output": "60$"},
+        "o1-pro": {"input": "150$", "output": "600$"},
+        "o1-mini": {"input": "1.10$", "output": "4.40$"},
+        "o3": {"input": "2.0$", "output": "8$"},
+        "o3-pro": {"input": "20$", "output": "80$"},
+        "o3-mini": {"input": "1.10$", "output": "4.40$"},
+        "o4-mini": {"input": "1.1$", "output": "4.40$"}
+      }
+    },
+    "embedding_models": {
+      "openai": {
+        "text-embedding-3-small": {"input": "0.02$", "output": ""},
+        "text-embedding-3-large": {"input": "0.13$", "output": ""},
+        "text-embedding-ada-002": {"input": "0.10$", "output": ""}
+      },
+      "qwen": {
+        "text-embedding-v4": {"input": "0.0005￥", "output": ""},
+        "text-embedding-v3": {"input": "0.0005￥", "output": ""}
+      },
+      "doubao": {
+        "doubao-embedding-large-text-250515": {"input": "0.7￥", "output": ""},
+        "doubao-embedding-text-240715": {"input": "0.5￥", "output": ""}
+      },
+      "zhipu": {
+        "Embedding-3": {"input": "0.5￥", "output": ""},
+        "Embedding-2": {"input": "0.5￥", "output": ""}
+      },
+      "jina": {
+        "jina-embeddings-v4": {"input": "0.05$", "output": ""},
+        "jina-embeddings-v3": {"input": "0.05$", "output": ""}
+      },
+      "gemini": {
+        "gemini-embedding-001": {"input": "0.15$", "output": ""}
+      }
+    },
+    "search_models": {
+      "bocha": {
+        "ai-search": {"cost_type": "balance", "unit": "per_query"}
+      },
+      "exa": {
+        "search": {"cost_type": "direct", "unit": "per_query"},
+        "research": {"cost_type": "direct", "unit": "per_task"}
+      }
+    }
+  }
--- a/mm_agents/maestro/core/new_knowledge.py
+++ b/mm_agents/maestro/core/new_knowledge.py
@@ -0,0 +1,481 @@
+import json
+import os
+from typing import Dict, Tuple, List
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from ..utils.common_utils import (
+    load_embeddings,
+    load_knowledge_base,
+    save_embeddings,
+)
+from ..tools.new_tools import NewTools
+from .mllm import CostManager
+
+def get_embedding_dim(model_name):
+    if model_name == "doubao-embedding-large-text-250515":
+        return 2048
+    elif model_name == "doubao-embedding-text-240715":
+        return 2560
+    elif model_name == "text-embedding-ada-002":
+        return 1536
+    elif model_name == "text-embedding-3-small":
+        return 1536
+    elif model_name == "text-embedding-3-large":
+        return 3072
+    elif model_name == "gemini-embedding-001":
+        return 3072
+    elif model_name == "jina-embeddings-v4":
+        return 2048
+    elif model_name == "jina-embeddings-v3":
+        return 1024
+    elif model_name == "text-embedding-v4":
+        return 1024
+    elif model_name == "text-embedding-v3":
+        return 1024
+    elif model_name == "embedding-2" or model_name == "embedding-3":
+        return 2048
+    else:
+        return None
+
+class NewKnowledgeBase:
+    def __init__(
+        self,
+        embedding_engine: NewTools,
+        local_kb_path: str,
+        platform: str,
+        Tools_dict: Dict,
+        save_knowledge: bool = True,
+    ):
+        """
+        Initialize the KnowledgeBase module
+        
+        Args:
+            embedding_engine: Embedding engine instance
+            local_kb_path: Path to local knowledge base
+            platform: Target platform (Windows/Darwin/Ubuntu)
+            Tools_dict: Dictionary containing tool configurations
+            save_knowledge: Whether to save knowledge embeddings
+        """
+        self.platform = platform
+
+        self.local_kb_path = local_kb_path
+
+        # initialize embedding engine
+        self.embedding_engine = embedding_engine
+
+        # Initialize paths for different memory types
+        self.episodic_memory_path = os.path.join(
+            self.local_kb_path, self.platform, "episodic_memory.json"
+        )
+        self.narrative_memory_path = os.path.join(
+            self.local_kb_path, self.platform, "narrative_memory.json"
+        )
+        embedding_model_name = ""
+        if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
+            embedding_model_name = self.embedding_engine.tools["embedding"].model_name
+        else:
+            embedding_model_name = "default"
+        embedding_dim = get_embedding_dim(embedding_model_name)
+        self.embeddings_path = os.path.join(
+            self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
+        )
+
+        # Initialize trajectory tracking
+        self.task_trajectory = ""
+        self.current_subtask_trajectory = ""
+        self.current_search_query = ""
+        
+        # query_formulator
+        self.query_formulator_name = "query_formulator"
+        self.query_formulator = NewTools()
+        self.query_formulator.register_tool(
+            self.query_formulator_name,
+            Tools_dict[self.query_formulator_name]["provider"],
+            Tools_dict[self.query_formulator_name]["model"],
+        )
+        
+        # knowledge_fusion_agent
+        self.knowledge_fusion_agent_name = "context_fusion"
+        self.knowledge_fusion_agent = NewTools()
+        self.knowledge_fusion_agent.register_tool(
+            self.knowledge_fusion_agent_name,
+            Tools_dict[self.knowledge_fusion_agent_name]["provider"],
+            Tools_dict[self.knowledge_fusion_agent_name]["model"],
+        )
+
+        # narrative_summarization_agent
+        self.narrative_summarization_agent_name = "narrative_summarization"
+        self.narrative_summarization_agent = NewTools()
+        self.narrative_summarization_agent.register_tool(
+            self.narrative_summarization_agent_name,
+            Tools_dict[self.narrative_summarization_agent_name]["provider"],
+            Tools_dict[self.narrative_summarization_agent_name]["model"],
+        )
+        
+        # episode_summarization_agent
+        self.episode_summarization_agent_name = "episode_summarization"
+        self.episode_summarization_agent = NewTools()
+        self.episode_summarization_agent.register_tool(
+            self.episode_summarization_agent_name,
+            Tools_dict[self.episode_summarization_agent_name]["provider"],
+            Tools_dict[self.episode_summarization_agent_name]["model"],
+        )
+
+        self.save_knowledge = save_knowledge
+
+    def retrieve_knowledge(
+        self, instruction: str, search_query: str, search_engine: NewTools
+    ) -> Tuple[str, List[int], str]:
+        """Retrieve knowledge using search engine
+        Args:
+            instruction (str): task instruction
+            search_query (str): search query to use
+            search_engine (NewTools): search engine tool to use
+            
+        Returns:
+            Tuple[str, List[int], float]: The search results, token usage, and cost
+        """
+        search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
+
+        return search_results, total_tokens, cost_string
+
+    def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
+        """Formulate search query based on instruction and current state
+        
+        Args:
+            instruction (str): The task instruction
+            observation (Dict): Current observation including screenshot
+            
+        Returns:
+            Tuple[str, List[int], float]: The formulated query, token usage, and cost
+        """
+        query_path = os.path.join(
+            self.local_kb_path, self.platform, "formulate_query.json"
+        )
+        try:
+            with open(query_path, "r") as f:
+                formulate_query = json.load(f)
+        except:
+            formulate_query = {}
+
+        if instruction in formulate_query:
+            return formulate_query[instruction], [0, 0, 0], ""
+
+        self.query_formulator.tools["query_formulator"].llm_agent.reset()
+
+        content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
+            "str_input": f"The task is: {instruction}\n" + 
+                "To use google search to get some useful information, first carefully analyze " + 
+                "the screenshot of the current desktop UI state, then given the task " + 
+                "instruction, formulate a question that can be used to search on the Internet " + 
+                "for information in helping with the task execution.\n" + 
+                "The question should not be too general or too specific. Please ONLY provide " + 
+                "the question.\nQuestion:",
+            "img_input": observation["screenshot"] if "screenshot" in observation else None
+        })
+        
+        search_query = content.strip().replace('"', "")
+    
+        # print("search query: ", search_query)
+        formulate_query[instruction] = search_query
+        with open(query_path, "w") as f:
+            json.dump(formulate_query, f, indent=2)
+
+        return search_query, total_tokens, cost_string
+
+    def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
+        """Retrieve narrative experience using embeddings
+        
+        Args:
+            instruction (str): The task instruction
+            
+        Returns:
+            Tuple[str, str]: The similar task key and its narrative experience
+        """
+
+        knowledge_base = load_knowledge_base(self.narrative_memory_path)
+        if not knowledge_base:
+            return "None", "None", [0, 0, 0], ""
+
+        embeddings = load_embeddings(self.embeddings_path)
+
+        # Get or create instruction embedding
+        instruction_embedding = embeddings.get(instruction)
+        total_tokens, cost_string = [0, 0, 0], ""
+
+        if instruction_embedding is None:
+            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
+            embeddings[instruction] = instruction_embedding
+            # total_tokens += tokens
+            for i in range(len(total_tokens)):
+                total_tokens[i] += tokens[i]
+            cost_string = cost_string_now
+        # Get or create embeddings for knowledge base entries
+        candidate_embeddings = []
+        for key in knowledge_base:
+            candidate_embedding = embeddings.get(key)
+            if candidate_embedding is None:
+                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
+                for i in range(len(tokens)):
+                    total_tokens[i] += tokens[i]
+                # total_tokens += tokens
+                cost_string = CostManager.add_costs(cost_string, cost_string_now)
+            embeddings[key] = candidate_embedding
+
+            candidate_embeddings.append(candidate_embedding)
+
+        save_embeddings(self.embeddings_path, embeddings)
+
+        similarities = cosine_similarity(
+            instruction_embedding, np.vstack(candidate_embeddings)
+        )[0]
+        sorted_indices = np.argsort(similarities)[::-1]
+
+        keys = list(knowledge_base.keys())
+        idx = 1 if keys[sorted_indices[0]] == instruction else 0
+        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
+
+    def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
+        """Retrieve similar task experience using embeddings
+        
+        Args:
+            instruction (str): The task instruction
+            
+        Returns:
+            Tuple[str, str]: The similar task key and its episodic experience
+        """
+
+        knowledge_base = load_knowledge_base(self.episodic_memory_path)
+        if not knowledge_base:
+            return "None", "None", [0, 0, 0], ""
+
+        embeddings = load_embeddings(self.embeddings_path)
+
+        # Get or create instruction embedding
+        instruction_embedding = embeddings.get(instruction)
+        total_tokens, cost_string = [0, 0, 0], ""
+
+        if instruction_embedding is None:
+            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
+            embeddings[instruction] = instruction_embedding
+
+            # total_tokens += tokens
+            for i in range(len(total_tokens)):
+                total_tokens[i] += tokens[i]
+            cost_string = cost_string_now
+
+        # Get or create embeddings for knowledge base entries
+        candidate_embeddings = []
+        for key in knowledge_base:
+            candidate_embedding = embeddings.get(key)
+            if candidate_embedding is None:
+                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
+                # total_tokens += tokens
+                for i in range(len(total_tokens)):
+                    total_tokens[i] += tokens[i]
+                cost_string = CostManager.add_costs(cost_string, cost_string_now)
+            embeddings[key] = candidate_embedding
+
+            candidate_embeddings.append(candidate_embedding)
+
+        save_embeddings(self.embeddings_path, embeddings)
+
+        similarities = cosine_similarity(
+            instruction_embedding, np.vstack(candidate_embeddings)
+        )[0]
+        sorted_indices = np.argsort(similarities)[::-1]
+
+        keys = list(knowledge_base.keys())
+        idx = 1 if keys[sorted_indices[0]] == instruction else 0
+        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
+
+    def knowledge_fusion(
+        self,
+        observation: Dict,
+        instruction: str,
+        web_knowledge: str,
+        similar_task: str,
+        experience: str,
+    ) -> Tuple[str, list, str]:
+        """Combine web knowledge with similar task experience"""
+
+        content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
+            "str_input": f"Task: {instruction}\n" + 
+                f"**Web search result**:\n{web_knowledge}\n\n" + 
+                f"**Retrieved similar task experience**:\n" + 
+                f"Similar task:{similar_task}\n{experience}\n\n" + 
+                f"Based on the web search result and the retrieved similar task experience, " + 
+                f"if you think the similar task experience is indeed useful to the main task, " + 
+                f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
+            "img_input": observation["screenshot"] if "screenshot" in observation else None
+        })
+        
+        return content, total_tokens, cost
+        
+
+    def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
+        """Save episodic memory (subtask level knowledge).
+
+        Args:
+            subtask_key (str): Key identifying the subtask
+            subtask_traj (str): Trajectory/experience of the subtask
+        """
+        if not self.save_knowledge:
+            return
+
+        try:
+            kb = load_knowledge_base(self.episodic_memory_path)
+        except:
+            kb = {}
+
+        if subtask_key not in kb:
+            subtask_summarization = self.summarize_episode(subtask_traj)
+            kb[subtask_key] = subtask_summarization
+
+            if self.save_knowledge:
+                os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
+                with open(self.episodic_memory_path, "w") as fout:
+                    json.dump(kb, fout, indent=2)
+
+        return kb.get(subtask_key)
+
+    def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
+        """Save narrative memory (task level knowledge).
+
+        Args:
+            task_key (str): Key identifying the task
+            task_traj (str): Full trajectory/experience of the task
+        """
+        if not self.save_knowledge:
+            return
+
+        try:
+            kb = load_knowledge_base(self.narrative_memory_path)
+        except:
+            kb = {}
+
+        if task_key not in kb:
+            task_summarization = self.summarize_narrative(task_traj)
+            kb[task_key] = task_summarization
+
+            if self.save_knowledge:
+                os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
+                with open(self.narrative_memory_path, "w") as fout:
+                    json.dump(kb, fout, indent=2)
+
+        return kb.get(task_key)
+
+    def initialize_task_trajectory(self, instruction: str) -> None:
+        """Initialize a new task trajectory.
+
+        Args:
+            instruction (str): The task instruction
+        """
+        self.task_trajectory = f"Task:\n{instruction}"
+        self.current_search_query = ""
+        self.current_subtask_trajectory = ""
+
+    def update_task_trajectory(self, meta_data: Dict) -> None:
+        """Update the task trajectory with new metadata.
+
+        Args:
+            meta_data (Dict): Metadata from the agent's prediction
+        """
+        if not self.current_search_query and "search_query" in meta_data:
+            self.current_search_query = meta_data["search_query"]
+
+        self.task_trajectory += (
+            "\n\nReflection:\n"
+            + str(meta_data["reflection"])
+            + "\n\n----------------------\n\nPlan:\n"
+            + meta_data["executor_plan"]
+        )
+
+    def handle_subtask_trajectory(self, meta_data: Dict):
+        """Handle subtask trajectory updates based on subtask status.
+
+        Args:
+            meta_data (Dict): Metadata containing subtask information
+
+        Returns:
+            bool: Whether the subtask was completed
+        """
+        subtask_status = meta_data["subtask_status"]
+        subtask = meta_data["subtask"]
+        subtask_info = meta_data["subtask_info"]
+
+        if subtask_status in ["Start", "Done"]:
+            # If there's an existing subtask trajectory, finalize it
+            if self.current_subtask_trajectory:
+                self.current_subtask_trajectory += "\nSubtask Completed.\n"
+                subtask_key = self.current_subtask_trajectory.split(
+                    "\n----------------------\n\nPlan:\n"
+                )[0]
+                self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
+                self.current_subtask_trajectory = ""
+                return True
+
+            # Start new subtask trajectory
+            self.current_subtask_trajectory = (
+                f"Task:\n{self.current_search_query}\n\n"
+                f"Subtask: {subtask}\n"
+                f"Subtask Instruction: {subtask_info}\n"
+                f"----------------------\n\n"
+                f'Plan:\n{meta_data["executor_plan"]}\n'
+            )
+            return False
+
+        elif subtask_status == "In":
+            # Continue current subtask trajectory
+            self.current_subtask_trajectory += (
+                f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
+            )
+            return False
+
+    def finalize_task(self) -> None:
+        """Finalize the task by saving any remaining trajectories."""
+        # Save any remaining subtask trajectory
+        if self.current_subtask_trajectory:
+            self.current_subtask_trajectory += "\nSubtask Completed.\n"
+            subtask_key = self.current_subtask_trajectory.split(
+                "\n----------------------\n\nPlan:\n"
+            )[0]
+            self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
+
+        # Save the complete task trajectory
+        if self.task_trajectory and self.current_search_query:
+            self.save_narrative_memory(self.current_search_query, self.task_trajectory)
+
+        # Reset trajectories
+        self.task_trajectory = ""
+        self.current_subtask_trajectory = ""
+        self.current_search_query = ""
+
+    def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
+        """Summarize the episode experience for lifelong learning reflection
+        
+        Args:
+            trajectory (str): The episode experience to be summarized
+            
+        Returns:
+            str: The summarized episode experience
+        """
+
+        # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
+        content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
+
+        return content, total_tokens, cost
+
+    def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
+        """Summarize the narrative experience for lifelong learning reflection
+        
+        Args:
+            trajectory (str): The narrative experience to be summarized
+            
+        Returns:
+            str: The summarized narrative experience
+        """
+        # Create Reflection on whole trajectories for next round trial
+        content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
+
+        return content, total_tokens, cost