Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)

* Added a **pyproject.toml** file to define project metadata and dependencies.
* Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic.
* Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis.
* Added a **tools module** containing utility functions and tool configurations to improve code reusability.
* Updated the **README** and documentation with usage examples and module descriptions.

These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience.

Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
This commit is contained in:
Hiroid
2025-09-08 15:07:21 +08:00
committed by GitHub
parent 029885e78c
commit 3a4b67304f
96 changed files with 31982 additions and 2 deletions

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,566 @@
import base64
import numpy as np
from .engine import (
LMMEngineAnthropic,
LMMEngineAzureOpenAI,
LMMEngineHuggingFace,
LMMEngineOpenAI,
LMMEngineLybic,
LMMEngineOpenRouter,
LMMEnginevLLM,
LMMEngineGemini,
LMMEngineQwen,
LMMEngineDoubao,
LMMEngineDeepSeek,
LMMEngineZhipu,
LMMEngineGroq,
LMMEngineSiliconflow,
LMMEngineMonica,
LMMEngineAWSBedrock,
OpenAIEmbeddingEngine,
GeminiEmbeddingEngine,
AzureOpenAIEmbeddingEngine,
DashScopeEmbeddingEngine,
DoubaoEmbeddingEngine,
JinaEmbeddingEngine,
BochaAISearchEngine,
ExaResearchEngine,
)
class CostManager:
"""Cost manager, responsible for adding currency symbols based on engine type"""
# Chinese engines use CNY
CNY_ENGINES = {
LMMEngineQwen, LMMEngineDoubao, LMMEngineDeepSeek, LMMEngineZhipu,
LMMEngineSiliconflow, DashScopeEmbeddingEngine, DoubaoEmbeddingEngine
}
# Other engines use USD
USD_ENGINES = {
LMMEngineOpenAI, LMMEngineLybic, LMMEngineAnthropic, LMMEngineAzureOpenAI, LMMEngineGemini,
LMMEngineOpenRouter, LMMEnginevLLM, LMMEngineHuggingFace, LMMEngineGroq,
LMMEngineMonica, LMMEngineAWSBedrock, OpenAIEmbeddingEngine,
GeminiEmbeddingEngine, AzureOpenAIEmbeddingEngine, JinaEmbeddingEngine
}
@classmethod
def get_currency_symbol(cls, engine) -> str:
engine_type = type(engine)
if engine_type in cls.CNY_ENGINES:
return ""
elif engine_type in cls.USD_ENGINES:
return "$"
else:
return "$"
@classmethod
def format_cost(cls, cost: float, engine) -> str:
currency = cls.get_currency_symbol(engine)
return f"{cost:.7f}{currency}"
@classmethod
def add_costs(cls, cost1: str, cost2: str) -> str:
currency_symbols = ["$", "", "¥", "", "£"]
currency1 = currency2 = "$"
value1 = value2 = 0.0
if isinstance(cost1, (int, float)):
value1 = float(cost1)
currency1 = "$"
else:
cost1_str = str(cost1)
for symbol in currency_symbols:
if symbol in cost1_str:
value1 = float(cost1_str.replace(symbol, "").strip())
currency1 = symbol
break
else:
try:
value1 = float(cost1_str)
currency1 = "$"
except:
value1 = 0.0
if isinstance(cost2, (int, float)):
value2 = float(cost2)
currency2 = "$"
else:
cost2_str = str(cost2)
for symbol in currency_symbols:
if symbol in cost2_str:
value2 = float(cost2_str.replace(symbol, "").strip())
currency2 = symbol
break
else:
try:
value2 = float(cost2_str)
currency2 = "$"
except:
value2 = 0.0
if currency1 != currency2:
print(f"Warning: Different currencies in cost accumulation: {currency1} and {currency2}")
currency = currency1
else:
currency = currency1
total_value = value1 + value2
return f"{total_value:.6f}{currency}"
class LLMAgent:
def __init__(self, engine_params=None, system_prompt=None, engine=None):
if engine is None:
if engine_params is not None:
engine_type = engine_params.get("engine_type")
if engine_type == "openai":
self.engine = LMMEngineOpenAI(**engine_params)
elif engine_type == "lybic":
self.engine = LMMEngineLybic(**engine_params)
elif engine_type == "anthropic":
self.engine = LMMEngineAnthropic(**engine_params)
elif engine_type == "azure":
self.engine = LMMEngineAzureOpenAI(**engine_params)
elif engine_type == "vllm":
self.engine = LMMEnginevLLM(**engine_params)
elif engine_type == "huggingface":
self.engine = LMMEngineHuggingFace(**engine_params)
elif engine_type == "gemini":
self.engine = LMMEngineGemini(**engine_params)
elif engine_type == "openrouter":
self.engine = LMMEngineOpenRouter(**engine_params)
elif engine_type == "dashscope":
self.engine = LMMEngineQwen(**engine_params)
elif engine_type == "doubao":
self.engine = LMMEngineDoubao(**engine_params)
elif engine_type == "deepseek":
self.engine = LMMEngineDeepSeek(**engine_params)
elif engine_type == "zhipu":
self.engine = LMMEngineZhipu(**engine_params)
elif engine_type == "groq":
self.engine = LMMEngineGroq(**engine_params)
elif engine_type == "siliconflow":
self.engine = LMMEngineSiliconflow(**engine_params)
elif engine_type == "monica":
self.engine = LMMEngineMonica(**engine_params)
elif engine_type == "aws_bedrock":
self.engine = LMMEngineAWSBedrock(**engine_params)
else:
raise ValueError("engine_type is not supported")
else:
raise ValueError("engine_params must be provided")
else:
self.engine = engine
self.messages = [] # Empty messages
if system_prompt:
self.add_system_prompt(system_prompt)
else:
self.add_system_prompt("You are a helpful assistant.")
def encode_image(self, image_content):
# if image_content is a path to an image file, check type of the image_content to verify
if isinstance(image_content, str):
with open(image_content, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
else:
return base64.b64encode(image_content).decode("utf-8")
def reset(
self,
):
self.messages = [
{
"role": "system",
"content": [{"type": "text", "text": self.system_prompt}],
}
]
def add_system_prompt(self, system_prompt):
self.system_prompt = system_prompt
if len(self.messages) > 0:
self.messages[0] = {
"role": "system",
"content": [{"type": "text", "text": self.system_prompt}],
}
else:
self.messages.append(
{
"role": "system",
"content": [{"type": "text", "text": self.system_prompt}],
}
)
def remove_message_at(self, index):
"""Remove a message at a given index"""
if index < len(self.messages):
self.messages.pop(index)
def replace_message_at(
self, index, text_content, image_content=None, image_detail="high"
):
"""Replace a message at a given index"""
if index < len(self.messages):
self.messages[index] = {
"role": self.messages[index]["role"],
"content": [{"type": "text", "text": text_content}],
}
if image_content:
base64_image = self.encode_image(image_content)
self.messages[index]["content"].append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}",
"detail": image_detail,
},
}
)
def add_message(
self,
text_content,
image_content=None,
role=None,
image_detail="high",
put_text_last=False,
):
"""Add a new message to the list of messages"""
# API-style inference from OpenAI and similar services
if isinstance(
self.engine,
(
LMMEngineAnthropic,
LMMEngineAzureOpenAI,
LMMEngineHuggingFace,
LMMEngineOpenAI,
LMMEngineLybic,
LMMEngineOpenRouter,
LMMEnginevLLM,
LMMEngineGemini,
LMMEngineQwen,
LMMEngineDoubao,
LMMEngineDeepSeek,
LMMEngineZhipu,
LMMEngineGroq,
LMMEngineSiliconflow,
LMMEngineMonica,
LMMEngineAWSBedrock,
),
):
# infer role from previous message
if role != "user":
if self.messages[-1]["role"] == "system":
role = "user"
elif self.messages[-1]["role"] == "user":
role = "assistant"
elif self.messages[-1]["role"] == "assistant":
role = "user"
message = {
"role": role,
"content": [{"type": "text", "text": text_content}],
}
if isinstance(image_content, np.ndarray) or image_content:
# Check if image_content is a list or a single image
if isinstance(image_content, list):
# If image_content is a list of images, loop through each image
for image in image_content:
base64_image = self.encode_image(image)
message["content"].append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}",
"detail": image_detail,
},
}
)
else:
# If image_content is a single image, handle it directly
base64_image = self.encode_image(image_content)
message["content"].append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}",
"detail": image_detail,
},
}
)
# Rotate text to be the last message if desired
if put_text_last:
text_content = message["content"].pop(0)
message["content"].append(text_content)
self.messages.append(message)
# For API-style inference from Anthropic
elif isinstance(self.engine, (LMMEngineAnthropic, LMMEngineAWSBedrock)):
# infer role from previous message
if role != "user":
if self.messages[-1]["role"] == "system":
role = "user"
elif self.messages[-1]["role"] == "user":
role = "assistant"
elif self.messages[-1]["role"] == "assistant":
role = "user"
message = {
"role": role,
"content": [{"type": "text", "text": text_content}],
}
if image_content:
# Check if image_content is a list or a single image
if isinstance(image_content, list):
# If image_content is a list of images, loop through each image
for image in image_content:
base64_image = self.encode_image(image)
message["content"].append(
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": base64_image,
},
}
)
else:
# If image_content is a single image, handle it directly
base64_image = self.encode_image(image_content)
message["content"].append(
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": base64_image,
},
}
)
self.messages.append(message)
# Locally hosted vLLM model inference
elif isinstance(self.engine, LMMEnginevLLM):
# infer role from previous message
if role != "user":
if self.messages[-1]["role"] == "system":
role = "user"
elif self.messages[-1]["role"] == "user":
role = "assistant"
elif self.messages[-1]["role"] == "assistant":
role = "user"
message = {
"role": role,
"content": [{"type": "text", "text": text_content}],
}
if image_content:
# Check if image_content is a list or a single image
if isinstance(image_content, list):
# If image_content is a list of images, loop through each image
for image in image_content:
base64_image = self.encode_image(image)
message["content"].append(
{
"type": "image_url",
"image_url": {
"url": f"data:image;base64,{base64_image}"
},
}
)
else:
# If image_content is a single image, handle it directly
base64_image = self.encode_image(image_content)
message["content"].append(
{
"type": "image_url",
"image_url": {"url": f"data:image;base64,{base64_image}"},
}
)
self.messages.append(message)
else:
raise ValueError("engine_type is not supported")
def get_response(
self,
user_message=None,
messages=None,
temperature=0.0,
max_new_tokens=None,
**kwargs,
):
"""Generate the next response based on previous messages"""
if messages is None:
messages = self.messages
if user_message:
messages.append(
{"role": "user", "content": [{"type": "text", "text": user_message}]}
)
if isinstance(self.engine, LMMEngineLybic):
content, total_tokens, cost = self.engine.generate(
messages,
max_new_tokens=max_new_tokens, # type: ignore
**kwargs,
)
else:
content, total_tokens, cost = self.engine.generate(
messages,
temperature=temperature,
max_new_tokens=max_new_tokens, # type: ignore
**kwargs,
)
cost_string = CostManager.format_cost(cost, self.engine)
return content, total_tokens, cost_string
class EmbeddingAgent:
def __init__(self, engine_params=None, engine=None):
if engine is None:
if engine_params is not None:
engine_type = engine_params.get("engine_type")
if engine_type == "openai":
self.engine = OpenAIEmbeddingEngine(**engine_params)
elif engine_type == "gemini":
self.engine = GeminiEmbeddingEngine(**engine_params)
elif engine_type == "azure":
self.engine = AzureOpenAIEmbeddingEngine(**engine_params)
elif engine_type == "dashscope":
self.engine = DashScopeEmbeddingEngine(**engine_params)
elif engine_type == "doubao":
self.engine = DoubaoEmbeddingEngine(**engine_params)
elif engine_type == "jina":
self.engine = JinaEmbeddingEngine(**engine_params)
else:
raise ValueError(f"Embedding engine type '{engine_type}' is not supported")
else:
raise ValueError("engine_params must be provided")
else:
self.engine = engine
def get_embeddings(self, text):
"""Get embeddings for the given text
Args:
text (str): The text to get embeddings for
Returns:
numpy.ndarray: The embeddings for the text
"""
embeddings, total_tokens, cost = self.engine.get_embeddings(text)
cost_string = CostManager.format_cost(cost, self.engine)
return embeddings, total_tokens, cost_string
def get_similarity(self, text1, text2):
"""Calculate the cosine similarity between two texts
Args:
text1 (str): First text
text2 (str): Second text
Returns:
float: Cosine similarity score between the two texts
"""
embeddings1, tokens1, cost1 = self.get_embeddings(text1)
embeddings2, tokens2, cost2 = self.get_embeddings(text2)
# Calculate cosine similarity
dot_product = np.dot(embeddings1, embeddings2)
norm1 = np.linalg.norm(embeddings1)
norm2 = np.linalg.norm(embeddings2)
similarity = dot_product / (norm1 * norm2)
total_tokens = tokens1 + tokens2
total_cost = CostManager.add_costs(cost1, cost2)
return similarity, total_tokens, total_cost
def batch_get_embeddings(self, texts):
"""Get embeddings for multiple texts
Args:
texts (List[str]): List of texts to get embeddings for
Returns:
List[numpy.ndarray]: List of embeddings for each text
"""
embeddings = []
total_tokens = [0, 0, 0]
if texts:
first_embedding, first_tokens, first_cost = self.get_embeddings(texts[0])
embeddings.append(first_embedding)
total_tokens[0] += first_tokens[0]
total_tokens[1] += first_tokens[1]
total_tokens[2] += first_tokens[2]
total_cost = first_cost
for text in texts[1:]:
embedding, tokens, cost = self.get_embeddings(text)
embeddings.append(embedding)
total_tokens[0] += tokens[0]
total_tokens[1] += tokens[1]
total_tokens[2] += tokens[2]
total_cost = CostManager.add_costs(total_cost, cost)
else:
currency = CostManager.get_currency_symbol(self.engine)
total_cost = f"0.0{currency}"
return embeddings, total_tokens, total_cost
class WebSearchAgent:
def __init__(self, engine_params=None, engine=None):
if engine is None:
if engine_params is not None:
self.engine_type = engine_params.get("engine_type")
if self.engine_type == "bocha":
self.engine = BochaAISearchEngine(**engine_params)
elif self.engine_type == "exa":
self.engine = ExaResearchEngine(**engine_params)
else:
raise ValueError(f"Web search engine type '{self.engine_type}' is not supported")
else:
raise ValueError("engine_params must be provided")
else:
self.engine = engine
def get_answer(self, query, **kwargs):
"""Get a direct answer for the query
Args:
query (str): The search query
**kwargs: Additional arguments to pass to the search engine
Returns:
str: The answer text
"""
if isinstance(self.engine, BochaAISearchEngine):
answer, tokens, cost = self.engine.get_answer(query, **kwargs)
return answer, tokens, str(cost)
elif isinstance(self.engine, ExaResearchEngine):
# For Exa, we'll use the chat_research method which returns a complete answer
# results, tokens, cost = self.engine.search(query, **kwargs)
results, tokens, cost = self.engine.chat_research(query, **kwargs)
if isinstance(results, dict) and "messages" in results:
for message in results.get("messages", []):
if message.get("type") == "answer":
return message.get("content", ""), tokens, str(cost)
return str(results), tokens, str(cost)
else:
raise ValueError(f"Web search engine type '{self.engine_type}' is not supported")

View File

@@ -0,0 +1,385 @@
# Supported Model Providers and Model Lists
## LLM Model Providers
### 1. OpenAI
**Provider**
- `openai`
**Supported Models:**
- `gpt-5` Window: 400,000 Max Output Tokens: 128,000
- `gpt-5-mini` Window: 400,000 Max Output Tokens: 128,000
- `gpt-4.1-nano` Window: 400,000 Max Output Tokens: 128,000
- `gpt-4.1` Window: 1,047,576 Max Output Tokens: 32,768
- `gpt-4.1-mini` Window: 1,047,576 Max Output Tokens: 32,768
- `gpt-4.1-nano` Window: 1,047,576 Max Output Tokens: 32,768
- `gpt-4o` Window: 128,000 Max Output Tokens: 16,384
- `gpt-4o-mini` Window: 128,000 Max Output Tokens: 16,384
- `o1` Window: 200,000 Max Output Tokens: 100,000
- `o1-pro` Window: 200,000 Max Output Tokens: 100,000
- `o1-mini` Window: 200,000 Max Output Tokens: 100,000
- `o3` Window: 200,000 Max Output Tokens: 100,000
- `o3-pro` Window: 200,000 Max Output Tokens: 100,000
- `o3-mini` Window: 200,000 Max Output Tokens: 100,000
- `o4-mini` Window: 200,000 Max Output Tokens: 100,000
**Embedding Models:**
- `text-embedding-3-small`
- `text-embedding-3-large`
- `text-embedding-ada-002`
📚 **Reference Link:** <https://platform.openai.com/docs/pricing>
---
### 2. Anthropic Claude
**Provider**
- `anthropic`
**Supported Models:**
- `claude-opus-4-1-20250805` Context window: 200K Max output: 32000
- `claude-opus-4-20250514` Context window: 200K Max output: 32000
- `claude-sonnet-4-20250514` Context window: 200K Max output: 64000
- `claude-3-7-sonnet-20250219` Context window: 200K Max output: 64000
- - `claude-3-5-sonnet-20240620` Context window: 200K Max output: 64000
- `claude-3-5-haiku-20241022` Context window: 200K Max output: 8192
📚 **Reference Link:** <https://www.anthropic.com/api>
---
### 3. AWS Bedrock
**Provider**
- `bedrock`
**Supported Claude Models:**
- `Claude-Opus-4`
- `Claude-Sonnet-4`
- `Claude-Sonnet-3.7`
- `Claude-Sonnet-3.5`
📚 **Reference Link:** <https://aws.amazon.com/bedrock/>
---
### 4. Google Gemini
**Provider**
- `gemini`
**Supported Models:**
- `gemini-2.5-pro` in: 1,048,576 out: 65536
- `gemini-2.5-flash` in: 1,048,576 out: 65536
- `gemini-2.0-flash` in: 1,048,576 out: 8192
- `gemini-1.5-pro` in: 2,097,152 out: 8192
- `gemini-1.5-flash` in: 1,048,576 out: 8192
**Embedding Models:**
- `gemini-embedding-001`
📚 **Reference Link:** <https://ai.google.dev/gemini-api/docs/pricing>
---
### 5. Groq
**Provider**
- `groq`
**Supported Models:**
- `Kimi-K2-Instruct`
- `Llama-4-Scout-17B-16E-Instruct`
- `Llama-4-Maverick-17B-128E-Instruct`
- `Llama-Guard-4-12B`
- `DeepSeek-R1-Distill-Llama-70B`
- `Qwen3-32B`
- `Llama-3.3-70B-Instruct`
📚 **Reference Link:** <https://groq.com/pricing>
---
### 6. Monica (Proxy Platform)
**Provider**
- `monica`
**OpenAI Models:**
- `gpt-4.1`
- `gpt-4.1-mini`
- `gpt-4.1-nano`
- `gpt-4o-2024-11-20`
- `gpt-4o-mini-2024-07-18`
- `o4-mini`
- `o3`
**Anthropic Claude Models:**
- `claude-opus-4-20250514`
- `claude-sonnet-4-20250514`
- `claude-3-7-sonnet-latest`
- `claude-3-5-sonnet-20241022`
- `claude-3-5-sonnet-20240620`
- `claude-3-5-haiku-20241022`
**Google Gemini Models:**
- `gemini-2.5-pro-preview-03-25`
- `gemini-2.5-flash-lite`
- `gemini-2.5-flash-preview-05-20`
- `gemini-2.0-flash-001`
- `gemini-1.5-pro-002`
- `gemini-1.5-flash-002`
**DeepSeek Models:**
- `deepseek-reasoner`
- `deepseek-chat`
**Meta Llama Models:**
- `Llama-4-Scout-17B-16E-Instruct` Context length: 10M tokens
- `Llama-4-Maverick-17B-128E-Instruct ` Context length: 1M tokens
- `llama-3.3-70b-instruct`
- `llama-3-70b-instruct`
- `llama-3.1-405b-instruct`
**xAI Grok Models:**
- `grok-3-beta`
- `grok-beta`
📚 **Reference Link:** <https://platform.monica.im/docs/en/models-and-pricing>
---
### 7. OpenRouter (Proxy Platform)
**Provider**
- `openrouter`
**OpenAI Models:**
- `gpt-4.1`
- `gpt-4.1-mini`
- `o1`
- `o1-pro`
- `o1-mini`
- `o3`
- `o3-pro`
- `o3-mini`
- `o4-mini`
**xAI Grok Models:**
- `grok-4` Total Context: 256K Max Output: 256K
- `grok-3`
- `grok-3-mini`
**Anthropic Claude Models:**
- `claude-opus-4`
- `claude-sonnet-4`
**Google Gemini Models:**
- `gemini-2.5-flash`
- `gemini-2.5-pro`
📚 **Reference Link:** <https://openrouter.ai/models>
---
### 8. Azure OpenAI
**Provider**
- `azure`
**Supported Models:**
- `gpt-4.1`
- `gpt-4.1-mini`
- `gpt-4.1-nano`
- `o1`
- `o3`
- `o4-mini`
📚 **Reference Link:** <https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/>
---
### 9. Lybic AI
**Provider:**
- `lybic`
**Supported Models:**
- `gpt-5`
- `gpt-4.1`
- `gpt-4.1-mini`
- `gpt-4.1-nano`
- `gpt-4.5-preview`
- `gpt-4o`
- `gpt-4o-realtime-preview`
- `gpt-4o-mini`
- `o1`
- `o1-pro`
- `o1-mini`
- `o3`
- `o3-pro`
- `o3-mini`
- `o4-mini`
**Note:** Lybic AI provides OpenAI-compatible API endpoints with the same model names and pricing structure.
📚 **Reference Link:** <https://aigw.lybicai.com/>
---
### 10. DeepSeek
**Provider**
- `deepseek`
**Supported Models:**
- `deepseek-chat` Context length: 128K, Output length: Default 4K, Max 8K
- `deepseek-reasoner` Context length: 128K, Output length: Default 32K, Max 64K
📚 **Reference Link:** <https://platform.deepseek.com/>
---
### 11. Alibaba Cloud Qwen
**Supported Models:**
- `qwen-max-latest` Context window: 32,768 Max input token length: 30,720 Max generation token length: 8,192
- `qwen-plus-latest` Context window: 131,072 Max input token length: 98,304 (thinking) Max generation token length: 129,024 Max output: 16,384
- `qwen-turbo-latest` Context window: 1,000,000 Max input token length: 1,000,000 Max generation token length: 16,384
- `qwen-vl-max-latest` (Grounding) Context window: 131,072 Max input token length: 129,024 Max generation token length: 8,192
- `qwen-vl-plus-latest` (Grounding) Context window: 131,072 Max input token length: 129,024 Max generation token length: 8,192
**Embedding Models:**
- `text-embedding-v4`
- `text-embedding-v3`
📚 **Reference Link:** <https://bailian.console.aliyun.com/?tab=doc#/doc/?type=model&url=https%3A%2F%2Fhelp.aliyun.com%2Fdocument_detail%2F2840914.html&renderType=iframe>
---
### 12. ByteDance Doubao
**Supported Models:**
- `doubao-seed-1-6-flash-250615` Context window: 256k Max input token length: 224k Max generation token length: 32k Max thinking content token length: 32k
- `doubao-seed-1-6-thinking-250715` Context window: 256k Max input token length: 224k Max generation token length: 32k Max thinking content token length: 32k
- `doubao-seed-1-6-250615` Context window: 256k Max input token length: 224k Max generation token length: 32k Max thinking content token length: 32k
- `doubao-1.5-vision-pro-250328` (Grounding) Context window: 128k Max input token length: 96k Max generation token length: 16k Max thinking content token length: 32k
- `doubao-1-5-thinking-vision-pro-250428` (Grounding) Context window: 128k Max input token length: 96k Max generation token length: 16k Max thinking content token length: 32k
- `doubao-1-5-ui-tars-250428` (Grounding) Context window: 128k Max input token length: 96k Max generation token length: 16k Max thinking content token length: 32k
**Embedding Models:**
- `doubao-embedding-large-text-250515`
- `doubao-embedding-text-240715`
📚 **Reference Link:** <https://console.volcengine.com/ark/region:ark+cn-beijing/model?vendor=Bytedance&view=LIST_VIEW>
---
### 13. Zhipu GLM
**Supported Models:**
- `GLM-4.5` Max in: 128k Max output: 0.2K
- `GLM-4.5-X` Max in: 128k Max output: 0.2K
- `GLM-4.5-Air` Max in: 128k Max output: 0.2K
- `GLM-4-Plus`
- `GLM-4-Air-250414`
- `GLM-4-AirX` (Grounding)
- `GLM-4V-Plus-0111` (Grounding)
**Embedding Models:**
- `Embedding-3`
- `Embedding-2`
📚 **Reference Link:** <https://open.bigmodel.cn/pricing>
---
### 14. SiliconFlow
**Supported Models:**
- `Kimi-K2-Instruct` Context Length: 128K
- `DeepSeek-V3`
- `DeepSeek-R1`
- `Qwen3-32B`
📚 **Reference Link:** <https://cloud.siliconflow.cn/sft-d1pi8rbk20jc73c62gm0/models>
---
## 🔤 Dedicated Embedding Providers
### 15. Jina AI
**Embedding Models:**
- `jina-embeddings-v4`
- `jina-embeddings-v3`
📚 **Reference Link:** <https://jina.ai/embeddings>
---
## 🔍 AI Search Engines
### 16. Bocha AI
**Service Type:** AI Research & Search
📚 **Reference Link:** <https://open.bochaai.com/overview>
---
### 17. Exa
**Service Type:** AI Research & Search
**Pricing Model:**
- $5.00 / 1k agent searches
- $5.00 / 1k exa-research agent page reads
- $10.00 / 1k exa-research-pro agent page reads
- $5.00 / 1M reasoning tokens
📚 **Reference Link:** <https://dashboard.exa.ai/home>

View File

@@ -0,0 +1,194 @@
{
"llm_models": {
"openai": {
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
"gpt-4.5-preview": {"input": "75$", "output": "150$"},
"gpt-4o": {"input": "2.5$", "output": "10$"},
"gpt-4o-realtime-preview": {"input": "5$", "output": "20$"},
"gpt-4o-mini": {"input": "0.15$", "output": "0.6$"},
"o1": {"input": "15$", "output": "60$"},
"o1-pro": {"input": "150$", "output": "600$"},
"o1-mini": {"input": "1.10$", "output": "4.40$"},
"o3": {"input": "2.0$", "output": "8$"},
"o3-pro": {"input": "20$", "output": "80$"},
"o3-mini": {"input": "1.10$", "output": "4.40$"},
"o4-mini": {"input": "1.1$", "output": "4.40$"}
},
"anthropic": {
"claude-opus-4-20250514": {"input": "15$", "output": "75$"},
"claude-sonnet-4-20250514": {"input": "3$", "output": "15$"},
"claude-3-7-sonnet-20250219": {"input": "3$", "output": "15$"},
"claude-3-5-sonnet-20241022": {"input": "3$", "output": "15$"},
"claude-3-5-haiku-20241022": {"input": "0.8$", "output": "4$"}
},
"qwen": {
"qwen-max-latest": {"input": "2.4¥", "output": "9.6¥"},
"qwen-plus-latest": {"input": "0.8¥", "output": "2¥"},
"qwen-turbo-latest": {"input": "0.3¥", "output": "0.6¥"},
"qwen-vl-max-latest": {"input": "3¥", "output": "9¥"},
"qwen-vl-plus-latest": {"input": "1.5¥", "output": "4.5¥"}
},
"doubao": {
"doubao-seed-1-6-flash-250615": {"input": "0.15¥", "output": "1.50¥"},
"doubao-seed-1-6-thinking-250715": {"input": "0.8¥", "output": "8¥"},
"doubao-seed-1-6-250615": {"input": "0.8¥", "output": "2¥"},
"doubao-1.5-vision-pro-250328": {"input": "3¥", "output": "9¥"},
"doubao-1-5-thinking-vision-pro-250428": {"input": "3¥", "output": "9¥"},
"doubao-1-5-ui-tars-250428": {"input": "3.5¥", "output": "12¥"}
},
"deepseek": {
"deepseek-chat": {"input": "2¥", "output": "8¥"},
"deepseek-reasoner": {"input": "4¥", "output": "16¥"}
},
"zhipu": {
"GLM-4.5": {"input": "4¥", "output": "16¥"},
"GLM-4.5V": {"input": "4¥", "output": "12¥"},
"GLM-4-Plus": {"input": "5¥", "output": "5¥"},
"GLM-4-Air-250414": {"input": "0.5¥", "output": "0.5¥"},
"GLM-4-AirX": {"input": "10¥", "output": "10¥"},
"GLM-4V-Plus-0111": {"input": "4¥", "output": "4¥"}
},
"groq": {
"Kimi-K2-Instruct": {"input": "1.00$", "output": "3.00$"},
"Llama-4-Scout-17B-16E-Instruct": {"input": "0.11$", "output": "0.34$"},
"Llama-4-Maverick-17B-128E-Instruct": {"input": "0.20$", "output": "0.60$"},
"Llama-Guard-4-12B": {"input": "0.20$", "output": "0.20$"},
"DeepSeek-R1-Distill-Llama-70B": {"input": "0.75$", "output": "0.99$"},
"Qwen3-32B": {"input": "0.29$", "output": "0.59$"},
"Llama-3.3-70B-Instruct": {"input": "0.59$", "output": "0.79$"}
},
"siliconflow": {
"Kimi-K2-Instruct": {"input": "4¥", "output": "16¥"},
"DeepSeek-V3": {"input": "2¥", "output": "8¥"},
"DeepSeek-R1": {"input": "4¥", "output": "16¥"},
"Qwen3-32B": {"input": "1¥", "output": "4¥"}
},
"monica": {
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
"gpt-4o-2024-11-20": {"input": "2.50$", "output": "10.00$"},
"gpt-4o-mini-2024-07-18": {"input": "0.15$", "output": "0.60$"},
"o4-mini": {"input": "0.55$", "output": "2.20$"},
"o3": {"input": "2.00$", "output": "8.00$"},
"claude-opus-4-20250514": {"input": "15.00$", "output": "75.00$"},
"claude-sonnet-4-20250514": {"input": "3.00$", "output": "15.00$"},
"claude-3-7-sonnet-latest": {"input": "3.00$", "output": "15.00$"},
"claude-3-5-sonnet-20241022": {"input": "3.00$", "output": "15.00$"},
"claude-3-5-sonnet-20240620": {"input": "3.00$", "output": "15.00$"},
"claude-3-5-haiku-20241022": {"input": "0.80$", "output": "4.00$"},
"claude-3-opus-20240229": {"input": "15.00$", "output": "75.00$"},
"claude-3-haiku-20240307": {"input": "0.25$", "output": "1.25$"},
"gemini-2.5-pro-preview-03-25": {"input": "1.25$", "output": "10.00$"},
"gemini-2.5-flash-lite": {"input": "0.10$", "output": "0.40$"},
"gemini-2.5-flash-preview-05-20": {"input": "0.30$", "output": "2.50$"},
"gemini-2.0-flash-001": {"input": "0.10$", "output": "0.40$"},
"gemini-1.5-pro-002": {"input": "1.25$", "output": "5.00$"},
"gemini-1.5-flash-002": {"input": "0.075$", "output": "0.30$"},
"deepseek-reasoner": {"input": "0.55$", "output": "2.21$"},
"deepseek-chat": {"input": "0.28$", "output": "1.10$"},
"llama-3-8b-instruct": {"input": "0.28$", "output": "0.83$"},
"llama-3.1-8b-instruct": {"input": "0.025$", "output": "0.06$"},
"llama-3.3-70b-instruct": {"input": "0.13$", "output": "0.40$"},
"llama-3-70b-instruct": {"input": "0.88$", "output": "0.88$"},
"llama-3.1-405b-instruct": {"input": "4.00$", "output": "4.00$"},
"grok-3-beta": {"input": "3.00$", "output": "15.00$"},
"grok-beta": {"input": "5.00$", "output": "15.00$"}
},
"gemini": {
"gemini-2.5-pro": {"input": "1.25$", "output": "10$"},
"gemini-2.5-flash": {"input": "0.30$", "output": "2.50$"},
"gemini-2.0-flash": {"input": "0.10$", "output": "0.40$"},
"gemini-1.5-pro": {"input": "1.25$", "output": "5$"},
"gemini-1.5-flash": {"input": "0.075$", "output": "0.30$"}
},
"bedrock": {
"Claude-Opus-4": {"input": "15.00$", "output": "75.00$"},
"Claude-Sonnet-4": {"input": "3.00$", "output": "15.00$"},
"Claude-Sonnet-3.7": {"input": "3.00$", "output": "15.00$"},
"Claude-Sonnet-3.5": {"input": "3.00$", "output": "15.00$"}
},
"openrouter": {
"openai/gpt-5": {"input": "1.25$", "output": "10$"},
"openai/gpt-5-chat": {"input": "1.25$", "output": "10$"},
"openai/gpt-4.1": {"input": "2$", "output": "8$"},
"openai/gpt-4.1-mini": {"input": "0.4$", "output": "1.6$"},
"openai/o1": {"input": "15$", "output": "60$"},
"openai/o1-pro": {"input": "150$", "output": "600$"},
"openai/o1-mini": {"input": "1.1$", "output": "4.4$"},
"openai/o3": {"input": "2$", "output": "8$"},
"openai/o3-pro": {"input": "20$", "output": "80$"},
"openai/o3-mini": {"input": "1.1$", "output": "4.4$"},
"openai/o4-mini": {"input": "1.1$", "output": "4.4$"},
"x-ai/grok-4": {"input": "3$", "output": "15$"},
"x-ai/grok-3": {"input": "3$", "output": "15$"},
"x-ai/grok-3-mini": {"input": "0.3$", "output": "0.5$"},
"anthropic/claude-opus-4": {"input": "15$", "output": "75$"},
"anthropic/claude-sonnet-4": {"input": "3$", "output": "15$"},
"google/gemini-2.5-flash": {"input": "0.3$", "output": "2.5$"},
"google/gemini-2.5-pro": {"input": "1.25$", "output": "10$"}
},
"azure": {
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
"o1": {"input": "15$", "output": "60$"},
"o3": {"input": "2.0$", "output": "8$"},
"o4-mini": {"input": "1.1$", "output": "4.40$"}
},
"lybic": {
"gpt-5": {"input": "1.25$", "output": "10$"},
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
"gpt-4.5-preview": {"input": "75$", "output": "150$"},
"gpt-4o": {"input": "2.5$", "output": "10$"},
"gpt-4o-realtime-preview": {"input": "5$", "output": "20$"},
"gpt-4o-mini": {"input": "0.15$", "output": "0.6$"},
"o1": {"input": "15$", "output": "60$"},
"o1-pro": {"input": "150$", "output": "600$"},
"o1-mini": {"input": "1.10$", "output": "4.40$"},
"o3": {"input": "2.0$", "output": "8$"},
"o3-pro": {"input": "20$", "output": "80$"},
"o3-mini": {"input": "1.10$", "output": "4.40$"},
"o4-mini": {"input": "1.1$", "output": "4.40$"}
}
},
"embedding_models": {
"openai": {
"text-embedding-3-small": {"input": "0.02$", "output": ""},
"text-embedding-3-large": {"input": "0.13$", "output": ""},
"text-embedding-ada-002": {"input": "0.10$", "output": ""}
},
"qwen": {
"text-embedding-v4": {"input": "0.0005¥", "output": ""},
"text-embedding-v3": {"input": "0.0005¥", "output": ""}
},
"doubao": {
"doubao-embedding-large-text-250515": {"input": "0.7¥", "output": ""},
"doubao-embedding-text-240715": {"input": "0.5¥", "output": ""}
},
"zhipu": {
"Embedding-3": {"input": "0.5¥", "output": ""},
"Embedding-2": {"input": "0.5¥", "output": ""}
},
"jina": {
"jina-embeddings-v4": {"input": "0.05$", "output": ""},
"jina-embeddings-v3": {"input": "0.05$", "output": ""}
},
"gemini": {
"gemini-embedding-001": {"input": "0.15$", "output": ""}
}
},
"search_models": {
"bocha": {
"ai-search": {"cost_type": "balance", "unit": "per_query"}
},
"exa": {
"search": {"cost_type": "direct", "unit": "per_query"},
"research": {"cost_type": "direct", "unit": "per_task"}
}
}
}

View File

@@ -0,0 +1,481 @@
import json
import os
from typing import Dict, Tuple, List
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from ..utils.common_utils import (
load_embeddings,
load_knowledge_base,
save_embeddings,
)
from ..tools.new_tools import NewTools
from .mllm import CostManager
def get_embedding_dim(model_name):
if model_name == "doubao-embedding-large-text-250515":
return 2048
elif model_name == "doubao-embedding-text-240715":
return 2560
elif model_name == "text-embedding-ada-002":
return 1536
elif model_name == "text-embedding-3-small":
return 1536
elif model_name == "text-embedding-3-large":
return 3072
elif model_name == "gemini-embedding-001":
return 3072
elif model_name == "jina-embeddings-v4":
return 2048
elif model_name == "jina-embeddings-v3":
return 1024
elif model_name == "text-embedding-v4":
return 1024
elif model_name == "text-embedding-v3":
return 1024
elif model_name == "embedding-2" or model_name == "embedding-3":
return 2048
else:
return None
class NewKnowledgeBase:
def __init__(
self,
embedding_engine: NewTools,
local_kb_path: str,
platform: str,
Tools_dict: Dict,
save_knowledge: bool = True,
):
"""
Initialize the KnowledgeBase module
Args:
embedding_engine: Embedding engine instance
local_kb_path: Path to local knowledge base
platform: Target platform (Windows/Darwin/Ubuntu)
Tools_dict: Dictionary containing tool configurations
save_knowledge: Whether to save knowledge embeddings
"""
self.platform = platform
self.local_kb_path = local_kb_path
# initialize embedding engine
self.embedding_engine = embedding_engine
# Initialize paths for different memory types
self.episodic_memory_path = os.path.join(
self.local_kb_path, self.platform, "episodic_memory.json"
)
self.narrative_memory_path = os.path.join(
self.local_kb_path, self.platform, "narrative_memory.json"
)
embedding_model_name = ""
if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
embedding_model_name = self.embedding_engine.tools["embedding"].model_name
else:
embedding_model_name = "default"
embedding_dim = get_embedding_dim(embedding_model_name)
self.embeddings_path = os.path.join(
self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
)
# Initialize trajectory tracking
self.task_trajectory = ""
self.current_subtask_trajectory = ""
self.current_search_query = ""
# query_formulator
self.query_formulator_name = "query_formulator"
self.query_formulator = NewTools()
self.query_formulator.register_tool(
self.query_formulator_name,
Tools_dict[self.query_formulator_name]["provider"],
Tools_dict[self.query_formulator_name]["model"],
)
# knowledge_fusion_agent
self.knowledge_fusion_agent_name = "context_fusion"
self.knowledge_fusion_agent = NewTools()
self.knowledge_fusion_agent.register_tool(
self.knowledge_fusion_agent_name,
Tools_dict[self.knowledge_fusion_agent_name]["provider"],
Tools_dict[self.knowledge_fusion_agent_name]["model"],
)
# narrative_summarization_agent
self.narrative_summarization_agent_name = "narrative_summarization"
self.narrative_summarization_agent = NewTools()
self.narrative_summarization_agent.register_tool(
self.narrative_summarization_agent_name,
Tools_dict[self.narrative_summarization_agent_name]["provider"],
Tools_dict[self.narrative_summarization_agent_name]["model"],
)
# episode_summarization_agent
self.episode_summarization_agent_name = "episode_summarization"
self.episode_summarization_agent = NewTools()
self.episode_summarization_agent.register_tool(
self.episode_summarization_agent_name,
Tools_dict[self.episode_summarization_agent_name]["provider"],
Tools_dict[self.episode_summarization_agent_name]["model"],
)
self.save_knowledge = save_knowledge
def retrieve_knowledge(
self, instruction: str, search_query: str, search_engine: NewTools
) -> Tuple[str, List[int], str]:
"""Retrieve knowledge using search engine
Args:
instruction (str): task instruction
search_query (str): search query to use
search_engine (NewTools): search engine tool to use
Returns:
Tuple[str, List[int], float]: The search results, token usage, and cost
"""
search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
return search_results, total_tokens, cost_string
def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
"""Formulate search query based on instruction and current state
Args:
instruction (str): The task instruction
observation (Dict): Current observation including screenshot
Returns:
Tuple[str, List[int], float]: The formulated query, token usage, and cost
"""
query_path = os.path.join(
self.local_kb_path, self.platform, "formulate_query.json"
)
try:
with open(query_path, "r") as f:
formulate_query = json.load(f)
except:
formulate_query = {}
if instruction in formulate_query:
return formulate_query[instruction], [0, 0, 0], ""
self.query_formulator.tools["query_formulator"].llm_agent.reset()
content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
"str_input": f"The task is: {instruction}\n" +
"To use google search to get some useful information, first carefully analyze " +
"the screenshot of the current desktop UI state, then given the task " +
"instruction, formulate a question that can be used to search on the Internet " +
"for information in helping with the task execution.\n" +
"The question should not be too general or too specific. Please ONLY provide " +
"the question.\nQuestion:",
"img_input": observation["screenshot"] if "screenshot" in observation else None
})
search_query = content.strip().replace('"', "")
# print("search query: ", search_query)
formulate_query[instruction] = search_query
with open(query_path, "w") as f:
json.dump(formulate_query, f, indent=2)
return search_query, total_tokens, cost_string
def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
"""Retrieve narrative experience using embeddings
Args:
instruction (str): The task instruction
Returns:
Tuple[str, str]: The similar task key and its narrative experience
"""
knowledge_base = load_knowledge_base(self.narrative_memory_path)
if not knowledge_base:
return "None", "None", [0, 0, 0], ""
embeddings = load_embeddings(self.embeddings_path)
# Get or create instruction embedding
instruction_embedding = embeddings.get(instruction)
total_tokens, cost_string = [0, 0, 0], ""
if instruction_embedding is None:
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
embeddings[instruction] = instruction_embedding
# total_tokens += tokens
for i in range(len(total_tokens)):
total_tokens[i] += tokens[i]
cost_string = cost_string_now
# Get or create embeddings for knowledge base entries
candidate_embeddings = []
for key in knowledge_base:
candidate_embedding = embeddings.get(key)
if candidate_embedding is None:
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
for i in range(len(tokens)):
total_tokens[i] += tokens[i]
# total_tokens += tokens
cost_string = CostManager.add_costs(cost_string, cost_string_now)
embeddings[key] = candidate_embedding
candidate_embeddings.append(candidate_embedding)
save_embeddings(self.embeddings_path, embeddings)
similarities = cosine_similarity(
instruction_embedding, np.vstack(candidate_embeddings)
)[0]
sorted_indices = np.argsort(similarities)[::-1]
keys = list(knowledge_base.keys())
idx = 1 if keys[sorted_indices[0]] == instruction else 0
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
"""Retrieve similar task experience using embeddings
Args:
instruction (str): The task instruction
Returns:
Tuple[str, str]: The similar task key and its episodic experience
"""
knowledge_base = load_knowledge_base(self.episodic_memory_path)
if not knowledge_base:
return "None", "None", [0, 0, 0], ""
embeddings = load_embeddings(self.embeddings_path)
# Get or create instruction embedding
instruction_embedding = embeddings.get(instruction)
total_tokens, cost_string = [0, 0, 0], ""
if instruction_embedding is None:
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
embeddings[instruction] = instruction_embedding
# total_tokens += tokens
for i in range(len(total_tokens)):
total_tokens[i] += tokens[i]
cost_string = cost_string_now
# Get or create embeddings for knowledge base entries
candidate_embeddings = []
for key in knowledge_base:
candidate_embedding = embeddings.get(key)
if candidate_embedding is None:
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
# total_tokens += tokens
for i in range(len(total_tokens)):
total_tokens[i] += tokens[i]
cost_string = CostManager.add_costs(cost_string, cost_string_now)
embeddings[key] = candidate_embedding
candidate_embeddings.append(candidate_embedding)
save_embeddings(self.embeddings_path, embeddings)
similarities = cosine_similarity(
instruction_embedding, np.vstack(candidate_embeddings)
)[0]
sorted_indices = np.argsort(similarities)[::-1]
keys = list(knowledge_base.keys())
idx = 1 if keys[sorted_indices[0]] == instruction else 0
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
def knowledge_fusion(
self,
observation: Dict,
instruction: str,
web_knowledge: str,
similar_task: str,
experience: str,
) -> Tuple[str, list, str]:
"""Combine web knowledge with similar task experience"""
content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
"str_input": f"Task: {instruction}\n" +
f"**Web search result**:\n{web_knowledge}\n\n" +
f"**Retrieved similar task experience**:\n" +
f"Similar task:{similar_task}\n{experience}\n\n" +
f"Based on the web search result and the retrieved similar task experience, " +
f"if you think the similar task experience is indeed useful to the main task, " +
f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
"img_input": observation["screenshot"] if "screenshot" in observation else None
})
return content, total_tokens, cost
def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
"""Save episodic memory (subtask level knowledge).
Args:
subtask_key (str): Key identifying the subtask
subtask_traj (str): Trajectory/experience of the subtask
"""
if not self.save_knowledge:
return
try:
kb = load_knowledge_base(self.episodic_memory_path)
except:
kb = {}
if subtask_key not in kb:
subtask_summarization = self.summarize_episode(subtask_traj)
kb[subtask_key] = subtask_summarization
if self.save_knowledge:
os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
with open(self.episodic_memory_path, "w") as fout:
json.dump(kb, fout, indent=2)
return kb.get(subtask_key)
def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
"""Save narrative memory (task level knowledge).
Args:
task_key (str): Key identifying the task
task_traj (str): Full trajectory/experience of the task
"""
if not self.save_knowledge:
return
try:
kb = load_knowledge_base(self.narrative_memory_path)
except:
kb = {}
if task_key not in kb:
task_summarization = self.summarize_narrative(task_traj)
kb[task_key] = task_summarization
if self.save_knowledge:
os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
with open(self.narrative_memory_path, "w") as fout:
json.dump(kb, fout, indent=2)
return kb.get(task_key)
def initialize_task_trajectory(self, instruction: str) -> None:
"""Initialize a new task trajectory.
Args:
instruction (str): The task instruction
"""
self.task_trajectory = f"Task:\n{instruction}"
self.current_search_query = ""
self.current_subtask_trajectory = ""
def update_task_trajectory(self, meta_data: Dict) -> None:
"""Update the task trajectory with new metadata.
Args:
meta_data (Dict): Metadata from the agent's prediction
"""
if not self.current_search_query and "search_query" in meta_data:
self.current_search_query = meta_data["search_query"]
self.task_trajectory += (
"\n\nReflection:\n"
+ str(meta_data["reflection"])
+ "\n\n----------------------\n\nPlan:\n"
+ meta_data["executor_plan"]
)
def handle_subtask_trajectory(self, meta_data: Dict):
"""Handle subtask trajectory updates based on subtask status.
Args:
meta_data (Dict): Metadata containing subtask information
Returns:
bool: Whether the subtask was completed
"""
subtask_status = meta_data["subtask_status"]
subtask = meta_data["subtask"]
subtask_info = meta_data["subtask_info"]
if subtask_status in ["Start", "Done"]:
# If there's an existing subtask trajectory, finalize it
if self.current_subtask_trajectory:
self.current_subtask_trajectory += "\nSubtask Completed.\n"
subtask_key = self.current_subtask_trajectory.split(
"\n----------------------\n\nPlan:\n"
)[0]
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
self.current_subtask_trajectory = ""
return True
# Start new subtask trajectory
self.current_subtask_trajectory = (
f"Task:\n{self.current_search_query}\n\n"
f"Subtask: {subtask}\n"
f"Subtask Instruction: {subtask_info}\n"
f"----------------------\n\n"
f'Plan:\n{meta_data["executor_plan"]}\n'
)
return False
elif subtask_status == "In":
# Continue current subtask trajectory
self.current_subtask_trajectory += (
f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
)
return False
def finalize_task(self) -> None:
"""Finalize the task by saving any remaining trajectories."""
# Save any remaining subtask trajectory
if self.current_subtask_trajectory:
self.current_subtask_trajectory += "\nSubtask Completed.\n"
subtask_key = self.current_subtask_trajectory.split(
"\n----------------------\n\nPlan:\n"
)[0]
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
# Save the complete task trajectory
if self.task_trajectory and self.current_search_query:
self.save_narrative_memory(self.current_search_query, self.task_trajectory)
# Reset trajectories
self.task_trajectory = ""
self.current_subtask_trajectory = ""
self.current_search_query = ""
def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
"""Summarize the episode experience for lifelong learning reflection
Args:
trajectory (str): The episode experience to be summarized
Returns:
str: The summarized episode experience
"""
# Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
return content, total_tokens, cost
def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
"""Summarize the narrative experience for lifelong learning reflection
Args:
trajectory (str): The narrative experience to be summarized
Returns:
str: The summarized narrative experience
"""
# Create Reflection on whole trajectories for next round trial
content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
return content, total_tokens, cost