Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)
* Added a **pyproject.toml** file to define project metadata and dependencies. * Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic. * Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis. * Added a **tools module** containing utility functions and tool configurations to improve code reusability. * Updated the **README** and documentation with usage examples and module descriptions. These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience. Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
This commit is contained in:
0
mm_agents/maestro/core/__init__.py
Normal file
0
mm_agents/maestro/core/__init__.py
Normal file
1556
mm_agents/maestro/core/engine.py
Normal file
1556
mm_agents/maestro/core/engine.py
Normal file
File diff suppressed because it is too large
Load Diff
566
mm_agents/maestro/core/mllm.py
Normal file
566
mm_agents/maestro/core/mllm.py
Normal file
@@ -0,0 +1,566 @@
|
||||
import base64
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .engine import (
|
||||
LMMEngineAnthropic,
|
||||
LMMEngineAzureOpenAI,
|
||||
LMMEngineHuggingFace,
|
||||
LMMEngineOpenAI,
|
||||
LMMEngineLybic,
|
||||
LMMEngineOpenRouter,
|
||||
LMMEnginevLLM,
|
||||
LMMEngineGemini,
|
||||
LMMEngineQwen,
|
||||
LMMEngineDoubao,
|
||||
LMMEngineDeepSeek,
|
||||
LMMEngineZhipu,
|
||||
LMMEngineGroq,
|
||||
LMMEngineSiliconflow,
|
||||
LMMEngineMonica,
|
||||
LMMEngineAWSBedrock,
|
||||
OpenAIEmbeddingEngine,
|
||||
GeminiEmbeddingEngine,
|
||||
AzureOpenAIEmbeddingEngine,
|
||||
DashScopeEmbeddingEngine,
|
||||
DoubaoEmbeddingEngine,
|
||||
JinaEmbeddingEngine,
|
||||
BochaAISearchEngine,
|
||||
ExaResearchEngine,
|
||||
)
|
||||
|
||||
class CostManager:
|
||||
"""Cost manager, responsible for adding currency symbols based on engine type"""
|
||||
|
||||
# Chinese engines use CNY
|
||||
CNY_ENGINES = {
|
||||
LMMEngineQwen, LMMEngineDoubao, LMMEngineDeepSeek, LMMEngineZhipu,
|
||||
LMMEngineSiliconflow, DashScopeEmbeddingEngine, DoubaoEmbeddingEngine
|
||||
}
|
||||
# Other engines use USD
|
||||
USD_ENGINES = {
|
||||
LMMEngineOpenAI, LMMEngineLybic, LMMEngineAnthropic, LMMEngineAzureOpenAI, LMMEngineGemini,
|
||||
LMMEngineOpenRouter, LMMEnginevLLM, LMMEngineHuggingFace, LMMEngineGroq,
|
||||
LMMEngineMonica, LMMEngineAWSBedrock, OpenAIEmbeddingEngine,
|
||||
GeminiEmbeddingEngine, AzureOpenAIEmbeddingEngine, JinaEmbeddingEngine
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_currency_symbol(cls, engine) -> str:
|
||||
engine_type = type(engine)
|
||||
|
||||
if engine_type in cls.CNY_ENGINES:
|
||||
return "¥"
|
||||
elif engine_type in cls.USD_ENGINES:
|
||||
return "$"
|
||||
else:
|
||||
return "$"
|
||||
|
||||
@classmethod
|
||||
def format_cost(cls, cost: float, engine) -> str:
|
||||
currency = cls.get_currency_symbol(engine)
|
||||
return f"{cost:.7f}{currency}"
|
||||
|
||||
@classmethod
|
||||
def add_costs(cls, cost1: str, cost2: str) -> str:
|
||||
currency_symbols = ["$", "¥", "¥", "€", "£"]
|
||||
currency1 = currency2 = "$"
|
||||
value1 = value2 = 0.0
|
||||
|
||||
if isinstance(cost1, (int, float)):
|
||||
value1 = float(cost1)
|
||||
currency1 = "$"
|
||||
else:
|
||||
cost1_str = str(cost1)
|
||||
for symbol in currency_symbols:
|
||||
if symbol in cost1_str:
|
||||
value1 = float(cost1_str.replace(symbol, "").strip())
|
||||
currency1 = symbol
|
||||
break
|
||||
else:
|
||||
try:
|
||||
value1 = float(cost1_str)
|
||||
currency1 = "$"
|
||||
except:
|
||||
value1 = 0.0
|
||||
|
||||
if isinstance(cost2, (int, float)):
|
||||
value2 = float(cost2)
|
||||
currency2 = "$"
|
||||
else:
|
||||
cost2_str = str(cost2)
|
||||
for symbol in currency_symbols:
|
||||
if symbol in cost2_str:
|
||||
value2 = float(cost2_str.replace(symbol, "").strip())
|
||||
currency2 = symbol
|
||||
break
|
||||
else:
|
||||
try:
|
||||
value2 = float(cost2_str)
|
||||
currency2 = "$"
|
||||
except:
|
||||
value2 = 0.0
|
||||
|
||||
if currency1 != currency2:
|
||||
print(f"Warning: Different currencies in cost accumulation: {currency1} and {currency2}")
|
||||
currency = currency1
|
||||
else:
|
||||
currency = currency1
|
||||
|
||||
total_value = value1 + value2
|
||||
return f"{total_value:.6f}{currency}"
|
||||
|
||||
class LLMAgent:
|
||||
def __init__(self, engine_params=None, system_prompt=None, engine=None):
|
||||
if engine is None:
|
||||
if engine_params is not None:
|
||||
engine_type = engine_params.get("engine_type")
|
||||
if engine_type == "openai":
|
||||
self.engine = LMMEngineOpenAI(**engine_params)
|
||||
elif engine_type == "lybic":
|
||||
self.engine = LMMEngineLybic(**engine_params)
|
||||
elif engine_type == "anthropic":
|
||||
self.engine = LMMEngineAnthropic(**engine_params)
|
||||
elif engine_type == "azure":
|
||||
self.engine = LMMEngineAzureOpenAI(**engine_params)
|
||||
elif engine_type == "vllm":
|
||||
self.engine = LMMEnginevLLM(**engine_params)
|
||||
elif engine_type == "huggingface":
|
||||
self.engine = LMMEngineHuggingFace(**engine_params)
|
||||
elif engine_type == "gemini":
|
||||
self.engine = LMMEngineGemini(**engine_params)
|
||||
elif engine_type == "openrouter":
|
||||
self.engine = LMMEngineOpenRouter(**engine_params)
|
||||
elif engine_type == "dashscope":
|
||||
self.engine = LMMEngineQwen(**engine_params)
|
||||
elif engine_type == "doubao":
|
||||
self.engine = LMMEngineDoubao(**engine_params)
|
||||
elif engine_type == "deepseek":
|
||||
self.engine = LMMEngineDeepSeek(**engine_params)
|
||||
elif engine_type == "zhipu":
|
||||
self.engine = LMMEngineZhipu(**engine_params)
|
||||
elif engine_type == "groq":
|
||||
self.engine = LMMEngineGroq(**engine_params)
|
||||
elif engine_type == "siliconflow":
|
||||
self.engine = LMMEngineSiliconflow(**engine_params)
|
||||
elif engine_type == "monica":
|
||||
self.engine = LMMEngineMonica(**engine_params)
|
||||
elif engine_type == "aws_bedrock":
|
||||
self.engine = LMMEngineAWSBedrock(**engine_params)
|
||||
else:
|
||||
raise ValueError("engine_type is not supported")
|
||||
else:
|
||||
raise ValueError("engine_params must be provided")
|
||||
else:
|
||||
self.engine = engine
|
||||
|
||||
self.messages = [] # Empty messages
|
||||
|
||||
if system_prompt:
|
||||
self.add_system_prompt(system_prompt)
|
||||
else:
|
||||
self.add_system_prompt("You are a helpful assistant.")
|
||||
|
||||
def encode_image(self, image_content):
|
||||
# if image_content is a path to an image file, check type of the image_content to verify
|
||||
if isinstance(image_content, str):
|
||||
with open(image_content, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
else:
|
||||
return base64.b64encode(image_content).decode("utf-8")
|
||||
|
||||
def reset(
|
||||
self,
|
||||
):
|
||||
|
||||
self.messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": self.system_prompt}],
|
||||
}
|
||||
]
|
||||
|
||||
def add_system_prompt(self, system_prompt):
|
||||
self.system_prompt = system_prompt
|
||||
if len(self.messages) > 0:
|
||||
self.messages[0] = {
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": self.system_prompt}],
|
||||
}
|
||||
else:
|
||||
self.messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": self.system_prompt}],
|
||||
}
|
||||
)
|
||||
|
||||
def remove_message_at(self, index):
|
||||
"""Remove a message at a given index"""
|
||||
if index < len(self.messages):
|
||||
self.messages.pop(index)
|
||||
|
||||
def replace_message_at(
|
||||
self, index, text_content, image_content=None, image_detail="high"
|
||||
):
|
||||
"""Replace a message at a given index"""
|
||||
if index < len(self.messages):
|
||||
self.messages[index] = {
|
||||
"role": self.messages[index]["role"],
|
||||
"content": [{"type": "text", "text": text_content}],
|
||||
}
|
||||
if image_content:
|
||||
base64_image = self.encode_image(image_content)
|
||||
self.messages[index]["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{base64_image}",
|
||||
"detail": image_detail,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
def add_message(
|
||||
self,
|
||||
text_content,
|
||||
image_content=None,
|
||||
role=None,
|
||||
image_detail="high",
|
||||
put_text_last=False,
|
||||
):
|
||||
"""Add a new message to the list of messages"""
|
||||
|
||||
# API-style inference from OpenAI and similar services
|
||||
if isinstance(
|
||||
self.engine,
|
||||
(
|
||||
LMMEngineAnthropic,
|
||||
LMMEngineAzureOpenAI,
|
||||
LMMEngineHuggingFace,
|
||||
LMMEngineOpenAI,
|
||||
LMMEngineLybic,
|
||||
LMMEngineOpenRouter,
|
||||
LMMEnginevLLM,
|
||||
LMMEngineGemini,
|
||||
LMMEngineQwen,
|
||||
LMMEngineDoubao,
|
||||
LMMEngineDeepSeek,
|
||||
LMMEngineZhipu,
|
||||
LMMEngineGroq,
|
||||
LMMEngineSiliconflow,
|
||||
LMMEngineMonica,
|
||||
LMMEngineAWSBedrock,
|
||||
),
|
||||
):
|
||||
# infer role from previous message
|
||||
if role != "user":
|
||||
if self.messages[-1]["role"] == "system":
|
||||
role = "user"
|
||||
elif self.messages[-1]["role"] == "user":
|
||||
role = "assistant"
|
||||
elif self.messages[-1]["role"] == "assistant":
|
||||
role = "user"
|
||||
|
||||
message = {
|
||||
"role": role,
|
||||
"content": [{"type": "text", "text": text_content}],
|
||||
}
|
||||
|
||||
if isinstance(image_content, np.ndarray) or image_content:
|
||||
# Check if image_content is a list or a single image
|
||||
if isinstance(image_content, list):
|
||||
# If image_content is a list of images, loop through each image
|
||||
for image in image_content:
|
||||
base64_image = self.encode_image(image)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{base64_image}",
|
||||
"detail": image_detail,
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
# If image_content is a single image, handle it directly
|
||||
base64_image = self.encode_image(image_content)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{base64_image}",
|
||||
"detail": image_detail,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Rotate text to be the last message if desired
|
||||
if put_text_last:
|
||||
text_content = message["content"].pop(0)
|
||||
message["content"].append(text_content)
|
||||
|
||||
self.messages.append(message)
|
||||
|
||||
# For API-style inference from Anthropic
|
||||
elif isinstance(self.engine, (LMMEngineAnthropic, LMMEngineAWSBedrock)):
|
||||
# infer role from previous message
|
||||
if role != "user":
|
||||
if self.messages[-1]["role"] == "system":
|
||||
role = "user"
|
||||
elif self.messages[-1]["role"] == "user":
|
||||
role = "assistant"
|
||||
elif self.messages[-1]["role"] == "assistant":
|
||||
role = "user"
|
||||
|
||||
message = {
|
||||
"role": role,
|
||||
"content": [{"type": "text", "text": text_content}],
|
||||
}
|
||||
|
||||
if image_content:
|
||||
# Check if image_content is a list or a single image
|
||||
if isinstance(image_content, list):
|
||||
# If image_content is a list of images, loop through each image
|
||||
for image in image_content:
|
||||
base64_image = self.encode_image(image)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/png",
|
||||
"data": base64_image,
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
# If image_content is a single image, handle it directly
|
||||
base64_image = self.encode_image(image_content)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/png",
|
||||
"data": base64_image,
|
||||
},
|
||||
}
|
||||
)
|
||||
self.messages.append(message)
|
||||
|
||||
# Locally hosted vLLM model inference
|
||||
elif isinstance(self.engine, LMMEnginevLLM):
|
||||
# infer role from previous message
|
||||
if role != "user":
|
||||
if self.messages[-1]["role"] == "system":
|
||||
role = "user"
|
||||
elif self.messages[-1]["role"] == "user":
|
||||
role = "assistant"
|
||||
elif self.messages[-1]["role"] == "assistant":
|
||||
role = "user"
|
||||
|
||||
message = {
|
||||
"role": role,
|
||||
"content": [{"type": "text", "text": text_content}],
|
||||
}
|
||||
|
||||
if image_content:
|
||||
# Check if image_content is a list or a single image
|
||||
if isinstance(image_content, list):
|
||||
# If image_content is a list of images, loop through each image
|
||||
for image in image_content:
|
||||
base64_image = self.encode_image(image)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image;base64,{base64_image}"
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
# If image_content is a single image, handle it directly
|
||||
base64_image = self.encode_image(image_content)
|
||||
message["content"].append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image;base64,{base64_image}"},
|
||||
}
|
||||
)
|
||||
|
||||
self.messages.append(message)
|
||||
else:
|
||||
raise ValueError("engine_type is not supported")
|
||||
|
||||
def get_response(
|
||||
self,
|
||||
user_message=None,
|
||||
messages=None,
|
||||
temperature=0.0,
|
||||
max_new_tokens=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Generate the next response based on previous messages"""
|
||||
if messages is None:
|
||||
messages = self.messages
|
||||
if user_message:
|
||||
messages.append(
|
||||
{"role": "user", "content": [{"type": "text", "text": user_message}]}
|
||||
)
|
||||
|
||||
if isinstance(self.engine, LMMEngineLybic):
|
||||
content, total_tokens, cost = self.engine.generate(
|
||||
messages,
|
||||
max_new_tokens=max_new_tokens, # type: ignore
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
content, total_tokens, cost = self.engine.generate(
|
||||
messages,
|
||||
temperature=temperature,
|
||||
max_new_tokens=max_new_tokens, # type: ignore
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
cost_string = CostManager.format_cost(cost, self.engine)
|
||||
|
||||
return content, total_tokens, cost_string
|
||||
|
||||
class EmbeddingAgent:
|
||||
def __init__(self, engine_params=None, engine=None):
|
||||
if engine is None:
|
||||
if engine_params is not None:
|
||||
engine_type = engine_params.get("engine_type")
|
||||
if engine_type == "openai":
|
||||
self.engine = OpenAIEmbeddingEngine(**engine_params)
|
||||
elif engine_type == "gemini":
|
||||
self.engine = GeminiEmbeddingEngine(**engine_params)
|
||||
elif engine_type == "azure":
|
||||
self.engine = AzureOpenAIEmbeddingEngine(**engine_params)
|
||||
elif engine_type == "dashscope":
|
||||
self.engine = DashScopeEmbeddingEngine(**engine_params)
|
||||
elif engine_type == "doubao":
|
||||
self.engine = DoubaoEmbeddingEngine(**engine_params)
|
||||
elif engine_type == "jina":
|
||||
self.engine = JinaEmbeddingEngine(**engine_params)
|
||||
else:
|
||||
raise ValueError(f"Embedding engine type '{engine_type}' is not supported")
|
||||
else:
|
||||
raise ValueError("engine_params must be provided")
|
||||
else:
|
||||
self.engine = engine
|
||||
|
||||
def get_embeddings(self, text):
|
||||
"""Get embeddings for the given text
|
||||
|
||||
Args:
|
||||
text (str): The text to get embeddings for
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: The embeddings for the text
|
||||
"""
|
||||
embeddings, total_tokens, cost = self.engine.get_embeddings(text)
|
||||
cost_string = CostManager.format_cost(cost, self.engine)
|
||||
return embeddings, total_tokens, cost_string
|
||||
|
||||
|
||||
def get_similarity(self, text1, text2):
|
||||
"""Calculate the cosine similarity between two texts
|
||||
|
||||
Args:
|
||||
text1 (str): First text
|
||||
text2 (str): Second text
|
||||
|
||||
Returns:
|
||||
float: Cosine similarity score between the two texts
|
||||
"""
|
||||
embeddings1, tokens1, cost1 = self.get_embeddings(text1)
|
||||
embeddings2, tokens2, cost2 = self.get_embeddings(text2)
|
||||
|
||||
# Calculate cosine similarity
|
||||
dot_product = np.dot(embeddings1, embeddings2)
|
||||
norm1 = np.linalg.norm(embeddings1)
|
||||
norm2 = np.linalg.norm(embeddings2)
|
||||
|
||||
similarity = dot_product / (norm1 * norm2)
|
||||
total_tokens = tokens1 + tokens2
|
||||
total_cost = CostManager.add_costs(cost1, cost2)
|
||||
|
||||
return similarity, total_tokens, total_cost
|
||||
|
||||
def batch_get_embeddings(self, texts):
|
||||
"""Get embeddings for multiple texts
|
||||
|
||||
Args:
|
||||
texts (List[str]): List of texts to get embeddings for
|
||||
|
||||
Returns:
|
||||
List[numpy.ndarray]: List of embeddings for each text
|
||||
"""
|
||||
embeddings = []
|
||||
total_tokens = [0, 0, 0]
|
||||
if texts:
|
||||
first_embedding, first_tokens, first_cost = self.get_embeddings(texts[0])
|
||||
embeddings.append(first_embedding)
|
||||
total_tokens[0] += first_tokens[0]
|
||||
total_tokens[1] += first_tokens[1]
|
||||
total_tokens[2] += first_tokens[2]
|
||||
total_cost = first_cost
|
||||
|
||||
for text in texts[1:]:
|
||||
embedding, tokens, cost = self.get_embeddings(text)
|
||||
embeddings.append(embedding)
|
||||
total_tokens[0] += tokens[0]
|
||||
total_tokens[1] += tokens[1]
|
||||
total_tokens[2] += tokens[2]
|
||||
total_cost = CostManager.add_costs(total_cost, cost)
|
||||
else:
|
||||
currency = CostManager.get_currency_symbol(self.engine)
|
||||
total_cost = f"0.0{currency}"
|
||||
|
||||
return embeddings, total_tokens, total_cost
|
||||
|
||||
|
||||
class WebSearchAgent:
|
||||
def __init__(self, engine_params=None, engine=None):
|
||||
if engine is None:
|
||||
if engine_params is not None:
|
||||
self.engine_type = engine_params.get("engine_type")
|
||||
if self.engine_type == "bocha":
|
||||
self.engine = BochaAISearchEngine(**engine_params)
|
||||
elif self.engine_type == "exa":
|
||||
self.engine = ExaResearchEngine(**engine_params)
|
||||
else:
|
||||
raise ValueError(f"Web search engine type '{self.engine_type}' is not supported")
|
||||
else:
|
||||
raise ValueError("engine_params must be provided")
|
||||
else:
|
||||
self.engine = engine
|
||||
|
||||
def get_answer(self, query, **kwargs):
|
||||
"""Get a direct answer for the query
|
||||
|
||||
Args:
|
||||
query (str): The search query
|
||||
**kwargs: Additional arguments to pass to the search engine
|
||||
|
||||
Returns:
|
||||
str: The answer text
|
||||
"""
|
||||
if isinstance(self.engine, BochaAISearchEngine):
|
||||
answer, tokens, cost = self.engine.get_answer(query, **kwargs)
|
||||
return answer, tokens, str(cost)
|
||||
|
||||
elif isinstance(self.engine, ExaResearchEngine):
|
||||
# For Exa, we'll use the chat_research method which returns a complete answer
|
||||
# results, tokens, cost = self.engine.search(query, **kwargs)
|
||||
results, tokens, cost = self.engine.chat_research(query, **kwargs)
|
||||
if isinstance(results, dict) and "messages" in results:
|
||||
for message in results.get("messages", []):
|
||||
if message.get("type") == "answer":
|
||||
return message.get("content", ""), tokens, str(cost)
|
||||
return str(results), tokens, str(cost)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Web search engine type '{self.engine_type}' is not supported")
|
||||
385
mm_agents/maestro/core/model.md
Normal file
385
mm_agents/maestro/core/model.md
Normal file
@@ -0,0 +1,385 @@
|
||||
# Supported Model Providers and Model Lists
|
||||
|
||||
## LLM Model Providers
|
||||
|
||||
### 1. OpenAI
|
||||
|
||||
**Provider**
|
||||
|
||||
- `openai`
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `gpt-5` Window: 400,000 Max Output Tokens: 128,000
|
||||
- `gpt-5-mini` Window: 400,000 Max Output Tokens: 128,000
|
||||
- `gpt-4.1-nano` Window: 400,000 Max Output Tokens: 128,000
|
||||
- `gpt-4.1` Window: 1,047,576 Max Output Tokens: 32,768
|
||||
- `gpt-4.1-mini` Window: 1,047,576 Max Output Tokens: 32,768
|
||||
- `gpt-4.1-nano` Window: 1,047,576 Max Output Tokens: 32,768
|
||||
- `gpt-4o` Window: 128,000 Max Output Tokens: 16,384
|
||||
- `gpt-4o-mini` Window: 128,000 Max Output Tokens: 16,384
|
||||
- `o1` Window: 200,000 Max Output Tokens: 100,000
|
||||
- `o1-pro` Window: 200,000 Max Output Tokens: 100,000
|
||||
- `o1-mini` Window: 200,000 Max Output Tokens: 100,000
|
||||
- `o3` Window: 200,000 Max Output Tokens: 100,000
|
||||
- `o3-pro` Window: 200,000 Max Output Tokens: 100,000
|
||||
- `o3-mini` Window: 200,000 Max Output Tokens: 100,000
|
||||
- `o4-mini` Window: 200,000 Max Output Tokens: 100,000
|
||||
|
||||
**Embedding Models:**
|
||||
|
||||
- `text-embedding-3-small`
|
||||
- `text-embedding-3-large`
|
||||
- `text-embedding-ada-002`
|
||||
|
||||
📚 **Reference Link:** <https://platform.openai.com/docs/pricing>
|
||||
|
||||
---
|
||||
|
||||
### 2. Anthropic Claude
|
||||
|
||||
**Provider**
|
||||
|
||||
- `anthropic`
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `claude-opus-4-1-20250805` Context window: 200K Max output: 32000
|
||||
- `claude-opus-4-20250514` Context window: 200K Max output: 32000
|
||||
- `claude-sonnet-4-20250514` Context window: 200K Max output: 64000
|
||||
- `claude-3-7-sonnet-20250219` Context window: 200K Max output: 64000
|
||||
- - `claude-3-5-sonnet-20240620` Context window: 200K Max output: 64000
|
||||
- `claude-3-5-haiku-20241022` Context window: 200K Max output: 8192
|
||||
|
||||
📚 **Reference Link:** <https://www.anthropic.com/api>
|
||||
|
||||
---
|
||||
|
||||
### 3. AWS Bedrock
|
||||
|
||||
**Provider**
|
||||
|
||||
- `bedrock`
|
||||
|
||||
|
||||
**Supported Claude Models:**
|
||||
|
||||
- `Claude-Opus-4`
|
||||
- `Claude-Sonnet-4`
|
||||
- `Claude-Sonnet-3.7`
|
||||
- `Claude-Sonnet-3.5`
|
||||
|
||||
📚 **Reference Link:** <https://aws.amazon.com/bedrock/>
|
||||
|
||||
---
|
||||
|
||||
### 4. Google Gemini
|
||||
|
||||
**Provider**
|
||||
|
||||
- `gemini`
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `gemini-2.5-pro` in: 1,048,576 out: 65536
|
||||
- `gemini-2.5-flash` in: 1,048,576 out: 65536
|
||||
- `gemini-2.0-flash` in: 1,048,576 out: 8192
|
||||
- `gemini-1.5-pro` in: 2,097,152 out: 8192
|
||||
- `gemini-1.5-flash` in: 1,048,576 out: 8192
|
||||
|
||||
**Embedding Models:**
|
||||
|
||||
- `gemini-embedding-001`
|
||||
|
||||
📚 **Reference Link:** <https://ai.google.dev/gemini-api/docs/pricing>
|
||||
|
||||
---
|
||||
|
||||
### 5. Groq
|
||||
|
||||
**Provider**
|
||||
|
||||
- `groq`
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `Kimi-K2-Instruct`
|
||||
- `Llama-4-Scout-17B-16E-Instruct`
|
||||
- `Llama-4-Maverick-17B-128E-Instruct`
|
||||
- `Llama-Guard-4-12B`
|
||||
- `DeepSeek-R1-Distill-Llama-70B`
|
||||
- `Qwen3-32B`
|
||||
- `Llama-3.3-70B-Instruct`
|
||||
|
||||
📚 **Reference Link:** <https://groq.com/pricing>
|
||||
|
||||
---
|
||||
|
||||
### 6. Monica (Proxy Platform)
|
||||
|
||||
**Provider**
|
||||
|
||||
- `monica`
|
||||
|
||||
**OpenAI Models:**
|
||||
|
||||
- `gpt-4.1`
|
||||
- `gpt-4.1-mini`
|
||||
- `gpt-4.1-nano`
|
||||
- `gpt-4o-2024-11-20`
|
||||
- `gpt-4o-mini-2024-07-18`
|
||||
- `o4-mini`
|
||||
- `o3`
|
||||
|
||||
**Anthropic Claude Models:**
|
||||
|
||||
- `claude-opus-4-20250514`
|
||||
- `claude-sonnet-4-20250514`
|
||||
- `claude-3-7-sonnet-latest`
|
||||
- `claude-3-5-sonnet-20241022`
|
||||
- `claude-3-5-sonnet-20240620`
|
||||
- `claude-3-5-haiku-20241022`
|
||||
|
||||
|
||||
**Google Gemini Models:**
|
||||
|
||||
- `gemini-2.5-pro-preview-03-25`
|
||||
- `gemini-2.5-flash-lite`
|
||||
- `gemini-2.5-flash-preview-05-20`
|
||||
- `gemini-2.0-flash-001`
|
||||
- `gemini-1.5-pro-002`
|
||||
- `gemini-1.5-flash-002`
|
||||
|
||||
**DeepSeek Models:**
|
||||
|
||||
- `deepseek-reasoner`
|
||||
- `deepseek-chat`
|
||||
|
||||
**Meta Llama Models:**
|
||||
|
||||
- `Llama-4-Scout-17B-16E-Instruct` Context length: 10M tokens
|
||||
- `Llama-4-Maverick-17B-128E-Instruct ` Context length: 1M tokens
|
||||
- `llama-3.3-70b-instruct`
|
||||
- `llama-3-70b-instruct`
|
||||
- `llama-3.1-405b-instruct`
|
||||
|
||||
**xAI Grok Models:**
|
||||
|
||||
- `grok-3-beta`
|
||||
- `grok-beta`
|
||||
|
||||
📚 **Reference Link:** <https://platform.monica.im/docs/en/models-and-pricing>
|
||||
|
||||
---
|
||||
|
||||
### 7. OpenRouter (Proxy Platform)
|
||||
|
||||
**Provider**
|
||||
|
||||
- `openrouter`
|
||||
|
||||
**OpenAI Models:**
|
||||
|
||||
- `gpt-4.1`
|
||||
- `gpt-4.1-mini`
|
||||
- `o1`
|
||||
- `o1-pro`
|
||||
- `o1-mini`
|
||||
- `o3`
|
||||
- `o3-pro`
|
||||
- `o3-mini`
|
||||
- `o4-mini`
|
||||
|
||||
**xAI Grok Models:**
|
||||
|
||||
- `grok-4` Total Context: 256K Max Output: 256K
|
||||
- `grok-3`
|
||||
- `grok-3-mini`
|
||||
|
||||
**Anthropic Claude Models:**
|
||||
|
||||
- `claude-opus-4`
|
||||
- `claude-sonnet-4`
|
||||
|
||||
**Google Gemini Models:**
|
||||
|
||||
- `gemini-2.5-flash`
|
||||
- `gemini-2.5-pro`
|
||||
|
||||
📚 **Reference Link:** <https://openrouter.ai/models>
|
||||
|
||||
---
|
||||
|
||||
### 8. Azure OpenAI
|
||||
|
||||
**Provider**
|
||||
|
||||
- `azure`
|
||||
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `gpt-4.1`
|
||||
- `gpt-4.1-mini`
|
||||
- `gpt-4.1-nano`
|
||||
- `o1`
|
||||
- `o3`
|
||||
- `o4-mini`
|
||||
|
||||
📚 **Reference Link:** <https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/>
|
||||
|
||||
---
|
||||
|
||||
### 9. Lybic AI
|
||||
|
||||
**Provider:**
|
||||
|
||||
- `lybic`
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `gpt-5`
|
||||
- `gpt-4.1`
|
||||
- `gpt-4.1-mini`
|
||||
- `gpt-4.1-nano`
|
||||
- `gpt-4.5-preview`
|
||||
- `gpt-4o`
|
||||
- `gpt-4o-realtime-preview`
|
||||
- `gpt-4o-mini`
|
||||
- `o1`
|
||||
- `o1-pro`
|
||||
- `o1-mini`
|
||||
- `o3`
|
||||
- `o3-pro`
|
||||
- `o3-mini`
|
||||
- `o4-mini`
|
||||
|
||||
**Note:** Lybic AI provides OpenAI-compatible API endpoints with the same model names and pricing structure.
|
||||
|
||||
📚 **Reference Link:** <https://aigw.lybicai.com/>
|
||||
|
||||
---
|
||||
|
||||
### 10. DeepSeek
|
||||
|
||||
**Provider**
|
||||
|
||||
- `deepseek`
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `deepseek-chat` Context length: 128K, Output length: Default 4K, Max 8K
|
||||
- `deepseek-reasoner` Context length: 128K, Output length: Default 32K, Max 64K
|
||||
|
||||
📚 **Reference Link:** <https://platform.deepseek.com/>
|
||||
|
||||
---
|
||||
|
||||
### 11. Alibaba Cloud Qwen
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `qwen-max-latest` Context window: 32,768 Max input token length: 30,720 Max generation token length: 8,192
|
||||
- `qwen-plus-latest` Context window: 131,072 Max input token length: 98,304 (thinking) Max generation token length: 129,024 Max output: 16,384
|
||||
- `qwen-turbo-latest` Context window: 1,000,000 Max input token length: 1,000,000 Max generation token length: 16,384
|
||||
- `qwen-vl-max-latest` (Grounding) Context window: 131,072 Max input token length: 129,024 Max generation token length: 8,192
|
||||
- `qwen-vl-plus-latest` (Grounding) Context window: 131,072 Max input token length: 129,024 Max generation token length: 8,192
|
||||
|
||||
**Embedding Models:**
|
||||
|
||||
- `text-embedding-v4`
|
||||
- `text-embedding-v3`
|
||||
|
||||
📚 **Reference Link:** <https://bailian.console.aliyun.com/?tab=doc#/doc/?type=model&url=https%3A%2F%2Fhelp.aliyun.com%2Fdocument_detail%2F2840914.html&renderType=iframe>
|
||||
|
||||
---
|
||||
|
||||
### 12. ByteDance Doubao
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `doubao-seed-1-6-flash-250615` Context window: 256k Max input token length: 224k Max generation token length: 32k Max thinking content token length: 32k
|
||||
- `doubao-seed-1-6-thinking-250715` Context window: 256k Max input token length: 224k Max generation token length: 32k Max thinking content token length: 32k
|
||||
- `doubao-seed-1-6-250615` Context window: 256k Max input token length: 224k Max generation token length: 32k Max thinking content token length: 32k
|
||||
- `doubao-1.5-vision-pro-250328` (Grounding) Context window: 128k Max input token length: 96k Max generation token length: 16k Max thinking content token length: 32k
|
||||
- `doubao-1-5-thinking-vision-pro-250428` (Grounding) Context window: 128k Max input token length: 96k Max generation token length: 16k Max thinking content token length: 32k
|
||||
- `doubao-1-5-ui-tars-250428` (Grounding) Context window: 128k Max input token length: 96k Max generation token length: 16k Max thinking content token length: 32k
|
||||
|
||||
**Embedding Models:**
|
||||
|
||||
- `doubao-embedding-large-text-250515`
|
||||
- `doubao-embedding-text-240715`
|
||||
|
||||
📚 **Reference Link:** <https://console.volcengine.com/ark/region:ark+cn-beijing/model?vendor=Bytedance&view=LIST_VIEW>
|
||||
|
||||
---
|
||||
|
||||
### 13. Zhipu GLM
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `GLM-4.5` Max in: 128k Max output: 0.2K
|
||||
- `GLM-4.5-X` Max in: 128k Max output: 0.2K
|
||||
- `GLM-4.5-Air` Max in: 128k Max output: 0.2K
|
||||
- `GLM-4-Plus`
|
||||
- `GLM-4-Air-250414`
|
||||
- `GLM-4-AirX` (Grounding)
|
||||
- `GLM-4V-Plus-0111` (Grounding)
|
||||
|
||||
**Embedding Models:**
|
||||
|
||||
- `Embedding-3`
|
||||
- `Embedding-2`
|
||||
|
||||
📚 **Reference Link:** <https://open.bigmodel.cn/pricing>
|
||||
|
||||
---
|
||||
|
||||
### 14. SiliconFlow
|
||||
|
||||
**Supported Models:**
|
||||
|
||||
- `Kimi-K2-Instruct` Context Length: 128K
|
||||
- `DeepSeek-V3`
|
||||
- `DeepSeek-R1`
|
||||
- `Qwen3-32B`
|
||||
|
||||
📚 **Reference Link:** <https://cloud.siliconflow.cn/sft-d1pi8rbk20jc73c62gm0/models>
|
||||
|
||||
---
|
||||
|
||||
## 🔤 Dedicated Embedding Providers
|
||||
|
||||
### 15. Jina AI
|
||||
|
||||
**Embedding Models:**
|
||||
|
||||
- `jina-embeddings-v4`
|
||||
- `jina-embeddings-v3`
|
||||
|
||||
📚 **Reference Link:** <https://jina.ai/embeddings>
|
||||
|
||||
---
|
||||
|
||||
## 🔍 AI Search Engines
|
||||
|
||||
### 16. Bocha AI
|
||||
|
||||
**Service Type:** AI Research & Search
|
||||
|
||||
📚 **Reference Link:** <https://open.bochaai.com/overview>
|
||||
|
||||
---
|
||||
|
||||
### 17. Exa
|
||||
|
||||
**Service Type:** AI Research & Search
|
||||
|
||||
**Pricing Model:**
|
||||
|
||||
- $5.00 / 1k agent searches
|
||||
- $5.00 / 1k exa-research agent page reads
|
||||
- $10.00 / 1k exa-research-pro agent page reads
|
||||
- $5.00 / 1M reasoning tokens
|
||||
|
||||
📚 **Reference Link:** <https://dashboard.exa.ai/home>
|
||||
194
mm_agents/maestro/core/model_pricing.json
Normal file
194
mm_agents/maestro/core/model_pricing.json
Normal file
@@ -0,0 +1,194 @@
|
||||
{
|
||||
"llm_models": {
|
||||
"openai": {
|
||||
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
|
||||
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
|
||||
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
|
||||
"gpt-4.5-preview": {"input": "75$", "output": "150$"},
|
||||
"gpt-4o": {"input": "2.5$", "output": "10$"},
|
||||
"gpt-4o-realtime-preview": {"input": "5$", "output": "20$"},
|
||||
"gpt-4o-mini": {"input": "0.15$", "output": "0.6$"},
|
||||
"o1": {"input": "15$", "output": "60$"},
|
||||
"o1-pro": {"input": "150$", "output": "600$"},
|
||||
"o1-mini": {"input": "1.10$", "output": "4.40$"},
|
||||
"o3": {"input": "2.0$", "output": "8$"},
|
||||
"o3-pro": {"input": "20$", "output": "80$"},
|
||||
"o3-mini": {"input": "1.10$", "output": "4.40$"},
|
||||
"o4-mini": {"input": "1.1$", "output": "4.40$"}
|
||||
},
|
||||
"anthropic": {
|
||||
"claude-opus-4-20250514": {"input": "15$", "output": "75$"},
|
||||
"claude-sonnet-4-20250514": {"input": "3$", "output": "15$"},
|
||||
"claude-3-7-sonnet-20250219": {"input": "3$", "output": "15$"},
|
||||
"claude-3-5-sonnet-20241022": {"input": "3$", "output": "15$"},
|
||||
"claude-3-5-haiku-20241022": {"input": "0.8$", "output": "4$"}
|
||||
},
|
||||
"qwen": {
|
||||
"qwen-max-latest": {"input": "2.4¥", "output": "9.6¥"},
|
||||
"qwen-plus-latest": {"input": "0.8¥", "output": "2¥"},
|
||||
"qwen-turbo-latest": {"input": "0.3¥", "output": "0.6¥"},
|
||||
"qwen-vl-max-latest": {"input": "3¥", "output": "9¥"},
|
||||
"qwen-vl-plus-latest": {"input": "1.5¥", "output": "4.5¥"}
|
||||
},
|
||||
"doubao": {
|
||||
"doubao-seed-1-6-flash-250615": {"input": "0.15¥", "output": "1.50¥"},
|
||||
"doubao-seed-1-6-thinking-250715": {"input": "0.8¥", "output": "8¥"},
|
||||
"doubao-seed-1-6-250615": {"input": "0.8¥", "output": "2¥"},
|
||||
"doubao-1.5-vision-pro-250328": {"input": "3¥", "output": "9¥"},
|
||||
"doubao-1-5-thinking-vision-pro-250428": {"input": "3¥", "output": "9¥"},
|
||||
"doubao-1-5-ui-tars-250428": {"input": "3.5¥", "output": "12¥"}
|
||||
},
|
||||
"deepseek": {
|
||||
"deepseek-chat": {"input": "2¥", "output": "8¥"},
|
||||
"deepseek-reasoner": {"input": "4¥", "output": "16¥"}
|
||||
},
|
||||
"zhipu": {
|
||||
"GLM-4.5": {"input": "4¥", "output": "16¥"},
|
||||
"GLM-4.5V": {"input": "4¥", "output": "12¥"},
|
||||
"GLM-4-Plus": {"input": "5¥", "output": "5¥"},
|
||||
"GLM-4-Air-250414": {"input": "0.5¥", "output": "0.5¥"},
|
||||
"GLM-4-AirX": {"input": "10¥", "output": "10¥"},
|
||||
"GLM-4V-Plus-0111": {"input": "4¥", "output": "4¥"}
|
||||
},
|
||||
"groq": {
|
||||
"Kimi-K2-Instruct": {"input": "1.00$", "output": "3.00$"},
|
||||
"Llama-4-Scout-17B-16E-Instruct": {"input": "0.11$", "output": "0.34$"},
|
||||
"Llama-4-Maverick-17B-128E-Instruct": {"input": "0.20$", "output": "0.60$"},
|
||||
"Llama-Guard-4-12B": {"input": "0.20$", "output": "0.20$"},
|
||||
"DeepSeek-R1-Distill-Llama-70B": {"input": "0.75$", "output": "0.99$"},
|
||||
"Qwen3-32B": {"input": "0.29$", "output": "0.59$"},
|
||||
"Llama-3.3-70B-Instruct": {"input": "0.59$", "output": "0.79$"}
|
||||
},
|
||||
"siliconflow": {
|
||||
"Kimi-K2-Instruct": {"input": "4¥", "output": "16¥"},
|
||||
"DeepSeek-V3": {"input": "2¥", "output": "8¥"},
|
||||
"DeepSeek-R1": {"input": "4¥", "output": "16¥"},
|
||||
"Qwen3-32B": {"input": "1¥", "output": "4¥"}
|
||||
},
|
||||
"monica": {
|
||||
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
|
||||
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
|
||||
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
|
||||
"gpt-4o-2024-11-20": {"input": "2.50$", "output": "10.00$"},
|
||||
"gpt-4o-mini-2024-07-18": {"input": "0.15$", "output": "0.60$"},
|
||||
"o4-mini": {"input": "0.55$", "output": "2.20$"},
|
||||
"o3": {"input": "2.00$", "output": "8.00$"},
|
||||
"claude-opus-4-20250514": {"input": "15.00$", "output": "75.00$"},
|
||||
"claude-sonnet-4-20250514": {"input": "3.00$", "output": "15.00$"},
|
||||
"claude-3-7-sonnet-latest": {"input": "3.00$", "output": "15.00$"},
|
||||
"claude-3-5-sonnet-20241022": {"input": "3.00$", "output": "15.00$"},
|
||||
"claude-3-5-sonnet-20240620": {"input": "3.00$", "output": "15.00$"},
|
||||
"claude-3-5-haiku-20241022": {"input": "0.80$", "output": "4.00$"},
|
||||
"claude-3-opus-20240229": {"input": "15.00$", "output": "75.00$"},
|
||||
"claude-3-haiku-20240307": {"input": "0.25$", "output": "1.25$"},
|
||||
"gemini-2.5-pro-preview-03-25": {"input": "1.25$", "output": "10.00$"},
|
||||
"gemini-2.5-flash-lite": {"input": "0.10$", "output": "0.40$"},
|
||||
"gemini-2.5-flash-preview-05-20": {"input": "0.30$", "output": "2.50$"},
|
||||
"gemini-2.0-flash-001": {"input": "0.10$", "output": "0.40$"},
|
||||
"gemini-1.5-pro-002": {"input": "1.25$", "output": "5.00$"},
|
||||
"gemini-1.5-flash-002": {"input": "0.075$", "output": "0.30$"},
|
||||
"deepseek-reasoner": {"input": "0.55$", "output": "2.21$"},
|
||||
"deepseek-chat": {"input": "0.28$", "output": "1.10$"},
|
||||
"llama-3-8b-instruct": {"input": "0.28$", "output": "0.83$"},
|
||||
"llama-3.1-8b-instruct": {"input": "0.025$", "output": "0.06$"},
|
||||
"llama-3.3-70b-instruct": {"input": "0.13$", "output": "0.40$"},
|
||||
"llama-3-70b-instruct": {"input": "0.88$", "output": "0.88$"},
|
||||
"llama-3.1-405b-instruct": {"input": "4.00$", "output": "4.00$"},
|
||||
"grok-3-beta": {"input": "3.00$", "output": "15.00$"},
|
||||
"grok-beta": {"input": "5.00$", "output": "15.00$"}
|
||||
},
|
||||
"gemini": {
|
||||
"gemini-2.5-pro": {"input": "1.25$", "output": "10$"},
|
||||
"gemini-2.5-flash": {"input": "0.30$", "output": "2.50$"},
|
||||
"gemini-2.0-flash": {"input": "0.10$", "output": "0.40$"},
|
||||
"gemini-1.5-pro": {"input": "1.25$", "output": "5$"},
|
||||
"gemini-1.5-flash": {"input": "0.075$", "output": "0.30$"}
|
||||
},
|
||||
"bedrock": {
|
||||
"Claude-Opus-4": {"input": "15.00$", "output": "75.00$"},
|
||||
"Claude-Sonnet-4": {"input": "3.00$", "output": "15.00$"},
|
||||
"Claude-Sonnet-3.7": {"input": "3.00$", "output": "15.00$"},
|
||||
"Claude-Sonnet-3.5": {"input": "3.00$", "output": "15.00$"}
|
||||
},
|
||||
"openrouter": {
|
||||
"openai/gpt-5": {"input": "1.25$", "output": "10$"},
|
||||
"openai/gpt-5-chat": {"input": "1.25$", "output": "10$"},
|
||||
"openai/gpt-4.1": {"input": "2$", "output": "8$"},
|
||||
"openai/gpt-4.1-mini": {"input": "0.4$", "output": "1.6$"},
|
||||
"openai/o1": {"input": "15$", "output": "60$"},
|
||||
"openai/o1-pro": {"input": "150$", "output": "600$"},
|
||||
"openai/o1-mini": {"input": "1.1$", "output": "4.4$"},
|
||||
"openai/o3": {"input": "2$", "output": "8$"},
|
||||
"openai/o3-pro": {"input": "20$", "output": "80$"},
|
||||
"openai/o3-mini": {"input": "1.1$", "output": "4.4$"},
|
||||
"openai/o4-mini": {"input": "1.1$", "output": "4.4$"},
|
||||
"x-ai/grok-4": {"input": "3$", "output": "15$"},
|
||||
"x-ai/grok-3": {"input": "3$", "output": "15$"},
|
||||
"x-ai/grok-3-mini": {"input": "0.3$", "output": "0.5$"},
|
||||
"anthropic/claude-opus-4": {"input": "15$", "output": "75$"},
|
||||
"anthropic/claude-sonnet-4": {"input": "3$", "output": "15$"},
|
||||
"google/gemini-2.5-flash": {"input": "0.3$", "output": "2.5$"},
|
||||
"google/gemini-2.5-pro": {"input": "1.25$", "output": "10$"}
|
||||
},
|
||||
"azure": {
|
||||
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
|
||||
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
|
||||
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
|
||||
"o1": {"input": "15$", "output": "60$"},
|
||||
"o3": {"input": "2.0$", "output": "8$"},
|
||||
"o4-mini": {"input": "1.1$", "output": "4.40$"}
|
||||
},
|
||||
"lybic": {
|
||||
"gpt-5": {"input": "1.25$", "output": "10$"},
|
||||
"gpt-4.1": {"input": "2.00$", "output": "8.00$"},
|
||||
"gpt-4.1-mini": {"input": "0.40$", "output": "1.60$"},
|
||||
"gpt-4.1-nano": {"input": "0.10$", "output": "0.40$"},
|
||||
"gpt-4.5-preview": {"input": "75$", "output": "150$"},
|
||||
"gpt-4o": {"input": "2.5$", "output": "10$"},
|
||||
"gpt-4o-realtime-preview": {"input": "5$", "output": "20$"},
|
||||
"gpt-4o-mini": {"input": "0.15$", "output": "0.6$"},
|
||||
"o1": {"input": "15$", "output": "60$"},
|
||||
"o1-pro": {"input": "150$", "output": "600$"},
|
||||
"o1-mini": {"input": "1.10$", "output": "4.40$"},
|
||||
"o3": {"input": "2.0$", "output": "8$"},
|
||||
"o3-pro": {"input": "20$", "output": "80$"},
|
||||
"o3-mini": {"input": "1.10$", "output": "4.40$"},
|
||||
"o4-mini": {"input": "1.1$", "output": "4.40$"}
|
||||
}
|
||||
},
|
||||
"embedding_models": {
|
||||
"openai": {
|
||||
"text-embedding-3-small": {"input": "0.02$", "output": ""},
|
||||
"text-embedding-3-large": {"input": "0.13$", "output": ""},
|
||||
"text-embedding-ada-002": {"input": "0.10$", "output": ""}
|
||||
},
|
||||
"qwen": {
|
||||
"text-embedding-v4": {"input": "0.0005¥", "output": ""},
|
||||
"text-embedding-v3": {"input": "0.0005¥", "output": ""}
|
||||
},
|
||||
"doubao": {
|
||||
"doubao-embedding-large-text-250515": {"input": "0.7¥", "output": ""},
|
||||
"doubao-embedding-text-240715": {"input": "0.5¥", "output": ""}
|
||||
},
|
||||
"zhipu": {
|
||||
"Embedding-3": {"input": "0.5¥", "output": ""},
|
||||
"Embedding-2": {"input": "0.5¥", "output": ""}
|
||||
},
|
||||
"jina": {
|
||||
"jina-embeddings-v4": {"input": "0.05$", "output": ""},
|
||||
"jina-embeddings-v3": {"input": "0.05$", "output": ""}
|
||||
},
|
||||
"gemini": {
|
||||
"gemini-embedding-001": {"input": "0.15$", "output": ""}
|
||||
}
|
||||
},
|
||||
"search_models": {
|
||||
"bocha": {
|
||||
"ai-search": {"cost_type": "balance", "unit": "per_query"}
|
||||
},
|
||||
"exa": {
|
||||
"search": {"cost_type": "direct", "unit": "per_query"},
|
||||
"research": {"cost_type": "direct", "unit": "per_task"}
|
||||
}
|
||||
}
|
||||
}
|
||||
481
mm_agents/maestro/core/new_knowledge.py
Normal file
481
mm_agents/maestro/core/new_knowledge.py
Normal file
@@ -0,0 +1,481 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, Tuple, List
|
||||
import numpy as np
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from ..utils.common_utils import (
|
||||
load_embeddings,
|
||||
load_knowledge_base,
|
||||
save_embeddings,
|
||||
)
|
||||
from ..tools.new_tools import NewTools
|
||||
from .mllm import CostManager
|
||||
|
||||
def get_embedding_dim(model_name):
|
||||
if model_name == "doubao-embedding-large-text-250515":
|
||||
return 2048
|
||||
elif model_name == "doubao-embedding-text-240715":
|
||||
return 2560
|
||||
elif model_name == "text-embedding-ada-002":
|
||||
return 1536
|
||||
elif model_name == "text-embedding-3-small":
|
||||
return 1536
|
||||
elif model_name == "text-embedding-3-large":
|
||||
return 3072
|
||||
elif model_name == "gemini-embedding-001":
|
||||
return 3072
|
||||
elif model_name == "jina-embeddings-v4":
|
||||
return 2048
|
||||
elif model_name == "jina-embeddings-v3":
|
||||
return 1024
|
||||
elif model_name == "text-embedding-v4":
|
||||
return 1024
|
||||
elif model_name == "text-embedding-v3":
|
||||
return 1024
|
||||
elif model_name == "embedding-2" or model_name == "embedding-3":
|
||||
return 2048
|
||||
else:
|
||||
return None
|
||||
|
||||
class NewKnowledgeBase:
|
||||
def __init__(
|
||||
self,
|
||||
embedding_engine: NewTools,
|
||||
local_kb_path: str,
|
||||
platform: str,
|
||||
Tools_dict: Dict,
|
||||
save_knowledge: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the KnowledgeBase module
|
||||
|
||||
Args:
|
||||
embedding_engine: Embedding engine instance
|
||||
local_kb_path: Path to local knowledge base
|
||||
platform: Target platform (Windows/Darwin/Ubuntu)
|
||||
Tools_dict: Dictionary containing tool configurations
|
||||
save_knowledge: Whether to save knowledge embeddings
|
||||
"""
|
||||
self.platform = platform
|
||||
|
||||
self.local_kb_path = local_kb_path
|
||||
|
||||
# initialize embedding engine
|
||||
self.embedding_engine = embedding_engine
|
||||
|
||||
# Initialize paths for different memory types
|
||||
self.episodic_memory_path = os.path.join(
|
||||
self.local_kb_path, self.platform, "episodic_memory.json"
|
||||
)
|
||||
self.narrative_memory_path = os.path.join(
|
||||
self.local_kb_path, self.platform, "narrative_memory.json"
|
||||
)
|
||||
embedding_model_name = ""
|
||||
if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
|
||||
embedding_model_name = self.embedding_engine.tools["embedding"].model_name
|
||||
else:
|
||||
embedding_model_name = "default"
|
||||
embedding_dim = get_embedding_dim(embedding_model_name)
|
||||
self.embeddings_path = os.path.join(
|
||||
self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
|
||||
)
|
||||
|
||||
# Initialize trajectory tracking
|
||||
self.task_trajectory = ""
|
||||
self.current_subtask_trajectory = ""
|
||||
self.current_search_query = ""
|
||||
|
||||
# query_formulator
|
||||
self.query_formulator_name = "query_formulator"
|
||||
self.query_formulator = NewTools()
|
||||
self.query_formulator.register_tool(
|
||||
self.query_formulator_name,
|
||||
Tools_dict[self.query_formulator_name]["provider"],
|
||||
Tools_dict[self.query_formulator_name]["model"],
|
||||
)
|
||||
|
||||
# knowledge_fusion_agent
|
||||
self.knowledge_fusion_agent_name = "context_fusion"
|
||||
self.knowledge_fusion_agent = NewTools()
|
||||
self.knowledge_fusion_agent.register_tool(
|
||||
self.knowledge_fusion_agent_name,
|
||||
Tools_dict[self.knowledge_fusion_agent_name]["provider"],
|
||||
Tools_dict[self.knowledge_fusion_agent_name]["model"],
|
||||
)
|
||||
|
||||
# narrative_summarization_agent
|
||||
self.narrative_summarization_agent_name = "narrative_summarization"
|
||||
self.narrative_summarization_agent = NewTools()
|
||||
self.narrative_summarization_agent.register_tool(
|
||||
self.narrative_summarization_agent_name,
|
||||
Tools_dict[self.narrative_summarization_agent_name]["provider"],
|
||||
Tools_dict[self.narrative_summarization_agent_name]["model"],
|
||||
)
|
||||
|
||||
# episode_summarization_agent
|
||||
self.episode_summarization_agent_name = "episode_summarization"
|
||||
self.episode_summarization_agent = NewTools()
|
||||
self.episode_summarization_agent.register_tool(
|
||||
self.episode_summarization_agent_name,
|
||||
Tools_dict[self.episode_summarization_agent_name]["provider"],
|
||||
Tools_dict[self.episode_summarization_agent_name]["model"],
|
||||
)
|
||||
|
||||
self.save_knowledge = save_knowledge
|
||||
|
||||
def retrieve_knowledge(
|
||||
self, instruction: str, search_query: str, search_engine: NewTools
|
||||
) -> Tuple[str, List[int], str]:
|
||||
"""Retrieve knowledge using search engine
|
||||
Args:
|
||||
instruction (str): task instruction
|
||||
search_query (str): search query to use
|
||||
search_engine (NewTools): search engine tool to use
|
||||
|
||||
Returns:
|
||||
Tuple[str, List[int], float]: The search results, token usage, and cost
|
||||
"""
|
||||
search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
|
||||
|
||||
return search_results, total_tokens, cost_string
|
||||
|
||||
def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
|
||||
"""Formulate search query based on instruction and current state
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
observation (Dict): Current observation including screenshot
|
||||
|
||||
Returns:
|
||||
Tuple[str, List[int], float]: The formulated query, token usage, and cost
|
||||
"""
|
||||
query_path = os.path.join(
|
||||
self.local_kb_path, self.platform, "formulate_query.json"
|
||||
)
|
||||
try:
|
||||
with open(query_path, "r") as f:
|
||||
formulate_query = json.load(f)
|
||||
except:
|
||||
formulate_query = {}
|
||||
|
||||
if instruction in formulate_query:
|
||||
return formulate_query[instruction], [0, 0, 0], ""
|
||||
|
||||
self.query_formulator.tools["query_formulator"].llm_agent.reset()
|
||||
|
||||
content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
|
||||
"str_input": f"The task is: {instruction}\n" +
|
||||
"To use google search to get some useful information, first carefully analyze " +
|
||||
"the screenshot of the current desktop UI state, then given the task " +
|
||||
"instruction, formulate a question that can be used to search on the Internet " +
|
||||
"for information in helping with the task execution.\n" +
|
||||
"The question should not be too general or too specific. Please ONLY provide " +
|
||||
"the question.\nQuestion:",
|
||||
"img_input": observation["screenshot"] if "screenshot" in observation else None
|
||||
})
|
||||
|
||||
search_query = content.strip().replace('"', "")
|
||||
|
||||
# print("search query: ", search_query)
|
||||
formulate_query[instruction] = search_query
|
||||
with open(query_path, "w") as f:
|
||||
json.dump(formulate_query, f, indent=2)
|
||||
|
||||
return search_query, total_tokens, cost_string
|
||||
|
||||
def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
|
||||
"""Retrieve narrative experience using embeddings
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: The similar task key and its narrative experience
|
||||
"""
|
||||
|
||||
knowledge_base = load_knowledge_base(self.narrative_memory_path)
|
||||
if not knowledge_base:
|
||||
return "None", "None", [0, 0, 0], ""
|
||||
|
||||
embeddings = load_embeddings(self.embeddings_path)
|
||||
|
||||
# Get or create instruction embedding
|
||||
instruction_embedding = embeddings.get(instruction)
|
||||
total_tokens, cost_string = [0, 0, 0], ""
|
||||
|
||||
if instruction_embedding is None:
|
||||
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
|
||||
embeddings[instruction] = instruction_embedding
|
||||
# total_tokens += tokens
|
||||
for i in range(len(total_tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
cost_string = cost_string_now
|
||||
# Get or create embeddings for knowledge base entries
|
||||
candidate_embeddings = []
|
||||
for key in knowledge_base:
|
||||
candidate_embedding = embeddings.get(key)
|
||||
if candidate_embedding is None:
|
||||
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
|
||||
for i in range(len(tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
# total_tokens += tokens
|
||||
cost_string = CostManager.add_costs(cost_string, cost_string_now)
|
||||
embeddings[key] = candidate_embedding
|
||||
|
||||
candidate_embeddings.append(candidate_embedding)
|
||||
|
||||
save_embeddings(self.embeddings_path, embeddings)
|
||||
|
||||
similarities = cosine_similarity(
|
||||
instruction_embedding, np.vstack(candidate_embeddings)
|
||||
)[0]
|
||||
sorted_indices = np.argsort(similarities)[::-1]
|
||||
|
||||
keys = list(knowledge_base.keys())
|
||||
idx = 1 if keys[sorted_indices[0]] == instruction else 0
|
||||
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
|
||||
|
||||
def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
|
||||
"""Retrieve similar task experience using embeddings
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: The similar task key and its episodic experience
|
||||
"""
|
||||
|
||||
knowledge_base = load_knowledge_base(self.episodic_memory_path)
|
||||
if not knowledge_base:
|
||||
return "None", "None", [0, 0, 0], ""
|
||||
|
||||
embeddings = load_embeddings(self.embeddings_path)
|
||||
|
||||
# Get or create instruction embedding
|
||||
instruction_embedding = embeddings.get(instruction)
|
||||
total_tokens, cost_string = [0, 0, 0], ""
|
||||
|
||||
if instruction_embedding is None:
|
||||
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
|
||||
embeddings[instruction] = instruction_embedding
|
||||
|
||||
# total_tokens += tokens
|
||||
for i in range(len(total_tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
cost_string = cost_string_now
|
||||
|
||||
# Get or create embeddings for knowledge base entries
|
||||
candidate_embeddings = []
|
||||
for key in knowledge_base:
|
||||
candidate_embedding = embeddings.get(key)
|
||||
if candidate_embedding is None:
|
||||
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
|
||||
# total_tokens += tokens
|
||||
for i in range(len(total_tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
cost_string = CostManager.add_costs(cost_string, cost_string_now)
|
||||
embeddings[key] = candidate_embedding
|
||||
|
||||
candidate_embeddings.append(candidate_embedding)
|
||||
|
||||
save_embeddings(self.embeddings_path, embeddings)
|
||||
|
||||
similarities = cosine_similarity(
|
||||
instruction_embedding, np.vstack(candidate_embeddings)
|
||||
)[0]
|
||||
sorted_indices = np.argsort(similarities)[::-1]
|
||||
|
||||
keys = list(knowledge_base.keys())
|
||||
idx = 1 if keys[sorted_indices[0]] == instruction else 0
|
||||
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
|
||||
|
||||
def knowledge_fusion(
|
||||
self,
|
||||
observation: Dict,
|
||||
instruction: str,
|
||||
web_knowledge: str,
|
||||
similar_task: str,
|
||||
experience: str,
|
||||
) -> Tuple[str, list, str]:
|
||||
"""Combine web knowledge with similar task experience"""
|
||||
|
||||
content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
|
||||
"str_input": f"Task: {instruction}\n" +
|
||||
f"**Web search result**:\n{web_knowledge}\n\n" +
|
||||
f"**Retrieved similar task experience**:\n" +
|
||||
f"Similar task:{similar_task}\n{experience}\n\n" +
|
||||
f"Based on the web search result and the retrieved similar task experience, " +
|
||||
f"if you think the similar task experience is indeed useful to the main task, " +
|
||||
f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
|
||||
"img_input": observation["screenshot"] if "screenshot" in observation else None
|
||||
})
|
||||
|
||||
return content, total_tokens, cost
|
||||
|
||||
|
||||
def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
|
||||
"""Save episodic memory (subtask level knowledge).
|
||||
|
||||
Args:
|
||||
subtask_key (str): Key identifying the subtask
|
||||
subtask_traj (str): Trajectory/experience of the subtask
|
||||
"""
|
||||
if not self.save_knowledge:
|
||||
return
|
||||
|
||||
try:
|
||||
kb = load_knowledge_base(self.episodic_memory_path)
|
||||
except:
|
||||
kb = {}
|
||||
|
||||
if subtask_key not in kb:
|
||||
subtask_summarization = self.summarize_episode(subtask_traj)
|
||||
kb[subtask_key] = subtask_summarization
|
||||
|
||||
if self.save_knowledge:
|
||||
os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
|
||||
with open(self.episodic_memory_path, "w") as fout:
|
||||
json.dump(kb, fout, indent=2)
|
||||
|
||||
return kb.get(subtask_key)
|
||||
|
||||
def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
|
||||
"""Save narrative memory (task level knowledge).
|
||||
|
||||
Args:
|
||||
task_key (str): Key identifying the task
|
||||
task_traj (str): Full trajectory/experience of the task
|
||||
"""
|
||||
if not self.save_knowledge:
|
||||
return
|
||||
|
||||
try:
|
||||
kb = load_knowledge_base(self.narrative_memory_path)
|
||||
except:
|
||||
kb = {}
|
||||
|
||||
if task_key not in kb:
|
||||
task_summarization = self.summarize_narrative(task_traj)
|
||||
kb[task_key] = task_summarization
|
||||
|
||||
if self.save_knowledge:
|
||||
os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
|
||||
with open(self.narrative_memory_path, "w") as fout:
|
||||
json.dump(kb, fout, indent=2)
|
||||
|
||||
return kb.get(task_key)
|
||||
|
||||
def initialize_task_trajectory(self, instruction: str) -> None:
|
||||
"""Initialize a new task trajectory.
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
"""
|
||||
self.task_trajectory = f"Task:\n{instruction}"
|
||||
self.current_search_query = ""
|
||||
self.current_subtask_trajectory = ""
|
||||
|
||||
def update_task_trajectory(self, meta_data: Dict) -> None:
|
||||
"""Update the task trajectory with new metadata.
|
||||
|
||||
Args:
|
||||
meta_data (Dict): Metadata from the agent's prediction
|
||||
"""
|
||||
if not self.current_search_query and "search_query" in meta_data:
|
||||
self.current_search_query = meta_data["search_query"]
|
||||
|
||||
self.task_trajectory += (
|
||||
"\n\nReflection:\n"
|
||||
+ str(meta_data["reflection"])
|
||||
+ "\n\n----------------------\n\nPlan:\n"
|
||||
+ meta_data["executor_plan"]
|
||||
)
|
||||
|
||||
def handle_subtask_trajectory(self, meta_data: Dict):
|
||||
"""Handle subtask trajectory updates based on subtask status.
|
||||
|
||||
Args:
|
||||
meta_data (Dict): Metadata containing subtask information
|
||||
|
||||
Returns:
|
||||
bool: Whether the subtask was completed
|
||||
"""
|
||||
subtask_status = meta_data["subtask_status"]
|
||||
subtask = meta_data["subtask"]
|
||||
subtask_info = meta_data["subtask_info"]
|
||||
|
||||
if subtask_status in ["Start", "Done"]:
|
||||
# If there's an existing subtask trajectory, finalize it
|
||||
if self.current_subtask_trajectory:
|
||||
self.current_subtask_trajectory += "\nSubtask Completed.\n"
|
||||
subtask_key = self.current_subtask_trajectory.split(
|
||||
"\n----------------------\n\nPlan:\n"
|
||||
)[0]
|
||||
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
|
||||
self.current_subtask_trajectory = ""
|
||||
return True
|
||||
|
||||
# Start new subtask trajectory
|
||||
self.current_subtask_trajectory = (
|
||||
f"Task:\n{self.current_search_query}\n\n"
|
||||
f"Subtask: {subtask}\n"
|
||||
f"Subtask Instruction: {subtask_info}\n"
|
||||
f"----------------------\n\n"
|
||||
f'Plan:\n{meta_data["executor_plan"]}\n'
|
||||
)
|
||||
return False
|
||||
|
||||
elif subtask_status == "In":
|
||||
# Continue current subtask trajectory
|
||||
self.current_subtask_trajectory += (
|
||||
f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
|
||||
)
|
||||
return False
|
||||
|
||||
def finalize_task(self) -> None:
|
||||
"""Finalize the task by saving any remaining trajectories."""
|
||||
# Save any remaining subtask trajectory
|
||||
if self.current_subtask_trajectory:
|
||||
self.current_subtask_trajectory += "\nSubtask Completed.\n"
|
||||
subtask_key = self.current_subtask_trajectory.split(
|
||||
"\n----------------------\n\nPlan:\n"
|
||||
)[0]
|
||||
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
|
||||
|
||||
# Save the complete task trajectory
|
||||
if self.task_trajectory and self.current_search_query:
|
||||
self.save_narrative_memory(self.current_search_query, self.task_trajectory)
|
||||
|
||||
# Reset trajectories
|
||||
self.task_trajectory = ""
|
||||
self.current_subtask_trajectory = ""
|
||||
self.current_search_query = ""
|
||||
|
||||
def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
|
||||
"""Summarize the episode experience for lifelong learning reflection
|
||||
|
||||
Args:
|
||||
trajectory (str): The episode experience to be summarized
|
||||
|
||||
Returns:
|
||||
str: The summarized episode experience
|
||||
"""
|
||||
|
||||
# Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
|
||||
content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
|
||||
|
||||
return content, total_tokens, cost
|
||||
|
||||
def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
|
||||
"""Summarize the narrative experience for lifelong learning reflection
|
||||
|
||||
Args:
|
||||
trajectory (str): The narrative experience to be summarized
|
||||
|
||||
Returns:
|
||||
str: The summarized narrative experience
|
||||
"""
|
||||
# Create Reflection on whole trajectories for next round trial
|
||||
content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
|
||||
|
||||
return content, total_tokens, cost
|
||||
Reference in New Issue
Block a user