Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)
* Added a **pyproject.toml** file to define project metadata and dependencies. * Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic. * Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis. * Added a **tools module** containing utility functions and tool configurations to improve code reusability. * Updated the **README** and documentation with usage examples and module descriptions. These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience. Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
This commit is contained in:
481
mm_agents/maestro/core/new_knowledge.py
Normal file
481
mm_agents/maestro/core/new_knowledge.py
Normal file
@@ -0,0 +1,481 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, Tuple, List
|
||||
import numpy as np
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from ..utils.common_utils import (
|
||||
load_embeddings,
|
||||
load_knowledge_base,
|
||||
save_embeddings,
|
||||
)
|
||||
from ..tools.new_tools import NewTools
|
||||
from .mllm import CostManager
|
||||
|
||||
def get_embedding_dim(model_name):
|
||||
if model_name == "doubao-embedding-large-text-250515":
|
||||
return 2048
|
||||
elif model_name == "doubao-embedding-text-240715":
|
||||
return 2560
|
||||
elif model_name == "text-embedding-ada-002":
|
||||
return 1536
|
||||
elif model_name == "text-embedding-3-small":
|
||||
return 1536
|
||||
elif model_name == "text-embedding-3-large":
|
||||
return 3072
|
||||
elif model_name == "gemini-embedding-001":
|
||||
return 3072
|
||||
elif model_name == "jina-embeddings-v4":
|
||||
return 2048
|
||||
elif model_name == "jina-embeddings-v3":
|
||||
return 1024
|
||||
elif model_name == "text-embedding-v4":
|
||||
return 1024
|
||||
elif model_name == "text-embedding-v3":
|
||||
return 1024
|
||||
elif model_name == "embedding-2" or model_name == "embedding-3":
|
||||
return 2048
|
||||
else:
|
||||
return None
|
||||
|
||||
class NewKnowledgeBase:
|
||||
def __init__(
|
||||
self,
|
||||
embedding_engine: NewTools,
|
||||
local_kb_path: str,
|
||||
platform: str,
|
||||
Tools_dict: Dict,
|
||||
save_knowledge: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the KnowledgeBase module
|
||||
|
||||
Args:
|
||||
embedding_engine: Embedding engine instance
|
||||
local_kb_path: Path to local knowledge base
|
||||
platform: Target platform (Windows/Darwin/Ubuntu)
|
||||
Tools_dict: Dictionary containing tool configurations
|
||||
save_knowledge: Whether to save knowledge embeddings
|
||||
"""
|
||||
self.platform = platform
|
||||
|
||||
self.local_kb_path = local_kb_path
|
||||
|
||||
# initialize embedding engine
|
||||
self.embedding_engine = embedding_engine
|
||||
|
||||
# Initialize paths for different memory types
|
||||
self.episodic_memory_path = os.path.join(
|
||||
self.local_kb_path, self.platform, "episodic_memory.json"
|
||||
)
|
||||
self.narrative_memory_path = os.path.join(
|
||||
self.local_kb_path, self.platform, "narrative_memory.json"
|
||||
)
|
||||
embedding_model_name = ""
|
||||
if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
|
||||
embedding_model_name = self.embedding_engine.tools["embedding"].model_name
|
||||
else:
|
||||
embedding_model_name = "default"
|
||||
embedding_dim = get_embedding_dim(embedding_model_name)
|
||||
self.embeddings_path = os.path.join(
|
||||
self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
|
||||
)
|
||||
|
||||
# Initialize trajectory tracking
|
||||
self.task_trajectory = ""
|
||||
self.current_subtask_trajectory = ""
|
||||
self.current_search_query = ""
|
||||
|
||||
# query_formulator
|
||||
self.query_formulator_name = "query_formulator"
|
||||
self.query_formulator = NewTools()
|
||||
self.query_formulator.register_tool(
|
||||
self.query_formulator_name,
|
||||
Tools_dict[self.query_formulator_name]["provider"],
|
||||
Tools_dict[self.query_formulator_name]["model"],
|
||||
)
|
||||
|
||||
# knowledge_fusion_agent
|
||||
self.knowledge_fusion_agent_name = "context_fusion"
|
||||
self.knowledge_fusion_agent = NewTools()
|
||||
self.knowledge_fusion_agent.register_tool(
|
||||
self.knowledge_fusion_agent_name,
|
||||
Tools_dict[self.knowledge_fusion_agent_name]["provider"],
|
||||
Tools_dict[self.knowledge_fusion_agent_name]["model"],
|
||||
)
|
||||
|
||||
# narrative_summarization_agent
|
||||
self.narrative_summarization_agent_name = "narrative_summarization"
|
||||
self.narrative_summarization_agent = NewTools()
|
||||
self.narrative_summarization_agent.register_tool(
|
||||
self.narrative_summarization_agent_name,
|
||||
Tools_dict[self.narrative_summarization_agent_name]["provider"],
|
||||
Tools_dict[self.narrative_summarization_agent_name]["model"],
|
||||
)
|
||||
|
||||
# episode_summarization_agent
|
||||
self.episode_summarization_agent_name = "episode_summarization"
|
||||
self.episode_summarization_agent = NewTools()
|
||||
self.episode_summarization_agent.register_tool(
|
||||
self.episode_summarization_agent_name,
|
||||
Tools_dict[self.episode_summarization_agent_name]["provider"],
|
||||
Tools_dict[self.episode_summarization_agent_name]["model"],
|
||||
)
|
||||
|
||||
self.save_knowledge = save_knowledge
|
||||
|
||||
def retrieve_knowledge(
|
||||
self, instruction: str, search_query: str, search_engine: NewTools
|
||||
) -> Tuple[str, List[int], str]:
|
||||
"""Retrieve knowledge using search engine
|
||||
Args:
|
||||
instruction (str): task instruction
|
||||
search_query (str): search query to use
|
||||
search_engine (NewTools): search engine tool to use
|
||||
|
||||
Returns:
|
||||
Tuple[str, List[int], float]: The search results, token usage, and cost
|
||||
"""
|
||||
search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
|
||||
|
||||
return search_results, total_tokens, cost_string
|
||||
|
||||
def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
|
||||
"""Formulate search query based on instruction and current state
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
observation (Dict): Current observation including screenshot
|
||||
|
||||
Returns:
|
||||
Tuple[str, List[int], float]: The formulated query, token usage, and cost
|
||||
"""
|
||||
query_path = os.path.join(
|
||||
self.local_kb_path, self.platform, "formulate_query.json"
|
||||
)
|
||||
try:
|
||||
with open(query_path, "r") as f:
|
||||
formulate_query = json.load(f)
|
||||
except:
|
||||
formulate_query = {}
|
||||
|
||||
if instruction in formulate_query:
|
||||
return formulate_query[instruction], [0, 0, 0], ""
|
||||
|
||||
self.query_formulator.tools["query_formulator"].llm_agent.reset()
|
||||
|
||||
content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
|
||||
"str_input": f"The task is: {instruction}\n" +
|
||||
"To use google search to get some useful information, first carefully analyze " +
|
||||
"the screenshot of the current desktop UI state, then given the task " +
|
||||
"instruction, formulate a question that can be used to search on the Internet " +
|
||||
"for information in helping with the task execution.\n" +
|
||||
"The question should not be too general or too specific. Please ONLY provide " +
|
||||
"the question.\nQuestion:",
|
||||
"img_input": observation["screenshot"] if "screenshot" in observation else None
|
||||
})
|
||||
|
||||
search_query = content.strip().replace('"', "")
|
||||
|
||||
# print("search query: ", search_query)
|
||||
formulate_query[instruction] = search_query
|
||||
with open(query_path, "w") as f:
|
||||
json.dump(formulate_query, f, indent=2)
|
||||
|
||||
return search_query, total_tokens, cost_string
|
||||
|
||||
def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
|
||||
"""Retrieve narrative experience using embeddings
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: The similar task key and its narrative experience
|
||||
"""
|
||||
|
||||
knowledge_base = load_knowledge_base(self.narrative_memory_path)
|
||||
if not knowledge_base:
|
||||
return "None", "None", [0, 0, 0], ""
|
||||
|
||||
embeddings = load_embeddings(self.embeddings_path)
|
||||
|
||||
# Get or create instruction embedding
|
||||
instruction_embedding = embeddings.get(instruction)
|
||||
total_tokens, cost_string = [0, 0, 0], ""
|
||||
|
||||
if instruction_embedding is None:
|
||||
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
|
||||
embeddings[instruction] = instruction_embedding
|
||||
# total_tokens += tokens
|
||||
for i in range(len(total_tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
cost_string = cost_string_now
|
||||
# Get or create embeddings for knowledge base entries
|
||||
candidate_embeddings = []
|
||||
for key in knowledge_base:
|
||||
candidate_embedding = embeddings.get(key)
|
||||
if candidate_embedding is None:
|
||||
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
|
||||
for i in range(len(tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
# total_tokens += tokens
|
||||
cost_string = CostManager.add_costs(cost_string, cost_string_now)
|
||||
embeddings[key] = candidate_embedding
|
||||
|
||||
candidate_embeddings.append(candidate_embedding)
|
||||
|
||||
save_embeddings(self.embeddings_path, embeddings)
|
||||
|
||||
similarities = cosine_similarity(
|
||||
instruction_embedding, np.vstack(candidate_embeddings)
|
||||
)[0]
|
||||
sorted_indices = np.argsort(similarities)[::-1]
|
||||
|
||||
keys = list(knowledge_base.keys())
|
||||
idx = 1 if keys[sorted_indices[0]] == instruction else 0
|
||||
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
|
||||
|
||||
def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
|
||||
"""Retrieve similar task experience using embeddings
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: The similar task key and its episodic experience
|
||||
"""
|
||||
|
||||
knowledge_base = load_knowledge_base(self.episodic_memory_path)
|
||||
if not knowledge_base:
|
||||
return "None", "None", [0, 0, 0], ""
|
||||
|
||||
embeddings = load_embeddings(self.embeddings_path)
|
||||
|
||||
# Get or create instruction embedding
|
||||
instruction_embedding = embeddings.get(instruction)
|
||||
total_tokens, cost_string = [0, 0, 0], ""
|
||||
|
||||
if instruction_embedding is None:
|
||||
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
|
||||
embeddings[instruction] = instruction_embedding
|
||||
|
||||
# total_tokens += tokens
|
||||
for i in range(len(total_tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
cost_string = cost_string_now
|
||||
|
||||
# Get or create embeddings for knowledge base entries
|
||||
candidate_embeddings = []
|
||||
for key in knowledge_base:
|
||||
candidate_embedding = embeddings.get(key)
|
||||
if candidate_embedding is None:
|
||||
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
|
||||
# total_tokens += tokens
|
||||
for i in range(len(total_tokens)):
|
||||
total_tokens[i] += tokens[i]
|
||||
cost_string = CostManager.add_costs(cost_string, cost_string_now)
|
||||
embeddings[key] = candidate_embedding
|
||||
|
||||
candidate_embeddings.append(candidate_embedding)
|
||||
|
||||
save_embeddings(self.embeddings_path, embeddings)
|
||||
|
||||
similarities = cosine_similarity(
|
||||
instruction_embedding, np.vstack(candidate_embeddings)
|
||||
)[0]
|
||||
sorted_indices = np.argsort(similarities)[::-1]
|
||||
|
||||
keys = list(knowledge_base.keys())
|
||||
idx = 1 if keys[sorted_indices[0]] == instruction else 0
|
||||
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
|
||||
|
||||
def knowledge_fusion(
|
||||
self,
|
||||
observation: Dict,
|
||||
instruction: str,
|
||||
web_knowledge: str,
|
||||
similar_task: str,
|
||||
experience: str,
|
||||
) -> Tuple[str, list, str]:
|
||||
"""Combine web knowledge with similar task experience"""
|
||||
|
||||
content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
|
||||
"str_input": f"Task: {instruction}\n" +
|
||||
f"**Web search result**:\n{web_knowledge}\n\n" +
|
||||
f"**Retrieved similar task experience**:\n" +
|
||||
f"Similar task:{similar_task}\n{experience}\n\n" +
|
||||
f"Based on the web search result and the retrieved similar task experience, " +
|
||||
f"if you think the similar task experience is indeed useful to the main task, " +
|
||||
f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
|
||||
"img_input": observation["screenshot"] if "screenshot" in observation else None
|
||||
})
|
||||
|
||||
return content, total_tokens, cost
|
||||
|
||||
|
||||
def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
|
||||
"""Save episodic memory (subtask level knowledge).
|
||||
|
||||
Args:
|
||||
subtask_key (str): Key identifying the subtask
|
||||
subtask_traj (str): Trajectory/experience of the subtask
|
||||
"""
|
||||
if not self.save_knowledge:
|
||||
return
|
||||
|
||||
try:
|
||||
kb = load_knowledge_base(self.episodic_memory_path)
|
||||
except:
|
||||
kb = {}
|
||||
|
||||
if subtask_key not in kb:
|
||||
subtask_summarization = self.summarize_episode(subtask_traj)
|
||||
kb[subtask_key] = subtask_summarization
|
||||
|
||||
if self.save_knowledge:
|
||||
os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
|
||||
with open(self.episodic_memory_path, "w") as fout:
|
||||
json.dump(kb, fout, indent=2)
|
||||
|
||||
return kb.get(subtask_key)
|
||||
|
||||
def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
|
||||
"""Save narrative memory (task level knowledge).
|
||||
|
||||
Args:
|
||||
task_key (str): Key identifying the task
|
||||
task_traj (str): Full trajectory/experience of the task
|
||||
"""
|
||||
if not self.save_knowledge:
|
||||
return
|
||||
|
||||
try:
|
||||
kb = load_knowledge_base(self.narrative_memory_path)
|
||||
except:
|
||||
kb = {}
|
||||
|
||||
if task_key not in kb:
|
||||
task_summarization = self.summarize_narrative(task_traj)
|
||||
kb[task_key] = task_summarization
|
||||
|
||||
if self.save_knowledge:
|
||||
os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
|
||||
with open(self.narrative_memory_path, "w") as fout:
|
||||
json.dump(kb, fout, indent=2)
|
||||
|
||||
return kb.get(task_key)
|
||||
|
||||
def initialize_task_trajectory(self, instruction: str) -> None:
|
||||
"""Initialize a new task trajectory.
|
||||
|
||||
Args:
|
||||
instruction (str): The task instruction
|
||||
"""
|
||||
self.task_trajectory = f"Task:\n{instruction}"
|
||||
self.current_search_query = ""
|
||||
self.current_subtask_trajectory = ""
|
||||
|
||||
def update_task_trajectory(self, meta_data: Dict) -> None:
|
||||
"""Update the task trajectory with new metadata.
|
||||
|
||||
Args:
|
||||
meta_data (Dict): Metadata from the agent's prediction
|
||||
"""
|
||||
if not self.current_search_query and "search_query" in meta_data:
|
||||
self.current_search_query = meta_data["search_query"]
|
||||
|
||||
self.task_trajectory += (
|
||||
"\n\nReflection:\n"
|
||||
+ str(meta_data["reflection"])
|
||||
+ "\n\n----------------------\n\nPlan:\n"
|
||||
+ meta_data["executor_plan"]
|
||||
)
|
||||
|
||||
def handle_subtask_trajectory(self, meta_data: Dict):
|
||||
"""Handle subtask trajectory updates based on subtask status.
|
||||
|
||||
Args:
|
||||
meta_data (Dict): Metadata containing subtask information
|
||||
|
||||
Returns:
|
||||
bool: Whether the subtask was completed
|
||||
"""
|
||||
subtask_status = meta_data["subtask_status"]
|
||||
subtask = meta_data["subtask"]
|
||||
subtask_info = meta_data["subtask_info"]
|
||||
|
||||
if subtask_status in ["Start", "Done"]:
|
||||
# If there's an existing subtask trajectory, finalize it
|
||||
if self.current_subtask_trajectory:
|
||||
self.current_subtask_trajectory += "\nSubtask Completed.\n"
|
||||
subtask_key = self.current_subtask_trajectory.split(
|
||||
"\n----------------------\n\nPlan:\n"
|
||||
)[0]
|
||||
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
|
||||
self.current_subtask_trajectory = ""
|
||||
return True
|
||||
|
||||
# Start new subtask trajectory
|
||||
self.current_subtask_trajectory = (
|
||||
f"Task:\n{self.current_search_query}\n\n"
|
||||
f"Subtask: {subtask}\n"
|
||||
f"Subtask Instruction: {subtask_info}\n"
|
||||
f"----------------------\n\n"
|
||||
f'Plan:\n{meta_data["executor_plan"]}\n'
|
||||
)
|
||||
return False
|
||||
|
||||
elif subtask_status == "In":
|
||||
# Continue current subtask trajectory
|
||||
self.current_subtask_trajectory += (
|
||||
f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
|
||||
)
|
||||
return False
|
||||
|
||||
def finalize_task(self) -> None:
|
||||
"""Finalize the task by saving any remaining trajectories."""
|
||||
# Save any remaining subtask trajectory
|
||||
if self.current_subtask_trajectory:
|
||||
self.current_subtask_trajectory += "\nSubtask Completed.\n"
|
||||
subtask_key = self.current_subtask_trajectory.split(
|
||||
"\n----------------------\n\nPlan:\n"
|
||||
)[0]
|
||||
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
|
||||
|
||||
# Save the complete task trajectory
|
||||
if self.task_trajectory and self.current_search_query:
|
||||
self.save_narrative_memory(self.current_search_query, self.task_trajectory)
|
||||
|
||||
# Reset trajectories
|
||||
self.task_trajectory = ""
|
||||
self.current_subtask_trajectory = ""
|
||||
self.current_search_query = ""
|
||||
|
||||
def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
|
||||
"""Summarize the episode experience for lifelong learning reflection
|
||||
|
||||
Args:
|
||||
trajectory (str): The episode experience to be summarized
|
||||
|
||||
Returns:
|
||||
str: The summarized episode experience
|
||||
"""
|
||||
|
||||
# Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
|
||||
content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
|
||||
|
||||
return content, total_tokens, cost
|
||||
|
||||
def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
|
||||
"""Summarize the narrative experience for lifelong learning reflection
|
||||
|
||||
Args:
|
||||
trajectory (str): The narrative experience to be summarized
|
||||
|
||||
Returns:
|
||||
str: The summarized narrative experience
|
||||
"""
|
||||
# Create Reflection on whole trajectories for next round trial
|
||||
content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
|
||||
|
||||
return content, total_tokens, cost
|
||||
Reference in New Issue
Block a user