* Added a **pyproject.toml** file to define project metadata and dependencies. * Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic. * Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis. * Added a **tools module** containing utility functions and tool configurations to improve code reusability. * Updated the **README** and documentation with usage examples and module descriptions. These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience. Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
482 lines
19 KiB
Python
482 lines
19 KiB
Python
import json
|
|
import os
|
|
from typing import Dict, Tuple, List
|
|
import numpy as np
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from ..utils.common_utils import (
|
|
load_embeddings,
|
|
load_knowledge_base,
|
|
save_embeddings,
|
|
)
|
|
from ..tools.new_tools import NewTools
|
|
from .mllm import CostManager
|
|
|
|
def get_embedding_dim(model_name):
|
|
if model_name == "doubao-embedding-large-text-250515":
|
|
return 2048
|
|
elif model_name == "doubao-embedding-text-240715":
|
|
return 2560
|
|
elif model_name == "text-embedding-ada-002":
|
|
return 1536
|
|
elif model_name == "text-embedding-3-small":
|
|
return 1536
|
|
elif model_name == "text-embedding-3-large":
|
|
return 3072
|
|
elif model_name == "gemini-embedding-001":
|
|
return 3072
|
|
elif model_name == "jina-embeddings-v4":
|
|
return 2048
|
|
elif model_name == "jina-embeddings-v3":
|
|
return 1024
|
|
elif model_name == "text-embedding-v4":
|
|
return 1024
|
|
elif model_name == "text-embedding-v3":
|
|
return 1024
|
|
elif model_name == "embedding-2" or model_name == "embedding-3":
|
|
return 2048
|
|
else:
|
|
return None
|
|
|
|
class NewKnowledgeBase:
|
|
def __init__(
|
|
self,
|
|
embedding_engine: NewTools,
|
|
local_kb_path: str,
|
|
platform: str,
|
|
Tools_dict: Dict,
|
|
save_knowledge: bool = True,
|
|
):
|
|
"""
|
|
Initialize the KnowledgeBase module
|
|
|
|
Args:
|
|
embedding_engine: Embedding engine instance
|
|
local_kb_path: Path to local knowledge base
|
|
platform: Target platform (Windows/Darwin/Ubuntu)
|
|
Tools_dict: Dictionary containing tool configurations
|
|
save_knowledge: Whether to save knowledge embeddings
|
|
"""
|
|
self.platform = platform
|
|
|
|
self.local_kb_path = local_kb_path
|
|
|
|
# initialize embedding engine
|
|
self.embedding_engine = embedding_engine
|
|
|
|
# Initialize paths for different memory types
|
|
self.episodic_memory_path = os.path.join(
|
|
self.local_kb_path, self.platform, "episodic_memory.json"
|
|
)
|
|
self.narrative_memory_path = os.path.join(
|
|
self.local_kb_path, self.platform, "narrative_memory.json"
|
|
)
|
|
embedding_model_name = ""
|
|
if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
|
|
embedding_model_name = self.embedding_engine.tools["embedding"].model_name
|
|
else:
|
|
embedding_model_name = "default"
|
|
embedding_dim = get_embedding_dim(embedding_model_name)
|
|
self.embeddings_path = os.path.join(
|
|
self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
|
|
)
|
|
|
|
# Initialize trajectory tracking
|
|
self.task_trajectory = ""
|
|
self.current_subtask_trajectory = ""
|
|
self.current_search_query = ""
|
|
|
|
# query_formulator
|
|
self.query_formulator_name = "query_formulator"
|
|
self.query_formulator = NewTools()
|
|
self.query_formulator.register_tool(
|
|
self.query_formulator_name,
|
|
Tools_dict[self.query_formulator_name]["provider"],
|
|
Tools_dict[self.query_formulator_name]["model"],
|
|
)
|
|
|
|
# knowledge_fusion_agent
|
|
self.knowledge_fusion_agent_name = "context_fusion"
|
|
self.knowledge_fusion_agent = NewTools()
|
|
self.knowledge_fusion_agent.register_tool(
|
|
self.knowledge_fusion_agent_name,
|
|
Tools_dict[self.knowledge_fusion_agent_name]["provider"],
|
|
Tools_dict[self.knowledge_fusion_agent_name]["model"],
|
|
)
|
|
|
|
# narrative_summarization_agent
|
|
self.narrative_summarization_agent_name = "narrative_summarization"
|
|
self.narrative_summarization_agent = NewTools()
|
|
self.narrative_summarization_agent.register_tool(
|
|
self.narrative_summarization_agent_name,
|
|
Tools_dict[self.narrative_summarization_agent_name]["provider"],
|
|
Tools_dict[self.narrative_summarization_agent_name]["model"],
|
|
)
|
|
|
|
# episode_summarization_agent
|
|
self.episode_summarization_agent_name = "episode_summarization"
|
|
self.episode_summarization_agent = NewTools()
|
|
self.episode_summarization_agent.register_tool(
|
|
self.episode_summarization_agent_name,
|
|
Tools_dict[self.episode_summarization_agent_name]["provider"],
|
|
Tools_dict[self.episode_summarization_agent_name]["model"],
|
|
)
|
|
|
|
self.save_knowledge = save_knowledge
|
|
|
|
def retrieve_knowledge(
|
|
self, instruction: str, search_query: str, search_engine: NewTools
|
|
) -> Tuple[str, List[int], str]:
|
|
"""Retrieve knowledge using search engine
|
|
Args:
|
|
instruction (str): task instruction
|
|
search_query (str): search query to use
|
|
search_engine (NewTools): search engine tool to use
|
|
|
|
Returns:
|
|
Tuple[str, List[int], float]: The search results, token usage, and cost
|
|
"""
|
|
search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
|
|
|
|
return search_results, total_tokens, cost_string
|
|
|
|
def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
|
|
"""Formulate search query based on instruction and current state
|
|
|
|
Args:
|
|
instruction (str): The task instruction
|
|
observation (Dict): Current observation including screenshot
|
|
|
|
Returns:
|
|
Tuple[str, List[int], float]: The formulated query, token usage, and cost
|
|
"""
|
|
query_path = os.path.join(
|
|
self.local_kb_path, self.platform, "formulate_query.json"
|
|
)
|
|
try:
|
|
with open(query_path, "r") as f:
|
|
formulate_query = json.load(f)
|
|
except:
|
|
formulate_query = {}
|
|
|
|
if instruction in formulate_query:
|
|
return formulate_query[instruction], [0, 0, 0], ""
|
|
|
|
self.query_formulator.tools["query_formulator"].llm_agent.reset()
|
|
|
|
content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
|
|
"str_input": f"The task is: {instruction}\n" +
|
|
"To use google search to get some useful information, first carefully analyze " +
|
|
"the screenshot of the current desktop UI state, then given the task " +
|
|
"instruction, formulate a question that can be used to search on the Internet " +
|
|
"for information in helping with the task execution.\n" +
|
|
"The question should not be too general or too specific. Please ONLY provide " +
|
|
"the question.\nQuestion:",
|
|
"img_input": observation["screenshot"] if "screenshot" in observation else None
|
|
})
|
|
|
|
search_query = content.strip().replace('"', "")
|
|
|
|
# print("search query: ", search_query)
|
|
formulate_query[instruction] = search_query
|
|
with open(query_path, "w") as f:
|
|
json.dump(formulate_query, f, indent=2)
|
|
|
|
return search_query, total_tokens, cost_string
|
|
|
|
def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
|
|
"""Retrieve narrative experience using embeddings
|
|
|
|
Args:
|
|
instruction (str): The task instruction
|
|
|
|
Returns:
|
|
Tuple[str, str]: The similar task key and its narrative experience
|
|
"""
|
|
|
|
knowledge_base = load_knowledge_base(self.narrative_memory_path)
|
|
if not knowledge_base:
|
|
return "None", "None", [0, 0, 0], ""
|
|
|
|
embeddings = load_embeddings(self.embeddings_path)
|
|
|
|
# Get or create instruction embedding
|
|
instruction_embedding = embeddings.get(instruction)
|
|
total_tokens, cost_string = [0, 0, 0], ""
|
|
|
|
if instruction_embedding is None:
|
|
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
|
|
embeddings[instruction] = instruction_embedding
|
|
# total_tokens += tokens
|
|
for i in range(len(total_tokens)):
|
|
total_tokens[i] += tokens[i]
|
|
cost_string = cost_string_now
|
|
# Get or create embeddings for knowledge base entries
|
|
candidate_embeddings = []
|
|
for key in knowledge_base:
|
|
candidate_embedding = embeddings.get(key)
|
|
if candidate_embedding is None:
|
|
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
|
|
for i in range(len(tokens)):
|
|
total_tokens[i] += tokens[i]
|
|
# total_tokens += tokens
|
|
cost_string = CostManager.add_costs(cost_string, cost_string_now)
|
|
embeddings[key] = candidate_embedding
|
|
|
|
candidate_embeddings.append(candidate_embedding)
|
|
|
|
save_embeddings(self.embeddings_path, embeddings)
|
|
|
|
similarities = cosine_similarity(
|
|
instruction_embedding, np.vstack(candidate_embeddings)
|
|
)[0]
|
|
sorted_indices = np.argsort(similarities)[::-1]
|
|
|
|
keys = list(knowledge_base.keys())
|
|
idx = 1 if keys[sorted_indices[0]] == instruction else 0
|
|
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
|
|
|
|
def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
|
|
"""Retrieve similar task experience using embeddings
|
|
|
|
Args:
|
|
instruction (str): The task instruction
|
|
|
|
Returns:
|
|
Tuple[str, str]: The similar task key and its episodic experience
|
|
"""
|
|
|
|
knowledge_base = load_knowledge_base(self.episodic_memory_path)
|
|
if not knowledge_base:
|
|
return "None", "None", [0, 0, 0], ""
|
|
|
|
embeddings = load_embeddings(self.embeddings_path)
|
|
|
|
# Get or create instruction embedding
|
|
instruction_embedding = embeddings.get(instruction)
|
|
total_tokens, cost_string = [0, 0, 0], ""
|
|
|
|
if instruction_embedding is None:
|
|
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
|
|
embeddings[instruction] = instruction_embedding
|
|
|
|
# total_tokens += tokens
|
|
for i in range(len(total_tokens)):
|
|
total_tokens[i] += tokens[i]
|
|
cost_string = cost_string_now
|
|
|
|
# Get or create embeddings for knowledge base entries
|
|
candidate_embeddings = []
|
|
for key in knowledge_base:
|
|
candidate_embedding = embeddings.get(key)
|
|
if candidate_embedding is None:
|
|
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
|
|
# total_tokens += tokens
|
|
for i in range(len(total_tokens)):
|
|
total_tokens[i] += tokens[i]
|
|
cost_string = CostManager.add_costs(cost_string, cost_string_now)
|
|
embeddings[key] = candidate_embedding
|
|
|
|
candidate_embeddings.append(candidate_embedding)
|
|
|
|
save_embeddings(self.embeddings_path, embeddings)
|
|
|
|
similarities = cosine_similarity(
|
|
instruction_embedding, np.vstack(candidate_embeddings)
|
|
)[0]
|
|
sorted_indices = np.argsort(similarities)[::-1]
|
|
|
|
keys = list(knowledge_base.keys())
|
|
idx = 1 if keys[sorted_indices[0]] == instruction else 0
|
|
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
|
|
|
|
def knowledge_fusion(
|
|
self,
|
|
observation: Dict,
|
|
instruction: str,
|
|
web_knowledge: str,
|
|
similar_task: str,
|
|
experience: str,
|
|
) -> Tuple[str, list, str]:
|
|
"""Combine web knowledge with similar task experience"""
|
|
|
|
content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
|
|
"str_input": f"Task: {instruction}\n" +
|
|
f"**Web search result**:\n{web_knowledge}\n\n" +
|
|
f"**Retrieved similar task experience**:\n" +
|
|
f"Similar task:{similar_task}\n{experience}\n\n" +
|
|
f"Based on the web search result and the retrieved similar task experience, " +
|
|
f"if you think the similar task experience is indeed useful to the main task, " +
|
|
f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
|
|
"img_input": observation["screenshot"] if "screenshot" in observation else None
|
|
})
|
|
|
|
return content, total_tokens, cost
|
|
|
|
|
|
def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
|
|
"""Save episodic memory (subtask level knowledge).
|
|
|
|
Args:
|
|
subtask_key (str): Key identifying the subtask
|
|
subtask_traj (str): Trajectory/experience of the subtask
|
|
"""
|
|
if not self.save_knowledge:
|
|
return
|
|
|
|
try:
|
|
kb = load_knowledge_base(self.episodic_memory_path)
|
|
except:
|
|
kb = {}
|
|
|
|
if subtask_key not in kb:
|
|
subtask_summarization = self.summarize_episode(subtask_traj)
|
|
kb[subtask_key] = subtask_summarization
|
|
|
|
if self.save_knowledge:
|
|
os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
|
|
with open(self.episodic_memory_path, "w") as fout:
|
|
json.dump(kb, fout, indent=2)
|
|
|
|
return kb.get(subtask_key)
|
|
|
|
def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
|
|
"""Save narrative memory (task level knowledge).
|
|
|
|
Args:
|
|
task_key (str): Key identifying the task
|
|
task_traj (str): Full trajectory/experience of the task
|
|
"""
|
|
if not self.save_knowledge:
|
|
return
|
|
|
|
try:
|
|
kb = load_knowledge_base(self.narrative_memory_path)
|
|
except:
|
|
kb = {}
|
|
|
|
if task_key not in kb:
|
|
task_summarization = self.summarize_narrative(task_traj)
|
|
kb[task_key] = task_summarization
|
|
|
|
if self.save_knowledge:
|
|
os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
|
|
with open(self.narrative_memory_path, "w") as fout:
|
|
json.dump(kb, fout, indent=2)
|
|
|
|
return kb.get(task_key)
|
|
|
|
def initialize_task_trajectory(self, instruction: str) -> None:
|
|
"""Initialize a new task trajectory.
|
|
|
|
Args:
|
|
instruction (str): The task instruction
|
|
"""
|
|
self.task_trajectory = f"Task:\n{instruction}"
|
|
self.current_search_query = ""
|
|
self.current_subtask_trajectory = ""
|
|
|
|
def update_task_trajectory(self, meta_data: Dict) -> None:
|
|
"""Update the task trajectory with new metadata.
|
|
|
|
Args:
|
|
meta_data (Dict): Metadata from the agent's prediction
|
|
"""
|
|
if not self.current_search_query and "search_query" in meta_data:
|
|
self.current_search_query = meta_data["search_query"]
|
|
|
|
self.task_trajectory += (
|
|
"\n\nReflection:\n"
|
|
+ str(meta_data["reflection"])
|
|
+ "\n\n----------------------\n\nPlan:\n"
|
|
+ meta_data["executor_plan"]
|
|
)
|
|
|
|
def handle_subtask_trajectory(self, meta_data: Dict):
|
|
"""Handle subtask trajectory updates based on subtask status.
|
|
|
|
Args:
|
|
meta_data (Dict): Metadata containing subtask information
|
|
|
|
Returns:
|
|
bool: Whether the subtask was completed
|
|
"""
|
|
subtask_status = meta_data["subtask_status"]
|
|
subtask = meta_data["subtask"]
|
|
subtask_info = meta_data["subtask_info"]
|
|
|
|
if subtask_status in ["Start", "Done"]:
|
|
# If there's an existing subtask trajectory, finalize it
|
|
if self.current_subtask_trajectory:
|
|
self.current_subtask_trajectory += "\nSubtask Completed.\n"
|
|
subtask_key = self.current_subtask_trajectory.split(
|
|
"\n----------------------\n\nPlan:\n"
|
|
)[0]
|
|
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
|
|
self.current_subtask_trajectory = ""
|
|
return True
|
|
|
|
# Start new subtask trajectory
|
|
self.current_subtask_trajectory = (
|
|
f"Task:\n{self.current_search_query}\n\n"
|
|
f"Subtask: {subtask}\n"
|
|
f"Subtask Instruction: {subtask_info}\n"
|
|
f"----------------------\n\n"
|
|
f'Plan:\n{meta_data["executor_plan"]}\n'
|
|
)
|
|
return False
|
|
|
|
elif subtask_status == "In":
|
|
# Continue current subtask trajectory
|
|
self.current_subtask_trajectory += (
|
|
f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
|
|
)
|
|
return False
|
|
|
|
def finalize_task(self) -> None:
|
|
"""Finalize the task by saving any remaining trajectories."""
|
|
# Save any remaining subtask trajectory
|
|
if self.current_subtask_trajectory:
|
|
self.current_subtask_trajectory += "\nSubtask Completed.\n"
|
|
subtask_key = self.current_subtask_trajectory.split(
|
|
"\n----------------------\n\nPlan:\n"
|
|
)[0]
|
|
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
|
|
|
|
# Save the complete task trajectory
|
|
if self.task_trajectory and self.current_search_query:
|
|
self.save_narrative_memory(self.current_search_query, self.task_trajectory)
|
|
|
|
# Reset trajectories
|
|
self.task_trajectory = ""
|
|
self.current_subtask_trajectory = ""
|
|
self.current_search_query = ""
|
|
|
|
def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
|
|
"""Summarize the episode experience for lifelong learning reflection
|
|
|
|
Args:
|
|
trajectory (str): The episode experience to be summarized
|
|
|
|
Returns:
|
|
str: The summarized episode experience
|
|
"""
|
|
|
|
# Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
|
|
content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
|
|
|
|
return content, total_tokens, cost
|
|
|
|
def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
|
|
"""Summarize the narrative experience for lifelong learning reflection
|
|
|
|
Args:
|
|
trajectory (str): The narrative experience to be summarized
|
|
|
|
Returns:
|
|
str: The summarized narrative experience
|
|
"""
|
|
# Create Reflection on whole trajectories for next round trial
|
|
content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
|
|
|
|
return content, total_tokens, cost
|