Files
sci-gui-agent-benchmark/mm_agents/maestro/core/new_knowledge.py
Hiroid 3a4b67304f Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)
* Added a **pyproject.toml** file to define project metadata and dependencies.
* Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic.
* Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis.
* Added a **tools module** containing utility functions and tool configurations to improve code reusability.
* Updated the **README** and documentation with usage examples and module descriptions.

These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience.

Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
2025-09-08 16:07:21 +09:00

482 lines
19 KiB
Python

import json
import os
from typing import Dict, Tuple, List
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from ..utils.common_utils import (
load_embeddings,
load_knowledge_base,
save_embeddings,
)
from ..tools.new_tools import NewTools
from .mllm import CostManager
def get_embedding_dim(model_name):
if model_name == "doubao-embedding-large-text-250515":
return 2048
elif model_name == "doubao-embedding-text-240715":
return 2560
elif model_name == "text-embedding-ada-002":
return 1536
elif model_name == "text-embedding-3-small":
return 1536
elif model_name == "text-embedding-3-large":
return 3072
elif model_name == "gemini-embedding-001":
return 3072
elif model_name == "jina-embeddings-v4":
return 2048
elif model_name == "jina-embeddings-v3":
return 1024
elif model_name == "text-embedding-v4":
return 1024
elif model_name == "text-embedding-v3":
return 1024
elif model_name == "embedding-2" or model_name == "embedding-3":
return 2048
else:
return None
class NewKnowledgeBase:
def __init__(
self,
embedding_engine: NewTools,
local_kb_path: str,
platform: str,
Tools_dict: Dict,
save_knowledge: bool = True,
):
"""
Initialize the KnowledgeBase module
Args:
embedding_engine: Embedding engine instance
local_kb_path: Path to local knowledge base
platform: Target platform (Windows/Darwin/Ubuntu)
Tools_dict: Dictionary containing tool configurations
save_knowledge: Whether to save knowledge embeddings
"""
self.platform = platform
self.local_kb_path = local_kb_path
# initialize embedding engine
self.embedding_engine = embedding_engine
# Initialize paths for different memory types
self.episodic_memory_path = os.path.join(
self.local_kb_path, self.platform, "episodic_memory.json"
)
self.narrative_memory_path = os.path.join(
self.local_kb_path, self.platform, "narrative_memory.json"
)
embedding_model_name = ""
if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
embedding_model_name = self.embedding_engine.tools["embedding"].model_name
else:
embedding_model_name = "default"
embedding_dim = get_embedding_dim(embedding_model_name)
self.embeddings_path = os.path.join(
self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
)
# Initialize trajectory tracking
self.task_trajectory = ""
self.current_subtask_trajectory = ""
self.current_search_query = ""
# query_formulator
self.query_formulator_name = "query_formulator"
self.query_formulator = NewTools()
self.query_formulator.register_tool(
self.query_formulator_name,
Tools_dict[self.query_formulator_name]["provider"],
Tools_dict[self.query_formulator_name]["model"],
)
# knowledge_fusion_agent
self.knowledge_fusion_agent_name = "context_fusion"
self.knowledge_fusion_agent = NewTools()
self.knowledge_fusion_agent.register_tool(
self.knowledge_fusion_agent_name,
Tools_dict[self.knowledge_fusion_agent_name]["provider"],
Tools_dict[self.knowledge_fusion_agent_name]["model"],
)
# narrative_summarization_agent
self.narrative_summarization_agent_name = "narrative_summarization"
self.narrative_summarization_agent = NewTools()
self.narrative_summarization_agent.register_tool(
self.narrative_summarization_agent_name,
Tools_dict[self.narrative_summarization_agent_name]["provider"],
Tools_dict[self.narrative_summarization_agent_name]["model"],
)
# episode_summarization_agent
self.episode_summarization_agent_name = "episode_summarization"
self.episode_summarization_agent = NewTools()
self.episode_summarization_agent.register_tool(
self.episode_summarization_agent_name,
Tools_dict[self.episode_summarization_agent_name]["provider"],
Tools_dict[self.episode_summarization_agent_name]["model"],
)
self.save_knowledge = save_knowledge
def retrieve_knowledge(
self, instruction: str, search_query: str, search_engine: NewTools
) -> Tuple[str, List[int], str]:
"""Retrieve knowledge using search engine
Args:
instruction (str): task instruction
search_query (str): search query to use
search_engine (NewTools): search engine tool to use
Returns:
Tuple[str, List[int], float]: The search results, token usage, and cost
"""
search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
return search_results, total_tokens, cost_string
def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
"""Formulate search query based on instruction and current state
Args:
instruction (str): The task instruction
observation (Dict): Current observation including screenshot
Returns:
Tuple[str, List[int], float]: The formulated query, token usage, and cost
"""
query_path = os.path.join(
self.local_kb_path, self.platform, "formulate_query.json"
)
try:
with open(query_path, "r") as f:
formulate_query = json.load(f)
except:
formulate_query = {}
if instruction in formulate_query:
return formulate_query[instruction], [0, 0, 0], ""
self.query_formulator.tools["query_formulator"].llm_agent.reset()
content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
"str_input": f"The task is: {instruction}\n" +
"To use google search to get some useful information, first carefully analyze " +
"the screenshot of the current desktop UI state, then given the task " +
"instruction, formulate a question that can be used to search on the Internet " +
"for information in helping with the task execution.\n" +
"The question should not be too general or too specific. Please ONLY provide " +
"the question.\nQuestion:",
"img_input": observation["screenshot"] if "screenshot" in observation else None
})
search_query = content.strip().replace('"', "")
# print("search query: ", search_query)
formulate_query[instruction] = search_query
with open(query_path, "w") as f:
json.dump(formulate_query, f, indent=2)
return search_query, total_tokens, cost_string
def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
"""Retrieve narrative experience using embeddings
Args:
instruction (str): The task instruction
Returns:
Tuple[str, str]: The similar task key and its narrative experience
"""
knowledge_base = load_knowledge_base(self.narrative_memory_path)
if not knowledge_base:
return "None", "None", [0, 0, 0], ""
embeddings = load_embeddings(self.embeddings_path)
# Get or create instruction embedding
instruction_embedding = embeddings.get(instruction)
total_tokens, cost_string = [0, 0, 0], ""
if instruction_embedding is None:
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
embeddings[instruction] = instruction_embedding
# total_tokens += tokens
for i in range(len(total_tokens)):
total_tokens[i] += tokens[i]
cost_string = cost_string_now
# Get or create embeddings for knowledge base entries
candidate_embeddings = []
for key in knowledge_base:
candidate_embedding = embeddings.get(key)
if candidate_embedding is None:
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
for i in range(len(tokens)):
total_tokens[i] += tokens[i]
# total_tokens += tokens
cost_string = CostManager.add_costs(cost_string, cost_string_now)
embeddings[key] = candidate_embedding
candidate_embeddings.append(candidate_embedding)
save_embeddings(self.embeddings_path, embeddings)
similarities = cosine_similarity(
instruction_embedding, np.vstack(candidate_embeddings)
)[0]
sorted_indices = np.argsort(similarities)[::-1]
keys = list(knowledge_base.keys())
idx = 1 if keys[sorted_indices[0]] == instruction else 0
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
"""Retrieve similar task experience using embeddings
Args:
instruction (str): The task instruction
Returns:
Tuple[str, str]: The similar task key and its episodic experience
"""
knowledge_base = load_knowledge_base(self.episodic_memory_path)
if not knowledge_base:
return "None", "None", [0, 0, 0], ""
embeddings = load_embeddings(self.embeddings_path)
# Get or create instruction embedding
instruction_embedding = embeddings.get(instruction)
total_tokens, cost_string = [0, 0, 0], ""
if instruction_embedding is None:
instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
embeddings[instruction] = instruction_embedding
# total_tokens += tokens
for i in range(len(total_tokens)):
total_tokens[i] += tokens[i]
cost_string = cost_string_now
# Get or create embeddings for knowledge base entries
candidate_embeddings = []
for key in knowledge_base:
candidate_embedding = embeddings.get(key)
if candidate_embedding is None:
candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
# total_tokens += tokens
for i in range(len(total_tokens)):
total_tokens[i] += tokens[i]
cost_string = CostManager.add_costs(cost_string, cost_string_now)
embeddings[key] = candidate_embedding
candidate_embeddings.append(candidate_embedding)
save_embeddings(self.embeddings_path, embeddings)
similarities = cosine_similarity(
instruction_embedding, np.vstack(candidate_embeddings)
)[0]
sorted_indices = np.argsort(similarities)[::-1]
keys = list(knowledge_base.keys())
idx = 1 if keys[sorted_indices[0]] == instruction else 0
return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
def knowledge_fusion(
self,
observation: Dict,
instruction: str,
web_knowledge: str,
similar_task: str,
experience: str,
) -> Tuple[str, list, str]:
"""Combine web knowledge with similar task experience"""
content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
"str_input": f"Task: {instruction}\n" +
f"**Web search result**:\n{web_knowledge}\n\n" +
f"**Retrieved similar task experience**:\n" +
f"Similar task:{similar_task}\n{experience}\n\n" +
f"Based on the web search result and the retrieved similar task experience, " +
f"if you think the similar task experience is indeed useful to the main task, " +
f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
"img_input": observation["screenshot"] if "screenshot" in observation else None
})
return content, total_tokens, cost
def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
"""Save episodic memory (subtask level knowledge).
Args:
subtask_key (str): Key identifying the subtask
subtask_traj (str): Trajectory/experience of the subtask
"""
if not self.save_knowledge:
return
try:
kb = load_knowledge_base(self.episodic_memory_path)
except:
kb = {}
if subtask_key not in kb:
subtask_summarization = self.summarize_episode(subtask_traj)
kb[subtask_key] = subtask_summarization
if self.save_knowledge:
os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
with open(self.episodic_memory_path, "w") as fout:
json.dump(kb, fout, indent=2)
return kb.get(subtask_key)
def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
"""Save narrative memory (task level knowledge).
Args:
task_key (str): Key identifying the task
task_traj (str): Full trajectory/experience of the task
"""
if not self.save_knowledge:
return
try:
kb = load_knowledge_base(self.narrative_memory_path)
except:
kb = {}
if task_key not in kb:
task_summarization = self.summarize_narrative(task_traj)
kb[task_key] = task_summarization
if self.save_knowledge:
os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
with open(self.narrative_memory_path, "w") as fout:
json.dump(kb, fout, indent=2)
return kb.get(task_key)
def initialize_task_trajectory(self, instruction: str) -> None:
"""Initialize a new task trajectory.
Args:
instruction (str): The task instruction
"""
self.task_trajectory = f"Task:\n{instruction}"
self.current_search_query = ""
self.current_subtask_trajectory = ""
def update_task_trajectory(self, meta_data: Dict) -> None:
"""Update the task trajectory with new metadata.
Args:
meta_data (Dict): Metadata from the agent's prediction
"""
if not self.current_search_query and "search_query" in meta_data:
self.current_search_query = meta_data["search_query"]
self.task_trajectory += (
"\n\nReflection:\n"
+ str(meta_data["reflection"])
+ "\n\n----------------------\n\nPlan:\n"
+ meta_data["executor_plan"]
)
def handle_subtask_trajectory(self, meta_data: Dict):
"""Handle subtask trajectory updates based on subtask status.
Args:
meta_data (Dict): Metadata containing subtask information
Returns:
bool: Whether the subtask was completed
"""
subtask_status = meta_data["subtask_status"]
subtask = meta_data["subtask"]
subtask_info = meta_data["subtask_info"]
if subtask_status in ["Start", "Done"]:
# If there's an existing subtask trajectory, finalize it
if self.current_subtask_trajectory:
self.current_subtask_trajectory += "\nSubtask Completed.\n"
subtask_key = self.current_subtask_trajectory.split(
"\n----------------------\n\nPlan:\n"
)[0]
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
self.current_subtask_trajectory = ""
return True
# Start new subtask trajectory
self.current_subtask_trajectory = (
f"Task:\n{self.current_search_query}\n\n"
f"Subtask: {subtask}\n"
f"Subtask Instruction: {subtask_info}\n"
f"----------------------\n\n"
f'Plan:\n{meta_data["executor_plan"]}\n'
)
return False
elif subtask_status == "In":
# Continue current subtask trajectory
self.current_subtask_trajectory += (
f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
)
return False
def finalize_task(self) -> None:
"""Finalize the task by saving any remaining trajectories."""
# Save any remaining subtask trajectory
if self.current_subtask_trajectory:
self.current_subtask_trajectory += "\nSubtask Completed.\n"
subtask_key = self.current_subtask_trajectory.split(
"\n----------------------\n\nPlan:\n"
)[0]
self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
# Save the complete task trajectory
if self.task_trajectory and self.current_search_query:
self.save_narrative_memory(self.current_search_query, self.task_trajectory)
# Reset trajectories
self.task_trajectory = ""
self.current_subtask_trajectory = ""
self.current_search_query = ""
def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
"""Summarize the episode experience for lifelong learning reflection
Args:
trajectory (str): The episode experience to be summarized
Returns:
str: The summarized episode experience
"""
# Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
return content, total_tokens, cost
def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
"""Summarize the narrative experience for lifelong learning reflection
Args:
trajectory (str): The narrative experience to be summarized
Returns:
str: The summarized narrative experience
"""
# Create Reflection on whole trajectories for next round trial
content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
return content, total_tokens, cost