sci-gui-agent-benchmark/mm_agents/maestro/core/new_knowledge.py

import json
import os
from typing import Dict, Tuple, List
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from ..utils.common_utils import (
    load_embeddings,
    load_knowledge_base,
    save_embeddings,
)
from ..tools.new_tools import NewTools
from .mllm import CostManager

def get_embedding_dim(model_name):
    if model_name == "doubao-embedding-large-text-250515":
        return 2048
    elif model_name == "doubao-embedding-text-240715":
        return 2560
    elif model_name == "text-embedding-ada-002":
        return 1536
    elif model_name == "text-embedding-3-small":
        return 1536
    elif model_name == "text-embedding-3-large":
        return 3072
    elif model_name == "gemini-embedding-001":
        return 3072
    elif model_name == "jina-embeddings-v4":
        return 2048
    elif model_name == "jina-embeddings-v3":
        return 1024
    elif model_name == "text-embedding-v4":
        return 1024
    elif model_name == "text-embedding-v3":
        return 1024
    elif model_name == "embedding-2" or model_name == "embedding-3":
        return 2048
    else:
        return None

class NewKnowledgeBase:
    def __init__(
        self,
        embedding_engine: NewTools,
        local_kb_path: str,
        platform: str,
        Tools_dict: Dict,
        save_knowledge: bool = True,
    ):
        """
        Initialize the KnowledgeBase module

        Args:
            embedding_engine: Embedding engine instance
            local_kb_path: Path to local knowledge base
            platform: Target platform (Windows/Darwin/Ubuntu)
            Tools_dict: Dictionary containing tool configurations
            save_knowledge: Whether to save knowledge embeddings
        """
        self.platform = platform

        self.local_kb_path = local_kb_path

        # initialize embedding engine
        self.embedding_engine = embedding_engine

        # Initialize paths for different memory types
        self.episodic_memory_path = os.path.join(
            self.local_kb_path, self.platform, "episodic_memory.json"
        )
        self.narrative_memory_path = os.path.join(
            self.local_kb_path, self.platform, "narrative_memory.json"
        )
        embedding_model_name = ""
        if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
            embedding_model_name = self.embedding_engine.tools["embedding"].model_name
        else:
            embedding_model_name = "default"
        embedding_dim = get_embedding_dim(embedding_model_name)
        self.embeddings_path = os.path.join(
            self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
        )

        # Initialize trajectory tracking
        self.task_trajectory = ""
        self.current_subtask_trajectory = ""
        self.current_search_query = ""

        # query_formulator
        self.query_formulator_name = "query_formulator"
        self.query_formulator = NewTools()
        self.query_formulator.register_tool(
            self.query_formulator_name,
            Tools_dict[self.query_formulator_name]["provider"],
            Tools_dict[self.query_formulator_name]["model"],
        )

        # knowledge_fusion_agent
        self.knowledge_fusion_agent_name = "context_fusion"
        self.knowledge_fusion_agent = NewTools()
        self.knowledge_fusion_agent.register_tool(
            self.knowledge_fusion_agent_name,
            Tools_dict[self.knowledge_fusion_agent_name]["provider"],
            Tools_dict[self.knowledge_fusion_agent_name]["model"],
        )

        # narrative_summarization_agent
        self.narrative_summarization_agent_name = "narrative_summarization"
        self.narrative_summarization_agent = NewTools()
        self.narrative_summarization_agent.register_tool(
            self.narrative_summarization_agent_name,
            Tools_dict[self.narrative_summarization_agent_name]["provider"],
            Tools_dict[self.narrative_summarization_agent_name]["model"],
        )

        # episode_summarization_agent
        self.episode_summarization_agent_name = "episode_summarization"
        self.episode_summarization_agent = NewTools()
        self.episode_summarization_agent.register_tool(
            self.episode_summarization_agent_name,
            Tools_dict[self.episode_summarization_agent_name]["provider"],
            Tools_dict[self.episode_summarization_agent_name]["model"],
        )

        self.save_knowledge = save_knowledge

    def retrieve_knowledge(
        self, instruction: str, search_query: str, search_engine: NewTools
    ) -> Tuple[str, List[int], str]:
        """Retrieve knowledge using search engine
        Args:
            instruction (str): task instruction
            search_query (str): search query to use
            search_engine (NewTools): search engine tool to use

        Returns:
            Tuple[str, List[int], float]: The search results, token usage, and cost
        """
        search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})

        return search_results, total_tokens, cost_string

    def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
        """Formulate search query based on instruction and current state

        Args:
            instruction (str): The task instruction
            observation (Dict): Current observation including screenshot

        Returns:
            Tuple[str, List[int], float]: The formulated query, token usage, and cost
        """
        query_path = os.path.join(
            self.local_kb_path, self.platform, "formulate_query.json"
        )
        try:
            with open(query_path, "r") as f:
                formulate_query = json.load(f)
        except:
            formulate_query = {}

        if instruction in formulate_query:
            return formulate_query[instruction], [0, 0, 0], ""

        self.query_formulator.tools["query_formulator"].llm_agent.reset()

        content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
            "str_input": f"The task is: {instruction}\n" +
                "To use google search to get some useful information, first carefully analyze " +
                "the screenshot of the current desktop UI state, then given the task " +
                "instruction, formulate a question that can be used to search on the Internet " +
                "for information in helping with the task execution.\n" +
                "The question should not be too general or too specific. Please ONLY provide " +
                "the question.\nQuestion:",
            "img_input": observation["screenshot"] if "screenshot" in observation else None
        })

        search_query = content.strip().replace('"', "")

        # print("search query: ", search_query)
        formulate_query[instruction] = search_query
        with open(query_path, "w") as f:
            json.dump(formulate_query, f, indent=2)

        return search_query, total_tokens, cost_string

    def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
        """Retrieve narrative experience using embeddings

        Args:
            instruction (str): The task instruction

        Returns:
            Tuple[str, str]: The similar task key and its narrative experience
        """

        knowledge_base = load_knowledge_base(self.narrative_memory_path)
        if not knowledge_base:
            return "None", "None", [0, 0, 0], ""

        embeddings = load_embeddings(self.embeddings_path)

        # Get or create instruction embedding
        instruction_embedding = embeddings.get(instruction)
        total_tokens, cost_string = [0, 0, 0], ""

        if instruction_embedding is None:
            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
            embeddings[instruction] = instruction_embedding
            # total_tokens += tokens
            for i in range(len(total_tokens)):
                total_tokens[i] += tokens[i]
            cost_string = cost_string_now
        # Get or create embeddings for knowledge base entries
        candidate_embeddings = []
        for key in knowledge_base:
            candidate_embedding = embeddings.get(key)
            if candidate_embedding is None:
                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
                for i in range(len(tokens)):
                    total_tokens[i] += tokens[i]
                # total_tokens += tokens
                cost_string = CostManager.add_costs(cost_string, cost_string_now)
            embeddings[key] = candidate_embedding

            candidate_embeddings.append(candidate_embedding)

        save_embeddings(self.embeddings_path, embeddings)

        similarities = cosine_similarity(
            instruction_embedding, np.vstack(candidate_embeddings)
        )[0]
        sorted_indices = np.argsort(similarities)[::-1]

        keys = list(knowledge_base.keys())
        idx = 1 if keys[sorted_indices[0]] == instruction else 0
        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string

    def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
        """Retrieve similar task experience using embeddings

        Args:
            instruction (str): The task instruction

        Returns:
            Tuple[str, str]: The similar task key and its episodic experience
        """

        knowledge_base = load_knowledge_base(self.episodic_memory_path)
        if not knowledge_base:
            return "None", "None", [0, 0, 0], ""

        embeddings = load_embeddings(self.embeddings_path)

        # Get or create instruction embedding
        instruction_embedding = embeddings.get(instruction)
        total_tokens, cost_string = [0, 0, 0], ""

        if instruction_embedding is None:
            instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
            embeddings[instruction] = instruction_embedding

            # total_tokens += tokens
            for i in range(len(total_tokens)):
                total_tokens[i] += tokens[i]
            cost_string = cost_string_now

        # Get or create embeddings for knowledge base entries
        candidate_embeddings = []
        for key in knowledge_base:
            candidate_embedding = embeddings.get(key)
            if candidate_embedding is None:
                candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
                # total_tokens += tokens
                for i in range(len(total_tokens)):
                    total_tokens[i] += tokens[i]
                cost_string = CostManager.add_costs(cost_string, cost_string_now)
            embeddings[key] = candidate_embedding

            candidate_embeddings.append(candidate_embedding)

        save_embeddings(self.embeddings_path, embeddings)

        similarities = cosine_similarity(
            instruction_embedding, np.vstack(candidate_embeddings)
        )[0]
        sorted_indices = np.argsort(similarities)[::-1]

        keys = list(knowledge_base.keys())
        idx = 1 if keys[sorted_indices[0]] == instruction else 0
        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string

    def knowledge_fusion(
        self,
        observation: Dict,
        instruction: str,
        web_knowledge: str,
        similar_task: str,
        experience: str,
    ) -> Tuple[str, list, str]:
        """Combine web knowledge with similar task experience"""

        content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
            "str_input": f"Task: {instruction}\n" +
                f"**Web search result**:\n{web_knowledge}\n\n" +
                f"**Retrieved similar task experience**:\n" +
                f"Similar task:{similar_task}\n{experience}\n\n" +
                f"Based on the web search result and the retrieved similar task experience, " +
                f"if you think the similar task experience is indeed useful to the main task, " +
                f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
            "img_input": observation["screenshot"] if "screenshot" in observation else None
        })

        return content, total_tokens, cost


    def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
        """Save episodic memory (subtask level knowledge).

        Args:
            subtask_key (str): Key identifying the subtask
            subtask_traj (str): Trajectory/experience of the subtask
        """
        if not self.save_knowledge:
            return

        try:
            kb = load_knowledge_base(self.episodic_memory_path)
        except:
            kb = {}

        if subtask_key not in kb:
            subtask_summarization = self.summarize_episode(subtask_traj)
            kb[subtask_key] = subtask_summarization

            if self.save_knowledge:
                os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
                with open(self.episodic_memory_path, "w") as fout:
                    json.dump(kb, fout, indent=2)

        return kb.get(subtask_key)

    def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
        """Save narrative memory (task level knowledge).

        Args:
            task_key (str): Key identifying the task
            task_traj (str): Full trajectory/experience of the task
        """
        if not self.save_knowledge:
            return

        try:
            kb = load_knowledge_base(self.narrative_memory_path)
        except:
            kb = {}

        if task_key not in kb:
            task_summarization = self.summarize_narrative(task_traj)
            kb[task_key] = task_summarization

            if self.save_knowledge:
                os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
                with open(self.narrative_memory_path, "w") as fout:
                    json.dump(kb, fout, indent=2)

        return kb.get(task_key)

    def initialize_task_trajectory(self, instruction: str) -> None:
        """Initialize a new task trajectory.

        Args:
            instruction (str): The task instruction
        """
        self.task_trajectory = f"Task:\n{instruction}"
        self.current_search_query = ""
        self.current_subtask_trajectory = ""

    def update_task_trajectory(self, meta_data: Dict) -> None:
        """Update the task trajectory with new metadata.

        Args:
            meta_data (Dict): Metadata from the agent's prediction
        """
        if not self.current_search_query and "search_query" in meta_data:
            self.current_search_query = meta_data["search_query"]

        self.task_trajectory += (
            "\n\nReflection:\n"
            + str(meta_data["reflection"])
            + "\n\n----------------------\n\nPlan:\n"
            + meta_data["executor_plan"]
        )

    def handle_subtask_trajectory(self, meta_data: Dict):
        """Handle subtask trajectory updates based on subtask status.

        Args:
            meta_data (Dict): Metadata containing subtask information

        Returns:
            bool: Whether the subtask was completed
        """
        subtask_status = meta_data["subtask_status"]
        subtask = meta_data["subtask"]
        subtask_info = meta_data["subtask_info"]

        if subtask_status in ["Start", "Done"]:
            # If there's an existing subtask trajectory, finalize it
            if self.current_subtask_trajectory:
                self.current_subtask_trajectory += "\nSubtask Completed.\n"
                subtask_key = self.current_subtask_trajectory.split(
                    "\n----------------------\n\nPlan:\n"
                )[0]
                self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
                self.current_subtask_trajectory = ""
                return True

            # Start new subtask trajectory
            self.current_subtask_trajectory = (
                f"Task:\n{self.current_search_query}\n\n"
                f"Subtask: {subtask}\n"
                f"Subtask Instruction: {subtask_info}\n"
                f"----------------------\n\n"
                f'Plan:\n{meta_data["executor_plan"]}\n'
            )
            return False

        elif subtask_status == "In":
            # Continue current subtask trajectory
            self.current_subtask_trajectory += (
                f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
            )
            return False

    def finalize_task(self) -> None:
        """Finalize the task by saving any remaining trajectories."""
        # Save any remaining subtask trajectory
        if self.current_subtask_trajectory:
            self.current_subtask_trajectory += "\nSubtask Completed.\n"
            subtask_key = self.current_subtask_trajectory.split(
                "\n----------------------\n\nPlan:\n"
            )[0]
            self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)

        # Save the complete task trajectory
        if self.task_trajectory and self.current_search_query:
            self.save_narrative_memory(self.current_search_query, self.task_trajectory)

        # Reset trajectories
        self.task_trajectory = ""
        self.current_subtask_trajectory = ""
        self.current_search_query = ""

    def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
        """Summarize the episode experience for lifelong learning reflection

        Args:
            trajectory (str): The episode experience to be summarized

        Returns:
            str: The summarized episode experience
        """

        # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
        content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})

        return content, total_tokens, cost

    def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
        """Summarize the narrative experience for lifelong learning reflection

        Args:
            trajectory (str): The narrative experience to be summarized

        Returns:
            str: The summarized narrative experience
        """
        # Create Reflection on whole trajectories for next round trial
        content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})

        return content, total_tokens, cost