新增rag eval

2025-02-04 12:48:54 +08:00
parent cc3b28a59a
commit 2c9b1bba3a
5 changed files with 231 additions and 27 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 # ---> Python
+
+_backend/evaluate/eval_rag_dataset/*
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/_backend/evaluate/construct_rag_eval_dataset.py
+++ b/_backend/evaluate/construct_rag_eval_dataset.py
@@ -0,0 +1,127 @@
+import requests
+import pandas as pd
+import json
+from openai import OpenAI
+from tqdm import tqdm
+from eval_prompt import QA_generation_prompt
+from datasets import Dataset, DatasetDict
+
+# 常量
+API_KEY = "dataset-OFxH5fwjOmYnfBsQkSWm8gHs"
+DATASETS_NAME = ["2d-mat-new", "eval-paper-new", "gold-nanorod-new", "PSK-new", "phospholipid"]
+OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
+OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
+MODEL_NAME = "gpt-4o-mini"
+DATASETS_URL = 'http://100.85.52.31:7080/v1/datasets?page=1&limit=100'
+DOCUMENTS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents'
+CHUNKS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents/{}/segments'
+N_GENERATIONS = -1
+
+
+def get_all_chunks(datasets_name):
+    """
+    获取所有知识库文档的所有块。
+
+    Returns:
+        包含所有块的列表。
+    """
+
+    headers = {'Authorization': f'Bearer {API_KEY}'}
+    all_chunks = []
+
+    # 获取数据集
+    datasets_response = requests.get(DATASETS_URL, headers=headers)
+    datasets = datasets_response.json()['data']
+
+    for dataset in datasets:
+        dataset_id = dataset['id']
+        if dataset['name'] not in datasets_name:
+            continue
+
+        # 获取文档
+        documents_response = requests.get(DOCUMENTS_URL.format(dataset_id), headers=headers)
+        documents = documents_response.json()['data']
+
+        for document in documents:
+            document_id = document['id']
+
+            # 获取块
+            chunks_response = requests.get(CHUNKS_URL.format(dataset_id, document_id), headers=headers)
+            chunks = chunks_response.json()['data']
+
+            for chunk in chunks:
+                all_chunks.append({
+                    'dataset_name': dataset['name'],
+                    'dataset_id': dataset_id,
+                    'document_id': document_id,
+                    'chunk_id': chunk['id'],
+                    'chunk_text': chunk['content']
+                })
+
+    return all_chunks
+
+
+def get_response_from_llm(messages: list[dict], tools: list = None):
+    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+    if tools is None:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+        )
+    else:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            tools=tools
+        )
+    content = response.choices[0].message.content
+    return content
+
+
+def qa_generator(docs_chunks: list):
+    n_samples = len(docs_chunks) if N_GENERATIONS==-1 else N_GENERATIONS
+    assert N_GENERATIONS <= len(docs_chunks), f"N_GENERATIONS MUST LOWER THAN THE LENGTH OF chunks {len(docs_chunks)}"
+    print(f"Generating {n_samples} QA couples...")
+
+    outputs = []
+    for sampled_context in tqdm(docs_chunks[:n_samples]):
+        # Generate QA couple
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": QA_generation_prompt.format(context=sampled_context['chunk_text'])}
+        ]
+        output_QA_couple = get_response_from_llm(messages)
+        try:
+            question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
+            answer = output_QA_couple.split("Answer: ")[-1]
+            assert len(answer) < 300, "Answer is too long"
+            outputs.append(
+                {
+                    "context": sampled_context['chunk_text'],
+                    "question": question,
+                    "answer": answer,
+                    "source_doc": {"dataset_id": sampled_context["dataset_id"], "document_id": sampled_context["document_id"]}
+                }
+            )
+        except:
+            continue
+    return outputs
+
+
+if __name__ == "__main__":
+    chunks = get_all_chunks(DATASETS_NAME)
+    qas = qa_generator(chunks)
+
+    # 创建 Hugging Face 数据集
+    dataset = Dataset.from_pandas(pd.DataFrame(qas))
+    dataset_dict = DatasetDict({"train": dataset})
+
+    # 保存数据集
+    import os
+    dir_name = os.path.dirname(__file__)
+    dataset_dict.save_to_disk(os.path.join(dir_name, "eval_rag_dataset"))
+    print(f"数据集已保存至本地 {dir_name}/eval_rag_dataset")
+
+    # 如果要发布到 Hugging Face Hub，请取消注释以下行并提供您的用户名和数据集名称
+    # dataset_dict.push_to_hub("your-username/your-dataset-name", private=True)
+    # print("数据集已保存至 Hugging Face Hub。要发布数据集，请手动更改设置。")
--- a/_backend/evaluate/eval_prompt.py
+++ b/_backend/evaluate/eval_prompt.py
@@ -0,0 +1,43 @@
+QA_generation_prompt = """
+Your task is to write a factoid question and an answer given a context.
+Your factoid question should be answerable with a specific, concise piece of factual information from the context.
+Your factoid question should be formulated in the same style as questions users could ask in a search engine.
+This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
+
+Provide your answer as follows:
+
+Output:::
+Factoid question: (your factoid question)
+Answer: (your answer to the factoid question)
+
+Now here is the context.
+
+Context: {context}\n
+Output:::"""
+
+
+EVALUATION_PROMPT = """###Task Description:
+An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
+1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
+2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
+3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
+4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.
+
+###The instruction to evaluate:
+{instruction}
+
+###Response to evaluate:
+{response}
+
+###Reference Answer (Score 5):
+{reference_answer}
+
+###Score Rubrics:
+[Is the response correct, accurate, and factual based on the reference answer?]
+Score 1: The response is completely incorrect, inaccurate, and/or not factual.
+Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
+Score 3: The response is somewhat correct, accurate, and/or factual.
+Score 4: The response is mostly correct, accurate, and factual.
+Score 5: The response is completely correct, accurate, and factual.
+
+###Feedback:"""
--- a/_backend/evaluate/rag_eval.py
+++ b/_backend/evaluate/rag_eval.py
@@ -0,0 +1,44 @@
+from datasets import load_from_disk, Dataset, DatasetDict
+from eval_prompt import EVALUATION_PROMPT
+from openai import OpenAI
+
+
+OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
+OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
+MODEL_NAME = "gpt-4o-mini"
+
+
+def load_eval_rag_dataset(dataset_path: str) -> DatasetDict:
+    """Loads the eval_rag_dataset from disk.
+
+    Args:
+        dataset_path (str): The path to the dataset.
+
+    Returns:
+        DatasetDict: The loaded dataset.
+    """
+    return load_from_disk(dataset_path)
+
+
+def get_response_from_llm(messages: list[dict], tools: list = None):
+    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+    if tools is None:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+        )
+    else:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            tools=tools
+        )
+    content = response.choices[0].message.content
+    return content
+
+
+
+DATASET_PATH = "_backend/evaluate/eval_rag_dataset"
+eval_dataset = load_eval_rag_dataset(DATASET_PATH)['train']
+for i in eval_dataset:
+    print()
--- a/_backend/single_agent_with_rag.py
+++ b/_backend/single_agent_with_rag.py
@@ -1,5 +1,6 @@
 import asyncio
 from typing import Sequence
+from autogen_core import CancellationToken
 from autogen_agentchat.agents import AssistantAgent, SocietyOfMindAgent, UserProxyAgent
 from autogen_agentchat.conditions import MaxMessageTermination, TextMentionTermination, HandoffTermination
 from autogen_agentchat.messages import AgentEvent, ChatMessage, TextMessage, ToolCallExecutionEvent
@@ -23,36 +24,22 @@ model_client = OpenAIChatCompletionClient(
    },
 )

-async def main(task: str = ""):
-    user = UserProxyAgent("user_agent", input_func=input)
-    rag_agent = AssistantAgent(
-        "RAGAgent",
-        description="An expert agent in the field of materials science",
+async def main():
+    assistant = AssistantAgent(
+        name="assistant",
+        system_message="""You are a helpful assistant. You can call tools to help user.""",
        model_client=model_client,
-        system_message="""
-        You are a professional scientist in materials science.
-        You solve material science problems together by talking to users, and you can invoke tools to retrieve information from the knowledge base to implement RAG.
-
-        Always handoff back to user_agent when response is complete.
-        """,
-        handoffs=["user_agent"],
-        reflect_on_tool_use=True,
-        tools=[vector_retrieval_from_knowledge_base]
+        tools=[vector_retrieval_from_knowledge_base],
+        reflect_on_tool_use=True, # Set to True to have the model reflect on the tool use, set to False to return the tool call result directly.
    )

-    # handoff_termination = HandoffTermination("DataAnalyst_PlanningAgent")
-    text_mention_termination = TextMentionTermination("APPROVE")
-    max_messages_termination = MaxMessageTermination(max_messages=50)
-    termination = text_mention_termination | max_messages_termination #| handoff_termination
-    # termination = max_messages_termination
-
-    team = Swarm(
-        participants=[rag_agent, user],
-        termination_condition=termination
-    )
-
-    await Console(team.run_stream(task=task))
+    while True:
+        user_input = input("User: ")
+        if user_input == "exit":
+            break
+        response = await assistant.on_messages([TextMessage(content=user_input, source="user")], CancellationToken())
+        print("Assistant:", response.chat_message.content)


 if __name__ == "__main__":
-    asyncio.run(main("Let the robot synthesize CsPbBr3 nanocubes at room temperature"))
+    asyncio.run(main())