新增rag eval
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,4 +1,7 @@
|
||||
# ---> Python
|
||||
|
||||
_backend/evaluate/eval_rag_dataset/*
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
127
_backend/evaluate/construct_rag_eval_dataset.py
Normal file
127
_backend/evaluate/construct_rag_eval_dataset.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import requests
|
||||
import pandas as pd
|
||||
import json
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
from eval_prompt import QA_generation_prompt
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
# 常量
|
||||
API_KEY = "dataset-OFxH5fwjOmYnfBsQkSWm8gHs"
|
||||
DATASETS_NAME = ["2d-mat-new", "eval-paper-new", "gold-nanorod-new", "PSK-new", "phospholipid"]
|
||||
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
|
||||
OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
|
||||
MODEL_NAME = "gpt-4o-mini"
|
||||
DATASETS_URL = 'http://100.85.52.31:7080/v1/datasets?page=1&limit=100'
|
||||
DOCUMENTS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents'
|
||||
CHUNKS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents/{}/segments'
|
||||
N_GENERATIONS = -1
|
||||
|
||||
|
||||
def get_all_chunks(datasets_name):
|
||||
"""
|
||||
获取所有知识库文档的所有块。
|
||||
|
||||
Returns:
|
||||
包含所有块的列表。
|
||||
"""
|
||||
|
||||
headers = {'Authorization': f'Bearer {API_KEY}'}
|
||||
all_chunks = []
|
||||
|
||||
# 获取数据集
|
||||
datasets_response = requests.get(DATASETS_URL, headers=headers)
|
||||
datasets = datasets_response.json()['data']
|
||||
|
||||
for dataset in datasets:
|
||||
dataset_id = dataset['id']
|
||||
if dataset['name'] not in datasets_name:
|
||||
continue
|
||||
|
||||
# 获取文档
|
||||
documents_response = requests.get(DOCUMENTS_URL.format(dataset_id), headers=headers)
|
||||
documents = documents_response.json()['data']
|
||||
|
||||
for document in documents:
|
||||
document_id = document['id']
|
||||
|
||||
# 获取块
|
||||
chunks_response = requests.get(CHUNKS_URL.format(dataset_id, document_id), headers=headers)
|
||||
chunks = chunks_response.json()['data']
|
||||
|
||||
for chunk in chunks:
|
||||
all_chunks.append({
|
||||
'dataset_name': dataset['name'],
|
||||
'dataset_id': dataset_id,
|
||||
'document_id': document_id,
|
||||
'chunk_id': chunk['id'],
|
||||
'chunk_text': chunk['content']
|
||||
})
|
||||
|
||||
return all_chunks
|
||||
|
||||
|
||||
def get_response_from_llm(messages: list[dict], tools: list = None):
|
||||
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
|
||||
if tools is None:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
tools=tools
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
return content
|
||||
|
||||
|
||||
def qa_generator(docs_chunks: list):
|
||||
n_samples = len(docs_chunks) if N_GENERATIONS==-1 else N_GENERATIONS
|
||||
assert N_GENERATIONS <= len(docs_chunks), f"N_GENERATIONS MUST LOWER THAN THE LENGTH OF chunks {len(docs_chunks)}"
|
||||
print(f"Generating {n_samples} QA couples...")
|
||||
|
||||
outputs = []
|
||||
for sampled_context in tqdm(docs_chunks[:n_samples]):
|
||||
# Generate QA couple
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": QA_generation_prompt.format(context=sampled_context['chunk_text'])}
|
||||
]
|
||||
output_QA_couple = get_response_from_llm(messages)
|
||||
try:
|
||||
question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
|
||||
answer = output_QA_couple.split("Answer: ")[-1]
|
||||
assert len(answer) < 300, "Answer is too long"
|
||||
outputs.append(
|
||||
{
|
||||
"context": sampled_context['chunk_text'],
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
"source_doc": {"dataset_id": sampled_context["dataset_id"], "document_id": sampled_context["document_id"]}
|
||||
}
|
||||
)
|
||||
except:
|
||||
continue
|
||||
return outputs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
chunks = get_all_chunks(DATASETS_NAME)
|
||||
qas = qa_generator(chunks)
|
||||
|
||||
# 创建 Hugging Face 数据集
|
||||
dataset = Dataset.from_pandas(pd.DataFrame(qas))
|
||||
dataset_dict = DatasetDict({"train": dataset})
|
||||
|
||||
# 保存数据集
|
||||
import os
|
||||
dir_name = os.path.dirname(__file__)
|
||||
dataset_dict.save_to_disk(os.path.join(dir_name, "eval_rag_dataset"))
|
||||
print(f"数据集已保存至本地 {dir_name}/eval_rag_dataset")
|
||||
|
||||
# 如果要发布到 Hugging Face Hub,请取消注释以下行并提供您的用户名和数据集名称
|
||||
# dataset_dict.push_to_hub("your-username/your-dataset-name", private=True)
|
||||
# print("数据集已保存至 Hugging Face Hub。要发布数据集,请手动更改设置。")
|
||||
43
_backend/evaluate/eval_prompt.py
Normal file
43
_backend/evaluate/eval_prompt.py
Normal file
@@ -0,0 +1,43 @@
|
||||
QA_generation_prompt = """
|
||||
Your task is to write a factoid question and an answer given a context.
|
||||
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
|
||||
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
|
||||
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
|
||||
|
||||
Provide your answer as follows:
|
||||
|
||||
Output:::
|
||||
Factoid question: (your factoid question)
|
||||
Answer: (your answer to the factoid question)
|
||||
|
||||
Now here is the context.
|
||||
|
||||
Context: {context}\n
|
||||
Output:::"""
|
||||
|
||||
|
||||
EVALUATION_PROMPT = """###Task Description:
|
||||
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
|
||||
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
|
||||
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
|
||||
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
|
||||
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.
|
||||
|
||||
###The instruction to evaluate:
|
||||
{instruction}
|
||||
|
||||
###Response to evaluate:
|
||||
{response}
|
||||
|
||||
###Reference Answer (Score 5):
|
||||
{reference_answer}
|
||||
|
||||
###Score Rubrics:
|
||||
[Is the response correct, accurate, and factual based on the reference answer?]
|
||||
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
|
||||
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
|
||||
Score 3: The response is somewhat correct, accurate, and/or factual.
|
||||
Score 4: The response is mostly correct, accurate, and factual.
|
||||
Score 5: The response is completely correct, accurate, and factual.
|
||||
|
||||
###Feedback:"""
|
||||
44
_backend/evaluate/rag_eval.py
Normal file
44
_backend/evaluate/rag_eval.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from datasets import load_from_disk, Dataset, DatasetDict
|
||||
from eval_prompt import EVALUATION_PROMPT
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
|
||||
OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
|
||||
MODEL_NAME = "gpt-4o-mini"
|
||||
|
||||
|
||||
def load_eval_rag_dataset(dataset_path: str) -> DatasetDict:
|
||||
"""Loads the eval_rag_dataset from disk.
|
||||
|
||||
Args:
|
||||
dataset_path (str): The path to the dataset.
|
||||
|
||||
Returns:
|
||||
DatasetDict: The loaded dataset.
|
||||
"""
|
||||
return load_from_disk(dataset_path)
|
||||
|
||||
|
||||
def get_response_from_llm(messages: list[dict], tools: list = None):
|
||||
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
|
||||
if tools is None:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
tools=tools
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
return content
|
||||
|
||||
|
||||
|
||||
DATASET_PATH = "_backend/evaluate/eval_rag_dataset"
|
||||
eval_dataset = load_eval_rag_dataset(DATASET_PATH)['train']
|
||||
for i in eval_dataset:
|
||||
print()
|
||||
@@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
from typing import Sequence
|
||||
from autogen_core import CancellationToken
|
||||
from autogen_agentchat.agents import AssistantAgent, SocietyOfMindAgent, UserProxyAgent
|
||||
from autogen_agentchat.conditions import MaxMessageTermination, TextMentionTermination, HandoffTermination
|
||||
from autogen_agentchat.messages import AgentEvent, ChatMessage, TextMessage, ToolCallExecutionEvent
|
||||
@@ -23,36 +24,22 @@ model_client = OpenAIChatCompletionClient(
|
||||
},
|
||||
)
|
||||
|
||||
async def main(task: str = ""):
|
||||
user = UserProxyAgent("user_agent", input_func=input)
|
||||
rag_agent = AssistantAgent(
|
||||
"RAGAgent",
|
||||
description="An expert agent in the field of materials science",
|
||||
async def main():
|
||||
assistant = AssistantAgent(
|
||||
name="assistant",
|
||||
system_message="""You are a helpful assistant. You can call tools to help user.""",
|
||||
model_client=model_client,
|
||||
system_message="""
|
||||
You are a professional scientist in materials science.
|
||||
You solve material science problems together by talking to users, and you can invoke tools to retrieve information from the knowledge base to implement RAG.
|
||||
|
||||
Always handoff back to user_agent when response is complete.
|
||||
""",
|
||||
handoffs=["user_agent"],
|
||||
reflect_on_tool_use=True,
|
||||
tools=[vector_retrieval_from_knowledge_base]
|
||||
tools=[vector_retrieval_from_knowledge_base],
|
||||
reflect_on_tool_use=True, # Set to True to have the model reflect on the tool use, set to False to return the tool call result directly.
|
||||
)
|
||||
|
||||
# handoff_termination = HandoffTermination("DataAnalyst_PlanningAgent")
|
||||
text_mention_termination = TextMentionTermination("APPROVE")
|
||||
max_messages_termination = MaxMessageTermination(max_messages=50)
|
||||
termination = text_mention_termination | max_messages_termination #| handoff_termination
|
||||
# termination = max_messages_termination
|
||||
|
||||
team = Swarm(
|
||||
participants=[rag_agent, user],
|
||||
termination_condition=termination
|
||||
)
|
||||
|
||||
await Console(team.run_stream(task=task))
|
||||
while True:
|
||||
user_input = input("User: ")
|
||||
if user_input == "exit":
|
||||
break
|
||||
response = await assistant.on_messages([TextMessage(content=user_input, source="user")], CancellationToken())
|
||||
print("Assistant:", response.chat_message.content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main("Let the robot synthesize CsPbBr3 nanocubes at room temperature"))
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user