新增rag eval

2025-02-04 12:48:54 +08:00
parent cc3b28a59a
commit 2c9b1bba3a
5 changed files with 231 additions and 27 deletions
--- a/_backend/evaluate/construct_rag_eval_dataset.py
+++ b/_backend/evaluate/construct_rag_eval_dataset.py
@@ -0,0 +1,127 @@
+import requests
+import pandas as pd
+import json
+from openai import OpenAI
+from tqdm import tqdm
+from eval_prompt import QA_generation_prompt
+from datasets import Dataset, DatasetDict
+
+# 常量
+API_KEY = "dataset-OFxH5fwjOmYnfBsQkSWm8gHs"
+DATASETS_NAME = ["2d-mat-new", "eval-paper-new", "gold-nanorod-new", "PSK-new", "phospholipid"]
+OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
+OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
+MODEL_NAME = "gpt-4o-mini"
+DATASETS_URL = 'http://100.85.52.31:7080/v1/datasets?page=1&limit=100'
+DOCUMENTS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents'
+CHUNKS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents/{}/segments'
+N_GENERATIONS = -1
+
+
+def get_all_chunks(datasets_name):
+    """
+    获取所有知识库文档的所有块。
+
+    Returns:
+        包含所有块的列表。
+    """
+
+    headers = {'Authorization': f'Bearer {API_KEY}'}
+    all_chunks = []
+
+    # 获取数据集
+    datasets_response = requests.get(DATASETS_URL, headers=headers)
+    datasets = datasets_response.json()['data']
+
+    for dataset in datasets:
+        dataset_id = dataset['id']
+        if dataset['name'] not in datasets_name:
+            continue
+
+        # 获取文档
+        documents_response = requests.get(DOCUMENTS_URL.format(dataset_id), headers=headers)
+        documents = documents_response.json()['data']
+
+        for document in documents:
+            document_id = document['id']
+
+            # 获取块
+            chunks_response = requests.get(CHUNKS_URL.format(dataset_id, document_id), headers=headers)
+            chunks = chunks_response.json()['data']
+
+            for chunk in chunks:
+                all_chunks.append({
+                    'dataset_name': dataset['name'],
+                    'dataset_id': dataset_id,
+                    'document_id': document_id,
+                    'chunk_id': chunk['id'],
+                    'chunk_text': chunk['content']
+                })
+
+    return all_chunks
+
+
+def get_response_from_llm(messages: list[dict], tools: list = None):
+    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+    if tools is None:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+        )
+    else:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            tools=tools
+        )
+    content = response.choices[0].message.content
+    return content
+
+
+def qa_generator(docs_chunks: list):
+    n_samples = len(docs_chunks) if N_GENERATIONS==-1 else N_GENERATIONS
+    assert N_GENERATIONS <= len(docs_chunks), f"N_GENERATIONS MUST LOWER THAN THE LENGTH OF chunks {len(docs_chunks)}"
+    print(f"Generating {n_samples} QA couples...")
+
+    outputs = []
+    for sampled_context in tqdm(docs_chunks[:n_samples]):
+        # Generate QA couple
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": QA_generation_prompt.format(context=sampled_context['chunk_text'])}
+        ]
+        output_QA_couple = get_response_from_llm(messages)
+        try:
+            question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
+            answer = output_QA_couple.split("Answer: ")[-1]
+            assert len(answer) < 300, "Answer is too long"
+            outputs.append(
+                {
+                    "context": sampled_context['chunk_text'],
+                    "question": question,
+                    "answer": answer,
+                    "source_doc": {"dataset_id": sampled_context["dataset_id"], "document_id": sampled_context["document_id"]}
+                }
+            )
+        except:
+            continue
+    return outputs
+
+
+if __name__ == "__main__":
+    chunks = get_all_chunks(DATASETS_NAME)
+    qas = qa_generator(chunks)
+
+    # 创建 Hugging Face 数据集
+    dataset = Dataset.from_pandas(pd.DataFrame(qas))
+    dataset_dict = DatasetDict({"train": dataset})
+
+    # 保存数据集
+    import os
+    dir_name = os.path.dirname(__file__)
+    dataset_dict.save_to_disk(os.path.join(dir_name, "eval_rag_dataset"))
+    print(f"数据集已保存至本地 {dir_name}/eval_rag_dataset")
+
+    # 如果要发布到 Hugging Face Hub，请取消注释以下行并提供您的用户名和数据集名称
+    # dataset_dict.push_to_hub("your-username/your-dataset-name", private=True)
+    # print("数据集已保存至 Hugging Face Hub。要发布数据集，请手动更改设置。")
--- a/_backend/evaluate/eval_prompt.py
+++ b/_backend/evaluate/eval_prompt.py
@@ -0,0 +1,43 @@
+QA_generation_prompt = """
+Your task is to write a factoid question and an answer given a context.
+Your factoid question should be answerable with a specific, concise piece of factual information from the context.
+Your factoid question should be formulated in the same style as questions users could ask in a search engine.
+This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
+
+Provide your answer as follows:
+
+Output:::
+Factoid question: (your factoid question)
+Answer: (your answer to the factoid question)
+
+Now here is the context.
+
+Context: {context}\n
+Output:::"""
+
+
+EVALUATION_PROMPT = """###Task Description:
+An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
+1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
+2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
+3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
+4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.
+
+###The instruction to evaluate:
+{instruction}
+
+###Response to evaluate:
+{response}
+
+###Reference Answer (Score 5):
+{reference_answer}
+
+###Score Rubrics:
+[Is the response correct, accurate, and factual based on the reference answer?]
+Score 1: The response is completely incorrect, inaccurate, and/or not factual.
+Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
+Score 3: The response is somewhat correct, accurate, and/or factual.
+Score 4: The response is mostly correct, accurate, and factual.
+Score 5: The response is completely correct, accurate, and factual.
+
+###Feedback:"""
--- a/_backend/evaluate/rag_eval.py
+++ b/_backend/evaluate/rag_eval.py
@@ -0,0 +1,44 @@
+from datasets import load_from_disk, Dataset, DatasetDict
+from eval_prompt import EVALUATION_PROMPT
+from openai import OpenAI
+
+
+OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
+OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
+MODEL_NAME = "gpt-4o-mini"
+
+
+def load_eval_rag_dataset(dataset_path: str) -> DatasetDict:
+    """Loads the eval_rag_dataset from disk.
+
+    Args:
+        dataset_path (str): The path to the dataset.
+
+    Returns:
+        DatasetDict: The loaded dataset.
+    """
+    return load_from_disk(dataset_path)
+
+
+def get_response_from_llm(messages: list[dict], tools: list = None):
+    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+    if tools is None:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+        )
+    else:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            tools=tools
+        )
+    content = response.choices[0].message.content
+    return content
+
+
+
+DATASET_PATH = "_backend/evaluate/eval_rag_dataset"
+eval_dataset = load_eval_rag_dataset(DATASET_PATH)['train']
+for i in eval_dataset:
+    print()