调整代码；

2025-02-07 19:56:56 +08:00
parent 0ef44b002e
commit ab5ff14aba
8 changed files with 8692 additions and 1285 deletions
--- a/_backend/evaluate/construct_rag_eval_dataset.py
+++ b/_backend/evaluate/construct_rag_eval_dataset.py
@@ -11,11 +11,13 @@ from datasets import Dataset, DatasetDict
 # 常量
 API_KEY = "dataset-OFxH5fwjOmYnfBsQkSWm8gHs"
 DATASETS_NAME = ["2d-mat-new", "eval-paper-new", "gold-nanorod-new", "PSK-new", "phospholipid"]
-N_THREADS = 32#multiprocessing.cpu_count()  # 使用所有可用的CPU核心
+N_THREADS = multiprocessing.cpu_count()  # 使用所有可用的CPU核心

 OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
 OPENAI_BASE_URL = "https://vip.apiyi.com/v1"
-MODEL_NAME = "chatgpt-4o-latest"
+# MODEL_NAME = "chatgpt-4o-latest"
+# MODEL_NAME = "o3-mini"
+MODEL_NAME = "deepseek-reasoner"
 DATASETS_URL = 'http://100.85.52.31:7080/v1/datasets?page=1&limit=100'
 DOCUMENTS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents'
 CHUNKS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents/{}/segments'
@@ -54,6 +56,9 @@ def get_all_chunks(datasets_name):
            chunks = chunks_response.json()['data']

            for chunk in chunks:
+                if chunk['tokens'] < 150:
+                    continue
+
                all_chunks.append({
                    'dataset_name': dataset['name'],
                    'dataset_id': dataset_id,
@@ -65,17 +70,17 @@ def get_all_chunks(datasets_name):
    return all_chunks


-def get_response_from_llm(messages: list[dict], tools: list = None):
+def get_response_from_llm(messages: list[dict], model:str, tools: list = None):
    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
    try:
        if tools is None:
            response = client.chat.completions.create(
-                model=MODEL_NAME,
+                model=model,
                messages=messages,
            )
        else:
            response = client.chat.completions.create(
-                model=MODEL_NAME,
+                model=model,
                messages=messages,
                tools=tools
            )
@@ -107,7 +112,7 @@ def _qa_generator_single(sampled_context):
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": QA_generation_prompt.format(context=sampled_context['chunk_text'])}
        ]
-        output_QA_couple = get_response_from_llm(messages=messages)
+        output_QA_couple = get_response_from_llm(messages=messages, model=MODEL_NAME)
        try:
            question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
            answer = output_QA_couple.split("Answer: ")[-1]
@@ -132,14 +137,20 @@ def qa_critic(qas, num_threads: int = N_THREADS):
 def _qa_critic_single(output):
    evaluations = {
        "groundedness": get_response_from_llm(messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": question_groundedness_critique_prompt.format(context=output['context'], question=output['question'])}]),
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": question_groundedness_critique_prompt.format(context=output['context'], question=output['question'])}],
+            model=MODEL_NAME
+        ),
        "relevance": get_response_from_llm(messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": question_relevance_critique_prompt.format(question=output['question'])}]),
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": question_relevance_critique_prompt.format(question=output['question'])}],
+            model=MODEL_NAME
+        ),
        "standalone": get_response_from_llm(messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": question_standalone_critique_prompt.format(question=output['question'])}]),
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": question_standalone_critique_prompt.format(question=output['question'])}],
+            model=MODEL_NAME
+        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
@@ -171,7 +182,7 @@ if __name__ == "__main__":
    generated_questions = generated_questions.loc[
        (generated_questions["groundedness_score"] >= 4)
        & (generated_questions["relevance_score"] >= 4)
-        & (generated_questions["standalone_score"] >= 1)
+        & (generated_questions["standalone_score"] >= 3)
    ]

    # 创建 Hugging Face 数据集
--- a/_backend/evaluate/eval_prompt.py
+++ b/_backend/evaluate/eval_prompt.py
@@ -1,6 +1,6 @@
 QA_generation_prompt = """
-Your task is to write a factoid question and an answer given a context.
-Your factoid question should be answerable with a specific, concise piece of factual information from the context.
+Your task is to write a factoid question around the topic of material science and a detailed answer given a context.
+Your factoid question should be answerable with a specific, complete piece of factual information from the context.
 Your factoid question should be formulated in the same style as questions users could ask in a search engine.
 This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

@@ -77,7 +77,7 @@ Answer::: """


 EVALUATION_PROMPT = """###Task Description:
-An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
+An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 3, and a score rubric representing a evaluation criteria are given.
 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
 3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
@@ -89,7 +89,7 @@ An instruction (might include an Input inside it), a response to evaluate, a ref
 ###Response to evaluate:
 {response}

-###Reference Answer (Score 5):
+###Reference Answer (Score 3):
 {reference_answer}

 ###Score Rubrics:
--- a/_backend/evaluate/eval_rag_result/gpt-4o-2024-08-06/multiagent_with_rag_cot.json
+++ b/_backend/evaluate/eval_rag_result/gpt-4o-2024-08-06/multiagent_with_rag_cot.json
--- a/_backend/evaluate/eval_rag_result/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json
+++ b/_backend/evaluate/eval_rag_result/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json
--- a/_backend/evaluate/eval_rag_result/o3-mini/multiagent_with_rag_cot.json
+++ b/_backend/evaluate/eval_rag_result/o3-mini/multiagent_with_rag_cot.json
--- a/_backend/evaluate/multiagent.py
+++ b/_backend/evaluate/multiagent.py
@@ -11,22 +11,22 @@ from autogen_ext.models.openai import OpenAIChatCompletionClient
 from _backend.constant import MODEL, OPENAI_API_KEY, OPENAI_BASE_URL
 from _backend.scientist_team import create_scientist_team

-model_client = OpenAIChatCompletionClient(
-    model=MODEL,
-    base_url=OPENAI_BASE_URL,
-    api_key=OPENAI_API_KEY,
-    model_info={
-        "vision": True,
-        "function_calling": True,
-        "json_output": True,
-        "family": "unknown",
-    },
-)
+# model_client = OpenAIChatCompletionClient(
+#     model=MODEL,
+#     base_url=OPENAI_BASE_URL,
+#     api_key=OPENAI_API_KEY,
+#     model_info={
+#         "vision": True,
+#         "function_calling": True,
+#         "json_output": True,
+#         "family": "unknown",
+#     },
+# )

-async def _multiagent_with_rag_cot(task: str = "") -> dict:
+async def _multiagent_with_rag_cot(task: str, model_client: OpenAIChatCompletionClient) -> dict:
    user = UserProxyAgent("user_agent", input_func=input)

-    scientist_team = create_scientist_team()
+    scientist_team = create_scientist_team(model_client)
    
    result = {}
    planning_agent = AssistantAgent(
--- a/_backend/evaluate/rag_eval.py
+++ b/_backend/evaluate/rag_eval.py
@@ -7,8 +7,11 @@ import os
 from functools import partial
 import multiprocessing
 import asyncio
+from autogen_ext.models.openai import OpenAIChatCompletionClient
 from single_agent_with_rag import _single_agent_answer_with_rag, _single_agent_answer_with_rag_cot
 from multiagent import _multiagent_with_rag_cot
+from autogen_core.models import ModelFamily
+

 OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
 OPENAI_BASE_URL = "http://154.44.26.195:17935/v1"
@@ -151,13 +154,25 @@ def single_model_answer_with_rag_cot(model: str):


 def multiagent_with_rag_cot(model: str):
+    model_client = OpenAIChatCompletionClient(
+        model=model,
+        base_url=OPENAI_BASE_URL,
+        api_key=OPENAI_API_KEY,
+        model_info={
+            "vision": False,
+            "function_calling": True,
+            "json_output": True,
+            "family": ModelFamily.O3
+        },
+    )
+
    eval_dataset = load_eval_rag_dataset(DATASET_PATH)
    num_threads = 16 #multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=num_threads) as pool:
        results = list(
            tqdm(
                pool.imap(
-                    partial(run_async_in_process, _multiagent_with_rag_cot),
+                    partial(run_async_in_process, _multiagent_with_rag_cot, model_client=model_client),
                    eval_dataset['question'],
                ),
                total=len(eval_dataset),
@@ -179,37 +194,84 @@ def multiagent_with_rag_cot(model: str):
        json.dump(final_result, f, indent=2)


-def _eval_rag_dataset(instruction: str, response: str, context: str, model: str):
-    """Evaluates a response with a single model.
+def eval_rag_dataset(qa_json_path: str):
+    with open(qa_json_path, "r") as f:
+        qa_data = json.load(f)
+    
+    eval_dataset = load_eval_rag_dataset(DATASET_PATH)
+    args = []
+    for idx, item in enumerate(eval_dataset):
+        if qa_data[idx]['question'] == item['question']:
+            arg = {
+                "instruction": item['question'],
+                "response": qa_data[idx]["answer"],
+                "reference_answer": item["answer"],
+                "model": MODEL_NAME
+                }
+            args.append(arg)

-    Args:
-        instruction (str): The instruction to evaluate the response with.
-        response (str): The response to evaluate.
-        context (str): The context to evaluate the response in.
-        model (str): The model to use.
+    num_threads = multiprocessing.cpu_count()
+    with multiprocessing.Pool(processes=num_threads) as pool:
+        results = list(
+            tqdm(
+                pool.imap(
+                    _eval_rag_dataset,
+                    args
+                ),
+                desc="Evaluating",
+                total=len(args)
+            )
+        )
+        for idx, (feedback, score) in enumerate(results):
+            qa_data[idx]["feedback"] = feedback
+            qa_data[idx]["score"] = score

-    Returns:
-        str: The evaluation.
-    """
+    with open(qa_json_path, "w") as f:
+        json.dump(qa_data, f, indent=2)
+
+
+def _eval_rag_dataset(args: dict):
+    instruction = args["instruction"]
+    response = args["response"]
+    reference_answer = args["reference_answer"]
+    model = args["model"]
    messages = [
        {"role": "system", "content": "You are a fair evaluator language model."},
-        {"role": "user", "content": EVALUATION_PROMPT.format(instruction=instruction, response=response, context=context)},
+        {"role": "user", "content": EVALUATION_PROMPT.format(instruction=instruction, response=response, reference_answer=reference_answer)},
    ]
-    eval_result = get_response_from_llm(messages)
-    feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
+    eval_result = get_response_from_llm(messages, model=model)
+    if "[RESULT]" in eval_result:
+        feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
+    else:
+        feedback = ""
+        score = ""
+    return feedback, score

-
-def eval_rag_dataset():
-    eval_dataset = load_eval_rag_dataset(DATASET_PATH)
-    for i in eval_dataset:
-        print()
+def calculate_average_score(qa_json_path: str):
+    with open(qa_json_path, "r") as f:
+        qa_data = json.load(f)
+    scores = []
+    count = 0
+    for item in qa_data:
+        if "score" in item and item["score"] != "":
+            scores.append(int(item["score"]))
+            count += 1
+    average_score = sum(scores) / count
+    print(f"{qa_json_path} Average score: {average_score}")


 if __name__ == "__main__":
    # single_model_answer(model="chatgpt-4o-latest")
    # single_model_answer(model="o1-2024-12-17")
-    single_model_answer(model="o3-mini")
+    # single_model_answer(model="o3-mini")
    # single_model_answer_with_rag(model="gpt-4o-2024-08-06")
    # single_model_answer_with_rag_cot(model="gpt-4o-2024-08-06")
    # multiagent_with_rag_cot(model="gpt-4o-2024-08-06")
-    
+    # multiagent_with_rag_cot(model="o3-mini")
+
+    
+    # eval_rag_dataset(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/multiagent_with_rag_cot.json")
+    calculate_average_score(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/multiagent_with_rag_cot.json")
+    # eval_rag_dataset(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json")
+    calculate_average_score(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json")
+    pass