调整代码;

This commit is contained in:
2025-02-07 19:56:56 +08:00
parent 0ef44b002e
commit ab5ff14aba
8 changed files with 8692 additions and 1285 deletions

View File

@@ -11,11 +11,13 @@ from datasets import Dataset, DatasetDict
# 常量
API_KEY = "dataset-OFxH5fwjOmYnfBsQkSWm8gHs"
DATASETS_NAME = ["2d-mat-new", "eval-paper-new", "gold-nanorod-new", "PSK-new", "phospholipid"]
N_THREADS = 32#multiprocessing.cpu_count() # 使用所有可用的CPU核心
N_THREADS = multiprocessing.cpu_count() # 使用所有可用的CPU核心
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
OPENAI_BASE_URL = "https://vip.apiyi.com/v1"
MODEL_NAME = "chatgpt-4o-latest"
# MODEL_NAME = "chatgpt-4o-latest"
# MODEL_NAME = "o3-mini"
MODEL_NAME = "deepseek-reasoner"
DATASETS_URL = 'http://100.85.52.31:7080/v1/datasets?page=1&limit=100'
DOCUMENTS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents'
CHUNKS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents/{}/segments'
@@ -54,6 +56,9 @@ def get_all_chunks(datasets_name):
chunks = chunks_response.json()['data']
for chunk in chunks:
if chunk['tokens'] < 150:
continue
all_chunks.append({
'dataset_name': dataset['name'],
'dataset_id': dataset_id,
@@ -65,17 +70,17 @@ def get_all_chunks(datasets_name):
return all_chunks
def get_response_from_llm(messages: list[dict], tools: list = None):
def get_response_from_llm(messages: list[dict], model:str, tools: list = None):
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
try:
if tools is None:
response = client.chat.completions.create(
model=MODEL_NAME,
model=model,
messages=messages,
)
else:
response = client.chat.completions.create(
model=MODEL_NAME,
model=model,
messages=messages,
tools=tools
)
@@ -107,7 +112,7 @@ def _qa_generator_single(sampled_context):
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": QA_generation_prompt.format(context=sampled_context['chunk_text'])}
]
output_QA_couple = get_response_from_llm(messages=messages)
output_QA_couple = get_response_from_llm(messages=messages, model=MODEL_NAME)
try:
question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
answer = output_QA_couple.split("Answer: ")[-1]
@@ -132,14 +137,20 @@ def qa_critic(qas, num_threads: int = N_THREADS):
def _qa_critic_single(output):
evaluations = {
"groundedness": get_response_from_llm(messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": question_groundedness_critique_prompt.format(context=output['context'], question=output['question'])}]),
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": question_groundedness_critique_prompt.format(context=output['context'], question=output['question'])}],
model=MODEL_NAME
),
"relevance": get_response_from_llm(messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": question_relevance_critique_prompt.format(question=output['question'])}]),
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": question_relevance_critique_prompt.format(question=output['question'])}],
model=MODEL_NAME
),
"standalone": get_response_from_llm(messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": question_standalone_critique_prompt.format(question=output['question'])}]),
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": question_standalone_critique_prompt.format(question=output['question'])}],
model=MODEL_NAME
),
}
try:
for criterion, evaluation in evaluations.items():
@@ -171,7 +182,7 @@ if __name__ == "__main__":
generated_questions = generated_questions.loc[
(generated_questions["groundedness_score"] >= 4)
& (generated_questions["relevance_score"] >= 4)
& (generated_questions["standalone_score"] >= 1)
& (generated_questions["standalone_score"] >= 3)
]
# 创建 Hugging Face 数据集

View File

@@ -1,6 +1,6 @@
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your task is to write a factoid question around the topic of material science and a detailed answer given a context.
Your factoid question should be answerable with a specific, complete piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
@@ -77,7 +77,7 @@ Answer::: """
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 3, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
@@ -89,7 +89,7 @@ An instruction (might include an Input inside it), a response to evaluate, a ref
###Response to evaluate:
{response}
###Reference Answer (Score 5):
###Reference Answer (Score 3):
{reference_answer}
###Score Rubrics:

File diff suppressed because one or more lines are too long

View File

@@ -11,22 +11,22 @@ from autogen_ext.models.openai import OpenAIChatCompletionClient
from _backend.constant import MODEL, OPENAI_API_KEY, OPENAI_BASE_URL
from _backend.scientist_team import create_scientist_team
model_client = OpenAIChatCompletionClient(
model=MODEL,
base_url=OPENAI_BASE_URL,
api_key=OPENAI_API_KEY,
model_info={
"vision": True,
"function_calling": True,
"json_output": True,
"family": "unknown",
},
)
# model_client = OpenAIChatCompletionClient(
# model=MODEL,
# base_url=OPENAI_BASE_URL,
# api_key=OPENAI_API_KEY,
# model_info={
# "vision": True,
# "function_calling": True,
# "json_output": True,
# "family": "unknown",
# },
# )
async def _multiagent_with_rag_cot(task: str = "") -> dict:
async def _multiagent_with_rag_cot(task: str, model_client: OpenAIChatCompletionClient) -> dict:
user = UserProxyAgent("user_agent", input_func=input)
scientist_team = create_scientist_team()
scientist_team = create_scientist_team(model_client)
result = {}
planning_agent = AssistantAgent(

View File

@@ -7,8 +7,11 @@ import os
from functools import partial
import multiprocessing
import asyncio
from autogen_ext.models.openai import OpenAIChatCompletionClient
from single_agent_with_rag import _single_agent_answer_with_rag, _single_agent_answer_with_rag_cot
from multiagent import _multiagent_with_rag_cot
from autogen_core.models import ModelFamily
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
OPENAI_BASE_URL = "http://154.44.26.195:17935/v1"
@@ -151,13 +154,25 @@ def single_model_answer_with_rag_cot(model: str):
def multiagent_with_rag_cot(model: str):
model_client = OpenAIChatCompletionClient(
model=model,
base_url=OPENAI_BASE_URL,
api_key=OPENAI_API_KEY,
model_info={
"vision": False,
"function_calling": True,
"json_output": True,
"family": ModelFamily.O3
},
)
eval_dataset = load_eval_rag_dataset(DATASET_PATH)
num_threads = 16 #multiprocessing.cpu_count()
with multiprocessing.Pool(processes=num_threads) as pool:
results = list(
tqdm(
pool.imap(
partial(run_async_in_process, _multiagent_with_rag_cot),
partial(run_async_in_process, _multiagent_with_rag_cot, model_client=model_client),
eval_dataset['question'],
),
total=len(eval_dataset),
@@ -179,37 +194,84 @@ def multiagent_with_rag_cot(model: str):
json.dump(final_result, f, indent=2)
def _eval_rag_dataset(instruction: str, response: str, context: str, model: str):
"""Evaluates a response with a single model.
def eval_rag_dataset(qa_json_path: str):
with open(qa_json_path, "r") as f:
qa_data = json.load(f)
eval_dataset = load_eval_rag_dataset(DATASET_PATH)
args = []
for idx, item in enumerate(eval_dataset):
if qa_data[idx]['question'] == item['question']:
arg = {
"instruction": item['question'],
"response": qa_data[idx]["answer"],
"reference_answer": item["answer"],
"model": MODEL_NAME
}
args.append(arg)
Args:
instruction (str): The instruction to evaluate the response with.
response (str): The response to evaluate.
context (str): The context to evaluate the response in.
model (str): The model to use.
num_threads = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=num_threads) as pool:
results = list(
tqdm(
pool.imap(
_eval_rag_dataset,
args
),
desc="Evaluating",
total=len(args)
)
)
for idx, (feedback, score) in enumerate(results):
qa_data[idx]["feedback"] = feedback
qa_data[idx]["score"] = score
Returns:
str: The evaluation.
"""
with open(qa_json_path, "w") as f:
json.dump(qa_data, f, indent=2)
def _eval_rag_dataset(args: dict):
instruction = args["instruction"]
response = args["response"]
reference_answer = args["reference_answer"]
model = args["model"]
messages = [
{"role": "system", "content": "You are a fair evaluator language model."},
{"role": "user", "content": EVALUATION_PROMPT.format(instruction=instruction, response=response, context=context)},
{"role": "user", "content": EVALUATION_PROMPT.format(instruction=instruction, response=response, reference_answer=reference_answer)},
]
eval_result = get_response_from_llm(messages)
feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
eval_result = get_response_from_llm(messages, model=model)
if "[RESULT]" in eval_result:
feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
else:
feedback = ""
score = ""
return feedback, score
def eval_rag_dataset():
eval_dataset = load_eval_rag_dataset(DATASET_PATH)
for i in eval_dataset:
print()
def calculate_average_score(qa_json_path: str):
with open(qa_json_path, "r") as f:
qa_data = json.load(f)
scores = []
count = 0
for item in qa_data:
if "score" in item and item["score"] != "":
scores.append(int(item["score"]))
count += 1
average_score = sum(scores) / count
print(f"{qa_json_path} Average score: {average_score}")
if __name__ == "__main__":
# single_model_answer(model="chatgpt-4o-latest")
# single_model_answer(model="o1-2024-12-17")
single_model_answer(model="o3-mini")
# single_model_answer(model="o3-mini")
# single_model_answer_with_rag(model="gpt-4o-2024-08-06")
# single_model_answer_with_rag_cot(model="gpt-4o-2024-08-06")
# multiagent_with_rag_cot(model="gpt-4o-2024-08-06")
# multiagent_with_rag_cot(model="o3-mini")
# eval_rag_dataset(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/multiagent_with_rag_cot.json")
calculate_average_score(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/multiagent_with_rag_cot.json")
# eval_rag_dataset(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json")
calculate_average_score(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json")
pass