调整代码;
This commit is contained in:
@@ -11,11 +11,13 @@ from datasets import Dataset, DatasetDict
|
||||
# 常量
|
||||
API_KEY = "dataset-OFxH5fwjOmYnfBsQkSWm8gHs"
|
||||
DATASETS_NAME = ["2d-mat-new", "eval-paper-new", "gold-nanorod-new", "PSK-new", "phospholipid"]
|
||||
N_THREADS = 32#multiprocessing.cpu_count() # 使用所有可用的CPU核心
|
||||
N_THREADS = multiprocessing.cpu_count() # 使用所有可用的CPU核心
|
||||
|
||||
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
|
||||
OPENAI_BASE_URL = "https://vip.apiyi.com/v1"
|
||||
MODEL_NAME = "chatgpt-4o-latest"
|
||||
# MODEL_NAME = "chatgpt-4o-latest"
|
||||
# MODEL_NAME = "o3-mini"
|
||||
MODEL_NAME = "deepseek-reasoner"
|
||||
DATASETS_URL = 'http://100.85.52.31:7080/v1/datasets?page=1&limit=100'
|
||||
DOCUMENTS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents'
|
||||
CHUNKS_URL = 'http://100.85.52.31:7080/v1/datasets/{}/documents/{}/segments'
|
||||
@@ -54,6 +56,9 @@ def get_all_chunks(datasets_name):
|
||||
chunks = chunks_response.json()['data']
|
||||
|
||||
for chunk in chunks:
|
||||
if chunk['tokens'] < 150:
|
||||
continue
|
||||
|
||||
all_chunks.append({
|
||||
'dataset_name': dataset['name'],
|
||||
'dataset_id': dataset_id,
|
||||
@@ -65,17 +70,17 @@ def get_all_chunks(datasets_name):
|
||||
return all_chunks
|
||||
|
||||
|
||||
def get_response_from_llm(messages: list[dict], tools: list = None):
|
||||
def get_response_from_llm(messages: list[dict], model:str, tools: list = None):
|
||||
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
|
||||
try:
|
||||
if tools is None:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model,
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
model=model,
|
||||
messages=messages,
|
||||
tools=tools
|
||||
)
|
||||
@@ -107,7 +112,7 @@ def _qa_generator_single(sampled_context):
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": QA_generation_prompt.format(context=sampled_context['chunk_text'])}
|
||||
]
|
||||
output_QA_couple = get_response_from_llm(messages=messages)
|
||||
output_QA_couple = get_response_from_llm(messages=messages, model=MODEL_NAME)
|
||||
try:
|
||||
question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
|
||||
answer = output_QA_couple.split("Answer: ")[-1]
|
||||
@@ -132,14 +137,20 @@ def qa_critic(qas, num_threads: int = N_THREADS):
|
||||
def _qa_critic_single(output):
|
||||
evaluations = {
|
||||
"groundedness": get_response_from_llm(messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": question_groundedness_critique_prompt.format(context=output['context'], question=output['question'])}]),
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": question_groundedness_critique_prompt.format(context=output['context'], question=output['question'])}],
|
||||
model=MODEL_NAME
|
||||
),
|
||||
"relevance": get_response_from_llm(messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": question_relevance_critique_prompt.format(question=output['question'])}]),
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": question_relevance_critique_prompt.format(question=output['question'])}],
|
||||
model=MODEL_NAME
|
||||
),
|
||||
"standalone": get_response_from_llm(messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": question_standalone_critique_prompt.format(question=output['question'])}]),
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": question_standalone_critique_prompt.format(question=output['question'])}],
|
||||
model=MODEL_NAME
|
||||
),
|
||||
}
|
||||
try:
|
||||
for criterion, evaluation in evaluations.items():
|
||||
@@ -171,7 +182,7 @@ if __name__ == "__main__":
|
||||
generated_questions = generated_questions.loc[
|
||||
(generated_questions["groundedness_score"] >= 4)
|
||||
& (generated_questions["relevance_score"] >= 4)
|
||||
& (generated_questions["standalone_score"] >= 1)
|
||||
& (generated_questions["standalone_score"] >= 3)
|
||||
]
|
||||
|
||||
# 创建 Hugging Face 数据集
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
QA_generation_prompt = """
|
||||
Your task is to write a factoid question and an answer given a context.
|
||||
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
|
||||
Your task is to write a factoid question around the topic of material science and a detailed answer given a context.
|
||||
Your factoid question should be answerable with a specific, complete piece of factual information from the context.
|
||||
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
|
||||
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
|
||||
|
||||
@@ -77,7 +77,7 @@ Answer::: """
|
||||
|
||||
|
||||
EVALUATION_PROMPT = """###Task Description:
|
||||
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
|
||||
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 3, and a score rubric representing a evaluation criteria are given.
|
||||
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
|
||||
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
|
||||
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
|
||||
@@ -89,7 +89,7 @@ An instruction (might include an Input inside it), a response to evaluate, a ref
|
||||
###Response to evaluate:
|
||||
{response}
|
||||
|
||||
###Reference Answer (Score 5):
|
||||
###Reference Answer (Score 3):
|
||||
{reference_answer}
|
||||
|
||||
###Score Rubrics:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@@ -11,22 +11,22 @@ from autogen_ext.models.openai import OpenAIChatCompletionClient
|
||||
from _backend.constant import MODEL, OPENAI_API_KEY, OPENAI_BASE_URL
|
||||
from _backend.scientist_team import create_scientist_team
|
||||
|
||||
model_client = OpenAIChatCompletionClient(
|
||||
model=MODEL,
|
||||
base_url=OPENAI_BASE_URL,
|
||||
api_key=OPENAI_API_KEY,
|
||||
model_info={
|
||||
"vision": True,
|
||||
"function_calling": True,
|
||||
"json_output": True,
|
||||
"family": "unknown",
|
||||
},
|
||||
)
|
||||
# model_client = OpenAIChatCompletionClient(
|
||||
# model=MODEL,
|
||||
# base_url=OPENAI_BASE_URL,
|
||||
# api_key=OPENAI_API_KEY,
|
||||
# model_info={
|
||||
# "vision": True,
|
||||
# "function_calling": True,
|
||||
# "json_output": True,
|
||||
# "family": "unknown",
|
||||
# },
|
||||
# )
|
||||
|
||||
async def _multiagent_with_rag_cot(task: str = "") -> dict:
|
||||
async def _multiagent_with_rag_cot(task: str, model_client: OpenAIChatCompletionClient) -> dict:
|
||||
user = UserProxyAgent("user_agent", input_func=input)
|
||||
|
||||
scientist_team = create_scientist_team()
|
||||
scientist_team = create_scientist_team(model_client)
|
||||
|
||||
result = {}
|
||||
planning_agent = AssistantAgent(
|
||||
|
||||
@@ -7,8 +7,11 @@ import os
|
||||
from functools import partial
|
||||
import multiprocessing
|
||||
import asyncio
|
||||
from autogen_ext.models.openai import OpenAIChatCompletionClient
|
||||
from single_agent_with_rag import _single_agent_answer_with_rag, _single_agent_answer_with_rag_cot
|
||||
from multiagent import _multiagent_with_rag_cot
|
||||
from autogen_core.models import ModelFamily
|
||||
|
||||
|
||||
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
|
||||
OPENAI_BASE_URL = "http://154.44.26.195:17935/v1"
|
||||
@@ -151,13 +154,25 @@ def single_model_answer_with_rag_cot(model: str):
|
||||
|
||||
|
||||
def multiagent_with_rag_cot(model: str):
|
||||
model_client = OpenAIChatCompletionClient(
|
||||
model=model,
|
||||
base_url=OPENAI_BASE_URL,
|
||||
api_key=OPENAI_API_KEY,
|
||||
model_info={
|
||||
"vision": False,
|
||||
"function_calling": True,
|
||||
"json_output": True,
|
||||
"family": ModelFamily.O3
|
||||
},
|
||||
)
|
||||
|
||||
eval_dataset = load_eval_rag_dataset(DATASET_PATH)
|
||||
num_threads = 16 #multiprocessing.cpu_count()
|
||||
with multiprocessing.Pool(processes=num_threads) as pool:
|
||||
results = list(
|
||||
tqdm(
|
||||
pool.imap(
|
||||
partial(run_async_in_process, _multiagent_with_rag_cot),
|
||||
partial(run_async_in_process, _multiagent_with_rag_cot, model_client=model_client),
|
||||
eval_dataset['question'],
|
||||
),
|
||||
total=len(eval_dataset),
|
||||
@@ -179,37 +194,84 @@ def multiagent_with_rag_cot(model: str):
|
||||
json.dump(final_result, f, indent=2)
|
||||
|
||||
|
||||
def _eval_rag_dataset(instruction: str, response: str, context: str, model: str):
|
||||
"""Evaluates a response with a single model.
|
||||
def eval_rag_dataset(qa_json_path: str):
|
||||
with open(qa_json_path, "r") as f:
|
||||
qa_data = json.load(f)
|
||||
|
||||
eval_dataset = load_eval_rag_dataset(DATASET_PATH)
|
||||
args = []
|
||||
for idx, item in enumerate(eval_dataset):
|
||||
if qa_data[idx]['question'] == item['question']:
|
||||
arg = {
|
||||
"instruction": item['question'],
|
||||
"response": qa_data[idx]["answer"],
|
||||
"reference_answer": item["answer"],
|
||||
"model": MODEL_NAME
|
||||
}
|
||||
args.append(arg)
|
||||
|
||||
Args:
|
||||
instruction (str): The instruction to evaluate the response with.
|
||||
response (str): The response to evaluate.
|
||||
context (str): The context to evaluate the response in.
|
||||
model (str): The model to use.
|
||||
num_threads = multiprocessing.cpu_count()
|
||||
with multiprocessing.Pool(processes=num_threads) as pool:
|
||||
results = list(
|
||||
tqdm(
|
||||
pool.imap(
|
||||
_eval_rag_dataset,
|
||||
args
|
||||
),
|
||||
desc="Evaluating",
|
||||
total=len(args)
|
||||
)
|
||||
)
|
||||
for idx, (feedback, score) in enumerate(results):
|
||||
qa_data[idx]["feedback"] = feedback
|
||||
qa_data[idx]["score"] = score
|
||||
|
||||
Returns:
|
||||
str: The evaluation.
|
||||
"""
|
||||
with open(qa_json_path, "w") as f:
|
||||
json.dump(qa_data, f, indent=2)
|
||||
|
||||
|
||||
def _eval_rag_dataset(args: dict):
|
||||
instruction = args["instruction"]
|
||||
response = args["response"]
|
||||
reference_answer = args["reference_answer"]
|
||||
model = args["model"]
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a fair evaluator language model."},
|
||||
{"role": "user", "content": EVALUATION_PROMPT.format(instruction=instruction, response=response, context=context)},
|
||||
{"role": "user", "content": EVALUATION_PROMPT.format(instruction=instruction, response=response, reference_answer=reference_answer)},
|
||||
]
|
||||
eval_result = get_response_from_llm(messages)
|
||||
feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
|
||||
eval_result = get_response_from_llm(messages, model=model)
|
||||
if "[RESULT]" in eval_result:
|
||||
feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
|
||||
else:
|
||||
feedback = ""
|
||||
score = ""
|
||||
return feedback, score
|
||||
|
||||
|
||||
def eval_rag_dataset():
|
||||
eval_dataset = load_eval_rag_dataset(DATASET_PATH)
|
||||
for i in eval_dataset:
|
||||
print()
|
||||
def calculate_average_score(qa_json_path: str):
|
||||
with open(qa_json_path, "r") as f:
|
||||
qa_data = json.load(f)
|
||||
scores = []
|
||||
count = 0
|
||||
for item in qa_data:
|
||||
if "score" in item and item["score"] != "":
|
||||
scores.append(int(item["score"]))
|
||||
count += 1
|
||||
average_score = sum(scores) / count
|
||||
print(f"{qa_json_path} Average score: {average_score}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# single_model_answer(model="chatgpt-4o-latest")
|
||||
# single_model_answer(model="o1-2024-12-17")
|
||||
single_model_answer(model="o3-mini")
|
||||
# single_model_answer(model="o3-mini")
|
||||
# single_model_answer_with_rag(model="gpt-4o-2024-08-06")
|
||||
# single_model_answer_with_rag_cot(model="gpt-4o-2024-08-06")
|
||||
# multiagent_with_rag_cot(model="gpt-4o-2024-08-06")
|
||||
|
||||
# multiagent_with_rag_cot(model="o3-mini")
|
||||
|
||||
|
||||
# eval_rag_dataset(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/multiagent_with_rag_cot.json")
|
||||
calculate_average_score(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/multiagent_with_rag_cot.json")
|
||||
# eval_rag_dataset(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json")
|
||||
calculate_average_score(qa_json_path=f"{EVAL_RESULT_PATH}/gpt-4o-2024-08-06/single_model_answer_with_rag_cot.json")
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user