layer2 commit

2025-05-28 11:00:24 +08:00
parent 6a6b09ae20
commit 9f5318c23d
66 changed files with 286574 additions and 0 deletions
--- a/layer2/PGEE/code/EN_ckj.json
+++ b/layer2/PGEE/code/EN_ckj.json
--- a/layer2/PGEE/code/pycache/prompts.cpython-311.pyc
+++ b/layer2/PGEE/code/pycache/prompts.cpython-311.pyc
--- a/layer2/PGEE/code/pycache/prompts.cpython-312.pyc
+++ b/layer2/PGEE/code/pycache/prompts.cpython-312.pyc
--- a/layer2/PGEE/code/clean&norepeat.py
+++ b/layer2/PGEE/code/clean&norepeat.py
@@ -0,0 +1,194 @@
+import json
+import time
+import os
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from openai import OpenAI
+import numpy as np
+from tqdm import tqdm
+from prompts import CLEAN_PROMPTS,SELECT_QUESTION_PROMPT
+
+API_KEY=""
+BASE_URL="https://vip.apiyi.com/v1"
+MODEL_GPT="text-embedding-ada-002"
+MODELS = ["deepseek-reasoner", "claude-3-7-sonnet-20250219", "qwen-max", "deepseek-chat", "gemini-pro"]
+
+def load_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+# 判断问题是否完整
+def check_question_completeness(question,answer):
+    try:
+        client = OpenAI(api_key= API_KEY, base_url= BASE_URL)
+        response = client.chat.completions.create(
+            model="gpt-4-turbo",
+            messages=[
+                {"role": "system", "content": ""},
+                {"role": "user", "content": CLEAN_PROMPTS.replace("{QUESTION}", question).replace("{ANSWER}", answer)}
+            ],
+            temperature=0.0
+        )
+        result = response.choices[0].message.content.strip()
+        # 尝试提取数字结果
+        if "1" in result:
+            return 1
+        else:
+            return 0
+    except Exception as e:
+        print(f"Error checking question completeness: {e}")
+        return 0
+
+# 对问题进行难度打分
+def score_question_difficulty(model_name, question, answer):
+    try:
+        client = OpenAI(api_key= API_KEY, base_url= BASE_URL)
+        response = client.chat.completions.create(
+            model = model_name,
+            messages=[
+                {"role": "system", "content": ""},
+                {"role": "user", "content": SELECT_QUESTION_PROMPT.replace("{QUESTION}", question).replace("{ANSWER}", answer)}
+            ],
+            temperature=0.2
+        )
+        
+        result = response.choices[0].message.content.strip()
+        
+        # 尝试从响应中提取JSON
+        try:
+            # 查找JSON开始和结束的位置
+            start_idx = result.find('{')
+            end_idx = result.rfind('}') + 1
+            
+            if start_idx >= 0 and end_idx > start_idx:
+                json_str = result[start_idx:end_idx]
+                json_result = json.loads(json_str)
+                return json_result.get("score", 0)
+            else:
+                # 如果无法找到JSON，尝试直接从文本中提取分数
+                import re
+                score_match = re.search(r'"score":\s*(\d+)', result)
+                if score_match:
+                    return int(score_match.group(1))
+                else:
+                    print(f"无法解析模型返回的分数: {result}")
+                    return 0
+        except Exception as e:
+            print(f"解析JSON发生错误: {e}")
+            print(f"原始响应: {result}")
+            return 0
+            
+    except Exception as e:
+        print(f"模型{model_name}评分出错: {e}")
+        time.sleep(5)  # 出错后暂停
+        return 0
+
+# 异步处理单个问题
+async def process_question(data_item, executor):
+    idx = data_item["idx"]
+    question = data_item["question"]
+    answer = data_item["answer"]
+    
+    # 首先判断问题是否完整
+    is_complete = check_question_completeness(question, answer)
+    
+    if is_complete != 1:
+        return None
+    
+    # 使用线程池并行评分
+    scores = {}
+    loop = asyncio.get_event_loop()
+    score_tasks = []
+    
+    for model in MODELS:
+        score_tasks.append(
+            loop.run_in_executor(
+                executor,
+                score_question_difficulty,
+                model,
+                question,
+                answer,
+                idx
+            )
+        )
+    
+    # 获取所有评分结果
+    model_scores = await asyncio.gather(*score_tasks)
+    
+    # 合并评分结果
+    total_score = 0
+    for i, model in enumerate(MODELS):
+        scores[model] = model_scores[i]
+        total_score += model_scores[i]
+    
+    # 构建结果
+    result = {
+        "id": idx,
+        "question": question,
+        "answer": answer,
+        "total_score": total_score
+    }
+    
+    # 添加各模型评分
+    for model in MODELS:
+        result[model] = scores[model]
+    
+    return result
+
+# 保存结果到文件
+def save_results(results, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+
+# 主处理函数
+async def main(input_file, output_file, score_file, top_n=2000):
+    # 加载数据
+    data = load_data(input_file)
+    
+    results = []
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        tasks = []
+        for item in data:
+            tasks.append(process_question(item, executor))
+        
+        # 显示进度条
+        for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing questions"):
+            result = await f
+            if result is not None:
+                results.append(result)
+    
+    # 按总分排序
+    results.sort(key=lambda x: x["total_score"], reverse=True)
+    
+    # 选取前top_n个
+    top_results = results[:top_n]
+    
+    # 保存结果
+    save_results(top_results, output_file)
+    
+    # 保存评分结果
+    score_results = []
+    for item in results:
+        score_item = {
+            "id": item["id"],
+            "question": item["question"],
+            "answer": item["answer"]
+        }
+        # 添加各模型得分
+        for model in MODELS:
+            score_item[model] = item[model]
+        
+        score_results.append(score_item)
+    
+    save_results(score_results, score_file)
+    
+    print(f"处理完成。共有{len(results)}道完整问题，已选取前{len(top_results)}道最难问题。")
+
+if __name__ == "__main__":
+    input_file = "input.json"    # 输入的JSON文件
+    output_file = "top_difficult_questions.json"  # 输出前2000道最难问题
+    score_file = "scores.json"   # 保存所有模型评分
+    
+    # 运行主函数
+    asyncio.run(main(input_file, output_file, score_file, 2000))
--- a/layer2/PGEE/code/dataset.json
+++ b/layer2/PGEE/code/dataset.json
--- a/layer2/PGEE/code/merge.py
+++ b/layer2/PGEE/code/merge.py
@@ -0,0 +1,19 @@
+import json
+
+def merge_and_renumber_json(file1, file2, output_file):
+    with open(file1, 'r', encoding='utf-8') as f1:
+        data1 = json.load(f1)
+    with open(file2, 'r', encoding='utf-8') as f2:
+        data2 = json.load(f2)
+
+    merged_data = data1 + data2
+
+    for new_idx, item in enumerate(merged_data, start=1):
+        item['idx'] = new_idx
+
+    with open(output_file, 'w', encoding='utf-8') as f_out:
+        json.dump(merged_data, f_out, indent=2, ensure_ascii=False)
+
+    print(f"合并完成，输出文件为: {output_file}")
+
+merge_and_renumber_json('/home/ubuntu/50T/fsy/layer2/QA/code/EN-single_select_includes_process.json', '/home/ubuntu/50T/fsy/layer2/QA/code/821_single_select.json', '/home/ubuntu/50T/fsy/layer2/QA/code/merged.json')
--- a/layer2/PGEE/code/prompts.py
+++ b/layer2/PGEE/code/prompts.py
@@ -0,0 +1,44 @@
+CLEAN_PROMPTS="""
+
+"""
+
+SELECT_QUESTION_PROMPT = """
+Given the most unique answer, evaluate the following **questions ** and decide which one best matches the answer. The higher the match between the question and the answer, the higher the score. Please rate each question and answer pairing on a scale from **1 to 5**, with 1 being the worst match and 5 being the best match. Then, give a brief reason why the question best matches the answer.
+
+### # ** Rating Criteria ** :
+- **5** : Perfect match - The question is exactly the same as the answer, covering all the key information for the answer.
+- **4** : High match - The question and answer are mostly consistent, and basically cover the core content of the answer.
+- **3** : Medium match - The question partially agrees with the answer, but does not match exactly, or the answer does not fully cover the requirements of the question.
+- **2** : Low match - There is a gap between the question and the answer, and more details may be needed to match.
+- **1** : Very low match - the question has little to do with the answer, or the answer does not match the question at all.
+
+### Note that you should also include in your evaluation criteria whether the question is asked about the recommended functional group. If so, the score should be higher, if not, the score should be lower.
+
+### ** Inputs: **
+1. ** unique answer **:
+{ANSWER}
+2. **questions **:
+{QUESTIONS}
+
+### ** Output format: **
+- Score how well each question matches the answer in the following JSON format:
+```json
+{
+    "questions": [
+        {
+            "id": 1,
+            "score": xxxx,
+        },
+        {
+            "id": 2,
+            "score": xxxx,
+        },
+        {
+            "id": 3,
+            "score": xxxx,
+        },
+        ...
+    ]
+}
+```
+"""
--- a/layer2/PGEE/code/question_embeddings.pkl
+++ b/layer2/PGEE/code/question_embeddings.pkl
--- a/layer2/PGEE/code/renum.py
+++ b/layer2/PGEE/code/renum.py
@@ -0,0 +1,23 @@
+import json
+
+def renumber_json_indices(input_file, output_file):
+
+    with open(input_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    idx = 1
+
+    for item in data:
+        item['idx'] = idx
+        idx = idx + 1
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    
+    print(f"成功将索引重新编号并保存到 {output_file}")
+    print(f"处理了 {len(data)} 条数据")
+
+if __name__ == "__main__":
+    input_file = "/home/ubuntu/50T/fsy/layer2/QA/single_select.json"    # 替换为你的输入文件名
+    output_file = "/home/ubuntu/50T/fsy/layer2/QA/single_select_renum.json"  # 替换为你想要的输出文件名
+    
+    renumber_json_indices(input_file, output_file)
--- a/layer2/PGEE/code/step0_data.json
+++ b/layer2/PGEE/code/step0_data.json
--- a/layer2/PGEE/code/step0_xlsx2json.py
+++ b/layer2/PGEE/code/step0_xlsx2json.py
@@ -0,0 +1,46 @@
+"""
+0. 将问题从xls提取为json
+1. 将问题进行拆分
+2. 翻译成英文
+3. 去重
+4. 使用大模型进行难度评估和筛选
+"""
+import pandas as pd
+import json
+import os
+
+def process_excel_files(directory):
+    all_data = []
+    
+    # 获取目录下所有xlsx文件
+    excel_files = [f for f in os.listdir(directory) if f.endswith('.xlsx')]
+    
+    for excel_file in excel_files:
+           
+        file_path = os.path.join(directory, excel_file)
+
+        df = pd.read_excel(file_path)
+        
+        if 'Question' in df.columns and 'Answer' in df.columns:
+            # 将每行转换为字典并添加到列表中
+            for _, row in df.iterrows():
+                data_item = {
+                    'question': str(row['Question']).strip(),
+                    'answer': str(row['Answer']).strip()
+                }
+                all_data.append(data_item)
+        else:
+            print(f"警告: {excel_file} 缺少必要的列 (question/answer)")
+
+    # 将数据保存为JSON文件
+    output_file = os.path.join(directory, 'qa_data.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(all_data, f, ensure_ascii=False, indent=2)
+    
+    print(f"处理完成！共处理了 {len(all_data)} 条数据")
+    print(f"数据已保存到: {output_file}")
+
+if __name__ == '__main__':
+    # 指定Excel文件所在的目录
+    directory = os.path.dirname(os.path.abspath(__file__))
+    process_excel_files(directory)
--- a/layer2/PGEE/code/step1_single_select.json
+++ b/layer2/PGEE/code/step1_single_select.json
--- a/layer2/PGEE/code/step1_single_select_includes_process.json
+++ b/layer2/PGEE/code/step1_single_select_includes_process.json
--- a/layer2/PGEE/code/step2_translate.json
+++ b/layer2/PGEE/code/step2_translate.json
--- a/layer2/PGEE/code/step2_translate.py
+++ b/layer2/PGEE/code/step2_translate.py
@@ -0,0 +1,137 @@
+"""
+0. 将问题从xls提取为json
+1. 将问题进行拆分
+2. 翻译成英文
+3. 去重
+4. 使用大模型进行难度评估和筛选
+"""
+import json
+import time
+import threading
+import queue
+from concurrent.futures import ThreadPoolExecutor
+from openai import OpenAI
+import re
+
+result_lock = threading.Lock()
+api_semaphore = threading.Semaphore(5)  
+processed_data =[]
+error_items = []
+API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+BASE_URL="https://vip.apiyi.com/v1"
+MODEL_GPT ="deepseek-chat"
+
+client = OpenAI(api_key=API_KEY,base_url=BASE_URL)
+
+def load_qa_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+def translate_qa_type(question,answer):
+    prompt = f"""
+            
+            Please strictly translate the following Chinese questions and answers into English, and return the results according to the specified JSON format:
+
+            Question: {question}
+            Answer: {answer}
+
+            Translation requirements:
+            - Only translate the Chinese expressions, any additions or modifications to the content are prohibited
+            - Maintain all information points, expressions, and numerical values exactly as in the original text
+            - Keep professional terminology accurate
+            - Return plain text, do not use markdown format
+
+            Return the translation results according to the following JSON format:
+            [
+            {{
+            "question": "Translated English question",
+            "answer": "Translated English answer"
+            }}
+            ]
+            """
+    
+    with api_semaphore:
+        try:
+            response = client.chat.completions.create(
+                model = MODEL_GPT,
+                messages=[
+                    {"role": "system", "content": "You are an expert translator with extensive knowledge of materials science, tasked with translating Chinese texts into highly accurate English, ensuring the correct usage of scientific terminology."},
+                    {"role": "user", "content": prompt}
+                ],
+                stream=False
+            )
+            result = response.choices[0].message.content.strip()
+            print(result)
+            process_result = comfirm_json_string(result)
+            return json.loads(process_result)
+        except Exception as e:
+            print(f"API调用错误: {e}")
+            return "2" 
+
+def comfirm_json_string(json_string):
+    json_string = re.sub(r'[“”]', '"', json_string)
+    json_string = re.sub(r'\\', r'\\\\', json_string)
+    json_string = re.sub(r'\\"', r'\"', json_string)
+    json_string = json_string.replace("\n", "").replace("\r", "")
+    # 去掉 Markdown 的语法包裹
+    if json_string.startswith("```json"):
+        json_string = json_string.strip("`json\n")
+    json_string = json_string.strip('`\n')
+
+    return json_string
+
+def process_item(item, index, total):
+    print(f"处理第 {index+1}/{total} 条数据...")
+    question = item["question"]
+    answer = item["answer"]
+    data = translate_qa_type(question,answer)
+    
+    with result_lock:
+        if isinstance(data, list):
+            processed_data.append({
+                    "idx": item['idx'],
+                    "question": data[0]["question"],
+                    "answer": data[0]["answer"]
+                    })
+        else:
+                error_items.append({
+                "idx": item['idx'],
+                "question": question,
+                "answer": answer
+            })
+
+        
+def save_processed_data(data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+def main():
+    input_file = "/home/ubuntu/50T/fsy/layer2/QA/single_select.json"
+    output_file = "/home/ubuntu/50T/fsy/layer2/QA/EN-single_select.json"
+    error_file = "/home/ubuntu/50T/fsy/error.json"
+    
+    data = load_qa_data(input_file)
+    total = len(data)
+    
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for i, item in enumerate(data):
+            future = executor.submit(process_item, item, i, total)
+            futures.append(future)
+            
+            if (i+1) % 10 == 0:
+                time.sleep(1)
+    
+        for future in futures:
+            future.result()
+    
+    save_processed_data(processed_data, output_file)
+    print(f"处理完成，已保存到 {output_file}")
+    
+    if error_items:
+        save_processed_data(error_items, error_file)
+        print(f"处理出错的条目已保存到 {error_file}")
+
+if __name__ == "__main__":
+    main()
--- a/layer2/PGEE/code/step3_deduplication.json
+++ b/layer2/PGEE/code/step3_deduplication.json
--- a/layer2/PGEE/code/step3_deduplication.py
+++ b/layer2/PGEE/code/step3_deduplication.py
@@ -0,0 +1,130 @@
+"""
+0. 将问题从xls提取为json
+1. 将问题进行拆分
+2. 翻译成英文
+3. 去重
+4. 使用大模型进行难度评估和筛选
+"""
+from openai import OpenAI
+import json
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+import pickle
+from prompts import CLEAN_PROMPTS, SELECT_QUESTION_PROMPT
+
+API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+BASE_URL="https://vip.apiyi.com/v1"
+MODEL_GPT="text-embedding-ada-002"
+MODELS = ["deepseek-reasoner", "claude-3-7-sonnet-20250219", "qwen-max", "deepseek-chat", "gemini-pro"]
+
+def get_embedding(text):
+    client = OpenAI(api_key= API_KEY, base_url= BASE_URL)
+    response = client.embeddings.create(
+        model = MODEL_GPT,
+        input = text
+    )
+    return response.data[0].embedding
+
+def compute_embeddings(texts):
+    embeddings = []
+    for i,text in enumerate(texts):
+        print("正在处理第{}/{}条".format(i+1,len(texts)))
+        embeddings.append(get_embedding(text))
+    return np.array(embeddings)
+
+def load_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return json.load(file)
+
+def save_json(data, file_path):
+    with open(file_path, 'w', encoding='utf-8') as file:
+        json.dump(data, file, ensure_ascii=False, indent=2)
+
+def save_embeddings(embeddings, file_path):
+    with open(file_path, 'wb') as file:
+        pickle.dump(embeddings, file)
+
+def load_embeddings(file_path):
+    with open(file_path, 'rb') as file:
+        return pickle.load(file)
+    
+def deduplicate_qa(data, save_vectors=True):
+    questions = [item['question'] for item in data]
+
+    # 生成嵌入向量
+    question_embeddings = compute_embeddings(questions)
+    
+    if save_vectors:
+        print("保存问题的嵌入向量...")
+        save_embeddings(question_embeddings, '/home/ubuntu/50T/fsy/layer2/QA/question_embeddings.pkl')
+    
+    # 去重逻辑
+    filtered_data, duplicate_entries = de_emphasize(question_embeddings,data)
+
+    return filtered_data, duplicate_entries
+
+def deduplicate_qa_pkl(data,pkl_path):
+    
+    question_embeddings = load_embeddings(pkl_path)
+    filtered_data, duplicate_entries = de_emphasize(question_embeddings,data)
+    
+    return filtered_data, duplicate_entries
+
+def de_emphasize(question_embeddings,data,similarity_threshold=0.99):
+    
+    unique_indices = []
+    duplicate_entries = []  # 用来保存重复的问答对信息
+    for i in range(len(data)):
+        print("正在处理第{}/{}条".format(i+1,len(data)))
+        duplicate_found = False
+        for j in unique_indices:
+            # 计算问题的语义相似性
+            question_sim = cosine_similarity([question_embeddings[i]], [question_embeddings[j]])[0][0]
+
+            # 如果相似度均超过阈值，则认为是重复
+            if question_sim > similarity_threshold:
+                duplicate_found = True
+                # 保存重复对的相关信息到 `duplicate_entries`，包括当前问答和匹配到的问答
+                duplicate_entries.append({
+                    "duplicate_question": data[i]['question'],
+                    "duplicate_answer": data[i]['answer'],
+                    "matched_question": data[j]['question'],
+                    "matched_answer": data[j]['answer']
+                })
+                break
+
+        if not duplicate_found:
+            unique_indices.append(i)
+
+    # 构建去重后的数据
+    filtered_data = [data[i] for i in unique_indices]
+    return filtered_data, duplicate_entries
+
+# 主程序
+if __name__ == '__main__':
+    input_file = '/home/ubuntu/50T/fsy/layer2/PGEE/code/dataset.json'  # 输入 JSON 文件路径
+    output_file = '/home/ubuntu/50T/fsy/layer2/PGEE/code/onrepeat_99.json'  # 去重后的输出文件路径
+    duplicates_file = '/home/ubuntu/50T/fsy/layer2/PGEE/codeduplicates_99.json'  # 筛选掉的问答对文件路径
+    pkl_path = "/home/ubuntu/50T/fsy/layer2/PGEE/question_embeddings.pkl"
+    qa_data = load_json(input_file)
+
+    # 进行去重，将获得的向量保存为pkl文件
+    # filtered_data, duplicate_entries = deduplicate_qa(qa_data, similarity_threshold=0.9)
+
+    # 导入pkl文件进行查重
+    filtered_data, duplicate_entries =deduplicate_qa_pkl(qa_data,pkl_path)
+
+    # 按照难度进行问题筛选
+
+
+    # 对于非选择题，选择答案最相近的答案作为错误选项
+
+
+    # 保存处理后的问答对以及重复的问答对
+    save_json(filtered_data, output_file)
+    save_json(duplicate_entries, duplicates_file)
+
+    
+
+    print(f"去重完成！处理前共有 {len(qa_data)} 条问答对，处理后剩余 {len(filtered_data)} 条。")
+    print(f"重复问答对保存到 {duplicates_file}，共保存 {len(duplicate_entries)} 条。")