layer2 commit

2025-05-28 11:00:24 +08:00
parent 6a6b09ae20
commit 9f5318c23d
66 changed files with 286574 additions and 0 deletions
--- a/layer2/PGEE/code/step2_translate.py
+++ b/layer2/PGEE/code/step2_translate.py
@@ -0,0 +1,137 @@
+"""
+0. 将问题从xls提取为json
+1. 将问题进行拆分
+2. 翻译成英文
+3. 去重
+4. 使用大模型进行难度评估和筛选
+"""
+import json
+import time
+import threading
+import queue
+from concurrent.futures import ThreadPoolExecutor
+from openai import OpenAI
+import re
+
+result_lock = threading.Lock()
+api_semaphore = threading.Semaphore(5)  
+processed_data =[]
+error_items = []
+API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+BASE_URL="https://vip.apiyi.com/v1"
+MODEL_GPT ="deepseek-chat"
+
+client = OpenAI(api_key=API_KEY,base_url=BASE_URL)
+
+def load_qa_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+def translate_qa_type(question,answer):
+    prompt = f"""
+            
+            Please strictly translate the following Chinese questions and answers into English, and return the results according to the specified JSON format:
+
+            Question: {question}
+            Answer: {answer}
+
+            Translation requirements:
+            - Only translate the Chinese expressions, any additions or modifications to the content are prohibited
+            - Maintain all information points, expressions, and numerical values exactly as in the original text
+            - Keep professional terminology accurate
+            - Return plain text, do not use markdown format
+
+            Return the translation results according to the following JSON format:
+            [
+            {{
+            "question": "Translated English question",
+            "answer": "Translated English answer"
+            }}
+            ]
+            """
+    
+    with api_semaphore:
+        try:
+            response = client.chat.completions.create(
+                model = MODEL_GPT,
+                messages=[
+                    {"role": "system", "content": "You are an expert translator with extensive knowledge of materials science, tasked with translating Chinese texts into highly accurate English, ensuring the correct usage of scientific terminology."},
+                    {"role": "user", "content": prompt}
+                ],
+                stream=False
+            )
+            result = response.choices[0].message.content.strip()
+            print(result)
+            process_result = comfirm_json_string(result)
+            return json.loads(process_result)
+        except Exception as e:
+            print(f"API调用错误: {e}")
+            return "2" 
+
+def comfirm_json_string(json_string):
+    json_string = re.sub(r'[“”]', '"', json_string)
+    json_string = re.sub(r'\\', r'\\\\', json_string)
+    json_string = re.sub(r'\\"', r'\"', json_string)
+    json_string = json_string.replace("\n", "").replace("\r", "")
+    # 去掉 Markdown 的语法包裹
+    if json_string.startswith("```json"):
+        json_string = json_string.strip("`json\n")
+    json_string = json_string.strip('`\n')
+
+    return json_string
+
+def process_item(item, index, total):
+    print(f"处理第 {index+1}/{total} 条数据...")
+    question = item["question"]
+    answer = item["answer"]
+    data = translate_qa_type(question,answer)
+    
+    with result_lock:
+        if isinstance(data, list):
+            processed_data.append({
+                    "idx": item['idx'],
+                    "question": data[0]["question"],
+                    "answer": data[0]["answer"]
+                    })
+        else:
+                error_items.append({
+                "idx": item['idx'],
+                "question": question,
+                "answer": answer
+            })
+
+        
+def save_processed_data(data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+def main():
+    input_file = "/home/ubuntu/50T/fsy/layer2/QA/single_select.json"
+    output_file = "/home/ubuntu/50T/fsy/layer2/QA/EN-single_select.json"
+    error_file = "/home/ubuntu/50T/fsy/error.json"
+    
+    data = load_qa_data(input_file)
+    total = len(data)
+    
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for i, item in enumerate(data):
+            future = executor.submit(process_item, item, i, total)
+            futures.append(future)
+            
+            if (i+1) % 10 == 0:
+                time.sleep(1)
+    
+        for future in futures:
+            future.result()
+    
+    save_processed_data(processed_data, output_file)
+    print(f"处理完成，已保存到 {output_file}")
+    
+    if error_items:
+        save_processed_data(error_items, error_file)
+        print(f"处理出错的条目已保存到 {error_file}")
+
+if __name__ == "__main__":
+    main()