second commit

2025-05-28 10:55:34 +08:00
parent 0f80316f8b
commit ef9355f2f5
73 changed files with 485583 additions and 0 deletions
--- a/layer1/ALL/ScienceQA-process.py
+++ b/layer1/ALL/ScienceQA-process.py
@@ -0,0 +1,45 @@
+import json
+
+def generate_labels(choice_count):
+    # 根据选项数量生成 A-Z
+    return [chr(ord('A') + i) for i in range(choice_count)]
+
+# 将数字答案转换为字母答案
+def convert_answer_to_letter(answer):
+    return chr(ord('A') + answer)
+
+def transform_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    
+    new_json=[]
+
+    for i,item in data.items():
+        question = item["question"]
+        choices = item["choices"]
+        answer_index = item["answer"]
+        
+        new_choices = {
+                "text": choices,
+                "label": generate_labels(len(choices))
+            }
+        transformed_answer = convert_answer_to_letter(answer_index)
+        
+        # 构造新的 JSON 数据
+        transformed_data = {
+            "question": question,
+            "choices":new_choices,
+            "answer": f"[ANSWER]{transformed_answer}[/ANSWER]",
+            "prompt":"You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER]. No explanations and other information. Only return the '[ANSWER]<answer>[/ANSWER]'. We require this because we use automatic parsing."
+
+        }
+        new_json.append(transformed_data)
+
+    return new_json
+
+input_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/ScienceQA-mat-noimage.json'  
+output_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/11-ScienceQA.json'
+transformed_data = transform_json(input_path)
+
+with open(output_path, 'w', encoding='utf-8') as f:
+    json.dump(transformed_data, f, ensure_ascii= False, indent=2)