second commit

2025-05-28 10:55:34 +08:00
parent 0f80316f8b
commit ef9355f2f5
73 changed files with 485583 additions and 0 deletions
--- a/layer1/ALL/sciq-process.py
+++ b/layer1/ALL/sciq-process.py
@@ -0,0 +1,53 @@
+import json
+import random
+
+def process_json_file(input_file, output_file):
+    new_json=[]
+    with open(input_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    
+    for item in data:
+        choices={}
+        options = [
+            item['correct_answer'],
+            item['distractor1'],
+            item['distractor2'],
+            item['distractor3']
+        ]       
+        # 打乱选项顺序
+        random.shuffle(options)
+        
+        # 找出正确答案的位置
+        correct_index = options.index(item['correct_answer'])
+        correct_letter = chr(65 + correct_index)  # 65是ASCII码中'A'的值
+        
+        # 拼接选项到问题中
+        labels = []
+        for i, option in enumerate(options):
+            letter = chr(65 + i)  # A, B, C, D
+            labels.append(letter)
+        
+        choices['text']=options
+        choices['label'] =labels
+        
+        transformed_data = {
+            "question": item["question"],
+            "choices":choices,
+            "answer": f"[ANSWER]{correct_letter}[/ANSWER]",
+            "prompt":"You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER]. No explanations and other information. Only return the '[ANSWER]<answer>[/ANSWER]'. We require this because we use automatic parsing."
+
+        }
+        new_json.append(transformed_data)
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(new_json, f, indent=2, ensure_ascii=False)
+    
+    return new_json
+
+# 示例使用
+if __name__ == "__main__":
+
+    input_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/sciq-val-mat.json"
+    output_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/15-sciq-val.json"
+
+    processed_data = process_json_file(input_file, output_file)