Files
MatBench/layer2/rubbish/4_1.py
2025-05-28 11:00:24 +08:00

37 lines
1.4 KiB
Python

import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def process_json(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
type_4_items = [item for item in data if item.get("type") == 4]
answers = [item["answer"] for item in type_4_items]
# 使用TF-IDF表示答案
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(answers)
# 计算每个答案的余弦相似度
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
for i, item in enumerate(type_4_items):
# 获取当前答案的相似度列表,跳过自身(对角线)
similarities = cosine_sim[i]
similarities[i] = -1 # 避免自身
# 获取与当前答案最相近的三个答案索引
most_similar_indices = similarities.argsort()[-3:][::-1]
# 存储错误答案
item["wrong_answers_1"] = type_4_items[most_similar_indices[0]]["answer"]
item["wrong_answers_2"] = type_4_items[most_similar_indices[1]]["answer"]
item["wrong_answers_3"] = type_4_items[most_similar_indices[2]]["answer"]
with open('5_type4.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
process_json('/home/ubuntu/50T/fsy/benchmark/4is_type_with_wrong_answers.json')