37 lines
1.4 KiB
Python
37 lines
1.4 KiB
Python
import json
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
def process_json(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
type_4_items = [item for item in data if item.get("type") == 4]
|
|
|
|
answers = [item["answer"] for item in type_4_items]
|
|
|
|
# 使用TF-IDF表示答案
|
|
vectorizer = TfidfVectorizer()
|
|
tfidf_matrix = vectorizer.fit_transform(answers)
|
|
|
|
# 计算每个答案的余弦相似度
|
|
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
|
|
|
|
for i, item in enumerate(type_4_items):
|
|
# 获取当前答案的相似度列表,跳过自身(对角线)
|
|
similarities = cosine_sim[i]
|
|
similarities[i] = -1 # 避免自身
|
|
|
|
# 获取与当前答案最相近的三个答案索引
|
|
most_similar_indices = similarities.argsort()[-3:][::-1]
|
|
|
|
# 存储错误答案
|
|
item["wrong_answers_1"] = type_4_items[most_similar_indices[0]]["answer"]
|
|
item["wrong_answers_2"] = type_4_items[most_similar_indices[1]]["answer"]
|
|
item["wrong_answers_3"] = type_4_items[most_similar_indices[2]]["answer"]
|
|
|
|
with open('5_type4.json', 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
|
process_json('/home/ubuntu/50T/fsy/benchmark/4is_type_with_wrong_answers.json')
|