import json from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def process_json(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) type_4_items = [item for item in data if item.get("type") == 4] answers = [item["answer"] for item in type_4_items] # 使用TF-IDF表示答案 vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(answers) # 计算每个答案的余弦相似度 cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) for i, item in enumerate(type_4_items): # 获取当前答案的相似度列表,跳过自身(对角线) similarities = cosine_sim[i] similarities[i] = -1 # 避免自身 # 获取与当前答案最相近的三个答案索引 most_similar_indices = similarities.argsort()[-3:][::-1] # 存储错误答案 item["wrong_answers_1"] = type_4_items[most_similar_indices[0]]["answer"] item["wrong_answers_2"] = type_4_items[most_similar_indices[1]]["answer"] item["wrong_answers_3"] = type_4_items[most_similar_indices[2]]["answer"] with open('5_type4.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) process_json('/home/ubuntu/50T/fsy/benchmark/4is_type_with_wrong_answers.json')