# 筛除综述类论文 import json import time import threading import queue from concurrent.futures import ThreadPoolExecutor from openai import OpenAI result_lock = threading.Lock() api_semaphore = threading.Semaphore(5) material_items = [] error_items = [] client = OpenAI( api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d", base_url="https://vip.apiyi.com/v1" ) def load_qa_data(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return data def classify_qa_type(abstract,title): prompt = f""" This is a categorization task. Please analyze the title and abstract of the article entered to determine if it is a review paper or report. Strictly return the number 1 if the title and abstract meet the requirements for a review paper or report, or 0 if they do not meet the requirements for a review paper or report. do not provide any other explanation or output, just return the number 1 or 0. Article Title: {title} Abstract: {abstract} """ with api_semaphore: try: response = client.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": prompt} ], stream=False ) result = response.choices[0].message.content.strip().lower() print(result) return result except Exception as e: print(f"API调用错误: {e}") return "2" def process_item(item, index, total): print(f"处理第 {index+1}/{total} 条数据...") abstract = item["Abstract"] # choices = item["choices"]["text"] # choices = item["distractor3"] +','+ item["distractor2"] + ',' + item["distractor1"] +','+item["correct_answer"] title = item["Article Title"] label = classify_qa_type(abstract,title) with result_lock: if "0" in label: material_items.append(item) elif "2" in label: item["error"] = "yes" error_items.append(item) def save_processed_data(data, output_file): with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) def main(): input_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json" output_file = "/home/ubuntu/50T/fsy/top_cited_paper_mat.json" error_file = "/home/ubuntu/50T/fsy/paper-error.json" data = load_qa_data(input_file) total = len(data) with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for i, item in enumerate(data): future = executor.submit(process_item, item, i, total) futures.append(future) if (i+1) % 10 == 0: time.sleep(1) for future in futures: future.result() save_processed_data(material_items, output_file) print(f"处理完成,材料科学相关条目已保存到 {output_file}") if error_items: save_processed_data(error_items, error_file) print(f"处理出错的条目已保存到 {error_file}") if __name__ == "__main__": main()