99 lines
3.3 KiB
Python
99 lines
3.3 KiB
Python
# 筛除综述类论文
|
|
import json
|
|
import time
|
|
import threading
|
|
import queue
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from openai import OpenAI
|
|
|
|
result_lock = threading.Lock()
|
|
api_semaphore = threading.Semaphore(5)
|
|
material_items = []
|
|
error_items = []
|
|
|
|
client = OpenAI(
|
|
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
|
|
base_url="https://vip.apiyi.com/v1"
|
|
)
|
|
|
|
def load_qa_data(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
return data
|
|
|
|
def classify_qa_type(abstract,title):
|
|
prompt = f"""
|
|
This is a categorization task. Please analyze the title and abstract of the article entered to determine if it is a review paper or report. Strictly return the number 1 if the title and abstract meet the requirements for a review paper or report, or 0 if they do not meet the requirements for a review paper or report. do not provide any other explanation or output, just return the number 1 or 0.
|
|
|
|
Article Title:
|
|
{title}
|
|
|
|
Abstract:
|
|
{abstract}
|
|
"""
|
|
|
|
with api_semaphore:
|
|
try:
|
|
response = client.chat.completions.create(
|
|
model="deepseek-chat",
|
|
messages=[
|
|
{"role": "system", "content": "You are a helpful assistant"},
|
|
{"role": "user", "content": prompt}
|
|
],
|
|
stream=False
|
|
)
|
|
result = response.choices[0].message.content.strip().lower()
|
|
print(result)
|
|
return result
|
|
except Exception as e:
|
|
print(f"API调用错误: {e}")
|
|
return "2"
|
|
|
|
def process_item(item, index, total):
|
|
print(f"处理第 {index+1}/{total} 条数据...")
|
|
abstract = item["Abstract"]
|
|
# choices = item["choices"]["text"]
|
|
# choices = item["distractor3"] +','+ item["distractor2"] + ',' + item["distractor1"] +','+item["correct_answer"]
|
|
title = item["Article Title"]
|
|
label = classify_qa_type(abstract,title)
|
|
|
|
with result_lock:
|
|
if "0" in label:
|
|
material_items.append(item)
|
|
elif "2" in label:
|
|
item["error"] = "yes"
|
|
error_items.append(item)
|
|
|
|
def save_processed_data(data, output_file):
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
def main():
|
|
input_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
|
|
output_file = "/home/ubuntu/50T/fsy/top_cited_paper_mat.json"
|
|
error_file = "/home/ubuntu/50T/fsy/paper-error.json"
|
|
|
|
data = load_qa_data(input_file)
|
|
total = len(data)
|
|
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
futures = []
|
|
for i, item in enumerate(data):
|
|
future = executor.submit(process_item, item, i, total)
|
|
futures.append(future)
|
|
|
|
if (i+1) % 10 == 0:
|
|
time.sleep(1)
|
|
|
|
for future in futures:
|
|
future.result()
|
|
|
|
save_processed_data(material_items, output_file)
|
|
print(f"处理完成,材料科学相关条目已保存到 {output_file}")
|
|
|
|
if error_items:
|
|
save_processed_data(error_items, error_file)
|
|
print(f"处理出错的条目已保存到 {error_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |