MatBench/layer3/IsMaterialSci.py

# 筛除综述类论文
import json
import time
import threading
import queue
from concurrent.futures import ThreadPoolExecutor
from openai import OpenAI

result_lock = threading.Lock()
api_semaphore = threading.Semaphore(5)
material_items = []
error_items = []

client = OpenAI(
    api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
    base_url="https://vip.apiyi.com/v1"
)

def load_qa_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def classify_qa_type(abstract,title):
    prompt = f"""
            This is a categorization task. Please analyze the title and abstract of the article entered to determine if it is a review paper or report. Strictly return the number 1 if the title and abstract meet the requirements for a review paper or report, or 0 if they do not meet the requirements for a review paper or report. do not provide any other explanation or output, just return the number 1 or 0.

            Article Title:
            {title}

            Abstract:
            {abstract}
            """

    with api_semaphore:
        try:
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant"},
                    {"role": "user", "content": prompt}
                ],
                stream=False
            )
            result = response.choices[0].message.content.strip().lower()
            print(result)
            return result
        except Exception as e:
            print(f"API调用错误: {e}")
            return "2"

def process_item(item, index, total):
    print(f"处理第 {index+1}/{total} 条数据...")
    abstract = item["Abstract"]
    # choices = item["choices"]["text"]
    # choices = item["distractor3"] +','+ item["distractor2"] + ',' + item["distractor1"] +','+item["correct_answer"]
    title = item["Article Title"]
    label = classify_qa_type(abstract,title)

    with result_lock:
        if "0" in label:
            material_items.append(item)
        elif "2" in label:
            item["error"] = "yes"
            error_items.append(item)

def save_processed_data(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    input_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
    output_file = "/home/ubuntu/50T/fsy/top_cited_paper_mat.json"
    error_file = "/home/ubuntu/50T/fsy/paper-error.json"

    data = load_qa_data(input_file)
    total = len(data)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i, item in enumerate(data):
            future = executor.submit(process_item, item, i, total)
            futures.append(future)

            if (i+1) % 10 == 0:
                time.sleep(1)

        for future in futures:
            future.result()

    save_processed_data(material_items, output_file)
    print(f"处理完成，材料科学相关条目已保存到 {output_file}")

    if error_items:
        save_processed_data(error_items, error_file)
        print(f"处理出错的条目已保存到 {error_file}")

if __name__ == "__main__":
    main()