""" Author: Yutang LI Institution: SIAT-MIC Contact: yt.li2@siat.ac.cn """ import datetime import logging import os import httpx import pandas as pd from bs4 import BeautifulSoup from PIL import Image from playwright.async_api import async_playwright from io import StringIO from utils import settings, handle_minio_upload logger = logging.getLogger(__name__) async def fetch_oqmd_data(composition: str) -> str: """从OQMD获取数据""" url = f"https://www.oqmd.org/materials/composition/{composition}" try: async with httpx.AsyncClient(timeout=30.0) as client: response = await client.get(url) response.raise_for_status() # 验证响应内容 if not response.text or len(response.text) < 100: raise ValueError("Invalid response content from OQMD API") return response.text except httpx.HTTPStatusError as e: logger.error(f"OQMD API request failed: {str(e)}") raise except httpx.TimeoutException: logger.error("OQMD API request timed out") raise except httpx.NetworkError as e: logger.error(f"Network error occurred: {str(e)}") raise except ValueError as e: logger.error(f"Invalid response content: {str(e)}") raise def parse_oqmd_html(html: str) -> tuple[list, str, list]: """解析OQMD HTML数据""" soup = BeautifulSoup(html, 'html.parser') # 解析基本数据 basic_data = [] basic_data.append(soup.find('h1').text.strip()) for script in soup.find_all('p'): if script: combined_text = "" for element in script.contents: if element.name == 'a': url = "https://www.oqmd.org" + element['href'] combined_text += f"[{element.text.strip()}]({url}) " else: combined_text += element.text.strip() + " " basic_data.append(combined_text.strip()) # 解析表格数据 table = soup.find('table') if table: df = pd.read_html(StringIO(str(table)))[0] df = df.fillna('') df = df.replace([float('inf'), float('-inf')], '') table_data = df.to_markdown(index=False) # 提取JavaScript数据 phase_data = [] for script in soup.find_all('script'): if script.string and '$(function()' in script.string: phase_data.append({ 'type': script.get('type', 'text/javascript'), 'content': script.string.strip() }) return basic_data, table_data, phase_data async def render_and_save_charts(script_data: list) -> str: """渲染并保存图表到MinIO""" browser = None temp_files = [] try: # 初始化Playwright async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() # 构建包含 JavaScript 的 HTML 代码 html_content = """