sci-gui-agent-benchmark/resouce_collection/Source2Doc/get_Source_Doc.py

import csv
import os
import yt_dlp as youtube_dl
from docx import Document
import requests
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
from io import BytesIO
from docx import Document
import re

# convert .vtt file to .docx file
def vtt_to_docx(vtt_filepath, docx_filepath):
    doc = Document()

    # open .vtt file
    with open(vtt_filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # apply regex to each line to check if it is a timecode
    vtt_text_pattern = re.compile(r'^\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}')

    # deal with each line
    for line in lines:
        # if it is a timecode, skip it
        if vtt_text_pattern.match(line) or 'WEBVTT' in line:
            continue
        # else, add it to the document
        if line.strip():
            doc.add_paragraph(line.strip())

    doc.save(docx_filepath)


# download youtube subtitles and convert them to .docx file
def download_youtube_subtitles(video_url, doc_filename):
    ydl_opts = {
        'skip_download': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'outtmpl': f'{doc_filename}.%(ext)s',
        'quiet': True,
    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

    # call vtt_to_docx function to convert .vtt file to .docx file
    vtt_to_docx(f'/content/{doc_filename}.en.vtt', f'/content/{doc_filename}.docx')

# scrape and OCR a forum
def scrape_and_ocr_forum(url, doc):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    text_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'li'])
    for element in text_elements:
        doc.add_paragraph(element.get_text())

    image_elements = soup.find_all('img')
    for image in image_elements:
        image_url = image['src']
        if image_url.startswith('http'):
            img_response = requests.get(image_url, stream=True)
            img = Image.open(BytesIO(img_response.content))
            ocr_text = pytesseract.image_to_string(img)
            if not ocr_text:
              doc.add_paragraph(ocr_text)

# process a url
def process_url(url, doc_id):
    doc_filepath = f"{doc_id}.docx"
    doc = Document()

    if 'youtube.com' in url or 'youtu.be' in url:
        download_youtube_subtitles(url, doc_id)
    else:
        scrape_and_ocr_forum(url, doc)

    doc.save(doc_filepath)

# read csv file and process each row
csv_filepath = './Get_Source_Doc - Sheet1.csv'
with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        process_url(row['Source'], row['id'])
        print(row)