import csv import os import yt_dlp as youtube_dl from docx import Document import requests from bs4 import BeautifulSoup from PIL import Image import pytesseract from io import BytesIO from docx import Document import re # convert .vtt file to .docx file def vtt_to_docx(vtt_filepath, docx_filepath): doc = Document() # open .vtt file with open(vtt_filepath, 'r', encoding='utf-8') as file: lines = file.readlines() # apply regex to each line to check if it is a timecode vtt_text_pattern = re.compile(r'^\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}') # deal with each line for line in lines: # if it is a timecode, skip it if vtt_text_pattern.match(line) or 'WEBVTT' in line: continue # else, add it to the document if line.strip(): doc.add_paragraph(line.strip()) doc.save(docx_filepath) # download youtube subtitles and convert them to .docx file def download_youtube_subtitles(video_url, doc_filename): ydl_opts = { 'skip_download': True, 'writeautomaticsub': True, 'subtitleslangs': ['en'], 'outtmpl': f'{doc_filename}.%(ext)s', 'quiet': True, } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([video_url]) # call vtt_to_docx function to convert .vtt file to .docx file vtt_to_docx(f'/content/{doc_filename}.en.vtt', f'/content/{doc_filename}.docx') # scrape and OCR a forum def scrape_and_ocr_forum(url, doc): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') text_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'li']) for element in text_elements: doc.add_paragraph(element.get_text()) image_elements = soup.find_all('img') for image in image_elements: image_url = image['src'] if image_url.startswith('http'): img_response = requests.get(image_url, stream=True) img = Image.open(BytesIO(img_response.content)) ocr_text = pytesseract.image_to_string(img) if not ocr_text: doc.add_paragraph(ocr_text) # process a url def process_url(url, doc_id): doc_filepath = f"{doc_id}.docx" doc = Document() if 'youtube.com' in url or 'youtu.be' in url: download_youtube_subtitles(url, doc_id) else: scrape_and_ocr_forum(url, doc) doc.save(doc_filepath) # read csv file and process each row csv_filepath = './Get_Source_Doc - Sheet1.csv' with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: process_url(row['Source'], row['id']) print(row)