add get Source 2 Doc Func

2024-01-20 22:22:06 +08:00
parent 3ef1a11d08
commit 6807cc0cc2
2 changed files with 361 additions and 0 deletions
--- a/resouce_collection/Source2Doc/get_Source_Doc.py
+++ b/resouce_collection/Source2Doc/get_Source_Doc.py
@@ -0,0 +1,89 @@
+import csv
+import os
+import yt_dlp as youtube_dl
+from docx import Document
+import requests
+from bs4 import BeautifulSoup
+from PIL import Image
+import pytesseract
+from io import BytesIO
+from docx import Document
+import re
+
+# convert .vtt file to .docx file
+def vtt_to_docx(vtt_filepath, docx_filepath):
+    doc = Document()
+    
+    # open .vtt file
+    with open(vtt_filepath, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+    
+    # apply regex to each line to check if it is a timecode
+    vtt_text_pattern = re.compile(r'^\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}')
+    
+    # deal with each line
+    for line in lines:
+        # if it is a timecode, skip it
+        if vtt_text_pattern.match(line) or 'WEBVTT' in line:
+            continue
+        # else, add it to the document
+        if line.strip(): 
+            doc.add_paragraph(line.strip())
+
+    doc.save(docx_filepath)
+
+
+# download youtube subtitles and convert them to .docx file
+def download_youtube_subtitles(video_url, doc_filename):
+    ydl_opts = {
+        'skip_download': True,
+        'writeautomaticsub': True,
+        'subtitleslangs': ['en'],
+        'outtmpl': f'{doc_filename}.%(ext)s',
+        'quiet': True,
+    }
+    
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([video_url])
+    
+    # call vtt_to_docx function to convert .vtt file to .docx file
+    vtt_to_docx(f'/content/{doc_filename}.en.vtt', f'/content/{doc_filename}.docx')
+
+# scrape and OCR a forum
+def scrape_and_ocr_forum(url, doc):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    
+    text_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'li'])
+    for element in text_elements:
+        doc.add_paragraph(element.get_text())
+    
+    image_elements = soup.find_all('img')
+    for image in image_elements:
+        image_url = image['src']
+        if image_url.startswith('http'):
+            img_response = requests.get(image_url, stream=True)
+            img = Image.open(BytesIO(img_response.content))
+            ocr_text = pytesseract.image_to_string(img)
+            if not ocr_text:
+              doc.add_paragraph(ocr_text)
+
+# process a url
+def process_url(url, doc_id):
+    doc_filepath = f"{doc_id}.docx"
+    doc = Document()
+    
+    if 'youtube.com' in url or 'youtu.be' in url:
+        download_youtube_subtitles(url, doc_id)
+    else:
+        scrape_and_ocr_forum(url, doc)
+    
+    doc.save(doc_filepath)
+
+# read csv file and process each row
+csv_filepath = './Get_Source_Doc - Sheet1.csv' 
+with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile:
+    reader = csv.DictReader(csvfile)
+    for row in reader:
+        process_url(row['Source'], row['id'])
+        print(row)