add get Source 2 Doc Func
This commit is contained in:
89
resouce_collection/Source2Doc/get_Source_Doc.py
Normal file
89
resouce_collection/Source2Doc/get_Source_Doc.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import csv
|
||||
import os
|
||||
import yt_dlp as youtube_dl
|
||||
from docx import Document
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
import re
|
||||
|
||||
# convert .vtt file to .docx file
|
||||
def vtt_to_docx(vtt_filepath, docx_filepath):
|
||||
doc = Document()
|
||||
|
||||
# open .vtt file
|
||||
with open(vtt_filepath, 'r', encoding='utf-8') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# apply regex to each line to check if it is a timecode
|
||||
vtt_text_pattern = re.compile(r'^\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3}')
|
||||
|
||||
# deal with each line
|
||||
for line in lines:
|
||||
# if it is a timecode, skip it
|
||||
if vtt_text_pattern.match(line) or 'WEBVTT' in line:
|
||||
continue
|
||||
# else, add it to the document
|
||||
if line.strip():
|
||||
doc.add_paragraph(line.strip())
|
||||
|
||||
doc.save(docx_filepath)
|
||||
|
||||
|
||||
# download youtube subtitles and convert them to .docx file
|
||||
def download_youtube_subtitles(video_url, doc_filename):
|
||||
ydl_opts = {
|
||||
'skip_download': True,
|
||||
'writeautomaticsub': True,
|
||||
'subtitleslangs': ['en'],
|
||||
'outtmpl': f'{doc_filename}.%(ext)s',
|
||||
'quiet': True,
|
||||
}
|
||||
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([video_url])
|
||||
|
||||
# call vtt_to_docx function to convert .vtt file to .docx file
|
||||
vtt_to_docx(f'/content/{doc_filename}.en.vtt', f'/content/{doc_filename}.docx')
|
||||
|
||||
# scrape and OCR a forum
|
||||
def scrape_and_ocr_forum(url, doc):
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
text_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'li'])
|
||||
for element in text_elements:
|
||||
doc.add_paragraph(element.get_text())
|
||||
|
||||
image_elements = soup.find_all('img')
|
||||
for image in image_elements:
|
||||
image_url = image['src']
|
||||
if image_url.startswith('http'):
|
||||
img_response = requests.get(image_url, stream=True)
|
||||
img = Image.open(BytesIO(img_response.content))
|
||||
ocr_text = pytesseract.image_to_string(img)
|
||||
if not ocr_text:
|
||||
doc.add_paragraph(ocr_text)
|
||||
|
||||
# process a url
|
||||
def process_url(url, doc_id):
|
||||
doc_filepath = f"{doc_id}.docx"
|
||||
doc = Document()
|
||||
|
||||
if 'youtube.com' in url or 'youtu.be' in url:
|
||||
download_youtube_subtitles(url, doc_id)
|
||||
else:
|
||||
scrape_and_ocr_forum(url, doc)
|
||||
|
||||
doc.save(doc_filepath)
|
||||
|
||||
# read csv file and process each row
|
||||
csv_filepath = './Get_Source_Doc - Sheet1.csv'
|
||||
with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
process_url(row['Source'], row['id'])
|
||||
print(row)
|
||||
Reference in New Issue
Block a user