modified libreoffice writer eval examples
This commit is contained in:
@@ -179,31 +179,35 @@ def scrape_webpage_to_markdown(url, doc_filepath):
|
||||
|
||||
# scrape the webpage and perform OCR on images
|
||||
for article in articles:
|
||||
for child in article.recursiveChildGenerator():
|
||||
# if this is an image, perform OCR
|
||||
if child.name == 'img':
|
||||
img_url = child.get('src')
|
||||
if not img_url.startswith(('http:', 'https:')):
|
||||
img_url = '{}{}'.format(url, img_url)
|
||||
if not img_url.endswith('.svg') and not img_url.endswith('.png'):
|
||||
continue
|
||||
if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url:
|
||||
continue
|
||||
img_response = requests.get(img_url, stream=True)
|
||||
img = Image.open(BytesIO(img_response.content))
|
||||
ocr_text = pytesseract.image_to_string(img)
|
||||
if ocr_text.strip():
|
||||
markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip())
|
||||
continue
|
||||
# Not an image, so continue recursively calling function
|
||||
if child.name is None:
|
||||
continue
|
||||
|
||||
html_str = str(child)
|
||||
markdown_content += md(html_str) + '\n\n'
|
||||
for child in article.recursiveChildGenerator():
|
||||
# if this is an image, perform OCR
|
||||
if child.name == 'img':
|
||||
img_url = child.get('src')
|
||||
if not img_url.startswith(('http:', 'https:')):
|
||||
img_url = '{}{}'.format(url, img_url)
|
||||
if not img_url.endswith('.svg') and not img_url.endswith('.png'):
|
||||
continue
|
||||
if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url:
|
||||
continue
|
||||
try:
|
||||
img_response = requests.get(img_url, stream=True)
|
||||
img = Image.open(BytesIO(img_response.content))
|
||||
ocr_text = pytesseract.image_to_string(img)
|
||||
if ocr_text.strip():
|
||||
markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip())
|
||||
continue
|
||||
except PIL.UnidentifiedImageError:
|
||||
print("unidentified image")
|
||||
|
||||
# Not an image, so continue recursively calling function
|
||||
if child.name is None:
|
||||
continue
|
||||
|
||||
html_str = str(child)
|
||||
markdown_content += md(html_str) + '\n\n'
|
||||
|
||||
with open(doc_filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
with open(doc_filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
|
||||
# process a URL and save the file
|
||||
|
||||
Reference in New Issue
Block a user