modified libreoffice writer eval examples

2024-01-23 22:02:09 +08:00
parent 42725a00a5
commit 35c4ce99ff
14 changed files with 749 additions and 322 deletions
--- a/resouce_collection/Source2Doc/get_Source_Doc.py
+++ b/resouce_collection/Source2Doc/get_Source_Doc.py
@@ -179,31 +179,35 @@ def scrape_webpage_to_markdown(url, doc_filepath):

    # scrape the webpage and perform OCR on images
    for article in articles:
-      for child in article.recursiveChildGenerator():
-          # if this is an image, perform OCR
-          if child.name == 'img':
-              img_url = child.get('src')
-              if not img_url.startswith(('http:', 'https:')):
-                  img_url = '{}{}'.format(url, img_url)
-              if not img_url.endswith('.svg') and not img_url.endswith('.png'):
-                  continue
-              if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url:
-                  continue
-              img_response = requests.get(img_url, stream=True)
-              img = Image.open(BytesIO(img_response.content))
-              ocr_text = pytesseract.image_to_string(img)
-              if ocr_text.strip():
-                  markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip())
-              continue
-          # Not an image, so continue recursively calling function
-          if child.name is None:
-              continue
-          
-          html_str = str(child)
-          markdown_content += md(html_str) + '\n\n'
+        for child in article.recursiveChildGenerator():
+            # if this is an image, perform OCR
+            if child.name == 'img':
+                img_url = child.get('src')
+                if not img_url.startswith(('http:', 'https:')):
+                    img_url = '{}{}'.format(url, img_url)
+                if not img_url.endswith('.svg') and not img_url.endswith('.png'):
+                    continue
+                if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url:
+                    continue
+                try:
+                    img_response = requests.get(img_url, stream=True)
+                    img = Image.open(BytesIO(img_response.content))
+                    ocr_text = pytesseract.image_to_string(img)
+                    if ocr_text.strip():
+                        markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip())
+                    continue
+                except PIL.UnidentifiedImageError:
+                    print("unidentified image")
+                
+            # Not an image, so continue recursively calling function
+            if child.name is None:
+                continue
+            
+            html_str = str(child)
+            markdown_content += md(html_str) + '\n\n'

-    with open(doc_filepath, 'w', encoding='utf-8') as f:
-        f.write(markdown_content)
+        with open(doc_filepath, 'w', encoding='utf-8') as f:
+            f.write(markdown_content)


 # process a URL and save the file