modified libreoffice writer eval examples

This commit is contained in:
tsuky_chen
2024-01-23 22:02:09 +08:00
parent 42725a00a5
commit 35c4ce99ff
14 changed files with 749 additions and 322 deletions

View File

@@ -1,7 +1,7 @@
from .chrome import is_expected_tabs, is_expected_bookmarks, compare_pdfs, is_cookie_deleted, is_shortcut_on_desktop
from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers
from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers, compare_docx_lines
from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, \
compare_insert_equation
compare_insert_equation, compare_highlighted_text
from .docs import is_first_line_centered, check_file_exists, compare_contains_image
from .docs import evaluate_colored_words_in_tables, check_highlighted_words, evaluate_strike_through_last_paragraph, \
evaluate_conversion, evaluate_spacing, check_italic_font_size_14, evaluate_alignment, get_unique_train_ids, \

View File

@@ -357,3 +357,31 @@ def check_no_duplicates(initial_file, processed_file):
# No duplicates found and at least one valid line was processed
return 1
def compare_docx_lines(file1, file2):
# Read the text of the document, line by line
doc1 = Document(file1)
doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
doc2 = Document(file2)
doc2_lines = [p.text.strip() for p in doc2.paragraphs if p.text.strip()]
# Convert the list of lines to sets and compare
return set(doc1_lines) == set(doc2_lines)
def compare_highlighted_text(file1, file2):
def extract_highlighted_text(doc):
highlighted_text = []
# Iterate through each run in each paragraph to check for highlight
for paragraph in doc.paragraphs:
for run in paragraph.runs:
if run.font.highlight_color: # Checks if the run is highlighted
highlighted_text.append(run.text.strip())
return highlighted_text
# Read the highlighted text from both documents
doc1_highlighted = extract_highlighted_text(Document(file1))
doc2_highlighted = extract_highlighted_text(Document(file2))
# Compare the sets of highlighted text to check if they are the same
return set(doc1_highlighted) == set(doc2_highlighted)

View File

@@ -1,42 +1,42 @@
{
"id": "0a0faba3-5580-44df-965d-f562a99b291c",
"snapshot": "libreoffice_writer",
"instruction": "I would like to make the first three words of the sentence left-aligned and the rest right-aligned. I basically want to have some empty space in the middle to add some photos. Assume that every sentence will have at least three words. Could you help me on alignment for me?",
"source": "https://stackoverflow.com/questions/64528055/how-to-make-part-of-my-sentence-left-aligned-and-rest-as-right-aligned",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1Wrjxsf184Go70TcRGM4Tohczh29Q9B_U&export=download",
"path": "Desktop/04 CHIN9505 EBook Purchasing info 2021 Jan.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/04 CHIN9505 EBook Purchasing info 2021 Jan.docx"
}
"id": "0a0faba3-5580-44df-965d-f562a99b291c",
"snapshot": "libreoffice_writer",
"instruction": "I would like to make the first three words of the sentence left-aligned and the rest right-aligned. I basically want to have some empty space in the middle to add some photos. Assume that every sentence will have at least three words. Could you help me on alignment for me?",
"source": "https://stackoverflow.com/questions/64528055/how-to-make-part-of-my-sentence-left-aligned-and-rest-as-right-aligned",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1Wrjxsf184Go70TcRGM4Tohczh29Q9B_U&export=download",
"path": "Desktop/04 CHIN9505 EBook Purchasing info 2021 Jan.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "evaluate_alignment",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1yyHGj8KUHDMsZmc1QeJ1KkvSEGy83jMR&export=download",
"dest": "04 CHIN9505 EBook Purchasing info 2021 Jan_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/04 CHIN9505 EBook Purchasing info 2021 Jan.docx",
"dest": "04 CHIN9505 EBook Purchasing info 2021 Jan.docx"
},
{
"type": "open",
"parameters": {
"path": "Desktop/04 CHIN9505 EBook Purchasing info 2021 Jan.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "compare_docx_files",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1yyHGj8KUHDMsZmc1QeJ1KkvSEGy83jMR&export=download",
"dest": "04 CHIN9505 EBook Purchasing info 2021 Jan_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/04 CHIN9505 EBook Purchasing info 2021 Jan.docx",
"dest": "04 CHIN9505 EBook Purchasing info 2021 Jan.docx"
}
}
}

View File

@@ -0,0 +1,45 @@
{
"id": "41c621f7-3544-49e1-af8d-dafd0f834f75",
"snapshot": "libreoffice_writer",
"instruction": "I am adding comments which begin with \"#\" right beside sentences my students have written. I want to make my comments to stand out more, so I am highlighting my comments to yellow. Could you help me on this? It is hard for me to color comments one by one.",
"source": "https://superuser.com/questions/1668018/how-to-auto-format-lines-in-libre-office-writer",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1WsEBs6RS-j6lvx_DFk5xTK-HdEkIDcPy&export=download&authuser=0&confirm=t&uuid=aa012ca0-f651-474f-bd84-6b86d5260817&at=APZUnTU3RImLCGSgOkUpUGKyYCrB:1706018377717",
"path": "Desktop/How_to_read_a_scientific_article.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "How_to_read_a_scientific_article.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": [
"compare_highlighted_text",
"compare_docx_files"
],
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1x2gplRDN1AOnRWiqfqmJqN9JTQ_x4sPu&export=download&authuser=0&confirm=t&uuid=ea6b0a61-fd06-4823-b253-05473c77c192&at=APZUnTUxWWFB_TOKLR9chabErm7b:1706018376493",
"dest": "How_to_read_a_scientific_article_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/How_to_read_a_scientific_article.docx",
"dest": "How_to_read_a_scientific_article.docx"
}
}
}

View File

@@ -1,42 +1,45 @@
{
"id": "6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2",
"snapshot": "libreoffice_writer",
"instruction": "I have been editing my document and some words that needed to be rewritten are highlighted in yellow. As I fixed those words, I removed highlight. Now I want to make sure that there is no highlight word. Could you help me on finding if there is no highlighted words in the file?",
"source": "https://superuser.com/questions/762500/how-do-i-find-all-highlighted-text-in-libreoffice-writer",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1-ygC5pClvU1vxPQ5SGxl3teQAbxCVm8s&export=download",
"path": "Desktop/DG75-DrawGuide.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/DG75-DrawGuide.docx"
}
"id": "6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2",
"snapshot": "libreoffice_writer",
"instruction": "I have been editing my document and some words that needed to be rewritten are highlighted in yellow. As I fixed those words, please help me remove all highlight. I want to make sure that there is no highlight word.",
"source": "https://help.libreoffice.org/7.2/en-US/text/shared/02/02160000.html?&DbPAR=WRITER&System=WIN#:~:text=Select%20the%20highlighted%20text.%20On%20the%20Formatting%20bar%2C,by%20clicking%20the%20icon%20again%20or%20pressing%20Esc.",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=10hgB73d_DoQXQVgUjvgXFUCP1Hd9YxDb&export=download&authuser=0&confirm=t&uuid=df2bb3c3-e75e-4cbc-a6bc-24120512a4e1&at=APZUnTUak54ZfgQDNxbt_PqBvNQu:1706017722797",
"path": "Desktop/sample-recruitment-phone-script.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "check_highlighted_words",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1Z5WkW0YH5tWh-D2YuU5zR7QLNef-37ya&export=download",
"dest": "DG75-DrawGuide_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/DG75-DrawGuide.docx",
"dest": "DG75-DrawGuide.docx"
},
{
"type": "open",
"parameters": {
"path": "Desktop/sample-recruitment-phone-script.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": [
"compare_docx_files",
"check_highlighted_words"
],
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1s9Dsy66-zxbCAgeTyCh0P7AT7P4jF6o3&export=download&authuser=0&confirm=t&uuid=1239f2a1-8c86-45a4-8e7d-36388ac22a69&at=APZUnTVZQzXQAMNsKKQzOw5ppT8A:1706017721589",
"dest": "sample-recruitment-phone-script_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/sample-recruitment-phone-script.docx",
"dest": "sample-recruitment-phone-script.docx"
}
}
}

View File

@@ -1,42 +1,42 @@
{
"id": "6f81754e-285d-4ce0-b59e-af7edb02d108",
"snapshot": "libreoffice_writer",
"instruction": "A certain railway company in Hong Kong uses a signaling system to keep track of trains in its railway system. Each line in the docx file represents a train calling at a station from 0600 to 1200 on 2022-09-22, and has the following format: time_HH:MM:SS, train_id, station_id, platform_no.. I want to remove duplicated train ids in order to know how many different trains are running from 0600 to 1200. Could you help me on this? I am doing it manually and it is very inefficient.",
"source": "https://superuser.com/questions/789473/remove-duplicate-lines-in-libreoffice-openoffice-writer",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1cK1AMt_qKVAPp6EndSFG8y8r7KOPsqC1&export=download",
"path": "Desktop/HK train record.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/HK train record.docx"
}
"id": "6f81754e-285d-4ce0-b59e-af7edb02d108",
"snapshot": "libreoffice_writer",
"instruction": "A certain railway company in Hong Kong uses a signaling system to keep track of trains in its railway system. Each line in the docx file represents a train calling at a station from 0600 to 1200 on 2022-09-22, and has the following format: time_HH:MM:SS, train_id, station_id, platform_no.. I want to remove duplicated train ids in order to know how many different trains are running from 0600 to 1200. Could you help me on this? I am doing it manually and it is very inefficient.",
"source": "https://superuser.com/questions/789473/remove-duplicate-lines-in-libreoffice-openoffice-writer",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1cK1AMt_qKVAPp6EndSFG8y8r7KOPsqC1&export=download",
"path": "Desktop/HK_train_record.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": ["get_unique_train_ids", "check_no_duplicates"],
"result": {
"type": "vm_file",
"path": "Desktop/HK train record.docx",
"dest": "HK train record.docx"
},
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1wZ5CKxCD3biB4mFFlrBInZO-bzo36vVG&export=download",
"dest": "HK train record_Gold.docx"
},
{
"type": "open",
"parameters": {
"path": "Desktop/HK_train_record.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "compare_docx_lines",
"result": {
"type": "vm_file",
"path": "Desktop/HK_train_record.docx",
"dest": "HK_train_record.docx"
},
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1wZ5CKxCD3biB4mFFlrBInZO-bzo36vVG&export=download",
"dest": "HK_train_record_Gold.docx"
}
}
}

View File

@@ -1,42 +1,45 @@
{
"id": "72b810ef-4156-4d09-8f08-a0cf57e7cefe",
"snapshot": "libreoffice_writer",
"instruction": "I am peer-reviewing my friend's course outline. I think the last paragraph is redundant so I want to add strike-through on words in the last paragraph. Can you do this for me?",
"source": "https://superuser.com/questions/657792/libreoffice-writer-how-to-apply-strikethrough-text-formatting?rq=1",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1Uqgr9Y_kjoMoDoUwt80hv1EtFaisyztU&export=download",
"path": "Desktop/GEOG2169_Course_Outline_2022-23.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "GEOG2169_Course_Outline_2022-23.docx"
}
"id": "72b810ef-4156-4d09-8f08-a0cf57e7cefe",
"snapshot": "libreoffice_writer",
"instruction": "I am peer-reviewing my friend's course outline. I think the last paragraph is redundant so I want to add strike-through on words in the last paragraph. Can you do this for me?",
"source": "https://superuser.com/questions/657792/libreoffice-writer-how-to-apply-strikethrough-text-formatting?rq=1",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1Uqgr9Y_kjoMoDoUwt80hv1EtFaisyztU&export=download",
"path": "Desktop/GEOG2169_Course_Outline_2022-23.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "evaluate_strike_through_last_paragraph",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1IpAnQRYo1whrnzIGyo8UldZf4Tli-yVT&export=download",
"dest": "GEOG2169_Course_Outline_2022-23_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/GEOG2169_Course_Outline_2022-23.docx",
"dest": "GEOG2169_Course_Outline_2022-23.docx"
},
{
"type": "open",
"parameters": {
"path": "GEOG2169_Course_Outline_2022-23.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": [
"evaluate_strike_through_last_paragraph",
"compare_docx_files"
],
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1IpAnQRYo1whrnzIGyo8UldZf4Tli-yVT&export=download",
"dest": "GEOG2169_Course_Outline_2022-23_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/GEOG2169_Course_Outline_2022-23.docx",
"dest": "GEOG2169_Course_Outline_2022-23.docx"
}
}
}

View File

@@ -1,42 +1,45 @@
{
"id": "8472fece-c7dd-4241-8d65-9b3cd1a0b568",
"snapshot": "libreoffice_writer",
"instruction": "I am writing a word list for a dyslexic kid. To ease things for him, I want to use red for words start with vowels and blue for those start with non-vowels. Can you do this for me? I'm doing it manually, and it is a pain.",
"source": "https://stackoverflow.com/questions/37259827/libreoffice-writer-how-to-set-different-colors-to-each-letter",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1QHk3fVFSlvYu2k013_7ahEkVQl_o1GTU&export=download",
"path": "Desktop/Dolch Sight Words Primer.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Dolch Sight Words Primer.docx"
}
"id": "8472fece-c7dd-4241-8d65-9b3cd1a0b568",
"snapshot": "libreoffice_writer",
"instruction": "I am writing a word list for a dyslexic kid. To ease things for him, I want to use red for words start with vowels and blue for those start with non-vowels. Can you do this for me? I'm doing it manually, and it is a pain.",
"source": "https://stackoverflow.com/questions/37259827/libreoffice-writer-how-to-set-different-colors-to-each-letter",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1QHk3fVFSlvYu2k013_7ahEkVQl_o1GTU&export=download",
"path": "Desktop/Dolch_Sight_Words_Primer.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "evaluate_colored_words_in_tables",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1ksn444K17lFOdm5pELrQYvuZHkOsKq69&export=download",
"dest": "Dolch Sight Words Primer_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Dolch Sight Words Primer.docx",
"dest": "Dolch Sight Words Primer.docx"
},
{
"type": "open",
"parameters": {
"path": "Desktop/Dolch_Sight_Words_Primer.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": [
"evaluate_colored_words_in_tables",
"compare_docx_files"
],
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1ksn444K17lFOdm5pELrQYvuZHkOsKq69&export=download",
"dest": "Dolch_Sight_Words_Primer_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Dolch_Sight_Words_Primer.docx",
"dest": "Dolch_Sight_Words_Primer.docx"
}
}
}

View File

@@ -0,0 +1,42 @@
{
"id": "88fe4b2d-3040-4c70-9a70-546a47764b48",
"snapshot": "libreoffice_writer",
"instruction": "I am making a guideline for students of my course and would like to separate each sentence in the the first paragraph to improve readability. Please separate each sentence by creating one empty line space after each of them, as I am having hard time separating them one by one.",
"source": "https://stackoverflow.com/questions/56554555/libreoffice-writer-how-to-create-empty-line-space-after-every-period-in-a-par",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1DV0M0eSOEM64Lc59TR-tsBgFqehSQbFf&export=download&authuser=0&confirm=t&uuid=b8ca221a-3e09-4765-9b6d-0f7c439e492a&at=APZUnTWYXixuTuAHaX9Iz-5g_xMx:1706012515458",
"path": "Desktop/CCCH9003_Tutorial_guidelines.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "CCCH9003_Tutorial_guidelines.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "compare_docx_files",
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1B9mOd0lTiX0QFly41ZhgmCSB9AnjFWLP&export=download&authuser=0&confirm=t&uuid=28af3f12-0639-4b30-8428-08d9e1834690&at=APZUnTW9oJ1ULogYQMAjxSWJkUv1:1706012516839",
"dest": "CCCH9003_Tutorial_guidelines_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/CCCH9003_Tutorial_guidelines.docx",
"dest": "CCCH9003_Tutorial_guidelines.docx"
}
}
}

View File

@@ -1,42 +1,45 @@
{
"id": "b21acd93-60fd-4127-8a43-2f5178f4a830",
"snapshot": "libreoffice_writer",
"instruction": "I have been praciticing professional writing lately. Now I am writing essay which requires one paragraph each for introduction, body and conclusion with single-space for introduction, double-space for body then one-and-a-half-space for conclusion. The font size of this essay is 12. Could you help me on this?",
"source": "https://superuser.com/questions/1097199/how-can-i-double-space-a-document-in-libreoffice?rq=1",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1akFeAURJiqnK9wGNlRgPoPuQ6vRmnUPe&export=download",
"path": "Desktop/CCHU9045 Course Outline 2019-20.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "CCHU9045 Course Outline 2019-20.docx"
}
"id": "b21acd93-60fd-4127-8a43-2f5178f4a830",
"snapshot": "libreoffice_writer",
"instruction": "I have been praciticing professional writing lately. Now I am writing essay which requires one paragraph each for introduction, body and conclusion with single-space for introduction, double-space for body then one-and-a-half-space for conclusion. The font size of this essay is 12. Could you help me on this?",
"source": "https://superuser.com/questions/1097199/how-can-i-double-space-a-document-in-libreoffice?rq=1",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1akFeAURJiqnK9wGNlRgPoPuQ6vRmnUPe&export=download",
"path": "Desktop/CCHU9045_Course_Outline_2019-20.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "evaluate_spacing",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=16LN7uYSSXk_xwgc4IZXnN2Z1nCmPJfLm&export=download",
"dest": "CCHU9045 Course Outline 2019-20_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/CCHU9045 Course Outline 2019-20.docx",
"dest": "CCHU9045 Course Outline 2019-20.docx"
},
{
"type": "open",
"parameters": {
"path": "CCHU9045_Course_Outline_2019-20.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": [
"compare_line_spacing",
"compare_docx_files"
],
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=16LN7uYSSXk_xwgc4IZXnN2Z1nCmPJfLm&export=download",
"dest": "CCHU9045_Course_Outline_2019-20_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/CCHU9045_Course_Outline_2019-20.docx",
"dest": "CCHU9045_Course_Outline_2019-20.docx"
}
}
}

View File

@@ -1,42 +1,42 @@
{
"id": "d53ff5ee-3b1a-431e-b2be-30ed2673079b",
"snapshot": "libreoffice_writer",
"instruction": "I am currently engaged in text processing and require assistance in converting all uppercase text to lowercase within my document. This precision is critical for maintaining a uniform and polished presentation. Could you help me on this?",
"source": "https://ask.libreoffice.org/t/how-to-convert-all-uppercase-to-lowercase/53341",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1j6Gx6KCxA9Cp-TE1uZ5lKcTSKVRPW-CB&export=download",
"path": "Desktop/presentation instruction 2023 Feb.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "presentation instruction 2023 Feb.docx"
}
"id": "d53ff5ee-3b1a-431e-b2be-30ed2673079b",
"snapshot": "libreoffice_writer",
"instruction": "I am currently engaged in text processing and require assistance in converting all uppercase text to lowercase within my document. This precision is critical for maintaining a uniform and polished presentation. Could you help me on this?",
"source": "https://ask.libreoffice.org/t/how-to-convert-all-uppercase-to-lowercase/53341",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1j6Gx6KCxA9Cp-TE1uZ5lKcTSKVRPW-CB&export=download",
"path": "Desktop/presentation_instruction_2023_Feb.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "evaluate_conversion",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1bB1N2TWN0puZ6DwUFS_TDjvRWchaGP9T&export=download",
"dest": "presentation instruction 2023 Feb_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/presentation instruction 2023 Feb.docx",
"dest": "presentation instruction 2023 Feb.docx"
},
{
"type": "open",
"parameters": {
"path": "presentation_instruction_2023_Feb.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "compare_docx_files",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1bB1N2TWN0puZ6DwUFS_TDjvRWchaGP9T&export=download",
"dest": "presentation_instruction_2023_Feb_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/presentation_instruction_2023_Feb.docx",
"dest": "presentation_instruction_2023_Feb.docx"
}
}
}

View File

@@ -1,42 +1,45 @@
{
"id": "e246f6d8-78d7-44ac-b668-fcf47946cb50",
"snapshot": "libreoffice_writer",
"instruction": "I found Italic font very hard to discern from the normal text for me, as it is also dark black with the same size. Current font size is 12 and I want to change the font size of italicized words to 14 to make it more discernible. Can you help me on this?",
"source": "https://ask.libreoffice.org/t/how-to-change-text-size-color-of-italic-font/77712",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1b8mPpEDlBrTLcOpf0ZcjdUV4vLAwxH1r&export=download",
"path": "Desktop/Y22-2119-assign4.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Y22-2119-assign4.docx"
}
"id": "e246f6d8-78d7-44ac-b668-fcf47946cb50",
"snapshot": "libreoffice_writer",
"instruction": "I found Italic font very hard to discern from the normal text for me, as it is also dark black with the same size. Current font size is 12 and I want to change the font size of italicized words to 14 to make it more discernible. Can you help me on this?",
"source": "https://ask.libreoffice.org/t/how-to-change-text-size-color-of-italic-font/77712",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?id=1b8mPpEDlBrTLcOpf0ZcjdUV4vLAwxH1r&export=download",
"path": "Desktop/Y22-2119-assign4.docx"
}
]
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": "check_italic_font_size_14",
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1GTZ-DkMxpdYx38z_s0ab85Ejgxv3qfEp&export=download",
"dest": "Y22-2119-assign4.docx_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Y22-2119-assign4.docx",
"dest": "Y22-2119-assign4.docx"
},
{
"type": "open",
"parameters": {
"path": "Desktop/Y22-2119-assign4.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": {
"func": [
"check_italic_font_size_14",
"compare_docx_files"
],
"expected": {
"type": "cloud_file",
"path": "https://drive.google.com/uc?id=1GTZ-DkMxpdYx38z_s0ab85Ejgxv3qfEp&export=download",
"dest": "Y22-2119-assign4.docx_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Y22-2119-assign4.docx",
"dest": "Y22-2119-assign4.docx"
}
}
}

View File

@@ -179,31 +179,35 @@ def scrape_webpage_to_markdown(url, doc_filepath):
# scrape the webpage and perform OCR on images
for article in articles:
for child in article.recursiveChildGenerator():
# if this is an image, perform OCR
if child.name == 'img':
img_url = child.get('src')
if not img_url.startswith(('http:', 'https:')):
img_url = '{}{}'.format(url, img_url)
if not img_url.endswith('.svg') and not img_url.endswith('.png'):
continue
if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url:
continue
img_response = requests.get(img_url, stream=True)
img = Image.open(BytesIO(img_response.content))
ocr_text = pytesseract.image_to_string(img)
if ocr_text.strip():
markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip())
continue
# Not an image, so continue recursively calling function
if child.name is None:
continue
html_str = str(child)
markdown_content += md(html_str) + '\n\n'
for child in article.recursiveChildGenerator():
# if this is an image, perform OCR
if child.name == 'img':
img_url = child.get('src')
if not img_url.startswith(('http:', 'https:')):
img_url = '{}{}'.format(url, img_url)
if not img_url.endswith('.svg') and not img_url.endswith('.png'):
continue
if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url:
continue
try:
img_response = requests.get(img_url, stream=True)
img = Image.open(BytesIO(img_response.content))
ocr_text = pytesseract.image_to_string(img)
if ocr_text.strip():
markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip())
continue
except PIL.UnidentifiedImageError:
print("unidentified image")
# Not an image, so continue recursively calling function
if child.name is None:
continue
html_str = str(child)
markdown_content += md(html_str) + '\n\n'
with open(doc_filepath, 'w', encoding='utf-8') as f:
f.write(markdown_content)
with open(doc_filepath, 'w', encoding='utf-8') as f:
f.write(markdown_content)
# process a URL and save the file

View File

@@ -0,0 +1,293 @@
import csv
import os
import io
import fitz
import yt_dlp
from docx import Document
import requests
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
from io import BytesIO
from docx import Document
import re
import markdownify
from markdownify import markdownify as md
def download_pdf(url):
response = requests.get(url)
response.raise_for_status() # 确保请求是成功的
return io.BytesIO(response.content)
def pdf_to_markdown(pdf_stream, markdown_path):
document = fitz.open(stream=pdf_stream, filetype="pdf")
markdown_content = ""
for page_number in range(len(document)):
page = document[page_number]
text = page.get_text()
markdown_content += text + "\n\n"
# 提取图片并添加到 Markdown 文件。图片被保存在同一目录下
image_list = page.get_images(full=True)
if image_list:
markdown_content += f"### Page {page_number + 1} Images\n"
for img_index, image in enumerate(image_list, start=1):
# 提取图片
xref = image[0]
base64_image = document.extract_image(xref)
image_bytes = base64_image["image"]
# 写入图片到磁盘
image_filename = f"output_image_page_{page_number + 1}_{img_index}.png"
image_abs_path = os.path.join(os.path.dirname(markdown_path), image_filename)
with open(image_abs_path, "wb") as image_file:
image_file.write(image_bytes)
# 在 Markdown 文件中添加图片引用
markdown_content += f"![Page {page_number + 1} Image {img_index}]({image_filename})\n\n"
with open(markdown_path, "w", encoding="utf-8") as md_file:
md_file.write(markdown_content)
document.close()
def valid_xml_char_ordinal(c):
codepoint = ord(c)
# conditions ordered by presumed frequency
return (
0x20 <= codepoint <= 0xD7FF or
codepoint in (0x9, 0xA, 0xD) or
0xE000 <= codepoint <= 0xFFFD or
0x10000 <= codepoint <= 0x10FFFF
)
def download_and_clean_youtube_subtitles(video_url, txt_filepath):
# 设置yt-dlp库的选项来下载字幕
subtitles_path = txt_filepath[0:-4]
ydl_opts = {
'skip_download': True,
'writesubtitles': True,
'writeautomaticsub': True, # 如果视频没有字幕,尝试下载自动生成的字幕
'subtitleslangs': ['en'], # 下载英文字幕
'outtmpl': f'{subtitles_path}.%(ext)s', # 确保保存到可写目录
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
# 获取视频信息,下载字幕文件
ydl.download([video_url])
subtitle_file = f'{subtitles_path}.en.vtt'
# 读取下载的字幕文件
subtitles = []
try:
with open(subtitle_file, 'r', encoding='utf-8') as file:
# 读取所有行
lines = file.readlines()
# 正则表达式匹配时间戳和其他不相关的标记
pattern = re.compile(r'(\d+:\d\d:\d\d.\d+ --> \d+:\d\d:\d\d.\d+)|(\s*<[^>]+>)')
# 去除时间戳和HTML标签等只保留字幕文本
lines = [re.sub(pattern, '', line).strip() for line in lines if line.strip() and not pattern.match(line)]
# 清洗字幕
for line in lines:
# 如果这是一个时间线或者其他不需要的信息,跳过它
if pattern.match(line) or line.strip() == '':
continue
# 添加到字幕列表,同时去除愈加和前导空白符
subtitles.append(line.strip())
# 去除可能的重复行
subtitles = list(dict.fromkeys(subtitles))
# 保存至txt文件
with open(txt_filepath, 'w', encoding='utf-8') as f:
for line in subtitles:
if line: # 避免写入空行
f.write(line + '\n')
except IOError:
print(f"Could not read file: {subtitle_file}")
# 爬取论坛内容对图片进行OCR处理并保存为.docx文件
def scrape_and_ocr_forum(url, doc):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
text_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'li'])
for element in text_elements:
doc.add_paragraph(element.get_text())
image_elements = soup.find_all('img')
for image in image_elements:
if 'src' not in image.attrs:
continue
image_url = image['src']
if image_url.startswith('http'):
if not image_url.endswith('.svg') and not image_url.endswith('.png'):
continue
if 'neveragain.allstatics.com/2019/assets/icon/logo' in image_url:
continue
img_response = requests.get(image_url, stream=True)
img = Image.open(BytesIO(img_response.content))
ocr_text = pytesseract.image_to_string(img)
if ocr_text != ' ' and ocr_text != '':
cleaned_string = ''.join(c for c in ocr_text if valid_xml_char_ordinal(c))
doc.add_paragraph(cleaned_string)
def superuser_to_markdown(url, doc_filepath):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# 创建Markdown文件的内容
markdown_content = ""
# 获取问题标题和内容
question_title = soup.find('h1').get_text(strip=True)
question = soup.find('div', {'id': 'question'})
if question:
question_body = question.find('div', {'class': 's-prose js-post-body'}).prettify()
markdown_content += f"# {question_title}\n\n" + markdownify.markdownify(question_body, heading_style="ATX") + "\n\n"
# 获取所有回答
answers = soup.find_all('div', {'class': 'answer'})
for answer in answers:
answer_body = answer.find('div', {'class': 's-prose js-post-body'}).prettify()
markdown_content += markdownify.markdownify(answer_body, heading_style="ATX") + "\n\n"
# 处理图片并执行OCR
all_img_tags = question.find_all('img') + [img for answer in answers for img in answer.find_all('img')]
for img_tag in all_img_tags:
image_src = img_tag.get('src') or img_tag.get('data-src') # Superuser使用延迟加载的图片
if image_src and image_src.startswith('http'):
img_response = requests.get(image_src, stream=True)
img = Image.open(BytesIO(img_response.content))
ocr_text = pytesseract.image_to_string(img)
if ocr_text.strip(): # 如果OCR结果非空则添加到Markdown内容中
markdown_content += "```\n" + ocr_text.strip() + "\n```\n\n"
# 将Markdown内容写入文件
with open(doc_filepath, 'w', encoding='utf-8') as f:
f.write(markdown_content)
def stack_overflow_to_markdown(url, doc_filepath):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# 创建Markdown文件的内容
markdown_content = ""
# 获取问题标题和内容
question = soup.find('div', {'id': 'question'})
question_title = soup.find('h1').get_text(strip=True)
if question:
question_body = question.find('div', {'class': 's-prose js-post-body'}).prettify()
markdown_content += f"# {question_title}\n\n" + markdownify.markdownify(question_body, heading_style="ATX") + "\n\n"
# 获取所有回答
answers = soup.find_all('div', {'class': 'answer'})
for answer in answers:
answer_body = answer.find('div', {'class': 's-prose js-post-body'}).prettify()
markdown_content += markdownify.markdownify(answer_body, heading_style="ATX") + "\n\n"
# 处理图片并执行OCR
all_img_tags = soup.find_all('img')
for img_tag in all_img_tags:
image_url = img_tag['src']
if image_url.startswith('http') and (image_url.endswith('.svg') or image_url.endswith('.png')): # 确保图片URL有效
img_response = requests.get(image_url, stream=True)
img = Image.open(BytesIO(img_response.content))
ocr_text = pytesseract.image_to_string(img)
if ocr_text.strip(): # 如果OCR结果非空则添加到Markdown内容中
markdown_content += "```\n" + ocr_text.strip() + "\n```\n\n"
# 将Markdown内容写入文件
with open(doc_filepath, 'w', encoding='utf-8') as f:
f.write(markdown_content)
def scrape_webpage_to_markdown(url, doc_filepath):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# 假设文章内容在 HTML 的 'article' 标签中,根据实际页面结构调整
articles = soup.find_all('article') or soup.find_all('main') or soup.find_all('div', {'id':'steps'}, {'class':'section_text'}) # 或其他包含主要内容的HTML标签
if not articles:
articles = soup.find_all('div', {'class': 'lia-message-body-content'})
markdown_content = ''
# 抓取所有图文信息
for article in articles:
for child in article.recursiveChildGenerator():
# 如果是图片则进行OCR
if child.name == 'img':
img_url = child.get('src')
if not img_url:
continue
if not img_url.startswith(('http:', 'https:')):
img_url = '{}{}'.format(url, img_url)
if not img_url.endswith('.svg') and not img_url.endswith('.png'):
continue
if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url:
continue
print(img_url)
try:
img_response = requests.get(img_url, stream=True)
img = Image.open(BytesIO(img_response.content))
ocr_text = pytesseract.image_to_string(img)
if ocr_text.strip():
markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip())
continue
except PIL.UnidentifiedImageError:
print("unidentified image")
# 不是标签可能是NavigableString或其他
if child.name is None:
continue
# 抓取标签并转换为Markdown
html_str = str(child)
markdown_content += md(html_str) + '\n\n'
# 写入markdown文件
with open(doc_filepath, 'w', encoding='utf-8') as f:
f.write(markdown_content)
# 处理单个URL
def process_url(url, doc_id, app):
doc_filepath = f"/content/drive/MyDrive/SourceDoc/{doc_id}_{app}.md"
txt_filepath = f"/content/drive/MyDrive/SourceDoc/{doc_id}_{app}.txt"
doc = Document()
if 'youtube.com' in url or 'youtu.be' in url:
download_and_clean_youtube_subtitles(url, txt_filepath)
elif url.endswith('.pdf'):
pdf_stream = download_pdf(url)
pdf_to_markdown(pdf_stream, doc_filepath)
elif 'superuser.com' in url or 'askubuntu.com' in url:
superuser_to_markdown(url, doc_filepath)
elif 'stackoverflow.com' in url:
stack_overflow_to_markdown(url, doc_filepath)
else:
scrape_webpage_to_markdown(url, doc_filepath)
# 读取CSV文件中的数据并执行对应操作
csv_filepath = '/content/Get_Source_Doc - Sheet1.csv' # 更新为你的CSV文件实际路径
with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
cnt = 176
for row in reader:
if cnt>0:
cnt -= 1
continue
process_url(row['Source'], row['id'], row['InvolvedApp'])
print(row)