diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index 786ed72..2f4287e 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -5,10 +5,10 @@ import os import subprocess import tempfile import time -from typing import Callable, Any, Optional +from typing import Callable, Any, Optional, Tuple # import uuid # import platform -from typing import List, Dict +from typing import List, Dict, Union import gymnasium as gym @@ -48,7 +48,8 @@ class DesktopEnv(gym.Env): action_space: str = "computer_13", task_config: Dict[str, Any] = None, tmp_dir: str = "tmp", - cache_dir: str = "cache" + cache_dir: str = "cache", + screen_size: Tuple[int] = (1920, 1080) ): """ Args: @@ -73,6 +74,7 @@ class DesktopEnv(gym.Env): self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(path_to_vm))) self.tmp_dir_base: str = tmp_dir self.cache_dir_base: str = cache_dir + self.vm_screen_size = screen_size # task-aware stuffs # todo: handling the logic of snapshot directory @@ -80,6 +82,7 @@ class DesktopEnv(gym.Env): # Initialize emulator and controller logger.info("Initializing...") + self._config_screen_size() self._start_emulator() self.vm_ip = self._get_vm_ip() self.controller = PythonController(vm_ip=self.vm_ip) @@ -87,7 +90,6 @@ class DesktopEnv(gym.Env): # Meta info of the VM, move to the reset() function self.vm_platform: str = "" # self.controller.get_vm_platform() - self.vm_screen_size = None # self.controller.get_vm_screen_size() # mode: human or machine assert action_space in ["computer_13", "pyautogui"] @@ -101,6 +103,57 @@ class DesktopEnv(gym.Env): self._step_no: int = 0 self.action_history: List[Dict[str, any]] = [] + def _config_screen_size(self): + def calculate_vram_size(width, height, bits_per_pixel=32): + """ + Calculate VRAM size for given width, height, and color depth. + Color depth defaults to 32 bits per pixel. + """ + bytes_per_pixel = bits_per_pixel // 8 + vram_size = width * height * bytes_per_pixel + return vram_size + + if not os.path.isfile(self.path_to_vm): + logger.warning(f"The specified vmx file does not exist: {self.path_to_vm}") + return False + + width, height = self.vm_screen_size + vramSize = calculate_vram_size(width, height) + + try: + with open(self.path_to_vm, 'r') as file: + lines = file.readlines() + + new_lines = [] + for line in lines: + if "svga.autodetect" in line: + continue + elif "svga.vramSize" in line: + continue + elif "displayWidth" in line: + continue + elif "displayHeight" in line: + continue + else: + new_lines.append(line) + + # Append new settings for screen size and VRAM. + new_lines.append(f'svga.autodetect = "TRUE"\n') + new_lines.append(f'svga.vramSize = "{vramSize}"\n') + new_lines.append(f'displayWidth = "{width}"\n') + new_lines.append(f'displayHeight = "{height}"\n') + + with open(self.path_to_vm, 'w') as file: + file.writelines(new_lines) + logger.info(f"Screen size for {self.path_to_vm} set to {width}x{height} with VRAM size {vramSize} bytes") + return True + except IOError as e: + logger.error(f"An IOError occurred: {e}") + return False + except Exception as e: + logger.error(f"An error occurred: {e}") + return False + def _start_emulator(self): while True: try: @@ -119,7 +172,7 @@ class DesktopEnv(gym.Env): logger.error(f"Error executing command: {e.output.decode().strip()}") def _get_vm_ip(self): - max_retries = 10 + max_retries = 20 logger.info("Getting IP Address...") for _ in range(max_retries): try: @@ -159,12 +212,44 @@ class DesktopEnv(gym.Env): self.instruction = task_config["instruction"] self.config = task_config["config"] + # evaluator dict + # func -> metric function string, or list of metric function strings + # conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or" + # result -> result getter config, or list of result getter configs + # expected (optional) -> expected getter config, or list of expected getter configs + # options (optional) -> metric options, or list of metric options + # if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length + # even if one of the metrics does not need expected or options field, it should be included in the list with None self.evaluator = task_config["evaluator"] - self.metric: Metric = getattr(metrics, self.evaluator["func"]) - self.result_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["result"]["type"])) - self.expected_getter: Getter = getattr(getters, "get_{:}".format( - self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None - self.metric_options: Dict[str, Any] = self.evaluator.get("options", {}) + self.metric: Metric = [getattr(metrics, func) for func in self.evaluator["func"]] \ + if isinstance(self.evaluator["func"], list) \ + else getattr(metrics, self.evaluator["func"]) + self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics + self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in + self.evaluator["result"]] \ + if isinstance(self.evaluator["result"], list) \ + else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"])) + if "expected" in self.evaluator: + self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in + self.evaluator["expected"]] \ + if isinstance(self.evaluator["expected"], list) \ + else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"])) + else: + self.expected_getter = [None] * len(self.metric) \ + if isinstance(self.metric, list) \ + else None + self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in + self.evaluator["options"]] \ + if isinstance(self.evaluator.get("options", {}), list) \ + else self.evaluator["options"] \ + if "options" in self.evaluator \ + else [{}] * len(self.metric) \ + if isinstance(self.metric, list) \ + else {} + + assert (not isinstance(self.evaluator["func"], list) + or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len( + self.metric_options))) def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]: logger.info("Resetting environment...") @@ -190,6 +275,8 @@ class DesktopEnv(gym.Env): _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path]) time.sleep(5) + self._config_screen_size() + print(self.vm_screen_size) logger.info("Starting emulator...") self._start_emulator() logger.info("Emulator started.") @@ -197,6 +284,7 @@ class DesktopEnv(gym.Env): logger.info("Get meta info of the VM...") self.vm_platform = self.controller.get_vm_platform() self.vm_screen_size = self.controller.get_vm_screen_size() + print(self.vm_screen_size) logger.info("Setting up environment...") self.setup_controller.setup(self.config) @@ -256,17 +344,41 @@ class DesktopEnv(gym.Env): self.setup_controller.setup(self.evaluator.get("postconfig", [])) - try: - result_state = self.result_getter(self, self.evaluator["result"]) - except FileNotFoundError: - logger.error("File not found!") - return 0 + if type(self.metric) == list: + for idx, metric in enumerate(self.metric): + try: + config = self.evaluator["result"][idx] + result_state = self.result_getter[idx](self, config) + except FileNotFoundError: + logger.error("File not found!") + if self.metric_conj == 'and': + return 0 - expected_state = self.expected_getter(self, self.evaluator["expected"]) if "expected" in self.evaluator \ - else None + expected = self.evaluator["expected"][idx] + expected_state = self.expected_getter[idx](self, expected) if expected else None - metric: float = self.metric(result_state, expected_state, **self.metric_options) if expected_state is not None \ - else self.metric(result_state, **self.metric_options) + metric: int = metric(result_state, expected_state, + **self.metric_options[idx]) if expected_state is not None \ + else metric(result_state, **self.metric_options[idx]) + + if self.metric_conj == 'and' and not bool(metric): + return 0 + elif self.metric_conj == 'or' and bool(metric): + return 1 + return 1 if self.metric_conj == 'and' else 0 + else: + try: + result_state = self.result_getter(self, self.evaluator["result"]) + except FileNotFoundError: + logger.error("File not found!") + return 0 + + expected_state = self.expected_getter(self, self.evaluator["expected"]) if "expected" in self.evaluator \ + else None + + metric: float = self.metric(result_state, expected_state, + **self.metric_options) if expected_state is not None \ + else self.metric(result_state, **self.metric_options) return metric diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py index a658192..7a1fa94 100644 --- a/desktop_env/evaluators/metrics/__init__.py +++ b/desktop_env/evaluators/metrics/__init__.py @@ -14,4 +14,6 @@ from .gimp import increase_saturation, decrease_brightness, check_file_exists, c from .general import check_csv, check_accessibility_tree, check_list, run_sqlite3, check_json from .thunderbird import check_thunderbird_prefs, check_thunderbird_filter from .vscode import compare_text_file, compare_config, compare_answer, is_extension_installed -from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines, check_for_audio, check_formula_shape, check_file_exists +from .impress import check_image_stretch_and_center, check_slide_numbers_color, compare_pptx_files, check_strikethrough, \ + check_for_audio, check_formula_shape +from .impress import check_slide_orientation_Portrait, contains_mp4_video diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py index 6246861..98c9596 100644 --- a/desktop_env/evaluators/metrics/general.py +++ b/desktop_env/evaluators/metrics/general.py @@ -180,8 +180,8 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str with open(result) as f: result: Dict[str, Any] = json.load(f) - expect_rules = rule.get("expect", {}) - unexpect_rules = rule.get("unexpect", {}) + expect_rules = rules.get("expect", {}) + unexpect_rules = rules.get("unexpect", {}) metric = True for r in expect_rules: diff --git a/desktop_env/evaluators/metrics/impress.py b/desktop_env/evaluators/metrics/impress.py index 719ae56..b68a6fe 100644 --- a/desktop_env/evaluators/metrics/impress.py +++ b/desktop_env/evaluators/metrics/impress.py @@ -1,24 +1,56 @@ from pptx import Presentation -import os +from pptx.util import Inches + + +def check_image_stretch_and_center(modified_ppt, original_ppt): + # fixme: this func is overfit to this example libreoffice_impress + # Load the presentations + original_pres = Presentation(original_ppt) + modified_pres = Presentation(modified_ppt) + + # Get the first slide of each presentation + original_slide = original_pres.slides[0] + modified_slide = modified_pres.slides[0] + + # Get the image on the first slide of each presentation + original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13] + modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13] + + the_image = original_slide_images[0] + + # Get the images that modified in width and height + for modified_image in modified_slide_images: + if the_image.image.blob == modified_image.image.blob: + the_modified_image = modified_image + + if (abs(the_modified_image.width - original_pres.slide_width) > Inches(0.1) or + abs(the_modified_image.height - original_pres.slide_height) > Inches(0.1) or + abs(the_modified_image.left - (original_pres.slide_width - the_modified_image.width) / 2) > Inches(0.1) or + abs(the_modified_image.top - (original_pres.slide_height - the_modified_image.height) / 2) > Inches(0.1)): + return False + + return True + def is_red_color(color): - #judge if the color is red + # judge if the color is red print(color.rgb) return color and color.rgb == (255, 0, 0) + def get_master_placeholder_color(prs): # get the color of the placeholder masters = prs.slide_masters for idx, master in enumerate(masters): - for placeholder in master.placeholders: - if placeholder.has_text_frame and placeholder.text == "": + for placeholder in master.placeholders: + if placeholder.has_text_frame and placeholder.text == "": text_frame = placeholder.text_frame if text_frame.paragraphs: first_paragraph = text_frame.paragraphs[0] - return first_paragraph.font.color - return None - + return first_paragraph.font.color + return None + def check_slide_numbers_color(pptx_file_path): presentation = Presentation(pptx_file_path) @@ -34,42 +66,65 @@ def check_slide_numbers_color(pptx_file_path): print(font_color) return 1 if font_color is not None and is_red_color(font_color) else 0 -def compare_pptx_files(file1_path, file2_path): + +def compare_pptx_files(file1_path, file2_path, **options): + # todo: not strictly match since not all information is compared because we cannot get the info through pptx prs1 = Presentation(file1_path) prs2 = Presentation(file2_path) # compare the number of slides if len(prs1.slides) != len(prs2.slides): - return 0 + return False # compare the content of each slide for slide1, slide2 in zip(prs1.slides, prs2.slides): # check if the shapes are the same for shape1, shape2 in zip(slide1.shapes, slide2.shapes): + if shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height: + return False + if hasattr(shape1, "text") and hasattr(shape2, "text"): if shape1.text != shape2.text: - return 0 - return 1 + return False -def has_two_lines_on_page(slide): - line_count = 0 - for shape in slide.shapes: - if shape.shape_type == 1: # 1 表示 Line 形状 - line_count += 1 - if line_count >= 2: - return True - return False + # check if the paragraphs are the same + for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs): + # check if the runs are the same + for run1, run2 in zip(para1.runs, para2.runs): + if run1.text != run2.text: + return False -def check_for_two_lines(prs): - prs = Presentation(prs) - for i, slide in enumerate(prs.slides): - if has_two_lines_on_page(slide): - return 1 - return 0 + # check if the font properties are the same + if run1.font.name != run2.font.name or run1.font.size != run2.font.size or run1.font.bold != run2.font.bold or run1.font.italic != run2.font.italic or run1.font.color.rgb != run2.font.color.rgb: + return False + + return True + + +def check_strikethrough(pptx_path, rules): + # Load the presentation + presentation = Presentation(pptx_path) + + slide_index_s = rules["slide_index_s"] + shape_index_s = rules["shape_index_s"] + paragraph_index_s = rules["paragraph_index_s"] + + for slide_index in slide_index_s: + # Get the slide + slide = presentation.slides[slide_index] + + for shape_index in shape_index_s: + # Get the text box + paragraphs = slide.shapes[shape_index].text_frame.paragraphs + + for paragraph_index in paragraph_index_s: + paragraph = paragraphs[paragraph_index] + run = paragraph.runs[0] + if 'strike' not in run.font._element.attrib: + return False + + return True -def check_file_exists(directory, filename): - file_path = os.path.join(directory, filename) - return 1 if os.path.isfile(file_path) else 0 def has_audio_on_page(slide): for shape in slide.shapes: @@ -77,6 +132,7 @@ def has_audio_on_page(slide): return True return False + def check_for_audio(prs): prs = Presentation(prs) for i, slide in enumerate(prs.slides): @@ -84,16 +140,39 @@ def check_for_audio(prs): return 1 return 0 + def check_formula_shape(prs): prs = Presentation(prs) slide = prs.slides[13] - + for shape in slide.shapes: if shape.has_text_frame and shape.shape_type == 1: return 1 return 0 + +def check_slide_orientation_Portrait(pptx_path): + presentation = Presentation(pptx_path) + + slide_height = presentation.slide_height + slide_width = presentation.slide_width + + if slide_width < slide_height: + return 1 + return 0 + + +def contains_mp4_video(pptx_path): + prs = Presentation(pptx_path) + for slide in prs.slides: + for shape in slide.shapes: + if shape.shape_type == 16: + if shape.media_type == 3: + return 1 + return 0 + + if __name__ == "__main__": path1 = "../../任务数据/LibreOffice Impress/Change_Color_Slide_Number_gold_textbox.pptx" presentation = Presentation(path1) diff --git a/desktop_env/evaluators/metrics/vscode.py b/desktop_env/evaluators/metrics/vscode.py index ac98d72..55f6407 100644 --- a/desktop_env/evaluators/metrics/vscode.py +++ b/desktop_env/evaluators/metrics/vscode.py @@ -1,5 +1,44 @@ from typing import Dict +import json +def check_json_keybindings(actual: str, expected: str, **options) -> float: + """ + Args: + actual (str): path to result text file + expected (str): expected dict{} + + Return: + float: the score + """ + + with open(actual) as f: + data = json.load(f) + + if expected in data: + return 1.0 + else: + return 0.0 + +def check_json_settings(actual: str, expected: str, **options) -> float: + """ + Args: + actual (str): path to result text file + expected (str): expected dict{} + + Return: + float: the score + """ + + with open(actual) as f: + data = json.load(f) + + expect = set(expected.items()) + json = set(data.items()) + + if expect.issubset(json): + return 1.0 + else: + return 0.0 def compare_text_file(actual: str, expected: str, **options) -> float: """ diff --git a/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json b/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json index e5ee5d6..6ba438f 100644 --- a/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json +++ b/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json @@ -9,7 +9,7 @@ "parameters": { "files": [ { - "url": "https://drive.usercontent.google.com/download?id=1WT1-L0iiIlF2kuIK77IDxTfBaQ0X0BbX&export=download&authuser=0&confirm=t&uuid=0b69767e-1f3e-49ce-88a7-1036ef25bcaf&at=APZUnTXZ_sqEZUrHNx1edWep017b:1705337750065", + "url": "https://drive.usercontent.google.com/download?id=1WT1-L0iiIlF2kuIK77IDxTfBaQ0X0BbX&export=download&authuser=0&confirm=t&uuid=3daac1dc-0f6e-449b-b6bc-09fd246697aa&at=APZUnTVgf_yEeeaARnUISIE4wr4E:1705768410739", "path": "Desktop/Ch5.pptx" } ] diff --git a/evaluation_examples/examples/libreoffice_impress/3b27600c-3668-4abd-8f84-7bcdebbccbdb.json b/evaluation_examples/examples/libreoffice_impress/3b27600c-3668-4abd-8f84-7bcdebbccbdb.json index 784913c..615cabc 100644 --- a/evaluation_examples/examples/libreoffice_impress/3b27600c-3668-4abd-8f84-7bcdebbccbdb.json +++ b/evaluation_examples/examples/libreoffice_impress/3b27600c-3668-4abd-8f84-7bcdebbccbdb.json @@ -1,12 +1,42 @@ { "id": "3b27600c-3668-4abd-8f84-7bcdebbccbdb", "snapshot": "libreoffice_impress", - "instruction": "Could you help me change the background color to blue 2 and apply it to all my slides.", + "instruction": "Please make the background blue on all my slides.", "source": "https://www.libreofficehelp.com/change-slide-background-impress/#All_Slides", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1aHMJzk2G8B_EqDlTAZLEiJ4h-ZsgA9UE&export=download&authuser=0&confirm=t&uuid=196a082d-5f08-4b3e-a64f-c021351f9cd8&at=APZUnTUXH4gvLvElvm9TtFhUJlIn:1705481007789", + "path": "Desktop/lec17-gui-events.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/lec17-gui-events.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "compare_pptx_files", + "expected": { + "type": "cloud_file", + "path": "https://drive.usercontent.google.com/download?id=1LU-wnmIqMQgwkdAUFBLE1wNkH4gSl3IR&export=download&authuser=0&confirm=t&uuid=74520405-4028-4fbe-bab8-d56dc82ffb6c&at=APZUnTU0dz5ZE5CcQry8IeY5_s1J:1705481009686", + "dest": "lec17-gui-events_Gold.docx" + }, + "result": { + "type": "vm_file", + "path": "Desktop/lec17-gui-events.pptx", + "dest": "lec17-gui-events.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json b/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json index 0a5ebb9..db194b0 100644 --- a/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json +++ b/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json @@ -27,8 +27,16 @@ "libreoffice_impress" ], "evaluator": { - "func": "check_file_exists", - "file_name": "res.png", - "directory": "/home/user/Desktop/" + "func": "compare_images", + "expected": { + "type": "cloud_file", + "path": "https://drive.usercontent.google.com/download?id=1XTDaQ2NlovrusKkuY6udi_BQfLwSP9th&export=download&authuser=0&confirm=t&uuid=d3c7883e-3cea-4bf3-8f83-d878622ee76d&at=APZUnTXQEnT0Gi4rB0oegvVGheyn:1705859805154", + "dest": "res_gold.png" + }, + "result": { + "type": "vm_file", + "path": "/home/user/Desktop/res.png", + "dest": "res.png" + } } } \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json index 9897ead..6c0e229 100644 --- a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json +++ b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json @@ -2,7 +2,7 @@ "id": "550ce7e7-747b-495f-b122-acdc4d0b8e54", "snapshot": "libreoffice_impress", "instruction": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?", - "source": "https://superuser.com/questions/1211035/libreoffice-impress-animations-how-to-strikethrough-on-click?rq=1", + "source": "https://technical-tips.com/blog/software/text-in-libreoffice-strikethrough--6948#:~:text=To%20strikethrough%20Text%20in%20LibreOffice%201%20In%20your,effect%22%20can%20your%20additionally%2C%20for%20example%2C%20double%20underline.", "config": [ { "type": "download", @@ -27,7 +27,15 @@ "libreoffice_impress" ], "evaluator": { - "func": "check_for_two_lines", + "func": "check_strikethrough", + "expected": { + "type": "rule", + "rules": { + "slide_index_s": [4], + "shape_index_s": [1], + "paragraph_index_s": [1, 2] + } + }, "result": { "type": "vm_file", "path": "Desktop/New_Club_Spring_2018_Training.pptx", diff --git a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json index 6c1f0f0..773ef8c 100644 --- a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json +++ b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json @@ -9,7 +9,7 @@ "parameters": { "files": [ { - "url": "https://drive.usercontent.google.com/download?id=16K6TpGIRZpqOJUu-mtJQ_78kIwLcn-4D&export=download&authuser=0&confirm=t&uuid=945b6f33-53d2-4e87-ada9-efa8b938a499&at=APZUnTVw4fKyJPW0vAAJURruAJIP:1705250184439", + "url": "https://drive.usercontent.google.com/download?id=16K6TpGIRZpqOJUu-mtJQ_78kIwLcn-4D&export=download&authuser=0&confirm=t&uuid=41509e5c-eb95-453a-baad-4e12a839a120&at=APZUnTVygE_LL27vx1l6OEg_FRj0:1705849959413", "path": "Desktop/CPD_Background_Investigation_Process.pptx" } ] @@ -27,11 +27,11 @@ "libreoffice_impress" ], "evaluator": { - "func": "compare_pptx_files", + "func": "check_image_stretch_and_center", "expected": { "type": "cloud_file", - "path": "https://drive.usercontent.google.com/download?id=1rsvFPyHYiIPh1c8Nj8say0NJCG2VIDr7&export=download&authuser=0&confirm=t&uuid=aac08a92-6595-47d8-84dc-8f1ab1df987f&at=APZUnTXIWCn5B0CpLttvG2bsr_a7:1705250423565", - "dest": "CPD_Background_Investigation_Process_Gold.docx" + "path": "https://drive.usercontent.google.com/download?id=16K6TpGIRZpqOJUu-mtJQ_78kIwLcn-4D&export=download&authuser=0&confirm=t&uuid=41509e5c-eb95-453a-baad-4e12a839a120&at=APZUnTVygE_LL27vx1l6OEg_FRj0:1705849959413", + "dest": "CPD_Background_Investigation_Process_Original.pptx" }, "result": { "type": "vm_file", diff --git a/evaluation_examples/examples/libreoffice_impress/a097acff-6266-4291-9fbd-137af7ecd439.json b/evaluation_examples/examples/libreoffice_impress/a097acff-6266-4291-9fbd-137af7ecd439.json index 45330bf..3553fdd 100644 --- a/evaluation_examples/examples/libreoffice_impress/a097acff-6266-4291-9fbd-137af7ecd439.json +++ b/evaluation_examples/examples/libreoffice_impress/a097acff-6266-4291-9fbd-137af7ecd439.json @@ -1,12 +1,34 @@ { "id": "a097acff-6266-4291-9fbd-137af7ecd439", "snapshot": "libreoffice_impress", - "instruction": "Could you help me save my slides to SAVE_PATH?", + "instruction": "Could you help me save my slides as pre.pptx on the Desktop?", "source": "https://www.youtube.com/watch?v=DDmEvjs4iBw", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1i_-m6mVrdesGJ392bulH5lveHarMwKk_&export=download&authuser=0&confirm=t&uuid=057973d3-52b7-45ac-8151-b2c6a1820f49&at=APZUnTU5SYajgO-YrxdDWSiJRfD4:1705768888387", + "path": "Desktop/Secrets-of-Monetizing-Video.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/Secrets-of-Monetizing-Video.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "check_file_exists", + "file_name": "pre.pptx", + "directory": "/home/user/Desktop/" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json index 7a8cb20..8b5dc98 100644 --- a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json +++ b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json @@ -6,7 +6,7 @@ "config": [], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], "evaluator": "evaluation_dir" } diff --git a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json index db71d72..a5213fe 100644 --- a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json +++ b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json @@ -1,8 +1,8 @@ { "id": "af23762e-2bfd-4a1d-aada-20fa8de9ce07", "snapshot": "libreoffice_impress", - "instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide. Could you make a summary slide for me?", - "source": "https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom", + "instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide use Impress \"Summary Slide\" feature. Could you make that for me?", + "source": "https://www.libreofficehelp.com/export-libreoffice-impress-slides-images/#:~:text=Exporting%20a%20single%20slide%20as.jpg%2C.png%2C%20etc%20image%20is,on%20the%20checkbox%20Selection.%20Provide%20jpg%20quality%20options.", "config": [ { "type": "download", diff --git a/evaluation_examples/examples/libreoffice_impress/ce88f674-ab7a-43da-9201-468d38539e4a.json b/evaluation_examples/examples/libreoffice_impress/ce88f674-ab7a-43da-9201-468d38539e4a.json index 1149fbb..7f3b272 100644 --- a/evaluation_examples/examples/libreoffice_impress/ce88f674-ab7a-43da-9201-468d38539e4a.json +++ b/evaluation_examples/examples/libreoffice_impress/ce88f674-ab7a-43da-9201-468d38539e4a.json @@ -1,12 +1,37 @@ { "id": "ce88f674-ab7a-43da-9201-468d38539e4a", "snapshot": "libreoffice_impress", - "instruction": "Could you help me change my slides to portrait (from landscape)?", + "instruction": "Please set my slides upright instead of sideways.", "source": "https://justclickhere.co.uk/resources/change-slides-in-impress-to-portrait/", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1LErTnC_w_YPQVo84QK5sifww9xZ-Cq0X&export=download&authuser=0&confirm=t&uuid=81ff0aaf-9c2e-4342-b7ce-36e65dd2218e&at=APZUnTUmQKCTp2HUP0dOqYqD10G3:1705479016156", + "path": "Desktop/AM_Last_Page_Template.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/AM_Last_Page_Template.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "check_slide_orientation_Portrait", + "result": { + "type": "vm_file", + "path": "Desktop/AM_Last_Page_Template.pptx", + "dest": "AM_Last_Page_Template.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/f0a334af-f91b-4c03-b578-aac9bec2b543.json b/evaluation_examples/examples/libreoffice_impress/f0a334af-f91b-4c03-b578-aac9bec2b543.json index 86342e6..3411aa8 100644 --- a/evaluation_examples/examples/libreoffice_impress/f0a334af-f91b-4c03-b578-aac9bec2b543.json +++ b/evaluation_examples/examples/libreoffice_impress/f0a334af-f91b-4c03-b578-aac9bec2b543.json @@ -1,12 +1,48 @@ { "id": "f0a334af-f91b-4c03-b578-aac9bec2b543", "snapshot": "libreoffice_impress", - "instruction": "Help me insert the video at VIDEO_PATH in the current slide.", + "instruction": "Insert the video Movie_countdown_2.mov on the Desktop into my current slide, please.", "source": "https://www.libreofficehelp.com/insert-video-impress-presentation/#Inserting_a_Video_in_Impress", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1vvRkrxOK_sFPX9PLFniFqrdNEZ2pQnPP&export=download&authuser=0&confirm=t&uuid=71964a12-2d0a-4c71-9375-2f9ec15de1ad&at=APZUnTX_B-T2GeZPS7ZmchMQ6E7m:1705481285721", + "path": "Desktop/Movie_activities_TURKEY.pptx" + } + ] + } + }, + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1S4lACtBb40Ff0DEjB2bG2tzr2qWwQLGd&export=download&authuser=0&confirm=t&uuid=a28c123e-5371-4e17-82c2-ed7b1f05b728&at=APZUnTW_rlUPV6mM4RjS0R6dMSv4:1705469776913", + "path": "Desktop/Movie_countdown_2.mov" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/Movie_activities_TURKEY.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "contains_mp4_video", + "result": { + "type": "vm_file", + "path": "Desktop/Movie_activities_TURKEY.pptx", + "dest": "Movie_activities_TURKEY.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json index 32d7570..14ce3cd 100644 --- a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json +++ b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json @@ -1,7 +1,7 @@ { "id": "59f21cfb-0120-4326-b255-a5b827b38967", "snapshot": "base_setup", - "instruction": "Could you play the music video that's saved on my desktop for me?", + "instruction": "Could you play the music video that's saved on my desktop for me via vlc?", "source": "https://docs.videolan.me/vlc-user/desktop/3.0/en/basic/media.html#playing-a-file", "config": [ { diff --git a/evaluation_examples/examples/vs_code/276cc624-87ea-4f08-ab93-f770e3790175.json b/evaluation_examples/examples/vs_code/276cc624-87ea-4f08-ab93-f770e3790175.json new file mode 100644 index 0000000..629dbe7 --- /dev/null +++ b/evaluation_examples/examples/vs_code/276cc624-87ea-4f08-ab93-f770e3790175.json @@ -0,0 +1,40 @@ +{ + "id": "276cc624-87ea-4f08-ab93-f770e3790175", + "snapshot": "vscode", + "instruction": "Could you help me set the line length to 50 characters for current user in VS Code?", + "source": "https://www.quora.com/unanswered/How-do-you-set-the-line-length-in-Visual-Studio-Code", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "check_json_settings", + "expected": { + "type": "rule", + "rules": { + "expect": {"editor.rulers": [50]} + } + }, + "result": { + "type": "vm_file", + "path": "/home/user/.config/Code/User/settings.json", + "dest": "settings.json" + } + } +} diff --git a/evaluation_examples/examples/vs_code/3486f395-ad68-459c-8c39-ea07de934dd4.json b/evaluation_examples/examples/vs_code/3486f395-ad68-459c-8c39-ea07de934dd4.json deleted file mode 100644 index d99df8c..0000000 --- a/evaluation_examples/examples/vs_code/3486f395-ad68-459c-8c39-ea07de934dd4.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "id": "3486f395-ad68-459c-8c39-ea07de934dd4", - "snapshot": "vscode", - "instruction": "Find me the keyboard shortcut of toggling integrated terminal. ", - "source": "https://www.youtube.com/watch?v=VqCgcpAypFQ", - "config": [], - "trajectory": "trajectories/3486f395-ad68-459c-8c39-ea07de934dd4", - "related_apps": [ - "vscode" - ], - "evaluator": "evaluation_dir" -} diff --git a/evaluation_examples/examples/vs_code/4e60007a-f5be-4bfc-9723-c39affa0a6d3.json b/evaluation_examples/examples/vs_code/4e60007a-f5be-4bfc-9723-c39affa0a6d3.json new file mode 100644 index 0000000..b36a7c8 --- /dev/null +++ b/evaluation_examples/examples/vs_code/4e60007a-f5be-4bfc-9723-c39affa0a6d3.json @@ -0,0 +1,46 @@ +{ + "id": "4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "snapshot": "vscode", + "instruction": "Install autoDocstring extension.", + "source": "https://campbell-muscle-lab.github.io/howtos_Python/pages/documentation/best_practices/vscode_docstring_extension/vscode_docstring_extension.html#:~:text=Type%2C%20Ctrl%20%2B%20Shift%20%2B%20P,select%20the%20NumPy%20docstring%20format.", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "is_extension_installed", + "result": { + "type": "vm_command_line", + "command": [ + "code", + "--list-extensions", + "|", + "grep", + "njpwerner.autodocstring" + ] + }, + "expected": { + "type": "rule", + "rules": { + "type": "contain", + "expected": "njpwerner.autodocstring" + } + } + } +} diff --git a/evaluation_examples/examples/vs_code/515630d2-9b30-430c-b06a-e86b0143f7fb.json b/evaluation_examples/examples/vs_code/515630d2-9b30-430c-b06a-e86b0143f7fb.json deleted file mode 100644 index 8f8bb5c..0000000 --- a/evaluation_examples/examples/vs_code/515630d2-9b30-430c-b06a-e86b0143f7fb.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "id": "515630d2-9b30-430c-b06a-e86b0143f7fb", - "snapshot": "vscode", - "instruction": "Help me enable automatically run code in VS code", - "source": "https://www.quora.com/How-do-I-automatically-run-code-in-Visual-Studio-Code", - "config": [], - "trajectory": "trajectories/", - "related_apps": [ - "vscode" - ], - "evaluator": "evaluation_dir" -} diff --git a/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json b/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json index 956138d..86250c7 100644 --- a/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json +++ b/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json @@ -37,7 +37,7 @@ "expected": { "type": "rule", "rules": { - "expect": "100" + "expect": "1" } }, "result": { diff --git a/evaluation_examples/examples/vs_code/6f7546b0-52f3-4938-9213-52f35454d314.json b/evaluation_examples/examples/vs_code/6f7546b0-52f3-4938-9213-52f35454d314.json deleted file mode 100644 index 1dcb21e..0000000 --- a/evaluation_examples/examples/vs_code/6f7546b0-52f3-4938-9213-52f35454d314.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "id": "6f7546b0-52f3-4938-9213-52f35454d314", - "snapshot": "vscode", - "instruction": "Help me ask chatGPT to generate html and css code for a scroll bar?", - "source": "https://www.tiktok.com/@akramovdev/video/7243349980897922306", - "config": [], - "trajectory": "trajectories/", - "related_apps": [ - "vscode" - ], - "evaluator": "evaluation_dir" -} diff --git a/evaluation_examples/examples/vs_code/90f6eeeb-f3c2-4c98-873c-e77d78a45578.json b/evaluation_examples/examples/vs_code/90f6eeeb-f3c2-4c98-873c-e77d78a45578.json deleted file mode 100644 index 955efa2..0000000 --- a/evaluation_examples/examples/vs_code/90f6eeeb-f3c2-4c98-873c-e77d78a45578.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "id": "90f6eeeb-f3c2-4c98-873c-e77d78a45578", - "snapshot": "vscode", - "instruction": "Help me sync extensions and settings across all profiles.", - "source": "https://stackoverflow.com/questions/75866801/how-do-i-sync-extensions-and-their-settings-between-vs-code-profiles", - "config": [], - "trajectory": "trajectories/", - "related_apps": [ - "vscode" - ], - "evaluator": "evaluation_dir" -} diff --git a/evaluation_examples/examples/vs_code/930fdb3b-11a8-46fe-9bac-577332e2640e.json b/evaluation_examples/examples/vs_code/930fdb3b-11a8-46fe-9bac-577332e2640e.json new file mode 100644 index 0000000..a66ecdd --- /dev/null +++ b/evaluation_examples/examples/vs_code/930fdb3b-11a8-46fe-9bac-577332e2640e.json @@ -0,0 +1,45 @@ +{ + "id": "930fdb3b-11a8-46fe-9bac-577332e2640e", + "snapshot": "vscode", + "instruction": "I want to create a shortcut to shift my focus cursor from terminal to Editor in VS Code. Please help me create this shortcut to be 'ctrl+j'.", + "source": "https://superuser.com/questions/1270103/how-to-switch-the-cursor-between-terminal-and-code-in-vscode", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "check_json_keybindings", + "expected": { + "type": "rule", + "rules": { + "expect": + { + "key": "ctrl+j", + "command": "workbench.action.focusActiveEditorGroup", + "when": "terminalFocus" + } + } + }, + "result": { + "type": "vm_file", + "path": "/home/user/.config/Code/User/keybindings.json", + "dest": "keybindings.json" + } + } +} diff --git a/evaluation_examples/examples/vs_code/9439a27b-18ae-42d8-9778-5f68f891805e.json b/evaluation_examples/examples/vs_code/9439a27b-18ae-42d8-9778-5f68f891805e.json new file mode 100644 index 0000000..2f848b2 --- /dev/null +++ b/evaluation_examples/examples/vs_code/9439a27b-18ae-42d8-9778-5f68f891805e.json @@ -0,0 +1,40 @@ +{ + "id": "9439a27b-18ae-42d8-9778-5f68f891805e", + "snapshot": "vscode", + "instruction": "I want to keep my cursor focus in debug console when debugging in VS Code, instead of focusing back to Editor. So please help me modify the setting of VS Code accordingly.", + "source": "https://stackoverflow.com/questions/75832474/how-to-keep-cursor-in-debug-console-when-debugging-in-visual-studio-code", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "check_json_settings", + "expected": { + "type": "rule", + "rules": { + "expect": {"debug.focusEditorOnBreak": false} + } + }, + "result": { + "type": "vm_file", + "path": "/home/user/.config/Code/User/settings.json", + "dest": "settings.json" + } + } +} diff --git a/evaluation_examples/examples/vs_code/9d425400-e9b2-4424-9a4b-d4c7abac4140.json b/evaluation_examples/examples/vs_code/9d425400-e9b2-4424-9a4b-d4c7abac4140.json new file mode 100644 index 0000000..080bdf1 --- /dev/null +++ b/evaluation_examples/examples/vs_code/9d425400-e9b2-4424-9a4b-d4c7abac4140.json @@ -0,0 +1,40 @@ +{ + "id": "9d425400-e9b2-4424-9a4b-d4c7abac4140", + "snapshot": "vscode", + "instruction": "I want to make tabs wrapped over multiple lines when exceeding available space, please help modify the setting of VS Code.", + "source": "https://superuser.com/questions/1466771/is-there-a-way-to-make-editor-tabs-stack-in-vs-code", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "check_json_settings", + "expected": { + "type": "rule", + "rules": { + "expect": {"workbench.editor.wrapTabs": true} + } + }, + "result": { + "type": "vm_file", + "path": "/home/user/.config/Code/User/settings.json", + "dest": "settings.json" + } + } +} diff --git a/evaluation_examples/examples/vs_code/ae506c68-352c-4094-9caa-ee9d42052317.json b/evaluation_examples/examples/vs_code/ae506c68-352c-4094-9caa-ee9d42052317.json new file mode 100644 index 0000000..6bac7ce --- /dev/null +++ b/evaluation_examples/examples/vs_code/ae506c68-352c-4094-9caa-ee9d42052317.json @@ -0,0 +1,39 @@ +{ + "id": "ae506c68-352c-4094-9caa-ee9d42052317", + "snapshot": "vscode", + "instruction": "Could you store the full terminal history of my VS Code terminal into '/home/user/Desktop/history.txt'?", + "source": "", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/ae506c68-352c-4094-9caa-ee9d42052317", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "compare_text_file", + "expected": { + "type": "cloud_file", + "path": "", + "dest": "gold_history.txt" + }, + "result": { + "type": "vm_file", + "path": "Desktop/history.txt", + "dest": "history.txt" + } + } +} diff --git a/evaluation_examples/examples/vs_code/e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2.json b/evaluation_examples/examples/vs_code/e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2.json new file mode 100644 index 0000000..306871a --- /dev/null +++ b/evaluation_examples/examples/vs_code/e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2.json @@ -0,0 +1,42 @@ +{ + "id": "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", + "snapshot": "vscode", + "instruction": "I want to disable the missing imports reporting of python error, please modify the setting of VS Code for me.", + "source": "https://superuser.com/questions/1386061/how-to-suppress-some-python-errors-warnings-in-vs-code", + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "check_json_settings", + "expected": { + "type": "rule", + "rules": { + "expect": { + "python.analysis.diagnosticSeverityOverrides": {"reportMissingImports": "none"} + } + } + }, + "result": { + "type": "vm_file", + "path": "/home/user/.config/Code/User/settings.json", + "dest": "settings.json" + } + } +} diff --git a/evaluation_examples/examples/vs_code/ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae.json b/evaluation_examples/examples/vs_code/ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae.json new file mode 100644 index 0000000..f2667e9 --- /dev/null +++ b/evaluation_examples/examples/vs_code/ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae.json @@ -0,0 +1,47 @@ +{ + "id": "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", + "snapshot": "vscode", + "instruction": "I want to remove the shortcut 'cmd+f' for Tree view Find (Explorer search) in VS Code explorer view due to shortcut conflict. Can you help me remove this shortcut?", + "source": ["https://superuser.com/questions/1748097/vs-code-disable-tree-view-find-explorer-search", + "https://superuser.com/questions/1417361/how-to-disable-file-filtering-in-vs-code-sidebar-explorer?rq=1" + ], + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "trajectory": "trajectories/", + "related_apps": [ + "vscode" + ], + "evaluator": { + "func": "check_json_keybindings", + "expected": { + "type": "rule", + "rules": { + "expect": + { + "key": "cmd+f", + "command": "-list.find", + "when": "listFocus && listSupportsFind" + } + } + }, + "result": { + "type": "vm_file", + "path": "/home/user/.config/Code/User/keybindings.json", + "dest": "keybindings.json" + } + } +} diff --git a/experiment_a11y_tree.py b/experiment_a11y_tree.py new file mode 100644 index 0000000..728d0de --- /dev/null +++ b/experiment_a11y_tree.py @@ -0,0 +1,141 @@ +import datetime +import json +import logging +import os +import sys + +from desktop_env.envs.desktop_env import DesktopEnv +from mm_agents.gpt_4v_agent import GPT4v_Agent + +# Logger Configs {{{ # +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") +debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") +stdout_handler = logging.StreamHandler(sys.stdout) +sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(logging.INFO) +sdebug_handler.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) +sdebug_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) +sdebug_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +logger.addHandler(sdebug_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + +PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" + + +def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): + trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") + env = DesktopEnv( + path_to_vm=PATH_TO_VM, + action_space=agent.action_space, + task_config=example + ) + # reset the environment to certain snapshot + observation = env.reset() + done = False + step_num = 0 + + if recording: + # send a request to the server to start recording + env.controller.start_recording() + + while not done and step_num < max_steps: + with open("accessibility_tree.xml", "w", encoding="utf-8") as f: + f.write(observation["accessibility_tree"]) + actions = agent.predict(observation) + step_num += 1 + for action in actions: + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + logger.info("Step %d: %s", step_num, action) + + observation, reward, done, info = env.step(action) + + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + logger.info("Info: %s", info) + + # Save screenshot and trajectory information + with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f: + with open(observation['screenshot'], "rb") as __f: + screenshot = __f.read() + _f.write(screenshot) + + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "step_num": step_num, + "action_timestamp": action_timestamp, + "action": action, + "reward": reward, + "done": done, + "info": info, + "screenshot_file": f"step_{step_num}_{action_timestamp}.png" + })) + f.write("\n") + + if done: + logger.info("The episode is done.") + break + + if recording: + # send a request to the server to stop recording + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + + result = env.evaluate() + logger.info("Result: %.2f", result) + + # env.close() + logger.info("Environment closed.") + + +if __name__ == "__main__": + action_space = "pyautogui" + example_class = "chrome" + example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" + gpt4_model = "gpt-4-vision-preview" + gemini_model = "gemini-pro-vision" + + logger.info("Running example %s/%s", example_class, example_id) + logger.info("Using model %s", gpt4_model) + # logger.info("Using model %s", gemini_model) + + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + example = json.load(f) + example["snapshot"] = "exp_setup4" + + api_key = os.environ.get("OPENAI_API_KEY") + agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], + action_space=action_space, exp="a11y_tree") + + # api_key = os.environ.get("GENAI_API_KEY") + # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space, exp="a11y_tree") + + root_trajectory_dir = "exp_trajectory" + + example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gemini_model, example_id) + + os.makedirs(example_trajectory_dir, exist_ok=True) + + run_one_example(example, agent, 15, example_trajectory_dir) diff --git a/experiment.py b/experiment_screenshot.py similarity index 87% rename from experiment.py rename to experiment_screenshot.py index 8e7f8b5..6d82730 100644 --- a/experiment.py +++ b/experiment_screenshot.py @@ -113,20 +113,28 @@ if __name__ == "__main__": action_space = "pyautogui" example_class = "thunderbird" example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4" + gpt4_model = "gpt-4-vision-preview" + gemini_model = "gemini-pro-vision" + + logger.info("Running example %s/%s", example_class, example_id) + logger.info("Using model %s", gpt4_model) + # logger.info("Using model %s", gemini_model) with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: example = json.load(f) example["snapshot"] = "exp_setup2" # api_key = os.environ.get("OPENAI_API_KEY") - # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) + # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot") api_key = os.environ.get("GENAI_API_KEY") - agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) + agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot") root_trajectory_dir = "exp_trajectory" - example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id) + example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gemini_model, example_id) + os.makedirs(example_trajectory_dir, exist_ok=True) - run_one_example(example, agent, 10, example_trajectory_dir) + run_one_example(example, agent, 15, example_trajectory_dir) diff --git a/experiment_screenshot_a11y_tree.py b/experiment_screenshot_a11y_tree.py new file mode 100644 index 0000000..60c81b6 --- /dev/null +++ b/experiment_screenshot_a11y_tree.py @@ -0,0 +1,139 @@ +import datetime +import json +import logging +import os +import sys + +from desktop_env.envs.desktop_env import DesktopEnv +from mm_agents.gpt_4v_agent import GPT4v_Agent + +# Logger Configs {{{ # +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") +debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") +stdout_handler = logging.StreamHandler(sys.stdout) +sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(logging.INFO) +sdebug_handler.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) +sdebug_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) +sdebug_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +logger.addHandler(sdebug_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + +PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" + + +def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): + trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") + env = DesktopEnv( + path_to_vm=PATH_TO_VM, + action_space=agent.action_space, + task_config=example + ) + # reset the environment to certain snapshot + observation = env.reset() + done = False + step_num = 0 + + if recording: + # send a request to the server to start recording + env.controller.start_recording() + + while not done and step_num < max_steps: + actions = agent.predict(observation) + step_num += 1 + for action in actions: + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + logger.info("Step %d: %s", step_num, action) + + observation, reward, done, info = env.step(action) + + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + logger.info("Info: %s", info) + + # Save screenshot and trajectory information + with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f: + with open(observation['screenshot'], "rb") as __f: + screenshot = __f.read() + _f.write(screenshot) + + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "step_num": step_num, + "action_timestamp": action_timestamp, + "action": action, + "reward": reward, + "done": done, + "info": info, + "screenshot_file": f"step_{step_num}_{action_timestamp}.png" + })) + f.write("\n") + + if done: + logger.info("The episode is done.") + break + + if recording: + # send a request to the server to stop recording + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + + result = env.evaluate() + logger.info("Result: %.2f", result) + + # env.close() + logger.info("Environment closed.") + + +if __name__ == "__main__": + action_space = "pyautogui" + example_class = "chrome" + example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" + gpt4_model = "gpt-4-vision-preview" + gemini_model = "gemini-pro-vision" + + logger.info("Running example %s/%s", example_class, example_id) + logger.info("Using model %s", gpt4_model) + # logger.info("Using model %s", gemini_model) + + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + example = json.load(f) + example["snapshot"] = "exp_setup4" + + api_key = os.environ.get("OPENAI_API_KEY") + agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], + action_space=action_space, exp="both") + + # api_key = os.environ.get("GENAI_API_KEY") + # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space, exp="both") + + root_trajectory_dir = "exp_trajectory" + + example_trajectory_dir = os.path.join(root_trajectory_dir, "both", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "both", example_class, gemini_model, example_id) + + os.makedirs(example_trajectory_dir, exist_ok=True) + + run_one_example(example, agent, 15, example_trajectory_dir) diff --git a/experiment_pure_text.py b/experiment_screenshot_seeact.py similarity index 91% rename from experiment_pure_text.py rename to experiment_screenshot_seeact.py index cfcbd46..b718693 100644 --- a/experiment_pure_text.py +++ b/experiment_screenshot_seeact.py @@ -5,8 +5,7 @@ import os import sys from desktop_env.envs.desktop_env import DesktopEnv -from mm_agents.gpt_4_agent import GPT4_Agent -from mm_agents.gemini_pro_agent import GeminiPro_Agent +from mm_agents.gpt_4v_agent import GPT4v_Agent # Logger Configs {{{ # logger = logging.getLogger() @@ -111,8 +110,8 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr if __name__ == "__main__": action_space = "pyautogui" example_class = "chrome" - example_id = "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263" - gpt4_model = "gpt-4-1106-preview" + example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" + gpt4_model = "gpt-4-vision-preview" gemini_model = "gemini-pro-vision" with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: @@ -120,15 +119,16 @@ if __name__ == "__main__": example["snapshot"] = "exp_setup4" api_key = os.environ.get("OPENAI_API_KEY") - agent = GPT4_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space) + agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], + action_space=action_space, exp="seeact") # api_key = os.environ.get("GENAI_API_KEY") # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space) root_trajectory_dir = "exp_trajectory" - example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gpt4_model, example_id) - # example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gemini_model, example_id) + example_trajectory_dir = os.path.join(root_trajectory_dir, "seeact", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "seeact", example_class, gemini_model, example_id) os.makedirs(example_trajectory_dir, exist_ok=True) diff --git a/experiment_screenshot_som.py b/experiment_screenshot_som.py new file mode 100644 index 0000000..2a64bb3 --- /dev/null +++ b/experiment_screenshot_som.py @@ -0,0 +1,135 @@ +import datetime +import json +import logging +import os +import sys + +from desktop_env.envs.desktop_env import DesktopEnv +from mm_agents.gpt_4v_agent import GPT4v_Agent + +# Logger Configs {{{ # +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") +debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") +stdout_handler = logging.StreamHandler(sys.stdout) +sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(logging.INFO) +sdebug_handler.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) +sdebug_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) +sdebug_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +logger.addHandler(sdebug_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + +PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" + + +def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): + trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") + env = DesktopEnv( + path_to_vm=PATH_TO_VM, + action_space=agent.action_space, + task_config=example + ) + # reset the environment to certain snapshot + observation = env.reset() + done = False + step_num = 0 + + if recording: + # send a request to the server to start recording + env.controller.start_recording() + + while not done and step_num < max_steps: + actions = agent.predict(observation) + step_num += 1 + for action in actions: + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + logger.info("Step %d: %s", step_num, action) + + observation, reward, done, info = env.step(action) + + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + logger.info("Info: %s", info) + + # Save screenshot and trajectory information + with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f: + with open(observation['screenshot'], "rb") as __f: + screenshot = __f.read() + _f.write(screenshot) + + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "step_num": step_num, + "action_timestamp": action_timestamp, + "action": action, + "reward": reward, + "done": done, + "info": info, + "screenshot_file": f"step_{step_num}_{action_timestamp}.png" + })) + f.write("\n") + + if done: + logger.info("The episode is done.") + break + + if recording: + # send a request to the server to stop recording + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + + result = env.evaluate() + logger.info("Result: %.2f", result) + + # env.close() + logger.info("Environment closed.") + + +if __name__ == "__main__": + action_space = "pyautogui" + example_class = "chrome" + example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3" + gpt4_model = "gpt-4-vision-preview" + gemini_model = "gemini-pro-vision" + + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + example = json.load(f) + example["snapshot"] = "exp_setup4" + + api_key = os.environ.get("OPENAI_API_KEY") + agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], + action_space=action_space, exp="som") + + # api_key = os.environ.get("GENAI_API_KEY") + # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space) + + root_trajectory_dir = "exp_trajectory" + + example_trajectory_dir = os.path.join(root_trajectory_dir, "som", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "som", example_class, gemini_model, example_id) + + os.makedirs(example_trajectory_dir, exist_ok=True) + + run_one_example(example, agent, 15, example_trajectory_dir) diff --git a/mm_agents/SoM_agent.py b/mm_agents/SoM_agent.py deleted file mode 100644 index e3b3e59..0000000 --- a/mm_agents/SoM_agent.py +++ /dev/null @@ -1,283 +0,0 @@ -# fixme: Need to be rewrite on new action space - -import os -import re -import base64 -import PIL.Image -import json -import requests - -import torch -import argparse - -# seem -from seem.modeling.BaseModel import BaseModel as BaseModel_Seem -from seem.utils.distributed import init_distributed as init_distributed_seem -from seem.modeling import build_model as build_model_seem -from task_adapter.seem.tasks import inference_seem_pano - -# semantic sam -from semantic_sam.BaseModel import BaseModel -from semantic_sam import build_model -from semantic_sam.utils.dist import init_distributed_mode -from semantic_sam.utils.arguments import load_opt_from_config_file -from semantic_sam.utils.constants import COCO_PANOPTIC_CLASSES -from task_adapter.semantic_sam.tasks import inference_semsam_m2m_auto, prompt_switch - -# sam -from segment_anything import sam_model_registry -from task_adapter.sam.tasks.inference_sam_m2m_auto import inference_sam_m2m_auto - -from scipy.ndimage import label -from io import BytesIO -import numpy as np - -SYS_PROMPT = ''' -You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. -For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. - -Firstly you need to predict the class of your action, select from one below: -- **CLICK**: click on the screen with the specified integer label -- **TYPE**: type a string on the keyboard - -- For CLICK, you need to predict the correct integer label shown on the screenshot -for example, format as: -``` -{ - "action_type": "CLICK", - "label": 7 -} -``` -- For TYPE, you need to specify the text you want to type -for example, format as: -``` -{ - "action_type": "TYPE", - "text": "hello world" -} -``` - -For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`). -You can predict multiple actions at one step, but you should only return one action for each step. -You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. -''' - -# build args -semsam_cfg = "configs/semantic_sam_only_sa-1b_swinL.yaml" -seem_cfg = "configs/seem_focall_unicl_lang_v1.yaml" - -semsam_ckpt = "./swinl_only_sam_many2many.pth" -sam_ckpt = "./sam_vit_h_4b8939.pth" -seem_ckpt = "./seem_focall_v1.pt" - -opt_semsam = load_opt_from_config_file(semsam_cfg) -opt_seem = load_opt_from_config_file(seem_cfg) -opt_seem = init_distributed_seem(opt_seem) - -# build model -model_semsam = BaseModel(opt_semsam, build_model(opt_semsam)).from_pretrained(semsam_ckpt).eval().cuda() -model_sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt).eval().cuda() -model_seem = BaseModel_Seem(opt_seem, build_model_seem(opt_seem)).from_pretrained(seem_ckpt).eval().cuda() - -with torch.no_grad(): - with torch.autocast(device_type='cuda', dtype=torch.float16): - model_seem.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True) - -@torch.no_grad() -def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs): - if slider < 1.5: - model_name = 'seem' - elif slider > 2.5: - model_name = 'sam' - else: - model_name = 'semantic-sam' - if slider < 1.5 + 0.14: - level = [1] - elif slider < 1.5 + 0.28: - level = [2] - elif slider < 1.5 + 0.42: - level = [3] - elif slider < 1.5 + 0.56: - level = [4] - elif slider < 1.5 + 0.70: - level = [5] - elif slider < 1.5 + 0.84: - level = [6] - else: - level = [6, 1, 2, 3, 4, 5] - - if label_mode == 'Alphabet': - label_mode = 'a' - else: - label_mode = '1' - - text_size, hole_scale, island_scale = 1280, 100, 100 - text, text_part, text_thresh = '', '', '0.0' - - with torch.autocast(device_type='cuda', dtype=torch.float16): - semantic = False - - if model_name == 'semantic-sam': - model = model_semsam - output, mask = inference_semsam_m2m_auto(model, image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs) - - elif model_name == 'sam': - model = model_sam - output, mask = inference_sam_m2m_auto(model, image, text_size, label_mode, alpha, anno_mode) - - elif model_name == 'seem': - model = model_seem - output, mask = inference_seem_pano(model, image, text_size, label_mode, alpha, anno_mode) - - return output, mask - -# Function to encode the image -def encode_image(image): - pil_img = PIL.Image.fromarray(image) - buff = BytesIO() - pil_img.save(buff, format="JPEG") - new_image_string = base64.b64encode(buff.getvalue()).decode("utf-8") - return new_image_string - -def parse_actions_from_string(input_string): - # Search for a JSON string within the input string - actions = [] - matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL) - if matches: - # Assuming there's only one match, parse the JSON string into a dictionary - try: - for match in matches: - action_dict = json.loads(match) - actions.append(action_dict) - return actions - except json.JSONDecodeError as e: - return f"Failed to parse JSON: {e}" - else: - matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL) - if matches: - # Assuming there's only one match, parse the JSON string into a dictionary - try: - for match in matches: - action_dict = json.loads(match) - actions.append(action_dict) - return actions - except json.JSONDecodeError as e: - return f"Failed to parse JSON: {e}" - else: - try: - action_dict = json.loads(input_string) - return [action_dict] - except json.JSONDecodeError as e: - raise ValueError("Invalid response format: " + input_string) - -class GPT4v_Agent: - def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300): - self.instruction = instruction - self.model = model - self.max_tokens = max_tokens - - self.headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {api_key}" - } - - self.trajectory = [ - { - "role": "system", - "content": [ - { - "type": "text", - "text": SYS_PROMPT - }, - ] - } - ] - - def predict(self, obs): - obs, mask = inference(obs, slider=3.0, mode="Automatic", alpha=0.1, label_mode="Number", anno_mode=["Mark", "Box"]) - PIL.Image.fromarray(obs).save("desktop.jpeg") - base64_image = encode_image(obs) - self.trajectory.append({ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's the next step for instruction '{}'?".format(self.instruction) - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}" - } - } - ] - }) - traj_to_show = [] - for i in range(len(self.trajectory)): - traj_to_show.append(self.trajectory[i]["content"][0]["text"]) - if len(self.trajectory[i]["content"]) > 1: - traj_to_show.append("screenshot_obs") - print("Trajectory:", traj_to_show) - payload = { - "model": self.model, - "messages": self.trajectory, - "max_tokens": self.max_tokens - } - response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload) - - try: - actions = self.parse_actions(response.json()['choices'][0]['message']['content'], mask) - except: - print("Failed to parse action from response:", response.json()['choices'][0]['message']['content']) - actions = None - - return actions - - def parse_actions(self, response: str, mask): - # response example - """ - ```json - { - "action_type": "CLICK", - "click_type": "RIGHT" - } - ``` - """ - - # parse from the response - actions = parse_actions_from_string(response) - print(actions) - - # add action into the trajectory - self.trajectory.append({ - "role": "assistant", - "content": [ - { - "type": "text", - "text": response - }, - ] - }) - - # parse action - parsed_actions = [] - for action in actions: - action_type = action['action_type'] - if action_type == "CLICK": - label = int(action['label']) - x, y, w, h = mask[label-1]['bbox'] - parsed_actions.append({"action_type": action_type, "x": int(x + w//2) , "y": int(y + h//2)}) - - if action_type == "TYPE": - parsed_actions.append({"action_type": action_type, "text": action["text"]}) - - return parsed_actions - - -if __name__ == '__main__': - # OpenAI API Key - api_key = os.environ.get("OPENAI_API_KEY") - - agent = GPT4v_Agent(api_key=api_key, instruction="Open Firefox") - obs = PIL.Image.open('desktop.png') - print(agent.predict(obs=obs)) \ No newline at end of file diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py index d6f83eb..47bbca0 100644 --- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -41,10 +41,12 @@ def filter_nodes(nodes): elif node.tag == 'text': continue else: - coords = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', '))) + coords = tuple( + map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', '))) if coords[0] < 0 or coords[1] < 0: continue - size = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size').strip('()').split(', '))) + size = tuple( + map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size').strip('()').split(', '))) if size[0] <= 0 or size[1] <= 0: continue # Node is not a 'panel', add to the list. @@ -57,17 +59,20 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): # Load the screenshot image image = Image.open(image_file_path) draw = ImageDraw.Draw(image) + marks = [] + drew_nodes = [] - # Optional: Load a font. If you don't specify a font, a default one will be used. try: # Adjust the path to the font file you have or use a default one - font = ImageFont.truetype("arial.ttf", 20) + font = ImageFont.truetype("arial.ttf", 15) except IOError: # Fallback to a basic font if the specified font can't be loaded font = ImageFont.load_default() + index = 1 + # Loop over all the visible nodes and draw their bounding boxes - for index, _node in enumerate(nodes): + for _node in nodes: coords_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord') size_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') @@ -88,15 +93,45 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]: raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}") - # Draw rectangle on image - draw.rectangle([coords, bottom_right], outline="red", width=2) + # Check if the area only contains one color + cropped_image = image.crop((*coords, *bottom_right)) + if len(set(list(cropped_image.getdata()))) == 1: + continue - # Draw index number at the bottom left of the bounding box + # Draw rectangle on image + draw.rectangle([coords, bottom_right], outline="red", width=1) + + # Draw index number at the bottom left of the bounding box with black background text_position = (coords[0], bottom_right[1]) # Adjust Y to be above the bottom right - draw.text(text_position, str(index), font=font, fill="purple") + draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black') + draw.text(text_position, str(index), font=font, fill="white") + index += 1 + + # each mark is an x, y, w, h tuple + marks.append([coords[0], coords[1], size[0], size[1]]) + drew_nodes.append(_node) except ValueError as e: pass # Save the result image.save(output_image_file_path) + return marks, drew_nodes + + +def print_nodes_with_indent(nodes, indent=0): + for node in nodes: + print(' ' * indent, node.tag, node.attrib) + print_nodes_with_indent(node, indent + 2) + + +if __name__ == '__main__': + with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f: + xml_file_str = f.read() + filtered_nodes = filter_nodes(find_leaf_nodes(xml_file_str)) + print(len(filtered_nodes)) + masks = draw_bounding_boxes(filtered_nodes, 'screenshot.png', + 'chrome_desktop_example_1_tagged_remove.png', ) + + # print(masks) + print(len(masks)) diff --git a/mm_agents/gemini_pro_agent.py b/mm_agents/gemini_pro_agent.py index 26f9c0e..ce84488 100644 --- a/mm_agents/gemini_pro_agent.py +++ b/mm_agents/gemini_pro_agent.py @@ -1,3 +1,5 @@ +# todo: needs to be refactored + import time from typing import Dict, List diff --git a/mm_agents/gemini_pro_vision_agent.py b/mm_agents/gemini_pro_vision_agent.py index 2d5d365..4a537db 100644 --- a/mm_agents/gemini_pro_vision_agent.py +++ b/mm_agents/gemini_pro_vision_agent.py @@ -1,3 +1,5 @@ +# todo: needs to be refactored + import time from typing import Dict, List diff --git a/mm_agents/gpt_4_agent.py b/mm_agents/gpt_4_agent.py deleted file mode 100644 index aa19185..0000000 --- a/mm_agents/gpt_4_agent.py +++ /dev/null @@ -1,195 +0,0 @@ -import base64 -import json -import re -import time -from typing import Dict, List - -import requests - -from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes -from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION -from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE - - -# Function to encode the image -def encode_image(image_path): - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') - - -def parse_actions_from_string(input_string): - # Search for a JSON string within the input string - actions = [] - matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL) - if matches: - # Assuming there's only one match, parse the JSON string into a dictionary - try: - for match in matches: - action_dict = json.loads(match) - actions.append(action_dict) - return actions - except json.JSONDecodeError as e: - return f"Failed to parse JSON: {e}" - else: - matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL) - if matches: - # Assuming there's only one match, parse the JSON string into a dictionary - try: - for match in matches: - action_dict = json.loads(match) - actions.append(action_dict) - return actions - except json.JSONDecodeError as e: - return f"Failed to parse JSON: {e}" - else: - try: - action_dict = json.loads(input_string) - return [action_dict] - except json.JSONDecodeError as e: - raise ValueError("Invalid response format: " + input_string) - - -def parse_code_from_string(input_string): - # This regular expression will match both ```code``` and ```python code``` - # and capture the `code` part. It uses a non-greedy match for the content inside. - pattern = r"```(?:\w+\s+)?(.*?)```" - # Find all non-overlapping matches in the string - matches = re.findall(pattern, input_string, re.DOTALL) - - # The regex above captures the content inside the triple backticks. - # The `re.DOTALL` flag allows the dot `.` to match newline characters as well, - # so the code inside backticks can span multiple lines. - - # matches now contains all the captured code snippets - - codes = [] - - for match in matches: - match = match.strip() - commands = ['WAIT', 'DONE', 'FAIL'] # fixme: updates this part when we have more commands - - if match in commands: - codes.append(match.strip()) - elif match.split('\n')[-1] in commands: - if len(match.split('\n')) > 1: - codes.append("\n".join(match.split('\n')[:-1])) - codes.append(match.split('\n')[-1]) - else: - codes.append(match) - - return codes - - -class GPT4_Agent: - def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"): - self.instruction = instruction - self.model = model - self.max_tokens = max_tokens - self.action_space = action_space - - self.headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {api_key}" - } - - self.trajectory = [ - { - "role": "system", - "content": [ - { - "type": "text", - "text": { - "computer_13": SYS_PROMPT_ACTION, - "pyautogui": SYS_PROMPT_CODE - }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) - }, - ] - } - ] - - def predict(self, obs: Dict) -> List: - """ - Predict the next action(s) based on the current observation. - """ - accessibility_tree = obs["accessibility_tree"] - - leaf_nodes = find_leaf_nodes(accessibility_tree) - filtered_nodes = filter_nodes(leaf_nodes) - - linearized_accessibility_tree = "tag\ttext\tposition\tsize\n" - # Linearize the accessibility tree nodes into a table format - - for node in filtered_nodes: - linearized_accessibility_tree += node.tag + "\t" - linearized_accessibility_tree += node.attrib.get('name') + "\t" - linearized_accessibility_tree += node.attrib.get( - '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t" - linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n" - - self.trajectory.append({ - "role": "user", - "content": [ - { - "type": "text", - "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( - linearized_accessibility_tree) - } - ] - }) - - # print( - # "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( - # linearized_accessibility_tree) - # ) - - traj_to_show = [] - for i in range(len(self.trajectory)): - traj_to_show.append(self.trajectory[i]["content"][0]["text"]) - if len(self.trajectory[i]["content"]) > 1: - traj_to_show.append("screenshot_obs") - - payload = { - "model": self.model, - "messages": self.trajectory, - "max_tokens": self.max_tokens - } - - while True: - try: - response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, - json=payload) - break - except: - print("Failed to generate response, retrying...") - time.sleep(5) - pass - - try: - actions = self.parse_actions(response.json()['choices'][0]['message']['content']) - except: - print("Failed to parse action from response:", response.json()) - actions = None - - return actions - - def parse_actions(self, response: str): - # parse from the response - if self.action_space == "computer_13": - actions = parse_actions_from_string(response) - elif self.action_space == "pyautogui": - actions = parse_code_from_string(response) - else: - raise ValueError("Invalid action space: " + self.action_space) - - # add action into the trajectory - self.trajectory.append({ - "role": "assistant", - "content": [ - { - "type": "text", - "text": response - }, - ] - }) - - return actions diff --git a/mm_agents/gpt_4_prompt_action.py b/mm_agents/gpt_4_prompt_action.py deleted file mode 100644 index 3019074..0000000 --- a/mm_agents/gpt_4_prompt_action.py +++ /dev/null @@ -1,244 +0,0 @@ -SYS_PROMPT = """ -You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. -For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. - -HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: -ACTION_SPACE = [ - { - "action_type": "MOVE_TO", - "note": "move the cursor to the specified position", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": False, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": False, - } - } - }, - { - "action_type": "CLICK", - "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", - "parameters": { - "button": { - "type": str, - "range": ["left", "right", "middle"], - "optional": True, - }, - "x": { - "type": float, - "range": [0, X_MAX], - "optional": True, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": True, - }, - "num_clicks": { - "type": int, - "range": [1, 2, 3], - "optional": True, - }, - } - }, - { - "action_type": "MOUSE_DOWN", - "note": "press the left button if the button not specified, otherwise press the specified button", - "parameters": { - "button": { - "type": str, - "range": ["left", "right", "middle"], - "optional": True, - } - } - }, - { - "action_type": "MOUSE_UP", - "note": "release the left button if the button not specified, otherwise release the specified button", - "parameters": { - "button": { - "type": str, - "range": ["left", "right", "middle"], - "optional": True, - } - } - }, - { - "action_type": "RIGHT_CLICK", - "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": True, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": True, - } - } - }, - { - "action_type": "DOUBLE_CLICK", - "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": True, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": True, - } - } - }, - { - "action_type": "DRAG_TO", - "note": "drag the cursor to the specified position with the left button pressed", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": False, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": False, - } - } - }, - { - "action_type": "SCROLL", - "note": "scroll the mouse wheel up or down", - "parameters": { - "dx": { - "type": int, - "range": None, - "optional": False, - }, - "dy": { - "type": int, - "range": None, - "optional": False, - } - } - }, - { - "action_type": "TYPING", - "note": "type the specified text", - "parameters": { - "text": { - "type": str, - "range": None, - "optional": False, - } - } - }, - { - "action_type": "PRESS", - "note": "press the specified key and release it", - "parameters": { - "key": { - "type": str, - "range": KEYBOARD_KEYS, - "optional": False, - } - } - }, - { - "action_type": "KEY_DOWN", - "note": "press the specified key", - "parameters": { - "key": { - "type": str, - "range": KEYBOARD_KEYS, - "optional": False, - } - } - }, - { - "action_type": "KEY_UP", - "note": "release the specified key", - "parameters": { - "key": { - "type": str, - "range": KEYBOARD_KEYS, - "optional": False, - } - } - }, - { - "action_type": "HOTKEY", - "note": "press the specified key combination", - "parameters": { - "keys": { - "type": list, - "range": [KEYBOARD_KEYS], - "optional": False, - } - } - }, - ############################################################################################################ - { - "action_type": "WAIT", - "note": "wait until the next action", - }, - { - "action_type": "FAIL", - "note": "decide the task can not be performed", - }, - { - "action_type": "DONE", - "note": "decide the task is done", - } -] -Firstly you need to predict the class of your action, then you need to predict the parameters of your action: -- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) -for example, format as: -``` -{ - "action_type": "MOUSE_MOVE", - "x": 1319.11, - "y": 65.06 -} -``` -- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: -for example, format as: -``` -{ - "action_type": "CLICK", - "click_type": "LEFT" -} -``` -- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard -for example, format as: -``` -{ - "action_type": "KEY", - "key": "ctrl+c" -} -``` -- For TYPE, you need to specify the text you want to type -for example, format as: -``` -{ - "action_type": "TYPE", - "text": "hello world" -} -``` - -REMEMBER: -For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. -You MUST wrap the dict with backticks (\`). -You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. -You CAN predict multiple actions at one step, but you should only return one action for each step. -""" \ No newline at end of file diff --git a/mm_agents/gpt_4_prompt_code.py b/mm_agents/gpt_4_prompt_code.py deleted file mode 100644 index 25e4083..0000000 --- a/mm_agents/gpt_4_prompt_code.py +++ /dev/null @@ -1,18 +0,0 @@ -SYS_PROMPT = """ -You are an agent which follow my instruction and perform desktop computer tasks as instructed. -You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. -For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. - -You are required to use `pyautogui` to perform the action. -Return one line or multiple lines of python code to perform the action each time, be time efficient. -You ONLY need to return the code inside a code block, like this: -```python -# your code here -``` -Specially, it is also allowed to return the following special code: -When you think you have to wait for some time, return ```WAIT```; -When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; -When you think the task is done, return ```DONE```. - -First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. -""" \ No newline at end of file diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index d594b76..10995b6 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -1,13 +1,26 @@ import base64 import json +import os import re -import time +import uuid from typing import Dict, List +import backoff import requests +from openai.error import ( + APIConnectionError, + APIError, + RateLimitError, + ServiceUnavailableError, + InvalidRequestError +) -from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION -from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE +from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes, draw_bounding_boxes +from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \ + SYS_PROMPT_IN_A11Y_OUT_CODE, SYS_PROMPT_IN_A11Y_OUT_ACTION, \ + SYS_PROMPT_IN_BOTH_OUT_CODE, SYS_PROMPT_IN_BOTH_OUT_ACTION, \ + SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \ + SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT # Function to encode the image @@ -16,6 +29,35 @@ def encode_image(image_path): return base64.b64encode(image_file.read()).decode('utf-8') +def linearize_accessibility_tree(accessibility_tree): + leaf_nodes = find_leaf_nodes(accessibility_tree) + filtered_nodes = filter_nodes(leaf_nodes) + + linearized_accessibility_tree = "tag\ttext\tposition\tsize\n" + # Linearize the accessibility tree nodes into a table format + + for node in filtered_nodes: + linearized_accessibility_tree += node.tag + "\t" + linearized_accessibility_tree += node.attrib.get('name') + "\t" + linearized_accessibility_tree += node.attrib.get( + '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t" + linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n" + + return linearized_accessibility_tree + + +def tag_screenshot(screenshot, accessibility_tree): + # Creat a tmp file to store the screenshot in random name + uuid_str = str(uuid.uuid4()) + os.makedirs("tmp/images", exist_ok=True) + tagged_screenshot_file_path = os.path.join("tmp/images", uuid_str + ".png") + nodes = filter_nodes(find_leaf_nodes(accessibility_tree)) + # Make tag screenshot + marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path) + + return marks, drew_nodes, tagged_screenshot_file_path + + def parse_actions_from_string(input_string): # Search for a JSON string within the input string actions = [] @@ -60,106 +102,424 @@ def parse_code_from_string(input_string): # so the code inside backticks can span multiple lines. # matches now contains all the captured code snippets - return matches + + codes = [] + + for match in matches: + match = match.strip() + commands = ['WAIT', 'DONE', 'FAIL'] # fixme: updates this part when we have more commands + + if match in commands: + codes.append(match.strip()) + elif match.split('\n')[-1] in commands: + if len(match.split('\n')) > 1: + codes.append("\n".join(match.split('\n')[:-1])) + codes.append(match.split('\n')[-1]) + else: + codes.append(match) + + return codes + + +def parse_code_from_som_string(input_string, masks): + # parse the output string by masks + mappings = [] + for i, mask in enumerate(masks): + x, y, w, h = mask + mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2)))) + + # reverse the mappings + for mapping in mappings[::-1]: + input_string = input_string.replace(mapping[0], mapping[1]) + + actions = parse_code_from_string(input_string) + return actions class GPT4v_Agent: - def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"): + def __init__( + self, + api_key, + instruction, + model="gpt-4-vision-preview", + max_tokens=500, + action_space="computer_13", + exp="screenshot_a11y_tree" + # exp can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som", "seeact"] + ): + self.instruction = instruction self.model = model self.max_tokens = max_tokens self.action_space = action_space + self.exp = exp + self.max_trajectory_length = 3 self.headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } - self.trajectory = [ - { - "role": "system", - "content": [ - { - "type": "text", - "text": { - "computer_13": SYS_PROMPT_ACTION, - "pyautogui": SYS_PROMPT_CODE - }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction) - }, - ] - } - ] + self.actions = [] + self.observations = [] + + if exp == "screenshot": + if action_space == "computer_13": + self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION + elif action_space == "pyautogui": + self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_CODE + else: + raise ValueError("Invalid action space: " + action_space) + elif exp == "a11y_tree": + if action_space == "computer_13": + self.system_message = SYS_PROMPT_IN_A11Y_OUT_ACTION + elif action_space == "pyautogui": + self.system_message = SYS_PROMPT_IN_A11Y_OUT_CODE + else: + raise ValueError("Invalid action space: " + action_space) + elif exp == "both": + if action_space == "computer_13": + self.system_message = SYS_PROMPT_IN_BOTH_OUT_ACTION + elif action_space == "pyautogui": + self.system_message = SYS_PROMPT_IN_BOTH_OUT_CODE + else: + raise ValueError("Invalid action space: " + action_space) + elif exp == "som": + if action_space == "computer_13": + raise ValueError("Invalid action space: " + action_space) + elif action_space == "pyautogui": + self.system_message = SYS_PROMPT_IN_SOM_A11Y_OUT_TAG + else: + raise ValueError("Invalid action space: " + action_space) + elif exp == "seeact": + if action_space == "computer_13": + raise ValueError("Invalid action space: " + action_space) + elif action_space == "pyautogui": + self.system_message = SYS_PROMPT_SEEACT + else: + raise ValueError("Invalid action space: " + action_space) + else: + raise ValueError("Invalid experiment type: " + exp) + + self.system_message = self.system_message + "\nYou are asked to complete the following task: {}".format( + self.instruction) def predict(self, obs: Dict) -> List: """ Predict the next action(s) based on the current observation. """ - base64_image = encode_image(obs["screenshot"]) - self.trajectory.append({ - "role": "user", + + # Prepare the payload for the API call + messages = [] + masks = None + + messages.append({ + "role": "system", "content": [ { "type": "text", - "text": "What's the next step that you will do to help with the task?" + "text": self.system_message }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}" - } - } ] }) - traj_to_show = [] - for i in range(len(self.trajectory)): - traj_to_show.append(self.trajectory[i]["content"][0]["text"]) - if len(self.trajectory[i]["content"]) > 1: - traj_to_show.append("screenshot_obs") + # Append trajectory + assert len(self.observations) == len(self.actions), "The number of observations and actions should be the same." - print("Trajectory:", traj_to_show) + if len(self.observations) > self.max_trajectory_length: + _observations = self.observations[-self.max_trajectory_length:] + _actions = self.actions[-self.max_trajectory_length:] + else: + _observations = self.observations + _actions = self.actions - payload = { + for previous_obs, previous_action in zip(_observations, _actions): + + if self.exp == "both": + _screenshot = previous_obs["screenshot"] + _linearized_accessibility_tree = previous_obs["accessibility_tree"] + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + _linearized_accessibility_tree) + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{_screenshot}", + "detail": "high" + } + } + ] + }) + elif self.exp in ["som", "seeact"]: + _screenshot = previous_obs["screenshot"] + _linearized_accessibility_tree = previous_obs["accessibility_tree"] + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + _linearized_accessibility_tree) + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{_screenshot}", + "detail": "high" + } + } + ] + }) + elif self.exp == "screenshot": + _screenshot = previous_obs["screenshot"] + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the screenshot as below. What's the next step that you will do to help with the task?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{_screenshot}", + "detail": "high" + } + } + ] + }) + elif self.exp == "a11y_tree": + _linearized_accessibility_tree = previous_obs["accessibility_tree"] + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + _linearized_accessibility_tree) + } + ] + }) + else: + raise ValueError("Invalid experiment type: " + self.exp) + + messages.append({ + "role": "assistant", + "content": [ + { + "type": "text", + "text": "\n".join(previous_action) if len(previous_action) > 0 else "No valid action" + }, + ] + }) + + if self.exp in ["screenshot", "both"]: + base64_image = encode_image(obs["screenshot"]) + linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) + + if self.exp == "both": + self.observations.append({ + "screenshot": base64_image, + "accessibility_tree": linearized_accessibility_tree + }) + else: + self.observations.append({ + "screenshot": base64_image, + "accessibility_tree": None + }) + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the screenshot as below. What's the next step that you will do to help with the task?" + if self.exp == "screenshot" + else "Given the screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + linearized_accessibility_tree) + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": "high" + } + } + ] + }) + elif self.exp == "a11y_tree": + linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) + + self.observations.append({ + "screenshot": None, + "accessibility_tree": linearized_accessibility_tree + }) + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + linearized_accessibility_tree) + } + ] + }) + elif self.exp == "som": + # Add som to the screenshot + masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"]) + base64_image = encode_image(tagged_screenshot) + linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) + + self.observations.append({ + "screenshot": base64_image, + "accessibility_tree": linearized_accessibility_tree + }) + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + linearized_accessibility_tree) + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": "high" + } + } + ] + }) + elif self.exp == "seeact": + # Add som to the screenshot + masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"]) + base64_image = encode_image(tagged_screenshot) + linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) + + self.observations.append({ + "screenshot": base64_image, + "accessibility_tree": linearized_accessibility_tree + }) + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": ACTION_DESCRIPTION_PROMPT_SEEACT.format(linearized_accessibility_tree) + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "detail": "high" + } + } + ] + }) + else: + raise ValueError("Invalid experiment type: " + self.exp) + + with open("messages.json", "w") as f: + f.write(json.dumps(messages, indent=4)) + + response = self.call_llm({ "model": self.model, - "messages": self.trajectory, + "messages": messages, "max_tokens": self.max_tokens - } + }) + + print(response) + + if self.exp == "seeact": + messages.append({ + "role": "assistant", + "content": [ + { + "type": "text", + "text": response + } + ] + }) + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": "{}\n\nWhat's the next step that you will do to help with the task?".format( + ACTION_GROUNDING_PROMPT_SEEACT) + } + ] + }) + + response = self.call_llm({ + "model": self.model, + "messages": messages, + "max_tokens": self.max_tokens + }) + print(response) - while True: - try: - response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, - json=payload) - break - except: - print("Failed to generate response, retrying...") - time.sleep(5) - pass try: - actions = self.parse_actions(response.json()['choices'][0]['message']['content']) - except: - print("Failed to parse action from response:", response.json()) + actions = self.parse_actions(response, masks) + except Exception as e: + print("Failed to parse action from response", e) actions = None return actions - def parse_actions(self, response: str): - # parse from the response - if self.action_space == "computer_13": - actions = parse_actions_from_string(response) - elif self.action_space == "pyautogui": - actions = parse_code_from_string(response) + @backoff.on_exception( + backoff.expo, + (APIError, RateLimitError, APIConnectionError, ServiceUnavailableError, InvalidRequestError), + ) + def call_llm(self, payload): + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=self.headers, + json=payload + ) + + if response.status_code != 200: + print("Failed to call LLM: " + response.text) + return "" else: - raise ValueError("Invalid action space: " + self.action_space) + return response.json()['choices'][0]['message']['content'] - # add action into the trajectory - self.trajectory.append({ - "role": "assistant", - "content": [ - { - "type": "text", - "text": response - }, - ] - }) + def parse_actions(self, response: str, masks=None): - return actions + if self.exp in ["screenshot", "a11y_tree", "both"]: + # parse from the response + if self.action_space == "computer_13": + actions = parse_actions_from_string(response) + elif self.action_space == "pyautogui": + actions = parse_code_from_string(response) + else: + raise ValueError("Invalid action space: " + self.action_space) + + self.actions.append(actions) + + return actions + elif self.exp in ["som", "seeact"]: + # parse from the response + if self.action_space == "computer_13": + raise ValueError("Invalid action space: " + self.action_space) + elif self.action_space == "pyautogui": + actions = parse_code_from_som_string(response, masks) + else: + raise ValueError("Invalid action space: " + self.action_space) + + self.actions.append(actions) + + return actions diff --git a/mm_agents/gpt_4v_prompt_action.py b/mm_agents/gpt_4v_prompt_action.py deleted file mode 100644 index 4323df6..0000000 --- a/mm_agents/gpt_4v_prompt_action.py +++ /dev/null @@ -1,244 +0,0 @@ -SYS_PROMPT = """ -You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. -For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. - -HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: -ACTION_SPACE = [ - { - "action_type": "MOVE_TO", - "note": "move the cursor to the specified position", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": False, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": False, - } - } - }, - { - "action_type": "CLICK", - "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", - "parameters": { - "button": { - "type": str, - "range": ["left", "right", "middle"], - "optional": True, - }, - "x": { - "type": float, - "range": [0, X_MAX], - "optional": True, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": True, - }, - "num_clicks": { - "type": int, - "range": [1, 2, 3], - "optional": True, - }, - } - }, - { - "action_type": "MOUSE_DOWN", - "note": "press the left button if the button not specified, otherwise press the specified button", - "parameters": { - "button": { - "type": str, - "range": ["left", "right", "middle"], - "optional": True, - } - } - }, - { - "action_type": "MOUSE_UP", - "note": "release the left button if the button not specified, otherwise release the specified button", - "parameters": { - "button": { - "type": str, - "range": ["left", "right", "middle"], - "optional": True, - } - } - }, - { - "action_type": "RIGHT_CLICK", - "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": True, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": True, - } - } - }, - { - "action_type": "DOUBLE_CLICK", - "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": True, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": True, - } - } - }, - { - "action_type": "DRAG_TO", - "note": "drag the cursor to the specified position with the left button pressed", - "parameters": { - "x": { - "type": float, - "range": [0, X_MAX], - "optional": False, - }, - "y": { - "type": float, - "range": [0, Y_MAX], - "optional": False, - } - } - }, - { - "action_type": "SCROLL", - "note": "scroll the mouse wheel up or down", - "parameters": { - "dx": { - "type": int, - "range": None, - "optional": False, - }, - "dy": { - "type": int, - "range": None, - "optional": False, - } - } - }, - { - "action_type": "TYPING", - "note": "type the specified text", - "parameters": { - "text": { - "type": str, - "range": None, - "optional": False, - } - } - }, - { - "action_type": "PRESS", - "note": "press the specified key and release it", - "parameters": { - "key": { - "type": str, - "range": KEYBOARD_KEYS, - "optional": False, - } - } - }, - { - "action_type": "KEY_DOWN", - "note": "press the specified key", - "parameters": { - "key": { - "type": str, - "range": KEYBOARD_KEYS, - "optional": False, - } - } - }, - { - "action_type": "KEY_UP", - "note": "release the specified key", - "parameters": { - "key": { - "type": str, - "range": KEYBOARD_KEYS, - "optional": False, - } - } - }, - { - "action_type": "HOTKEY", - "note": "press the specified key combination", - "parameters": { - "keys": { - "type": list, - "range": [KEYBOARD_KEYS], - "optional": False, - } - } - }, - ############################################################################################################ - { - "action_type": "WAIT", - "note": "wait until the next action", - }, - { - "action_type": "FAIL", - "note": "decide the task can not be performed", - }, - { - "action_type": "DONE", - "note": "decide the task is done", - } -] -Firstly you need to predict the class of your action, then you need to predict the parameters of your action: -- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) -for example, format as: -``` -{ - "action_type": "MOUSE_MOVE", - "x": 1319.11, - "y": 65.06 -} -``` -- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: -for example, format as: -``` -{ - "action_type": "CLICK", - "click_type": "LEFT" -} -``` -- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard -for example, format as: -``` -{ - "action_type": "KEY", - "key": "ctrl+c" -} -``` -- For TYPE, you need to specify the text you want to type -for example, format as: -``` -{ - "action_type": "TYPE", - "text": "hello world" -} -``` - -REMEMBER: -For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. -You MUST wrap the dict with backticks (\`). -You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. -You CAN predict multiple actions at one step, but you should only return one action for each step. -""" \ No newline at end of file diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py deleted file mode 100644 index 8f256da..0000000 --- a/mm_agents/gpt_4v_prompt_code.py +++ /dev/null @@ -1,18 +0,0 @@ -SYS_PROMPT = """ -You are an agent which follow my instruction and perform desktop computer tasks as instructed. -You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. -For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image. - -You are required to use `pyautogui` to perform the action. -Return one line or multiple lines of python code to perform the action each time, be time efficient. -You ONLY need to return the code inside a code block, like this: -```python -# your code here -``` -Specially, it is also allowed to return the following special code: -When you think you have to wait for some time, return ```WAIT```; -When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; -When you think the task is done, return ```DONE```. - -First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. -""" \ No newline at end of file diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py new file mode 100644 index 0000000..90ce22f --- /dev/null +++ b/mm_agents/prompts.py @@ -0,0 +1,868 @@ +SYS_PROMPT_IN_SCREENSHOT_OUT_CODE = """ +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image. + +You are required to use `pyautogui` to perform the action, but don't use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; +When you think the task is done, return ```DONE```. + +First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +""".strip() + +SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION = """ +You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. +For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. + +HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: +ACTION_SPACE = [ + { + "action_type": "MOVE_TO", + "note": "move the cursor to the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "CLICK", + "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + }, + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + }, + "num_clicks": { + "type": int, + "range": [1, 2, 3], + "optional": True, + }, + } + }, + { + "action_type": "MOUSE_DOWN", + "note": "press the left button if the button not specified, otherwise press the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "MOUSE_UP", + "note": "release the left button if the button not specified, otherwise release the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "RIGHT_CLICK", + "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DOUBLE_CLICK", + "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DRAG_TO", + "note": "drag the cursor to the specified position with the left button pressed", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "SCROLL", + "note": "scroll the mouse wheel up or down", + "parameters": { + "dx": { + "type": int, + "range": None, + "optional": False, + }, + "dy": { + "type": int, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "TYPING", + "note": "type the specified text", + "parameters": { + "text": { + "type": str, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "PRESS", + "note": "press the specified key and release it", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_DOWN", + "note": "press the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_UP", + "note": "release the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "HOTKEY", + "note": "press the specified key combination", + "parameters": { + "keys": { + "type": list, + "range": [KEYBOARD_KEYS], + "optional": False, + } + } + }, + ############################################################################################################ + { + "action_type": "WAIT", + "note": "wait until the next action", + }, + { + "action_type": "FAIL", + "note": "decide the task can not be performed", + }, + { + "action_type": "DONE", + "note": "decide the task is done", + } +] +Firstly you need to predict the class of your action, then you need to predict the parameters of your action: +- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) +for example, format as: +``` +{ + "action_type": "MOUSE_MOVE", + "x": 1319.11, + "y": 65.06 +} +``` +- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: +for example, format as: +``` +{ + "action_type": "CLICK", + "click_type": "LEFT" +} +``` +- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard +for example, format as: +``` +{ + "action_type": "KEY", + "key": "ctrl+c" +} +``` +- For TYPE, you need to specify the text you want to type +for example, format as: +``` +{ + "action_type": "TYPE", + "text": "hello world" +} +``` + +REMEMBER: +For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +You MUST wrap the dict with backticks (\`). +You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. +You CAN predict multiple actions at one step, but you should only return one action for each step. +""".strip() + +SYS_PROMPT_IN_A11Y_OUT_CODE = """ +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of the desktop by accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. + +You are required to use `pyautogui` to perform the action, but don't use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; +When you think the task is done, return ```DONE```. + +First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +""".strip() + +SYS_PROMPT_IN_A11Y_OUT_ACTION = """ +You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. +For each step, you will get an observation of the desktop by accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. + +HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: +ACTION_SPACE = [ + { + "action_type": "MOVE_TO", + "note": "move the cursor to the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "CLICK", + "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + }, + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + }, + "num_clicks": { + "type": int, + "range": [1, 2, 3], + "optional": True, + }, + } + }, + { + "action_type": "MOUSE_DOWN", + "note": "press the left button if the button not specified, otherwise press the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "MOUSE_UP", + "note": "release the left button if the button not specified, otherwise release the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "RIGHT_CLICK", + "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DOUBLE_CLICK", + "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DRAG_TO", + "note": "drag the cursor to the specified position with the left button pressed", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "SCROLL", + "note": "scroll the mouse wheel up or down", + "parameters": { + "dx": { + "type": int, + "range": None, + "optional": False, + }, + "dy": { + "type": int, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "TYPING", + "note": "type the specified text", + "parameters": { + "text": { + "type": str, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "PRESS", + "note": "press the specified key and release it", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_DOWN", + "note": "press the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_UP", + "note": "release the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "HOTKEY", + "note": "press the specified key combination", + "parameters": { + "keys": { + "type": list, + "range": [KEYBOARD_KEYS], + "optional": False, + } + } + }, + ############################################################################################################ + { + "action_type": "WAIT", + "note": "wait until the next action", + }, + { + "action_type": "FAIL", + "note": "decide the task can not be performed", + }, + { + "action_type": "DONE", + "note": "decide the task is done", + } +] +Firstly you need to predict the class of your action, then you need to predict the parameters of your action: +- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) +for example, format as: +``` +{ + "action_type": "MOUSE_MOVE", + "x": 1319.11, + "y": 65.06 +} +``` +- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: +for example, format as: +``` +{ + "action_type": "CLICK", + "click_type": "LEFT" +} +``` +- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard +for example, format as: +``` +{ + "action_type": "KEY", + "key": "ctrl+c" +} +``` +- For TYPE, you need to specify the text you want to type +for example, format as: +``` +{ + "action_type": "TYPE", + "text": "hello world" +} +``` + +REMEMBER: +For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +You MUST wrap the dict with backticks (\`). +You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. +You CAN predict multiple actions at one step, but you should only return one action for each step. +""".strip() + +SYS_PROMPT_IN_BOTH_OUT_CODE = """ +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. +And you will predict the action of the computer based on the screenshot and accessibility tree. + +You are required to use `pyautogui` to perform the action, but don't use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; +When you think the task is done, return ```DONE```. + +First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +""".strip() + +SYS_PROMPT_IN_BOTH_OUT_ACTION = """ +You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. +For each step, you will get an observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. +And you will predict the action of the computer based on the screenshot and accessibility tree. + +HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: +ACTION_SPACE = [ + { + "action_type": "MOVE_TO", + "note": "move the cursor to the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "CLICK", + "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + }, + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + }, + "num_clicks": { + "type": int, + "range": [1, 2, 3], + "optional": True, + }, + } + }, + { + "action_type": "MOUSE_DOWN", + "note": "press the left button if the button not specified, otherwise press the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "MOUSE_UP", + "note": "release the left button if the button not specified, otherwise release the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "RIGHT_CLICK", + "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DOUBLE_CLICK", + "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DRAG_TO", + "note": "drag the cursor to the specified position with the left button pressed", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "SCROLL", + "note": "scroll the mouse wheel up or down", + "parameters": { + "dx": { + "type": int, + "range": None, + "optional": False, + }, + "dy": { + "type": int, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "TYPING", + "note": "type the specified text", + "parameters": { + "text": { + "type": str, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "PRESS", + "note": "press the specified key and release it", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_DOWN", + "note": "press the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_UP", + "note": "release the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "HOTKEY", + "note": "press the specified key combination", + "parameters": { + "keys": { + "type": list, + "range": [KEYBOARD_KEYS], + "optional": False, + } + } + }, + ############################################################################################################ + { + "action_type": "WAIT", + "note": "wait until the next action", + }, + { + "action_type": "FAIL", + "note": "decide the task can not be performed", + }, + { + "action_type": "DONE", + "note": "decide the task is done", + } +] +Firstly you need to predict the class of your action, then you need to predict the parameters of your action: +- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) +for example, format as: +``` +{ + "action_type": "MOUSE_MOVE", + "x": 1319.11, + "y": 65.06 +} +``` +- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: +for example, format as: +``` +{ + "action_type": "CLICK", + "click_type": "LEFT" +} +``` +- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard +for example, format as: +``` +{ + "action_type": "KEY", + "key": "ctrl+c" +} +``` +- For TYPE, you need to specify the text you want to type +for example, format as: +``` +{ + "action_type": "TYPE", + "text": "hello world" +} +``` + +REMEMBER: +For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +You MUST wrap the dict with backticks (\`). +You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. +You CAN predict multiple actions at one step, but you should only return one action for each step. +""".strip() + +SYS_PROMPT_IN_SOM_A11Y_OUT_TAG = """ +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. + +You are required to use `pyautogui` to perform the action, but don't use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. +You can replace x, y in the code with the tag of the element you want to operate with. such as: +```python +pyautogui.moveTo(tag#3) +pyautogui.click(tag#2) +pyautogui.dragTo(tag#1, button='left') +``` +When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. +But you should be careful to ensure that the coordinates are correct. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; +When you think the task is done, return ```DONE```. + +First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +""".strip() + +SYS_PROMPT_SEEACT = """ +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image. +""".strip() + +ACTION_DESCRIPTION_PROMPT_SEEACT = """ +The text and image shown below is the observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. +{} + +Follow the following guidance to think step by step before outlining the next action step at the current stage: + +(Current Screenshot Identification) +Firstly, think about what the current screenshot is. + +(Previous Action Analysis) +Secondly, combined with the screenshot, analyze each step of the previous action history and their intention one by one. Particularly, pay more attention to the last step, which may be more related to what you should do now as the next step. + +(Screenshot Details Analysis) +Closely examine the screenshot to check the status of every part of the webpage to understand what you can operate with and what has been set or completed. You should closely examine the screenshot details to see what steps have been completed by previous actions even though you are given the textual previous actions. Because the textual history may not clearly and sufficiently record some effects of previous actions, you should closely evaluate the status of every part of the webpage to understand what you have done. + +(Next Action Based on Screenshot and Analysis) +Then, based on your analysis, in conjunction with human desktop using habits and the logic of app GUI design, decide on the following action. And clearly outline which button in the screenshot users will operate with as the first next target element, its detailed location, and the corresponding operation. +""" + +ACTION_GROUNDING_PROMPT_SEEACT = """ +You are required to use `pyautogui` to perform the action, but don't use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. +You can replace x, y in the code with the tag of the element you want to operate with. such as: +```python +pyautogui.moveTo(tag#3) +pyautogui.click(tag#2) +pyautogui.dragTo(tag#1, button='left') +``` +When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly. +But you should be careful to ensure that the coordinates are correct. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task; +When you think the task is done, return ```DONE```. + +First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +""" diff --git a/mm_agents/sam_test.py b/mm_agents/sam_test.py deleted file mode 100644 index 9d4ce44..0000000 --- a/mm_agents/sam_test.py +++ /dev/null @@ -1,124 +0,0 @@ -import torch -from PIL import Image -import requests -from transformers import SamModel, SamProcessor -import numpy as np -import matplotlib.pyplot as plt -import os -os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" - -def show_mask(mask, ax, random_color=False): - if random_color: - color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) - else: - color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) - h, w = mask.shape[-2:] - mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) - ax.imshow(mask_image) - - -def show_box(box, ax): - x0, y0 = box[0], box[1] - w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2)) - - -def show_boxes_on_image(raw_image, boxes): - plt.figure(figsize=(10, 10)) - plt.imshow(raw_image) - for box in boxes: - show_box(box, plt.gca()) - plt.axis('on') - plt.show() - - -def show_points_on_image(raw_image, input_points, input_labels=None): - plt.figure(figsize=(10, 10)) - plt.imshow(raw_image) - input_points = np.array(input_points) - if input_labels is None: - labels = np.ones_like(input_points[:, 0]) - else: - labels = np.array(input_labels) - show_points(input_points, labels, plt.gca()) - plt.axis('on') - plt.show() - - -def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None): - plt.figure(figsize=(10, 10)) - plt.imshow(raw_image) - input_points = np.array(input_points) - if input_labels is None: - labels = np.ones_like(input_points[:, 0]) - else: - labels = np.array(input_labels) - show_points(input_points, labels, plt.gca()) - for box in boxes: - show_box(box, plt.gca()) - plt.axis('on') - plt.show() - - -def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None): - plt.figure(figsize=(10, 10)) - plt.imshow(raw_image) - input_points = np.array(input_points) - if input_labels is None: - labels = np.ones_like(input_points[:, 0]) - else: - labels = np.array(input_labels) - show_points(input_points, labels, plt.gca()) - for box in boxes: - show_box(box, plt.gca()) - plt.axis('on') - plt.show() - - -def show_points(coords, labels, ax, marker_size=375): - pos_points = coords[labels == 1] - neg_points = coords[labels == 0] - ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', - linewidth=1.25) - ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', - linewidth=1.25) - - -def show_masks_on_image(raw_image, masks, scores): - if len(masks.shape) == 4: - masks = masks.squeeze() - if scores.shape[0] == 1: - scores = scores.squeeze() - - nb_predictions = scores.shape[-1] - fig, axes = plt.subplots(1, nb_predictions, figsize=(15, 15)) - - for i, (mask, score) in enumerate(zip(masks, scores)): - mask = mask.cpu().detach() - axes[i].imshow(np.array(raw_image)) - show_mask(mask, axes[i]) - axes[i].title.set_text(f"Mask {i + 1}, Score: {score.item():.3f}") - axes[i].axis("off") - plt.show() - - -device = "cuda" if torch.cuda.is_available() else "cpu" -model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device) -processor = SamProcessor.from_pretrained("facebook/sam-vit-huge") - -img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" -raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") - -plt.imshow(raw_image) - -inputs = processor(raw_image, return_tensors="pt").to(device) -with torch.no_grad(): - outputs = model(**inputs) - -masks = processor.image_processor.post_process_masks( - outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu() -) - - -scores = outputs.iou_scores -show_masks_on_image(raw_image, masks[0], scores) diff --git a/requirements.txt b/requirements.txt index f2c4cc3..97019b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,3 +32,4 @@ librosa pymupdf chardet playwright +backoff diff --git a/resouce_collection/Source2Doc/Get_Source_Doc - Sheet1.csv b/resouce_collection/Source2Doc/Get_Source_Doc - Sheet1.csv new file mode 100644 index 0000000..86838a6 --- /dev/null +++ b/resouce_collection/Source2Doc/Get_Source_Doc - Sheet1.csv @@ -0,0 +1,268 @@ +id,Source,InvolvedApp +94d95f96-9699-4208-98ba-3c3119edf9c2,https://help.ubuntu.com/lts/ubuntu-help/addremove-install.html.en,OS +bedcedc4-4d72-425e-ad62-21960b11fe0d,https://www.youtube.com/watch?v=D4WyNjt_hbQ&t=2s,OS +43c2d64c-bab5-4dcb-a30c-b888321c319a,https://ubuntu.com/tutorials/command-line-for-beginners#4-creating-folders-and-files,OS +7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82,https://ubuntu.com/tutorials/command-line-for-beginners#5-moving-and-manipulating-files,OS +ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3,https://www.youtube.com/watch?v=D4WyNjt_hbQ&t=2s,OS +a462a795-fdc7-4b23-b689-e8b6df786b78,https://help.ubuntu.com/lts/ubuntu-help/shell-exit.html.en,OS +f9be0997-4b7c-45c5-b05c-4612b44a6118,https://help.ubuntu.com/lts/ubuntu-help/shell-notifications.html.en,OS +ae039631-2b12-4637-84f6-c67d51511be3,https://help.ubuntu.com/lts/ubuntu-help/net-default-browser.html.en,OS +e2eb4bf1-aa93-4192-b55d-03e2fb6dfd15,https://help.ubuntu.com/lts/ubuntu-help/contacts-add-remove.html.en,OS +28cc3b7e-b194-4bc9-8353-d04c0f4d56d2,https://help.ubuntu.com/lts/ubuntu-help/sound-volume.html.en,OS +5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57,https://help.ubuntu.com/lts/ubuntu-help/files-recover.html.en,OS +e0df059f-28a6-4169-924f-b9623e7184cc,https://help.ubuntu.com/lts/ubuntu-help/files-rename.html.en,OS +ddc75b62-7311-4af8-bfb3-859558542b36,https://help.ubuntu.com/lts/ubuntu-help/addremove-remove.html.en,OS +5c433d22-ed9a-4e31-91f5-54cf3e8acd63,https://help.ubuntu.com/lts/ubuntu-help/session-language.html.zh-CN,OS +b6781586-6346-41cd-935a-a6b1487918fc,https://help.ubuntu.com/lts/ubuntu-help/clock-timezone.html.en,OS +b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa,https://help.ubuntu.com/lts/ubuntu-help/bluetooth-turn-on-off.html.en,OS +3ce045a0-877b-42aa-8d2c-b4a863336ab8,https://help.ubuntu.com/lts/ubuntu-help/a11y-font-size.html.en,OS +fe41f596-a71b-4c2f-9b2f-9dcd40b568c3,https://help.ubuntu.com/lts/ubuntu-help/power-percentage.html.en,OS +a4d98375-215b-4a4d-aee9-3d4370fccc41,https://help.ubuntu.com/lts/ubuntu-help/privacy-screen-lock.html.en,OS +765d2b74-88a7-4d50-bf51-34e4106fd24a,https://help.ubuntu.com/lts/ubuntu-help/files-delete.html.en,OS +cc9d4f34-1ca0-4a1b-8ff2-09302696acb9,https://superuser.com/questions/178587/how-do-i-detach-a-process-from-terminal-entirely,OS +5812b315-e7bd-4265-b51f-863c02174c28,https://superuser.com/questions/149404/create-an-ssh-user-who-only-has-permission-to-access-specific-folders,OS +c56de254-a3ec-414e-81a6-83d2ce8c41fa,https://superuser.com/questions/28426/how-to-extract-text-with-ocr-from-a-pdf-on-linux,OS +6ebbfb01-ea72-4226-a2a6-dc428e111ed2,https://superuser.com/questions/46748/how-do-i-make-bash-my-default-shell-on-ubuntu,OS +4d2b519e-e872-4100-8ea3-fe71ab0f9133,https://stackoverflow.com/questions/11530090/adding-a-new-entry-to-the-path-variable-in-zsh,OS +c288e301-e626-4b98-a1ab-159dcb162af5,https://stackoverflow.com/questions/41986507/unable-to-set-default-python-version-to-python3-in-ubuntu,OS +13584542-872b-42d8-b299-866967b5c3ef,https://superuser.com/questions/72176/linux-set-default-terminal-size-and-screen-position,OS +23393935-50c7-4a86-aeea-2b78fd089c5c,https://superuser.com/questions/91307/copying-only-jpg-from-a-directory-structure-to-another-location-linux,OS +f10b16e1-c160-4cb3-989f-7b2ec89bc073,https://www.wikihow.com/Install-Gnome-on-Ubuntu,OS +eb03d19a-b88d-4de4-8a64-ca0ac66f426b,https://www.youtube.com/shorts/t9JLUaT55UQ,MS Excel +0bf05a7d-b28b-44d2-955a-50b41e24012a,https://www.youtube.com/shorts/FPAQaDTS8VY,MS Excel +7b802dad-6e0f-4204-9815-d4e3f57627d8,https://www.youtube.com/shorts/Of-lzeP1usE,MS Excel +7a4e4bc8-922c-4c84-865c-25ba34136be1,https://www.youtube.com/shorts/bvUhr1AHs44,MS Excel +2bd59342-0664-4ccb-ba87-79379096cc08,https://www.youtube.com/shorts/L3Z-F1QTQFY,MS Excel +a9f325aa-8c05-4e4f-8341-9e4358565f4f,https://www.youtube.com/shorts/A0gmEBRKXWs,MS Excel +ecb0df7a-4e8d-4a03-b162-053391d3afaf,https://www.youtube.com/shorts/tXOovKn0H68,MS Excel +7efeb4b1-3d19-4762-b163-63328d66303b,https://www.youtube.com/shorts/4jzXfZNhfmk,MS Excel +4e6fcf72-daf3-439f-a232-c434ce416af6,https://www.youtube.com/shorts/0uxJccNCKcE,MS Excel +6054afcb-5bab-4702-90a0-b259b5d3217c,https://www.youtube.com/shorts/JTbZ8sRxkdU,MS Excel +abed40dc-063f-4598-8ba5-9fe749c0615d,https://www.youtube.com/shorts/xgf4ZpsEx5M,MS Excel +01b269ae-2111-4a07-81fd-3fcd711993b0,https://www.youtube.com/shorts/VrUzPTIwQ04,MS Excel +8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14,https://www.youtube.com/shorts/Hbcwu6IQ1ns,MS Excel +af2b02f7-acee-4be4-8b66-499fab394915,https://www.youtube.com/shorts/AwKsb5VmtBI,MS Excel +da1d63b8-fa12-417b-ba18-f748e5f770f3,https://www.youtube.com/shorts/hquscnbz2-U,MS Excel +636380ea-d5f6-4474-b6ca-b2ed578a20f1,https://www.youtube.com/shorts/_BYL6VOHLGw,"MS Excel, Edge" +5ba77536-05c5-4aae-a9ff-6e298d094c3e,https://www.youtube.com/shorts/CuBC1evUS5I,MS Excel +4bc4eaf4-ca5e-4db2-8138-8d4e65af7c0b,https://www.youtube.com/shorts/1adQWfjN-tI,MS Excel +672a1b02-c62f-4ae2-acf0-37f5fb3052b0,https://www.youtube.com/shorts/2rhdQXI4Lng,MS Excel +648fe544-16ba-44af-a587-12ccbe280ea6,https://www.youtube.com/shorts/sOPBMWaC6Uc,MS Excel +8985d1e4-5b99-4711-add4-88949ebb2308,https://www.youtube.com/shorts/J5ts2Acv9Pc,MS Excel +9e606842-2e27-43bf-b1d1-b43289c9589b,https://www.youtube.com/shorts/B-mGYDFOyUs,MS Excel +fcb6e45b-25c4-4087-9483-03d714f473a9,https://www.youtube.com/shorts/GZipp7nOZS0,MS Excel +68c0c5b7-96f3-4e87-92a7-6c1b967fd2d2,https://www.youtube.com/shorts/JEH5TsK-cCk,"MS Excel, Edge" +fff629ea-046e-4793-8eec-1a5a15c3eb35,https://www.youtube.com/shorts/8WybtCdUT6w,MS Excel +5c9a206c-bb00-4fb6-bb46-ee675c187df5,https://www.youtube.com/shorts/VbQtMNnq9i4,MS Excel +e975ae74-79bd-4672-8d1c-dc841a85781d,https://www.youtube.com/shorts/GjT7gGe5Sr8,MS Excel +34a6938a-58da-4897-8639-9b90d6db5391,https://www.youtube.com/shorts/gW37x2TkzOY,MS Excel +b5a22759-b4eb-4bf2-aeed-ad14e8615f19,https://www.youtube.com/shorts/3xLa-D0C7Ic,MS Excel +2f9913a1-51ed-4db6-bfe0-7e1c95b3139e,https://www.youtube.com/shorts/dGLRcmfVO6Q,MS Excel +2558031e-401d-4579-8e00-3ecf540fb492,https://www.mrexcel.com/board/threads/sales-for-the-first-6-weeks.1249213/,MS Excel +39aa4e37-dc91-482e-99af-132a612d40f3,https://www.libreofficehelp.com/add-insert-delete-copy-move-rename-a-worksheet-in-libreoffice-calc/,LibreOffice Calc +0cecd4f3-74de-457b-ba94-29ad6b5dafb6,https://www.libreofficehelp.com/add-insert-delete-copy-move-rename-a-worksheet-in-libreoffice-calc/,LibreOffice Calc +4188d3a4-077d-46b7-9c86-23e1a036f6c1,https://www.libreofficehelp.com/freeze-unfreeze-rows-columns-ranges-calc/,LibreOffice Calc +51b11269-2ca8-4b2a-9163-f21758420e78,https://www.reddit.com/r/LibreOfficeCalc/comments/186pcc6/how_to_arrange_numbers_in_a_column_from_minimum/,LibreOffice Calc +7e429b8d-a3f0-4ed0-9b58-08957d00b127,https://medium.com/@divyangichaudhari17/how-to-use-vlookup-and-hlookup-in-libre-calc-3370698bb3ff,LibreOffice Calc +f5a90742-3fa2-40fc-a564-f29b054e0337,https://superuser.com/questions/1236149/libreoffice-calc-how-to-apply-functions-to-columns,LibreOffice Calc +22df9241-f8d7-4509-b7f1-37e501a823f7,https://superuser.com/questions/1767185/how-do-you-move-cells-in-libreoffice-calc,LibreOffice Calc +1434ca3e-f9e3-4db8-9ca7-b4c653be7d17,https://www.wikihow.com/Remove-Duplicates-in-Open-Office-Calc,LibreOffice Calc +347ef137-7eeb-4c80-a3bb-0951f26a8aff,https://www.youtube.com/watch?v=bgO40-CjYNY,LibreOffice Calc +6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5,https://www.youtube.com/watch?v=nl-nXjJurhQ,LibreOffice Calc +3aaa4e37-dc91-482e-99af-132a612d40f3,https://www.quora.com/How-can-you-import-export-CSV-files-with-LibreOffice-Calc-or-OpenOffice,LibreOffice Calc +0decd4f3-74de-457b-ba94-29ad6b5dafb6,https://justclickhere.co.uk/resources/checkboxes-tick-boxes-libreoffice-calc/,LibreOffice Calc +37608790-6147-45d0-9f20-1137bb35703d,https://www.youtube.com/shorts/uzPo_CPCHH8,MS Excel +f9584479-3d0d-4c79-affa-9ad7afdd8850,https://youtube.com/shorts/feldd-Pn48c?si=9xJiem2uAHm6Jshb,LibreOffice Calc +d681960f-7bc3-4286-9913-a8812ba3261a,https://www.youtube.com/shorts/d7U1S_IsTVM,LibreOffice Calc +f6a90742-3fa2-40fc-a564-f29b054e0337,https://www.excel-easy.com/examples/drop-down-list.html,LibreOffice Calc +21df9241-f8d7-4509-b7f1-37e501a823f7,https://www.youtube.com/watch?v=p5C4V_AO1UU,LibreOffice Calc +1334ca3e-f9e3-4db8-9ca7-b4c653be7d17,https://techcommunity.microsoft.com/t5/excel/excel-workbook-top-way-too-big-can-t-see-rows-and-columns/m-p/4014694,LibreOffice Calc +357ef137-7eeb-4c80-a3bb-0951f26a8aff,https://www.reddit.com/r/excel/comments/17zny8u/calculating_total_amount_earned_from_total_hours/,LibreOffice Calc +6f99a1ad-07d2-4b66-a1ce-ece6d99c20a5,https://techcommunity.microsoft.com/t5/excel/sumarize-the-sheetnames/m-p/4014716,LibreOffice Calc +aa3a8974-2e85-438b-b29e-a64df44deb4b,https://www.quora.com/Libre-Office-Calc-How-do-I-resize-all-cells-in-a-sheet-to-make-them-fit-to-1-page-for-printing-and-exporting-as-PDF,LibreOffice Calc +a01fbce3-2793-461f-ab86-43680ccbae25,https://superuser.com/questions/1250677/how-to-set-decimal-separator-in-libre-office-calc,LibreOffice Calc +4f07fbe9-70de-4927-a4d5-bb28bc12c52c,https://superuser.com/questions/1081048/libreoffice-calc-how-to-pad-number-to-fixed-decimals-when-used-within-formula,LibreOffice Calc +e3b1d5fa-ed00-4129-bda1-1452bd2b6772,https://www.reddit.com/r/libreoffice/comments/tel112/calc_how_to_calculate_sum_by_categories/,LibreOffice Calc +ca6a9524-f8e9-4d2f-9364-ab0cad567739,https://www.reddit.com/r/libreoffice/comments/113gmyc/how_to_remove_certain_text_from_cells_in_calc/,LibreOffice Calc +a455e8d0-930f-40d2-9575-5e8d2d222f58,https://superuser.com/questions/562944/quickly-fill-blank-cells-in-a-list-in-libreoffice-calc,LibreOffice Calc +83ee22c6-7737-49ce-9b5a-138c3e92af04,https://superuser.com/questions/661102/currency-conversion-in-libreoffice-calc,LibreOffice Calc +819f61c2-ec77-4d3f-9996-0838ae5aacc8,https://superuser.com/questions/381696/creating-a-column-of-working-days-in-libreoffice-calc,LibreOffice Calc +69d577b3-004e-4bca-89b2-0d7c2f6049e3,https://superuser.com/questions/387106/libreoffice-calc-how-to-get-total-for-hhmmss-cells,LibreOffice Calc +0a1bf4ca-d4ea-4618-baa5-6e8dc1b46d82,https://superuser.com/questions/571915/sum-up-to-n-highest-value-out-of-a-series,LibreOffice Calc +ac9bb6cb-1888-43ab-81e4-a98a547918cd,https://superuser.com/questions/1674211/how-to-change-colour-of-slide-number-in-libre-office,LibreOffice Impress +5d901039-a89c-4bfb-967b-bf66f4df075e,https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag,LibreOffice Impress +071d4ace-091a-4ec3-886e-f4be55ae375d,https://superuser.com/questions/706860/hide-slide-numbers-and-slide-footer-on-first-and-second-slide-in-libreoffice-imp?rq=1,LibreOffice Impress +550ce7e7-747b-495f-b122-acdc4d0b8e54,"https://technical-tips.com/blog/software/text-in-libreoffice-strikethrough--6948#:~:text=To%20strikethrough%20Text%20in%20LibreOffice%201%20In%20your,effect%22%20can%20your%20additionally%2C%20for%20example%2C%20double%20underline.",LibreOffice Impress +455d3c66-7dc6-4537-a39a-36d3e9119df7,"https://www.libreofficehelp.com/export-libreoffice-impress-slides-images/#:~:text=Exporting%20a%20single%20slide%20as.jpg%2C.png%2C%20etc%20image%20is,on%20the%20checkbox%20Selection.%20Provide%20jpg%20quality%20options.",LibreOffice Impress +af23762e-2bfd-4a1d-aada-20fa8de9ce07,https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom,LibreOffice Impress +c59742c0-4323-4b9d-8a02-723c251deaa0,https://www.reddit.com/r/libreoffice/comments/17lcdrp/audio_not_supported_in_libreoffice_impress/,LibreOffice Impress +39478d4a-1049-456f-aa77-407811393add,https://www.reddit.com/r/libreoffice/comments/jul3o8/putting_cap_or_hat_or_carat_symbol_in_libre/,LibreOffice Impress +c3ad4442-499f-4e58-bc4e-1a1417ea9b8c,http://maharajacollege.ac.in/material/Libreofficeimpresspdf.pdf,LibreOffice Impress +ef9d12bd-bcee-4ba0-a40e-918400f43ddf,https://www.reddit.com/r/libreoffice/comments/18elh3y/i_closed_the_slide_pannel_on_the_left_and_idk_how/,LibreOffice Impress +9ec204e4-f0a3-42f8-8458-b772a6797cab,https://www.tiktok.com/@lil.d1rt_/video/7247574148887629083,LibreOffice Impress +0f84bef9-9790-432e-92b7-eece357603fb,https://stackoverflow.com/questions/29036788/how-to-disable-libreoffice-impress-to-use-multiple-display,LibreOffice Impress +ce88f674-ab7a-43da-9201-468d38539e4a,https://justclickhere.co.uk/resources/change-slides-in-impress-to-portrait/,LibreOffice Impress +f0a334af-f91b-4c03-b578-aac9bec2b543,https://www.libreofficehelp.com/insert-video-impress-presentation/#Inserting_a_Video_in_Impress,LibreOffice Impress +3b27600c-3668-4abd-8f84-7bcdebbccbdb,https://www.libreofficehelp.com/change-slide-background-impress/#All_Slides,LibreOffice Impress +a097acff-6266-4291-9fbd-137af7ecd439,https://www.youtube.com/watch?v=DDmEvjs4iBw,LibreOffice Impress +21760ecb-8f62-40d2-8d85-0cee5725cb72,https://www.libreofficehelp.com/add-animations-transitions-libreoffice-impress-slides/,LibreOffice Impress +3cc4f35d-fa2e-4555-afb9-741b7c062a74,https://documentation.libreoffice.org/assets/Uploads/Documentation/en/IG7.6/IG76-ImpressGuide.pdf,LibreOffice Impress +6ada715d-3aae-4a32-a6a7-429b2e43fb93,https://www.quora.com/How-do-you-insert-images-into-a-LibreOffice-Writer-document,LibreOffice Writer +ecc2413d-8a48-416e-a3a2-d30106ca36cb,https://www.quora.com/How-can-I-insert-a-blank-page-on-libreoffice,LibreOffice Writer +0e47de2a-32e0-456c-a366-8c607ef7a9d2,https://ask.libreoffice.org/t/how-to-start-page-numbering-on-a-certain-page/39931/4,LibreOffice Writer +4bcb1253-a636-4df4-8cb0-a35c04dfef31,https://www.libreofficehelp.com/save-export-writer-documents-in-pdf-epub-format/,LibreOffice Writer +0810415c-bde4-4443-9047-d5f70165a697,https://www.youtube.com/watch?v=Q_AaL6ljudU,LibreOffice Writer +e528b65e-1107-4b8c-8988-490e4fece599,https://www.youtube.com/watch?v=l25Evu4ohKg,LibreOffice Writer +66399b0d-8fda-4618-95c4-bfc6191617e9,https://www.youtube.com/watch?v=l25Evu4ohKg,LibreOffice Writer +936321ce-5236-426a-9a20-e0e3c5dc536f,https://www.youtube.com/watch?v=l25Evu4ohKg,LibreOffice Writer +663876c7-3471-43db-ba51-f410b13d9d7d,https://askubuntu.com/questions/319593/how-to-type-science-equations-in-libre-office,LibreOffice Writer +3ef2b351-8a84-4ff2-8724-d86eae9b842e,https://askubuntu.com/questions/1066351/how-do-you-center-align-in-libreoffice#:~:text=Ctrl%20%2B%20e%20will%20Center%20align%20the%20cursor%20for%20you.,LibreOffice Writer +45d61a06-6545-4422-97b7-bc76cfa964c1,https://stackoverflow.com/questions/71685737/how-to-replace-all-newlines-with-paragraph-marks-in-libreoffice-write,LibreOffice Writer +0b17a146-2934-46c7-8727-73ff6b6483e8,https://askubuntu.com/questions/245695/how-do-you-insert-subscripts-and-superscripts-into-ordinary-non-formula-text-i,LibreOffice Writer +0e763496-b6bb-4508-a427-fad0b6c3e195,https://ask.libreoffice.org/t/how-do-i-change-the-font-for-the-whole-document-in-writer/9220,LibreOffice Writer +f178a4a9-d090-4b56-bc4c-4b72a61a035d,https://ask.libreoffice.org/t/how-do-i-make-times-new-roman-the-default-font-in-lo/64604,LibreOffice Writer +0a0faba3-5580-44df-965d-f562a99b291c,https://stackoverflow.com/questions/64528055/how-to-make-part-of-my-sentence-left-aligned-and-rest-as-right-aligned,LibreOffice Writer +e246f6d8-78d7-44ac-b668-fcf47946cb50,https://ask.libreoffice.org/t/how-to-change-text-size-color-of-italic-font/77712,LibreOffice Writer +8472fece-c7dd-4241-8d65-9b3cd1a0b568,https://stackoverflow.com/questions/37259827/libreoffice-writer-how-to-set-different-colors-to-each-letter,LibreOffice Writer +88fe4b2d-3040-4c70-9a70-546a47764b48,https://stackoverflow.com/questions/56554555/libreoffice-writer-how-to-create-empty-line-space-after-every-period-in-a-par,LibreOffice Writer +6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2,https://superuser.com/questions/762500/how-do-i-find-all-highlighted-text-in-libreoffice-writer,LibreOffice Writer +d53ff5ee-3b1a-431e-b2be-30ed2673079b,https://ask.libreoffice.org/t/how-to-convert-all-uppercase-to-lowercase/53341,LibreOffice Writer +72b810ef-4156-4d09-8f08-a0cf57e7cefe,https://superuser.com/questions/657792/libreoffice-writer-how-to-apply-strikethrough-text-formatting?rq=1,LibreOffice Writer +6f81754e-285d-4ce0-b59e-af7edb02d108,https://superuser.com/questions/789473/remove-duplicate-lines-in-libreoffice-openoffice-writer,LibreOffice Writer +41c621f7-3544-49e1-af8d-dafd0f834f75,https://superuser.com/questions/1668018/how-to-auto-format-lines-in-libre-office-writer,LibreOffice Writer +b21acd93-60fd-4127-8a43-2f5178f4a830,https://superuser.com/questions/1097199/how-can-i-double-space-a-document-in-libreoffice?rq=1,LibreOffice Writer +59f21cfb-0120-4326-b255-a5b827b38967,https://docs.videolan.me/vlc-user/desktop/3.0/en/basic/media.html#playing-a-file,VLC player +8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89,https://docs.videolan.me/vlc-user/desktop/3.0/en/basic/recording/playing.html#choose-your-recordings-folder,VLC player +8f080098-ddb1-424c-b438-4e96e5e4786e,https://medium.com/@jetscribe_ai/how-to-extract-mp3-audio-from-videos-using-vlc-media-player-beeef644ebfb,VLC player +bba3381f-b5eb-4439-bd9e-80c22218d5a7,https://www.quora.com/How-do-I-play-online-videos-using-the-VLC-media-player,VLC player +a1c3ab35-02de-4999-a7ed-2fd12c972c6e,https://www.quora.com/How-do-I-compress-a-video-with-VLC,VLC player +fba2c100-79e8-42df-ae74-b592418d54f4,https://www.youtube.com/watch?v=XHprwDJ0-fU&t=436s,VLC player +d70666e4-7348-42c7-a06a-664094c5df3c,https://www.youtube.com/watch?v=XHprwDJ0-fU&t=436s,VLC player +efcf0d81-0835-4880-b2fd-d866e8bc2294,"https://www.youtube.com/watch?v=XHprwDJ0-fU&t=436s, https://help.ubuntu.com/stable/ubuntu-help/look-background.html.en",VLC player +8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f,https://www.youtube.com/watch?v=XHprwDJ0-fU&t=436s,VLC player +aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6,https://videoconverter.wondershare.com/vlc/how-to-rotate-a-video-using-vlc.html?gad_source=1&gclid=CjwKCAiA-vOsBhAAEiwAIWR0TaGSOLkYiBeVQGZSyfeUP3g-tIvYxffl5RFIu0-zrUL1IF41eCw1JRoCnCMQAvD_BwE,VLC player +386dbd0e-0241-4a0a-b6a2-6704fba26b1c,https://superuser.com/questions/1708415/pause-and-play-vlc-in-background?rq=1,VLC player +9195653c-f4aa-453d-aa95-787f6ccfaae9,https://superuser.com/questions/1513285/how-can-i-increase-the-maximum-volume-output-by-vlc?rq=1,VLC player +5ac2891a-eacd-4954-b339-98abba077adb,"https://superuser.com/questions/1412810/how-to-prevent-vlc-media-player-from-auto-closing-after-video-end#:%7E:text=Click%20on%20%22Media%22on%20the,VLC%20player%20after%20video%20ending",VLC player +0d95d28a-9587-433b-a805-1fbe5467d598,https://superuser.com/questions/1299036/vlc-how-to-open-the-folder-of-the-current-playing-video?noredirect=1&lq=1,VLC player +d06f0d4d-2cd5-4ede-8de9-598629438c6e,https://superuser.com/questions/1039392/changing-colour-of-vlc-volume-slider,VLC player +a5bbbcd5-b398-4c91-83d4-55e1e31bbb81,https://superuser.com/questions/776056/how-to-hide-bottom-toolbar-in-vlc,VLC player +f3977615-2b45-4ac5-8bba-80c17dbe2a37,https://www.reddit.com/r/Fedora/comments/rhljzd/how_to_run_multiple_instances_of_vlc_media_player/,VLC player +c669a35f-d45a-450e-b1f2-f473748337bb,https://www.quora.com/How-do-I-fast-forward-a-video-in-VLC-player,VLC player +d1ba14d0-fef8-4026-8418-5b581dc68ca0,https://superuser.com/questions/306154/how-to-use-a-b-repeat-feature-of-vlc,VLC player +215dfd39-f493-4bc3-a027-8a97d72c61bf,https://superuser.com/questions/1224784/how-to-change-vlcs-splash-screen,VLC player +bb5e4c0d-f964-439c-97b6-bdb9747de3f4,https://www.wikihow.com/Remove-an-Email-Account-from-Thunderbird,ThunderBird +7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3,https://www.wikihow.com/Access-Gmail-With-Mozilla-Thunderbird,ThunderBird +b188fe10-ae67-4db8-a154-26a0b8ff8f1e,https://www.reddit.com/r/Thunderbird/comments/17vv2os/restore_readability_in_message_list_pane/,ThunderBird +12086550-11c0-466b-b367-1d9e75b3910e,https://www.bitrecover.com/blog/manage-thunderbird-profiles/,ThunderBird +06fe7178-4491-4589-810f-2e2bc9502122,https://www.quora.com/How-do-I-backup-email-files-in-Mozilla-Thunderbird,ThunderBird +6766f2b8-8a72-417f-a9e5-56fcaa735837,"https://www.adsigner.com/user-manual/signatures/setup-email-client-thunderbird/#:~:text=is%20probably%20hidden.-,Right%20click%20on%20the%20empty%20space%20at%20the%20top%20of,signature%20from%20a%20file%20instead.",ThunderBird +e1e75309-3ddb-4d09-92ec-de869c928143,https://support.mozilla.org/en-US/kb/organize-your-messages-using-filters,ThunderBird +3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5,https://support.mozilla.org/en-US/kb/organize-your-messages-using-filters,ThunderBird +35253b65-1c19-4304-8aa4-6884b8218fc0,https://support.mozilla.org/en-US/questions/1259354,ThunderBird +d088f539-cab4-4f9a-ac92-9999fc3a656e,https://support.mozilla.org/en-US/kb/how-use-attachments,ThunderBird +2ad9387a-65d8-4e33-ad5b-7580065a27ca,"https://support.mozilla.org/bm/questions/1027435, https://www.wikihow.tech/Create-Folders-in-Mozilla-Thunderbird",ThunderBird +480bcfea-d68f-4aaa-a0a9-2589ef319381,https://www.reddit.com/r/Thunderbird/comments/182dg5p/unified_inbox_howto/,ThunderBird +37b9808f-b2b4-4177-ab00-9ddfae4bad27,https://www.quora.com/How-can-I-schedule-Mozilla-Thunderbird-to-turn-off-automatically,ThunderBird +af630914-714e-4a24-a7bb-f9af687d3b91,https://stackoverflow.com/questions/11333148/adding-a-toolbar-button-to-a-thundebird-compose-message-window?rq=3,ThunderBird +3299584d-8f11-4457-bf4c-ce98f7600250,https://superuser.com/questions/1643561/would-like-to-see-the-email-address-from-sender-in-the-column,ThunderBird +030eeff7-b492-4218-b312-701ec99ee0cc,https://superuser.com/questions/1781004/how-do-i-remove-the-indentation-and-character-in-quoted-text-of-a-reply-mess,ThunderBird +94760984-3ff5-41ee-8347-cf1af709fea0,https://superuser.com/questions/1757333/how-can-i-view-thunderbird-in-full-dark-mode,ThunderBird +99146c54-4f37-4ab8-9327-5f3291665e1e,https://superuser.com/questions/1764409/how-to-send-email-with-thunderbird-without-configuring-an-incoming-email-service,ThunderBird +9656a811-9b5b-4ddf-99c7-5117bcef0626,https://superuser.com/questions/205240/is-there-a-way-to-get-a-popup-confirmation-box-when-you-send-an-email-in-thunder?rq=1,ThunderBird +c9e7eaf2-b1a1-4efc-a982-721972fa9f02,https://superuser.com/questions/544480/how-to-apply-automatic-message-filters-to-subfolders-too?noredirect=1&lq=1,ThunderBird +bb5e4c0d-f964-439c-97b6-bdb9747de3f4,https://support.google.com/chrome/answer/95426?sjid=16867045591165135686-AP,Chrome +7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3,https://support.google.com/chrome/answer/95647?hl=en&ref_topic=7438325&sjid=16867045591165135686-AP#zippy=%2Cdelete-cookies-from-a-site,Chrome +12086550-11c0-466b-b367-1d9e75b3910e,https://www.quora.com/What-are-the-cool-tricks-to-use-Google-Chrome,Chrome +06fe7178-4491-4589-810f-2e2bc9502122,https://www.wikihow.com/Switch-Tabs-in-Chrome,Chrome +6766f2b8-8a72-417f-a9e5-56fcaa735837,https://support.google.com/chrome/thread/205881926/it-s-possible-to-load-unpacked-extension-automatically-in-chrome?hl=en,Chrome +e1e75309-3ddb-4d09-92ec-de869c928143,https://in5stepstutorials.com/google-chrome/save-web-page-as-pdf-in-chrome.php,Chrome +3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5,https://in5stepstutorials.com/google-chrome/add-change-delete-autofill-address.php,Chrome +35253b65-1c19-4304-8aa4-6884b8218fc0,"https://www.laptopmag.com/articles/how-to-create-desktop-shortcuts-for-web-pages-using-chrome, https://www.reddit.com/r/chrome/comments/13xcbap/crete_shortcut_option_missing/",Chrome +d088f539-cab4-4f9a-ac92-9999fc3a656e,https://medium.com/@inkverseuk2/useful-tips-and-tricks-for-the-google-chrome-browser-ac7d0d24b3cc,Chrome +2ad9387a-65d8-4e33-ad5b-7580065a27ca,https://www.youtube.com/watch?v=IN-Eq_UripQ,Chrome +7a5a7856-f1b6-42a4-ade9-1ca81ca0f263,https://www.youtube.com/watch?v=ZaZ8GcTxjXA,Chrome +3720f614-37fd-4d04-8a6b-76f54f8c222d,https://superuser.com/questions/984668/change-interface-language-of-chrome-to-english,Chrome +b63059a2-53bc-4163-a89f-3ac948c74081,https://superuser.com/questions/1303418/how-do-i-make-chrome-block-absolutely-all-pop-ups?rq=1,Chrome +44ee5668-ecd5-4366-a6ce-c1c9b8d4e938,https://superuser.com/questions/1787991/clear-browsing-history-from-specific-site-on-chrome,Chrome +b5ebc8c6-6329-4373-85b4-9421c97375e9,https://superuser.com/questions/364470/is-there-a-way-to-view-google-chrome-browsing-history-past-three-months-ago?rq=1,Chrome +93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9,https://superuser.com/questions/1417973/how-to-disable-google-chrome-dark-mode,Chrome +2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3,https://superuser.com/questions/1393683/how-to-change-the-username-in-google-chrome-profiles?rq=1,Chrome +480bcfea-d68f-4aaa-a0a9-2589ef319381,https://bugartisan.medium.com/disable-the-new-chrome-ui-round-in-2023-db168271f71e,Chrome +37b9808f-b2b4-4177-ab00-9ddfae4bad27,https://www.reddit.com/r/chrome/comments/17niw3h/tutorial_how_to_disable_the_download_bubble_in/,Chrome +af630914-714e-4a24-a7bb-f9af687d3b91,https://www.howtogeek.com/680260/how-to-change-chromes-default-text-size/,Chrome +ae78f875-5b98-4907-bbb5-9c737fc68c03,https://support.google.com/chrome/thread/219988391/increase-search-results-per-page?hl=en,Chrome +0ed39f63-6049-43d4-ba4d-5fa2fe04a951,https://www.quora.com/How-do-you-find-and-replace-text-in-Visual-Studio-Code,VS Code +b421106e-b282-4c41-af72-37c95493f95f,https://stackoverflow.com/questions/74153883/launch-vscode-with-new-txt-file,VS Code +53ad5833-3455-407b-bbc6-45b4c79ab8fb,https://www.youtube.com/watch?v=VqCgcpAypFQ,VS Code +eabc805a-bfcf-4460-b250-ac92135819f6,https://www.youtube.com/watch?v=VqCgcpAypFQ,VS Code +3486f395-ad68-459c-8c39-ea07de934dd4,https://www.youtube.com/watch?v=VqCgcpAypFQ,VS Code +982d12a5-beab-424f-8d38-d2a48429e511,https://www.youtube.com/watch?v=ORrELERGIHs,VS Code +4e60007a-f5be-4bfc-9723-c39affa0a6d3,"https://campbell-muscle-lab.github.io/howtos_Python/pages/documentation/best_practices/vscode_docstring_extension/vscode_docstring_extension.html#:~:text=Type%2C%20Ctrl%20%2B%20Shift%20%2B%20P,select%20the%20NumPy%20docstring%20format.",VS Code +e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2,https://superuser.com/questions/1386061/how-to-suppress-some-python-errors-warnings-in-vs-code,VS Code +9439a27b-18ae-42d8-9778-5f68f891805e,https://stackoverflow.com/questions/75832474/how-to-keep-cursor-in-debug-console-when-debugging-in-visual-studio-code,VS Code +ae506c68-352c-4094-9caa-ee9d42052317,https://superuser.com/questions/1460404/get-visual-studio-code-terminal-history?rq=1,VS Code +ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae,https://superuser.com/questions/1748097/vs-code-disable-tree-view-find-explorer-search,VS Code +c714dcee-cad3-4e12-8f3c-12bdcfcdb048,https://superuser.com/questions/1417361/how-to-disable-file-filtering-in-vs-code-sidebar-explorer?rq=1,VS Code +930fdb3b-11a8-46fe-9bac-577332e2640e,https://superuser.com/questions/1270103/how-to-switch-the-cursor-between-terminal-and-code-in-vscode,VS Code +276cc624-87ea-4f08-ab93-f770e3790175,https://www.quora.com/unanswered/How-do-you-set-the-line-length-in-Visual-Studio-Code,VS Code +9d425400-e9b2-4424-9a4b-d4c7abac4140,https://superuser.com/questions/1466771/is-there-a-way-to-make-editor-tabs-stack-in-vs-code,VS Code +7a4deb26-d57d-4ea9-9a73-630f66a7b568,https://www.quora.com/How-do-I-edit-a-photo-in-GIMP,GIMP +554785e9-4523-4e7a-b8e1-8016f565f56a,https://www.quora.com/How-do-I-edit-a-photo-in-GIMP,GIMP +77b8ab4d-994f-43ac-8930-8ca087d7c4b4,https://superuser.com/questions/1636113/how-to-get-gimp-to-recognize-images-or-pictures-folder-as-the-default-folder-for,GIMP +f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce,https://superuser.com/questions/612338/how-do-i-select-and-move-an-object-in-gimp,GIMP +d52d6308-ec58-42b7-a2c9-de80e4837b2b,https://superuser.com/questions/1447106/how-to-get-rid-of-the-gimp-tool-options-box,GIMP +2a729ded-3296-423d-aec4-7dd55ed5fbb3,https://www.youtube.com/watch?v=lOzSiOIipSM,GIMP +b148e375-fe0b-4bec-90e7-38632b0d73c2,https://www.quora.com/How-do-I-add-layers-in-GIMP,GIMP +a746add2-cab0-4740-ac36-c3769d9bfb46,https://www.youtube.com/watch?v=_L_MMU22bAw,GIMP +7b7617bd-57cc-468e-9c91-40c4ec2bcb3d,https://www.youtube.com/watch?v=G_PjQAy0iiU,GIMP +d16c99dc-2a1e-46f2-b350-d97c86c85c15,https://stackoverflow.com/questions/75185543/use-gimp-to-resize-image-in-one-layer-only,GIMP +573f79b5-abfe-4507-b455-251d45fe6198,https://stackoverflow.com/questions/45196895/gimp-add-padding-to-multiple-images,GIMP +06ca5602-62ca-47f6-ad4f-da151cde54cc,https://stackoverflow.com/questions/74664666/how-to-export-palette-based-png-in-gimp,GIMP +fa9b1e10-4d2d-4a13-af76-7efa822b6a8b,https://stackoverflow.com/questions/24626608/how-to-combine-several-png-images-as-layers-in-a-single-xcf-image,GIMP +6b2b72ed-3a10-4849-876a-750f7cdf3886,https://stackoverflow.com/questions/21018007/resize-image-to-fit-canvas-gimp,GIMP +d0e42fd2-d290-46b3-b598-a6e2b7be9c85,https://stackoverflow.com/questions/56758689/stop-gimp-from-merging-layers-when-de-selecting,GIMP +e2dd0213-26db-4349-abe5-d5667bfd725c,https://superuser.com/questions/839650/how-to-move-an-inserted-text-box-in-gimp,GIMP +f723c744-e62c-4ae6-98d1-750d3cd7d79d,https://www.reddit.com/r/GIMP/comments/12e57w8/how_to_use_gimp_to_exaggerate_contrast/,GIMP +8d6b1c9c-1aab-47fe-9ba5-e84c838d0c57,https://www.quora.com/How-can-email-attachments-be-converted-into-a-word-document-using-Mozilla-Thunderbird,multiple +11e1e614-9696-4d94-88c9-8e556880d41a,https://ifttt.com/applets/L2A89geP-send-chrome-software-update-release-alerts-to-email,multiple +57956154-f0fe-486b-88b8-e7126da035a9,https://zapier.com/apps/email/integrations/google-sheets/547/get-email-notifications-for-new-rows-in-a-google-sheets-spreadsheet,multiple +ec14c524-b245-456d-abd6-ec12c746e9f8,https://zapier.com/apps/gmail/integrations/google-sheets/2618/save-new-gmail-emails-matching-certain-traits-to-a-google-spreadsheet,multiple +cbf5fbda-425e-4619-bcf2-0ea8d4c0bfa3,https://zapier.com/apps/google-sheets/integrations/google-slides/13919/refresh-charts-on-a-google-slides-presentation-when-rows-are-updated-on-google-sheets,multiple +a54284d0-7b93-4327-bfcc-3a421516dbdd,https://superuser.com/questions/655622/cannot-drag-images-from-thunderbird-to-word,multiple +58565672-7bfe-48ab-b828-db349231de6b,https://superuser.com/questions/1792660/open-link-from-other-application-does-not-open-the-url-in-firefox,multiple +6d72aad6-187a-4392-a4c4-ed87269c51cf,https://superuser.com/questions/923171/converting-openoffice-impress-presentation-to-video-without-screen-recording,multiple +937087b6-f668-4ba6-9110-60682ee33441,https://superuser.com/questions/187440/set-default-ubuntu-video-player-as-vlc,multiple +f8cfa149-d1c1-4215-8dac-4a0932bad3c2,https://superuser.com/questions/1803088/libreoffice-calc-clears-clipboard,multiple +5e974913-6905-4c3f-8b65-d7837f3931cc,https://stackoverflow.com/questions/61856141/how-can-i-start-thunderbird-and-minimize-the-window-on-startup-in-ubuntu,multiple +7c179dad-f1c7-4892-b53f-d1c4023d23c7,https://stackoverflow.com/questions/21155085/pasting-excel-tables-in-thunderbird-e-mail-client,multiple +4a68b2dd-70f2-4532-9bc1-d21878bd8cb2,https://stackoverflow.com/questions/65669955/thunderbird-how-to-send-a-mail-to-all-receivers-of-a-folder,multiple +c8457fde-b14b-4aba-b402-144842ea29e1,https://stackoverflow.com/questions/65788200/how-to-open-xlsx-files-in-ms-excel-from-vs-code,multiple +81c425f5-78f3-4771-afd6-3d2973825947,https://www.zyxware.com/articles/3770/how-to-transfer-data-in-libreoffice-calc-to-libreoffice-writer-in-table-format,multiple +bb83cab4-e5c7-42c7-a67b-e46068032b86,https://ask.libreoffice.org/t/save-impress-presentation-as-writer-document/5291/4,multiple +227d2f97-562b-4ccb-ae47-a5ec9e142fbb,https://discourse.gnome.org/t/gimp-and-libre-office-writer/15430/4,multiple +a6bbc08c-51e9-4ee4-9327-83d05075d960,https://forum.openoffice.org/en/forum/viewtopic.php?t=105055,multiple +964e6e03-ba31-466b-8c15-5a351a81f675,https://www.maketecheasier.com/mail-merge-thunderbird-calc/,multiple +2fe4b718-3bd7-46ec-bdce-b184f5653624,https://www.thewindowsclub.com/how-to-create-animated-gif-from-a-video-file-using-vlc-and-gimp,multiple +d02b9364-6bb0-4c7e-9dbd-4db62822bc26,https://stackoverflow.com/questions/38306910/simple-python-script-to-get-a-libreoffice-base-field-and-play-on-vlc,multiple +57fb469b-127a-46fa-8281-bbb3840efdf5,https://support.mozilla.org/en-US/questions/1150626,multiple +3680a5ee-6870-426a-a997-eba929a0d25c,https://unix.stackexchange.com/questions/510850/how-to-open-calc-from-terminal-and-insert-files,multiple +2d8c8a20-6f54-4c2e-ad56-61fbe7af6b78,https://www.quora.com/How-do-I-force-LibreOffice-Calc-to-recalculate-a-spreadsheet-from-the-command-line,multiple +ee9a3c83-f437-4879-8918-be5efbb9fac7,https://stackoverflow.com/questions/64589140/convert-ods-to-csv-using-command-line-when-libreoffice-instance-is-running,multiple +f7dfbef3-7697-431c-883a-db8583a4e4f9,https://www.thegeekdiary.com/libreoffice-command-examples-in-linux/,multiple +2b9493d7-49b8-493a-a71b-56cd1f4d6908,https://devicetests.com/kill-libreoffice-writer-command-line-ubuntu,multiple +51f5801c-18b3-4f25-b0c3-02f85507a078,https://github.com/danielrcollins1/ImpressExtractNotes,multiple +81de345e-5473-4cb6-a74d-b6abf3475a6a,https://stackoverflow.com/questions/45588952/how-can-i-compose-and-send-email-in-thunderbird-from-commandline,multiple +2c9fc0de-3ee7-45e1-a5df-c86206ad78b5,https://nikki-ricks.medium.com/how-to-use-git-add-commit-and-push-in-vs-code-and-command-line-35c0e8c47b62,multiple +510f64c8-9bcc-4be1-8d30-638705850618,https://www.geeksforgeeks.org/how-to-start-vs-code-from-the-terminal-command-line/,multiple +9ff484f7-5c09-4398-ae29-d5904e59e138,https://stackoverflow.com/questions/38606973/playing-opening-and-pausing-vlc-command-line-executed-from-python-scripts,multiple +d9b7c649-c975-4f53-88f5-940b29c47247,https://marketplace.uipath.com/listings/extract-the-first-1000-gmail-emails-from-the-current-month-in-a-new-google-sheets-report,multiple +be4ef0dc-0f70-4936-81d8-3cd2b04482f8,https://marketplace.uipath.com/listings/table-data-extraction-for-sales-opportunities-to-excel-workbook,multiple +78aed49a-a710-4321-a793-b611a7c5b56b,https://marketplace.uipath.com/listings/upload-email-attachments-from-gmail-to-google-drive,multiple +897e3b53-5d4d-444b-85cb-2cdc8a97d903,https://marketplace.uipath.com/listings/convert-word-file-to-pdf-and-store-in-onedrive,multiple +4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc,https://marketplace.uipath.com/listings/extract-data-from-a-new-invoice-file-in-google-drive-and-store-it-in-google-sheets4473,multiple +b52b40a5-ad70-4c53-b5b0-5650a8387052,https://marketplace.uipath.com/listings/merge-pdfs-from-gmail-email-attachments-and-upload-to-gogle-drive,multiple +46407397-a7d5-4c6b-92c6-dbe038b1457b,https://marketplace.uipath.com/listings/upload-to-google-drive-images-from-pdf-attachments-received-via-gmail,multiple +a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb,https://marketplace.uipath.com/listings/backup-important-emails-to-onedrive-or-sharepoint,multiple +665f4af1-617d-4009-baff-84ff66071e6a,https://www.howtogeek.com/663927/how-to-open-google-chrome-using-command-prompt-on-windows-10/#open-chrome-straight-to-a-specific-website,multiple +e6313b30-3903-4ed9-8c7d-4c47bf51fc96,https://stackoverflow.com/questions/12258086/how-do-i-run-google-chrome-as-root,multiple \ No newline at end of file diff --git a/resouce_collection/Source2Doc/get_Source_Doc.py b/resouce_collection/Source2Doc/get_Source_Doc.py new file mode 100644 index 0000000..5f5e207 --- /dev/null +++ b/resouce_collection/Source2Doc/get_Source_Doc.py @@ -0,0 +1,234 @@ +import csv +import os +import yt_dlp +from docx import Document +import requests +from bs4 import BeautifulSoup +from PIL import Image +import pytesseract +from io import BytesIO +from docx import Document +import re +import markdownify +from markdownify import markdownify as md + +def valid_xml_char_ordinal(c): + codepoint = ord(c) + # conditions ordered by presumed frequency + return ( + 0x20 <= codepoint <= 0xD7FF or + codepoint in (0x9, 0xA, 0xD) or + 0xE000 <= codepoint <= 0xFFFD or + 0x10000 <= codepoint <= 0x10FFFF + ) + +def download_and_clean_youtube_subtitles(video_url, txt_filepath): + # set up youtube-dl options to download the subtitles + subtitles_path = txt_filepath[0:-4] + ydl_opts = { + 'skip_download': True, + 'writesubtitles': True, + 'writeautomaticsub': True, # if no subtitles are available, try to generate them + 'subtitleslangs': ['en'], + 'outtmpl': f'{subtitles_path}.%(ext)s', + 'quiet': True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + # download the subtitles + ydl.download([video_url]) + subtitle_file = f'{subtitles_path}.en.vtt' + + # read the subtitle file + subtitles = [] + try: + with open(subtitle_file, 'r', encoding='utf-8') as file: + lines = file.readlines() + + # define a pattern to match the time line + pattern = re.compile(r'(\d{2}:\d{2}:\d{2}.\d{3} --> \d{2}:\d{2}:\d{2}.\d{3})|(^WEBVTT)|(^Kind: captions)|(^Language: .*)') + + # clean the subtitles + for line in lines: + # if this line is a time line or it is blank , skip it + if pattern.match(line) or line.strip() == '': + continue + # add this subtitle line to subtitles list, remove the trailing spaces and line change + subtitles.append(line.strip()) + + # remove duplicated subtitles + subtitles = list(dict.fromkeys(subtitles)) + + # save the subtitles as a txt file + with open(txt_filepath, 'w', encoding='utf-8') as f: + for line in subtitles: + if line: + f.write(line + '\n') + + except IOError: + print(f"Could not read file: {subtitle_file}") + +# scrape a webpage and perform OCR on images +def scrape_and_ocr_forum(url, doc): + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + text_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'li']) + for element in text_elements: + doc.add_paragraph(element.get_text()) + + image_elements = soup.find_all('img') + for image in image_elements: + if 'src' not in image.attrs: + continue + image_url = image['src'] + if image_url.startswith('http'): + if not image_url.endswith('.svg') and not image_url.endswith('.png'): + continue + if 'neveragain.allstatics.com/2019/assets/icon/logo' in image_url: + continue + img_response = requests.get(image_url, stream=True) + img = Image.open(BytesIO(img_response.content)) + ocr_text = pytesseract.image_to_string(img) + + if ocr_text != ' ' and ocr_text != '': + cleaned_string = ''.join(c for c in ocr_text if valid_xml_char_ordinal(c)) + doc.add_paragraph(cleaned_string) + +def superuser_to_markdown(url, doc_filepath): + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + # set up the markdown document + markdown_content = "" + + # get the question title and body + question_title = soup.find('h1').get_text(strip=True) + question = soup.find('div', {'id': 'question'}) + if question: + question_body = question.find('div', {'class': 's-prose js-post-body'}).prettify() + markdown_content += f"# {question_title}\n\n" + markdownify.markdownify(question_body, heading_style="ATX") + "\n\n" + + # get all answers + answers = soup.find_all('div', {'class': 'answer'}) + for answer in answers: + answer_body = answer.find('div', {'class': 's-prose js-post-body'}).prettify() + markdown_content += markdownify.markdownify(answer_body, heading_style="ATX") + "\n\n" + + # deal with images and perform OCR + all_img_tags = question.find_all('img') + [img for answer in answers for img in answer.find_all('img')] + for img_tag in all_img_tags: + image_src = img_tag.get('src') or img_tag.get('data-src') # Superuser uses lazy loading + if image_src and image_src.startswith('http'): + img_response = requests.get(image_src, stream=True) + img = Image.open(BytesIO(img_response.content)) + ocr_text = pytesseract.image_to_string(img) + if ocr_text.strip(): # if the OCR result is not empty, add it to the markdown content + markdown_content += "```\n" + ocr_text.strip() + "\n```\n\n" + + with open(doc_filepath, 'w', encoding='utf-8') as f: + f.write(markdown_content) + + +def stack_overflow_to_markdown(url, doc_filepath): + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + # set up the markdown document + markdown_content = "" + + # get the question title and body + question = soup.find('div', {'id': 'question'}) + + question_title = soup.find('h1').get_text(strip=True) + if question: + + question_body = question.find('div', {'class': 's-prose js-post-body'}).prettify() + markdown_content += f"# {question_title}\n\n" + markdownify.markdownify(question_body, heading_style="ATX") + "\n\n" + + # get all answers + answers = soup.find_all('div', {'class': 'answer'}) + for answer in answers: + answer_body = answer.find('div', {'class': 's-prose js-post-body'}).prettify() + markdown_content += markdownify.markdownify(answer_body, heading_style="ATX") + "\n\n" + + # deal with images and perform OCR + all_img_tags = soup.find_all('img') + for img_tag in all_img_tags: + image_url = img_tag['src'] + if image_url.startswith('http') and (image_url.endswith('.svg') or image_url.endswith('.png')): # 确保图片URL有效 + img_response = requests.get(image_url, stream=True) + img = Image.open(BytesIO(img_response.content)) + ocr_text = pytesseract.image_to_string(img) + if ocr_text.strip(): + markdown_content += "```\n" + ocr_text.strip() + "\n```\n\n" + + with open(doc_filepath, 'w', encoding='utf-8') as f: + f.write(markdown_content) + +def scrape_webpage_to_markdown(url, doc_filepath): + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + articles = soup.find_all('article') or soup.find_all('main') or soup.find_all('div', {'class': 'lia-message-body-content'}) + + if not articles: + return + + markdown_content = '' + + # scrape the webpage and perform OCR on images + for article in articles: + for child in article.recursiveChildGenerator(): + # if this is an image, perform OCR + if child.name == 'img': + img_url = child.get('src') + if not img_url.startswith(('http:', 'https:')): + img_url = '{}{}'.format(url, img_url) + if not img_url.endswith('.svg') and not img_url.endswith('.png'): + continue + if 'neveragain.allstatics.com/2019/assets/icon/logo' in img_url: + continue + img_response = requests.get(img_url, stream=True) + img = Image.open(BytesIO(img_response.content)) + ocr_text = pytesseract.image_to_string(img) + if ocr_text.strip(): + markdown_content += '\n```plaintext\n{}\n```\n'.format(ocr_text.strip()) + continue + # Not an image, so continue recursively calling function + if child.name is None: + continue + + html_str = str(child) + markdown_content += md(html_str) + '\n\n' + + with open(doc_filepath, 'w', encoding='utf-8') as f: + f.write(markdown_content) + + +# process a URL and save the file +def process_url(url, doc_id, app): + doc_filepath = f"/content/drive/MyDrive/SourceDoc/{doc_id}_{app}.md" + txt_filepath = f"/content/drive/MyDrive/SourceDoc/{doc_id}_{app}.txt" + doc = Document() + + if 'youtube.com' in url or 'youtu.be' in url: + download_and_clean_youtube_subtitles(url, txt_filepath) + elif 'superuser.com' in url: + superuser_to_markdown(url, doc_filepath) + elif 'stackoverflow.com' in url: + stack_overflow_to_markdown(url, doc_filepath) + else: + scrape_webpage_to_markdown(url, doc_filepath) + +# read the CSV file and process each URL +csv_filepath = './Get_Source_Doc - Sheet1.csv' +with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + cnt = 55 + for row in reader: + if cnt>0: + cnt -= 1 + continue + process_url(row['Source'], row['id'], row['InvolvedApp']) + print(row) \ No newline at end of file