Improve on agent codes; add auto-running experiments code; Fix some examples

This commit is contained in:
Timothyxxx
2024-01-27 19:47:47 +08:00
parent f8ff612b85
commit 909aa868f3
8 changed files with 283 additions and 56 deletions

View File

@@ -11,7 +11,7 @@
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1XaTnC_lLbR_tGTz8tcN2Tp6cNrMlNW3R&export=download&authuser=0&confirm=t&uuid=89e69a23-43cf-4316-833a-fb9d3e281460&at=APZUnTWn5zZTH4GlClO6lV1i4WwP:1706184669922",
"path": "poster_party_night.webp"
"path": "/home/user/Desktop/poster_party_night.webp"
}
]
}
@@ -19,7 +19,7 @@
{
"type": "execute",
"parameters": {
"command": "mv ~/poster_party_night.webp ~/.local/share/Trash/files/",
"command": "gio trash /home/user/Desktop/poster_party_night.webp",
"shell": true
}
},

View File

@@ -1,7 +1,7 @@
{
"id": "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
"snapshot": "chrome",
"instruction": "I want to watch two or more videos in same time on VLC. I tried to run multiple instances of VLC. It worked but can't play videos on those new instances. When I play video it plays on first instance instead of new instance.\nIs there any way to solve this problem? Take the three videos on my desktop for example, do that for me.",
"instruction": "I want to watch two or more videos in same time on VLC. I tried to run multiple instances of VLC. It worked but can't play videos on those new instances. When I play video it plays on first instance instead of new instance.\nIs there any way to solve this problem?",
"source": "https://www.reddit.com/r/Fedora/comments/rhljzd/how_to_run_multiple_instances_of_vlc_media_player/",
"config": [
{

View File

@@ -3,7 +3,8 @@ import json
import logging
import os
import sys
import threading
import time
from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.gpt_4v_agent import GPT4v_Agent
@@ -61,8 +62,6 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
env.controller.start_recording()
while not done and step_num < max_steps:
with open("accessibility_tree.xml", "w", encoding="utf-8") as f:
f.write(observation["accessibility_tree"])
actions = agent.predict(observation)
step_num += 1
for action in actions:
@@ -98,34 +97,63 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
logger.info("The episode is done.")
break
if recording:
# send a request to the server to stop recording
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
def stop_recording():
try:
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
except Exception as e:
print(f"An error occurred while stopping the recording: {e}")
# Run the `record` function in a separate thread
recording_thread = threading.Thread(target=stop_recording())
recording_thread.start()
# Start a timer for your timeout length (in this case, 60 seconds)
timeout = 60 # seconds
start_time = time.time()
# The main thread will wait for the set timeout period or until the recording is done
while recording_thread.is_alive():
elapsed_time = time.time() - start_time
if elapsed_time >= timeout:
print("Timeout reached. Stopping recording.")
break
time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU
# kill the recording thread if it is still alive
if recording_thread.is_alive():
recording_thread.kill()
# Wait for the recording thread to finish before exiting
recording_thread.join()
result = env.evaluate()
logger.info("Result: %.2f", result)
with open(trajectory_recording_path, "a") as f:
f.write(json.dumps({
"result": result
}))
f.write("\n")
# env.close()
logger.info("Environment closed.")
if __name__ == "__main__":
def main(example_class, example_id):
action_space = "pyautogui"
example_class = "chrome"
example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
gpt4_model = "gpt-4-vision-preview"
gpt4_model = "gpt-4-0125-preview"
gemini_model = "gemini-pro-vision"
logger.info("Running example %s/%s", example_class, example_id)
logger.info("Using model %s", gpt4_model)
# logger.info("Using model %s", gemini_model)
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
example = json.load(f)
example["snapshot"] = "exp_setup4"
example["snapshot"] = "exp_chrome"
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], max_tokens=1000,
action_space=action_space, exp="a11y_tree")
# api_key = os.environ.get("GENAI_API_KEY")
@@ -139,3 +167,45 @@ if __name__ == "__main__":
os.makedirs(example_trajectory_dir, exist_ok=True)
run_one_example(example, agent, 15, example_trajectory_dir)
if __name__ == '__main__':
vlc_list = [
# "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
# "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
# "8f080098-ddb1-424c-b438-4e96e5e4786e",
# "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
# "fba2c100-79e8-42df-ae74-b592418d54f4",
# "efcf0d81-0835-4880-b2fd-d866e8bc2294",
# "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f",
# "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6",
# "386dbd0e-0241-4a0a-b6a2-6704fba26b1c",
# "9195653c-f4aa-453d-aa95-787f6ccfaae9",
# "d06f0d4d-2cd5-4ede-8de9-598629438c6e",
# "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81",
"f3977615-2b45-4ac5-8bba-80c17dbe2a37",
"215dfd39-f493-4bc3-a027-8a97d72c61bf"
]
for example_id in vlc_list:
recording_thread = threading.Thread(target=main, args=("vlc", example_id))
recording_thread.start()
# Start a timer for your timeout length (in this case, 60 seconds)
timeout = 600 # seconds
start_time = time.time()
# The main thread will wait for the set timeout period or until the recording is done
while recording_thread.is_alive():
elapsed_time = time.time() - start_time
if elapsed_time >= timeout:
print("Timeout reached. Kill this example.")
break
time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU
# kill the recording thread if it is still alive
if recording_thread.is_alive():
recording_thread.kill()
# Wait for the recording thread to finish before exiting
recording_thread.join()

View File

@@ -3,10 +3,12 @@ import json
import logging
import os
import sys
import threading
import time
from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.gpt_4v_agent import GPT4v_Agent
from mm_agents.gemini_pro_agent import GeminiPro_Agent
# from mm_agents.gemini_pro_agent import GeminiPro_Agent
# Logger Configs {{{ #
logger = logging.getLogger()
@@ -98,21 +100,50 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
logger.info("The episode is done.")
break
if recording:
# send a request to the server to stop recording
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
def stop_recording():
try:
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
except Exception as e:
print(f"An error occurred while stopping the recording: {e}")
# Run the `record` function in a separate thread
recording_thread = threading.Thread(target=stop_recording())
recording_thread.start()
# Start a timer for your timeout length (in this case, 60 seconds)
timeout = 60 # seconds
start_time = time.time()
# The main thread will wait for the set timeout period or until the recording is done
while recording_thread.is_alive():
elapsed_time = time.time() - start_time
if elapsed_time >= timeout:
print("Timeout reached. Stopping recording.")
break
time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU
# kill the recording thread if it is still alive
if recording_thread.is_alive():
recording_thread.kill()
# Wait for the recording thread to finish before exiting
recording_thread.join()
result = env.evaluate()
logger.info("Result: %.2f", result)
with open(trajectory_recording_path, "a") as f:
f.write(json.dumps({
"result": result
}))
f.write("\n")
# env.close()
logger.info("Environment closed.")
if __name__ == "__main__":
def main(example_class, example_id):
action_space = "pyautogui"
example_class = "thunderbird"
example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
gpt4_model = "gpt-4-vision-preview"
gemini_model = "gemini-pro-vision"
@@ -120,21 +151,28 @@ if __name__ == "__main__":
logger.info("Using model %s", gpt4_model)
# logger.info("Using model %s", gemini_model)
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
example = json.load(f)
example["snapshot"] = "exp_setup2"
example["snapshot"] = "exp_chrome"
# api_key = os.environ.get("OPENAI_API_KEY")
# agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot")
api_key = os.environ.get("GENAI_API_KEY")
agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot")
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot")
#
# api_key = os.environ.get("GENAI_API_KEY")
# agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot")
root_trajectory_dir = "exp_trajectory"
example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gpt4_model, example_id)
# example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gemini_model, example_id)
example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gpt4_model, example_id)
# example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gemini_model, example_id)
os.makedirs(example_trajectory_dir, exist_ok=True)
run_one_example(example, agent, 15, example_trajectory_dir)
if __name__ == '__main__':
xx_list = [
]
for example_id in xx_list:
main("xx", example_id)

View File

@@ -3,6 +3,8 @@ import json
import logging
import os
import sys
import threading
import time
from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.gpt_4v_agent import GPT4v_Agent
@@ -96,21 +98,50 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
logger.info("The episode is done.")
break
if recording:
# send a request to the server to stop recording
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
def stop_recording():
try:
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
except Exception as e:
print(f"An error occurred while stopping the recording: {e}")
# Run the `record` function in a separate thread
recording_thread = threading.Thread(target=stop_recording())
recording_thread.start()
# Start a timer for your timeout length (in this case, 60 seconds)
timeout = 60 # seconds
start_time = time.time()
# The main thread will wait for the set timeout period or until the recording is done
while recording_thread.is_alive():
elapsed_time = time.time() - start_time
if elapsed_time >= timeout:
print("Timeout reached. Stopping recording.")
break
time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU
# kill the recording thread if it is still alive
if recording_thread.is_alive():
recording_thread.kill()
# Wait for the recording thread to finish before exiting
recording_thread.join()
result = env.evaluate()
logger.info("Result: %.2f", result)
with open(trajectory_recording_path, "a") as f:
f.write(json.dumps({
"result": result
}))
f.write("\n")
# env.close()
logger.info("Environment closed.")
if __name__ == "__main__":
def main(example_class, example_id):
action_space = "pyautogui"
example_class = "chrome"
example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
gpt4_model = "gpt-4-vision-preview"
gemini_model = "gemini-pro-vision"
@@ -118,9 +149,9 @@ if __name__ == "__main__":
logger.info("Using model %s", gpt4_model)
# logger.info("Using model %s", gemini_model)
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
example = json.load(f)
example["snapshot"] = "exp_setup4"
example["snapshot"] = "exp_chrome"
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
@@ -137,3 +168,10 @@ if __name__ == "__main__":
os.makedirs(example_trajectory_dir, exist_ok=True)
run_one_example(example, agent, 15, example_trajectory_dir)
if __name__ == '__main__':
xx_list = [
]
for example_id in xx_list:
main("xx", example_id)

View File

@@ -3,6 +3,8 @@ import json
import logging
import os
import sys
import threading
import time
from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.gpt_4v_agent import GPT4v_Agent
@@ -96,27 +98,56 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
logger.info("The episode is done.")
break
if recording:
# send a request to the server to stop recording
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
def stop_recording():
try:
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
except Exception as e:
print(f"An error occurred while stopping the recording: {e}")
# Run the `record` function in a separate thread
recording_thread = threading.Thread(target=stop_recording())
recording_thread.start()
# Start a timer for your timeout length (in this case, 60 seconds)
timeout = 60 # seconds
start_time = time.time()
# The main thread will wait for the set timeout period or until the recording is done
while recording_thread.is_alive():
elapsed_time = time.time() - start_time
if elapsed_time >= timeout:
print("Timeout reached. Stopping recording.")
break
time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU
# kill the recording thread if it is still alive
if recording_thread.is_alive():
recording_thread.kill()
# Wait for the recording thread to finish before exiting
recording_thread.join()
result = env.evaluate()
logger.info("Result: %.2f", result)
with open(trajectory_recording_path, "a") as f:
f.write(json.dumps({
"result": result
}))
f.write("\n")
# env.close()
logger.info("Environment closed.")
if __name__ == "__main__":
def main(example_class, example_id):
action_space = "pyautogui"
example_class = "chrome"
example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
gpt4_model = "gpt-4-vision-preview"
gemini_model = "gemini-pro-vision"
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
example = json.load(f)
example["snapshot"] = "exp_setup4"
example["snapshot"] = "exp_chrome"
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
@@ -133,3 +164,10 @@ if __name__ == "__main__":
os.makedirs(example_trajectory_dir, exist_ok=True)
run_one_example(example, agent, 15, example_trajectory_dir)
if __name__ == '__main__':
xx_list = [
]
for example_id in xx_list:
main("xx", example_id)

View File

@@ -3,6 +3,8 @@ import json
import logging
import os
import sys
import threading
import time
from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.gpt_4v_agent import GPT4v_Agent
@@ -96,27 +98,56 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
logger.info("The episode is done.")
break
if recording:
# send a request to the server to stop recording
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
def stop_recording():
try:
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
except Exception as e:
print(f"An error occurred while stopping the recording: {e}")
# Run the `record` function in a separate thread
recording_thread = threading.Thread(target=stop_recording())
recording_thread.start()
# Start a timer for your timeout length (in this case, 60 seconds)
timeout = 60 # seconds
start_time = time.time()
# The main thread will wait for the set timeout period or until the recording is done
while recording_thread.is_alive():
elapsed_time = time.time() - start_time
if elapsed_time >= timeout:
print("Timeout reached. Stopping recording.")
break
time.sleep(0.1) # Sleep for a short time to prevent this loop from using too much CPU
# kill the recording thread if it is still alive
if recording_thread.is_alive():
recording_thread.kill()
# Wait for the recording thread to finish before exiting
recording_thread.join()
result = env.evaluate()
logger.info("Result: %.2f", result)
with open(trajectory_recording_path, "a") as f:
f.write(json.dumps({
"result": result
}))
f.write("\n")
# env.close()
logger.info("Environment closed.")
if __name__ == "__main__":
def main(example_class, example_id):
action_space = "pyautogui"
example_class = "chrome"
example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
gpt4_model = "gpt-4-vision-preview"
gemini_model = "gemini-pro-vision"
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
example = json.load(f)
example["snapshot"] = "exp_setup4"
example["snapshot"] = "exp_chrome"
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
@@ -133,3 +164,10 @@ if __name__ == "__main__":
os.makedirs(example_trajectory_dir, exist_ok=True)
run_one_example(example, agent, 15, example_trajectory_dir)
if __name__ == '__main__':
xx_list = [
]
for example_id in xx_list:
main("xx", example_id)

View File

@@ -63,6 +63,8 @@ def tag_screenshot(screenshot, accessibility_tree):
def parse_actions_from_string(input_string):
if input_string.strip() in ['WAIT', 'DONE', 'FAIL']:
return [input_string.strip()]
# Search for a JSON string within the input string
actions = []
matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
@@ -95,6 +97,9 @@ def parse_actions_from_string(input_string):
def parse_code_from_string(input_string):
if input_string.strip() in ['WAIT', 'DONE', 'FAIL']:
return [input_string.strip()]
# This regular expression will match both ```code``` and ```python code```
# and capture the `code` part. It uses a non-greedy match for the content inside.
pattern = r"```(?:\w+\s+)?(.*?)```"