diff --git a/evaluation_examples/test_all.json b/evaluation_examples/test_all.json new file mode 100644 index 0000000..0514d47 --- /dev/null +++ b/evaluation_examples/test_all.json @@ -0,0 +1,400 @@ +{ + "chrome": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", + "06fe7178-4491-4589-810f-2e2bc9502122", + "e1e75309-3ddb-4d09-92ec-de869c928143", + "35253b65-1c19-4304-8aa4-6884b8218fc0", + "2ad9387a-65d8-4e33-ad5b-7580065a27ca", + "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263", + "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938", + "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3", + "480bcfea-d68f-4aaa-a0a9-2589ef319381", + "af630914-714e-4a24-a7bb-f9af687d3b91", + "3720f614-37fd-4d04-8a6b-76f54f8c222d", + "99146c54-4f37-4ab8-9327-5f3291665e1e", + "12086550-11c0-466b-b367-1d9e75b3910e", + "6766f2b8-8a72-417f-a9e5-56fcaa735837", + "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9", + "ae78f875-5b98-4907-bbb5-9c737fc68c03", + "3299584d-8f11-4457-bf4c-ce98f7600250", + "030eeff7-b492-4218-b312-701ec99ee0cc", + "9656a811-9b5b-4ddf-99c7-5117bcef0626", + "fc6d8143-9452-4171-9459-7f515143419a", + "a96b564e-dbe9-42c3-9ccf-b4498073438a", + "1704f00f-79e6-43a7-961b-cedd3724d5fd", + "f3b19d1e-2d48-44e9-b4e1-defcae1a0197", + "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a", + "47543840-672a-467d-80df-8f7c3b9788c9", + "c1fa57f3-c3db-4596-8f09-020701085416", + "da46d875-6b82-4681-9284-653b0c7ae241", + "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc", + "f79439ad-3ee8-4f99-a518-0eb60e5652b0", + "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8", + "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805", + "7f52cab9-535c-4835-ac8c-391ee64dc930", + "82279c77-8fc6-46f6-9622-3ba96f61b477", + "2888b4e6-5b47-4b57-8bf5-c73827890774", + "b4f95342-463e-4179-8c3f-193cd7241fb2", + "f5d96daf-83a8-4c86-9686-bada31fc66ab", + "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba", + "368d9ba4-203c-40c1-9fa3-da2f1430ce63", + "59155008-fe71-45ec-8a8f-dc35497b6aa8", + "a728a36e-8bf1-4bb6-9a03-ef039a5233f0", + "b070486d-e161-459b-aa2b-ef442d973b92", + "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217", + "9f935cce-0a9f-435f-8007-817732bfc0a5", + "f0b971a1-6831-4b9b-a50e-22a6e47f45ba", + "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" + ], + "gimp": [ + "7a4deb26-d57d-4ea9-9a73-630f66a7b568", + "554785e9-4523-4e7a-b8e1-8016f565f56a", + "77b8ab4d-994f-43ac-8930-8ca087d7c4b4", + "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce", + "d52d6308-ec58-42b7-a2c9-de80e4837b2b", + "2a729ded-3296-423d-aec4-7dd55ed5fbb3", + "b148e375-fe0b-4bec-90e7-38632b0d73c2", + "a746add2-cab0-4740-ac36-c3769d9bfb46", + "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d", + "d16c99dc-2a1e-46f2-b350-d97c86c85c15", + "06ca5602-62ca-47f6-ad4f-da151cde54cc", + "e2dd0213-26db-4349-abe5-d5667bfd725c", + "f723c744-e62c-4ae6-98d1-750d3cd7d79d", + "72f83cdc-bf76-4531-9a1b-eb893a13f8aa", + "7767eef2-56a3-4cea-8c9f-48c070c7d65b", + "734d6579-c07d-47a8-9ae2-13339795476b", + "e19bd559-633b-4b02-940f-d946248f088e", + "38f48d40-764e-4e77-a7cf-51dfce880291", + "fbb548ca-c2a6-4601-9204-e39a2efc507b", + "5ca86c6f-f317-49d8-b6a7-b527541caae8", + "62f7fd55-0687-4a43-b6e1-3eda16fc6252", + "8ea73f6f-9689-42ad-8c60-195bbf06a7ba", + "58d3eeeb-e9d0-499f-962e-fd0db2a744d8", + "2e6f678f-472d-4c55-99cc-8e7c5c402a71", + "045bf3ff-9077-4b86-b483-a1040a949cff", + "dbbf4b99-2253-4b10-9274-45f246af2466" + ], + "libreoffice_calc": [ + "357ef137-7eeb-4c80-a3bb-0951f26a8aff", + "42e0a640-4f19-4b28-973d-729602b5a4a7", + "51719eea-10bc-4246-a428-ac7c433dd4b3", + "1954cced-e748-45c4-9c26-9855b97fbc5e", + "2bd59342-0664-4ccb-ba87-79379096cc08", + "3aaa4e37-dc91-482e-99af-132a612d40f3", + "1273e544-688f-496b-8d89-3e0f40aa0606", + "12382c62-0cd1-4bf2-bdc8-1d20bf9b2371", + "f9584479-3d0d-4c79-affa-9ad7afdd8850", + "535364ea-05bd-46ea-9937-9f55c68507e8", + "7e429b8d-a3f0-4ed0-9b58-08957d00b127", + "4f07fbe9-70de-4927-a4d5-bb28bc12c52c", + "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f", + "0bf05a7d-b28b-44d2-955a-50b41e24012a", + "6054afcb-5bab-4702-90a0-b259b5d3217c", + "abed40dc-063f-4598-8ba5-9fe749c0615d", + "37608790-6147-45d0-9f20-1137bb35703d", + "26a8440e-c166-4c50-aef4-bfb77314b46b", + "d681960f-7bc3-4286-9913-a8812ba3261a", + "035f41ba-6653-43ab-aa63-c86d449d62e5", + "7efeb4b1-3d19-4762-b163-63328d66303b", + "1de60575-bb6e-4c3d-9e6a-2fa699f9f197", + "aa3a8974-2e85-438b-b29e-a64df44deb4b", + "51b11269-2ca8-4b2a-9163-f21758420e78", + "1e8df695-bd1b-45b3-b557-e7d599cf7597", + "ecb0df7a-4e8d-4a03-b162-053391d3afaf", + "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14", + "7b802dad-6e0f-4204-9815-d4e3f57627d8", + "a01fbce3-2793-461f-ab86-43680ccbae25", + "0326d92d-d218-48a8-9ca1-981cd6d064c7", + "0a2e43bf-b26c-4631-a966-af9dfa12c9e5", + "4188d3a4-077d-46b7-9c86-23e1a036f6c1", + "347ef137-7eeb-4c80-a3bb-0951f26a8aff", + "eb03d19a-b88d-4de4-8a64-ca0ac66f426b", + "0cecd4f3-74de-457b-ba94-29ad6b5dafb6", + "1d17d234-e39d-4ed7-b46f-4417922a4e7c", + "4e6fcf72-daf3-439f-a232-c434ce416af6", + "01b269ae-2111-4a07-81fd-3fcd711993b0", + "21df9241-f8d7-4509-b7f1-37e501a823f7", + "a9f325aa-8c05-4e4f-8341-9e4358565f4f", + "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5", + "7a4e4bc8-922c-4c84-865c-25ba34136be1", + "4de54231-e4b5-49e3-b2ba-61a0bec721c0", + "30e3e107-1cfb-46ee-a755-2cd080d7ba6a", + "4172ea6e-6b77-4edb-a9cc-c0014bd1603b", + "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17", + "3a7c8185-25c1-4941-bd7b-96e823c9f21f", + "21ab7b40-77c2-4ae6-8321-e00d3a086c73" + ], + "libreoffice_impress": [ + "5d901039-a89c-4bfb-967b-bf66f4df075e", + "550ce7e7-747b-495f-b122-acdc4d0b8e54", + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "af23762e-2bfd-4a1d-aada-20fa8de9ce07", + "c59742c0-4323-4b9d-8a02-723c251deaa0", + "ef9d12bd-bcee-4ba0-a40e-918400f43ddf", + "9ec204e4-f0a3-42f8-8458-b772a6797cab", + "0f84bef9-9790-432e-92b7-eece357603fb", + "ce88f674-ab7a-43da-9201-468d38539e4a", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + "a097acff-6266-4291-9fbd-137af7ecd439", + "bf4e9888-f10f-47af-8dba-76413038b73c", + "21760ecb-8f62-40d2-8d85-0cee5725cb72", + "ac9bb6cb-1888-43ab-81e4-a98a547918cd", + "2cd43775-7085-45d8-89fa-9e35c0a915cf", + "358aa0a7-6677-453f-ae35-e440f004c31e", + "a669ef01-ded5-4099-9ea9-25e99b569840", + "73c99fb9-f828-43ce-b87a-01dc07faa224", + "15aece23-a215-4579-91b4-69eec72e18da", + "986fc832-6af2-417c-8845-9272b3a1528b", + "a434992a-89df-4577-925c-0c58b747f0f4", + "7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8", + "841b50aa-df53-47bd-a73a-22d3a9f73160", + "8979838c-54a5-4454-a2b8-3d135a1a5c8f", + "b8adbc24-cef2-4b15-99d5-ecbe7ff445eb", + "2b94c692-6abb-48ae-ab0b-b3e8a19cb340", + "9cf05d24-6bd9-4dae-8967-f67d88f5d38a", + "08aced46-45a2-48d7-993b-ed3fb5b32302", + "edb61b14-a854-4bf5-a075-c8075c11293a", + "c82632a4-56b6-4db4-9dd1-3820ee3388e4", + "39be0d19-634d-4475-8768-09c130f5425d", + "ac1b39ff-ee4d-4483-abce-c117e98942f0", + "f23acfd2-c485-4b7c-a1e7-d4303ddfe864", + "70bca0cc-c117-427e-b0be-4df7299ebeb6", + "af2d657a-e6b3-4c6a-9f67-9e3ed015974c", + "57667013-ea97-417c-9dce-2713091e6e2a", + "0a211154-fda0-48d0-9274-eaac4ce5486d", + "a53f80cd-4a90-4490-8310-097b011433f6", + "7ae48c60-f143-4119-b659-15b8f485eb9a", + "5cfb9197-e72b-454b-900e-c06b0c802b40", + "05dd4c1d-c489-4c85-8389-a7836c4f0567", + "5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1", + "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a", + "e4ef0baf-4b52-4590-a47e-d4d464cca2d7", + "ed43c15f-00cb-4054-9c95-62c880865d68", + "3161d64e-3120-47b4-aaad-6a764a92493b", + "04578141-1d42-4146-b9cf-6fab4ce5fd74" + ], + "libreoffice_writer": [ + "0810415c-bde4-4443-9047-d5f70165a697", + "0a0faba3-5580-44df-965d-f562a99b291c", + "0b17a146-2934-46c7-8727-73ff6b6483e8", + "0e47de2a-32e0-456c-a366-8c607ef7a9d2", + "0e763496-b6bb-4508-a427-fad0b6c3e195", + "3ef2b351-8a84-4ff2-8724-d86eae9b842e", + "4bcb1253-a636-4df4-8cb0-a35c04dfef31", + "66399b0d-8fda-4618-95c4-bfc6191617e9", + "6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2", + "6ada715d-3aae-4a32-a6a7-429b2e43fb93", + "6f81754e-285d-4ce0-b59e-af7edb02d108", + "72b810ef-4156-4d09-8f08-a0cf57e7cefe", + "8472fece-c7dd-4241-8d65-9b3cd1a0b568", + "88fe4b2d-3040-4c70-9a70-546a47764b48", + "936321ce-5236-426a-9a20-e0e3c5dc536f", + "adf5e2c3-64c7-4644-b7b6-d2f0167927e7", + "b21acd93-60fd-4127-8a43-2f5178f4a830", + "d53ff5ee-3b1a-431e-b2be-30ed2673079b", + "e246f6d8-78d7-44ac-b668-fcf47946cb50", + "e528b65e-1107-4b8c-8988-490e4fece599", + "ecc2413d-8a48-416e-a3a2-d30106ca36cb", + "f178a4a9-d090-4b56-bc4c-4b72a61a035d", + "bb8ccc78-479f-4a2f-a71e-d565e439436b" + ], + "multi_apps": [ + "2b9493d7-49b8-493a-a71b-56cd1f4d6908", + "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5", + "2fe4b718-3bd7-46ec-bdce-b184f5653624", + "3680a5ee-6870-426a-a997-eba929a0d25c", + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "510f64c8-9bcc-4be1-8d30-638705850618", + "51f5801c-18b3-4f25-b0c3-02f85507a078", + "58565672-7bfe-48ab-b828-db349231de6b", + "78aed49a-a710-4321-a793-b611a7c5b56b", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "937087b6-f668-4ba6-9110-60682ee33441", + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb", + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "c867c42d-a52d-4a24-8ae3-f75d256b5618", + "d9b7c649-c975-4f53-88f5-940b29c47247", + "e135df7c-7687-4ac0-a5f0-76b74438b53e", + "ee9a3c83-f437-4879-8918-be5efbb9fac7", + "f7dfbef3-7697-431c-883a-db8583a4e4f9", + "f8cfa149-d1c1-4215-8dac-4a0932bad3c2", + "6d72aad6-187a-4392-a4c4-ed87269c51cf", + "f918266a-b3e0-4914-865d-4faa564f1aef", + "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", + "bc2b57f3-686d-4ec9-87ce-edf850b7e442", + "74d5859f-ed66-4d3e-aa0e-93d7a592ce41", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "00fa164e-2612-4439-992e-157d019a8436", + "acb0f96b-e27c-44d8-b55f-7cb76609dfcd", + "69acbb55-d945-4927-a87b-8480e1a5bb7e", + "48d05431-6cd5-4e76-82eb-12b60d823f7d", + "68a25bd4-59c7-4f4d-975e-da0c8509c848", + "eb303e01-261e-4972-8c07-c9b4e7a4922a", + "0c825995-5b70-4526-b663-113f4c999dd2", + "c7c1e4c3-9e92-4eba-a4b8-689953975ea4", + "d1acdb87-bb67-4f30-84aa-990e56a09c92", + "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", + "8e116af7-7db7-4e35-a68b-b0939c066c78", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "82e3c869-49f6-4305-a7ce-f3e64a0618e7", + "185f29bd-5da0-40a6-b69c-ba7f4e0324ef", + "869de13e-bef9-4b91-ba51-f6708c40b096", + "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e", + "3a93cae4-ad3e-403e-8c12-65303b271818", + "1f18aa87-af6f-41ef-9853-cdb8f32ebdea", + "26150609-0da3-4a7d-8868-0faf9c5f01bb", + "9219480b-3aed-47fc-8bac-d2cffc5849f7", + "881deb30-9549-4583-a841-8270c65f2a17", + "7e287123-70ca-47b9-8521-47db09b69b14", + "e2392362-125e-4f76-a2ee-524b183a3412", + "5bc63fb9-276a-4439-a7c1-9dc76401737f", + "26660ad1-6ebb-4f59-8cba-a8432dfe8d38", + "a82b78bb-7fde-4cb3-94a4-035baf10bcf0", + "36037439-2044-4b50-b9d1-875b5a332143", + "716a6079-22da-47f1-ba73-c9d58f986a38", + "873cafdd-a581-47f6-8b33-b9696ddb7b05", + "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a", + "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a", + "da922383-bfa4-4cd3-bbad-6bebab3d7742", + "2373b66a-092d-44cb-bfd7-82e86e7a3b4d", + "81c425f5-78f3-4771-afd6-3d2973825947", + "bb83cab4-e5c7-42c7-a67b-e46068032b86", + "227d2f97-562b-4ccb-ae47-a5ec9e142fbb", + "b337d106-053f-4d37-8da0-7f9c4043a66b", + "20236825-b5df-46e7-89bf-62e1d640a897", + "8df7e444-8e06-4f93-8a1a-c5c974269d82", + "aad10cd7-9337-4b62-b704-a857848cedf2", + "02ce9a50-7af2-47ed-8596-af0c230501f8", + "4c26e3f3-3a14-4d86-b44a-d3cedebbb487", + "a503b07f-9119-456b-b75d-f5146737d24f", + "09a37c51-e625-49f4-a514-20a773797a8a", + "3e3fc409-bff3-4905-bf16-c968eee3f807", + "f5c13cdd-205c-4719-a562-348ae5cd1d91", + "5990457f-2adb-467b-a4af-5c857c92d762", + "415ef462-bed3-493a-ac36-ca8c6d23bf1b", + "7ff48d5b-2df2-49da-b500-a5150ffc7f18", + "9f3bb592-209d-43bc-bb47-d77d9df56504", + "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0", + "ce2b64a2-ddc1-4f91-8c7d-a88be7121aac", + "3f05f3b9-29ba-4b6b-95aa-2204697ffc06", + "e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56", + "f8369178-fafe-40c2-adc4-b9b08a125456", + "778efd0a-153f-4842-9214-f05fc176b877", + "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5", + "c2751594-0cd5-4088-be1b-b5f2f9ec97c4", + "788b3701-3ec9-4b67-b679-418bfa726c22", + "48c46dc7-fe04-4505-ade7-723cba1aa6f6", + "42d25c08-fb87-4927-8b65-93631280a26f", + "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108", + "e8172110-ec08-421b-a6f5-842e6451911f", + "42f4d1c7-4521-4161-b646-0a8934e36081", + "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", + "d68204bf-11c1-4b13-b48b-d303c73d4bf6", + "91190194-f406-4cd6-b3f9-c43fac942b22", + "7f35355e-02a6-45b5-b140-f0be698bcf85", + "98e8e339-5f91-4ed2-b2b2-12647cb134f4", + "0e5303d4-8820-42f6-b18d-daf7e633de21", + "df67aebb-fb3a-44fd-b75b-51b6012df509", + "5df7b33a-9f77-4101-823e-02f863e1c1ae", + "aceb0368-56b8-4073-b70e-3dc9aee184e0", + "22a4636f-8179-4357-8e87-d1743ece1f81", + "236833a3-5704-47fc-888c-4f298f09f799", + "67890eb6-6ce5-4c00-9e3d-fb4972699b06" + ], + "os": [ + "94d95f96-9699-4208-98ba-3c3119edf9c2", + "bedcedc4-4d72-425e-ad62-21960b11fe0d", + "43c2d64c-bab5-4dcb-a30c-b888321c319a", + "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82", + "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3", + "a462a795-fdc7-4b23-b689-e8b6df786b78", + "f9be0997-4b7c-45c5-b05c-4612b44a6118", + "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2", + "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", + "e0df059f-28a6-4169-924f-b9623e7184cc", + "ddc75b62-7311-4af8-bfb3-859558542b36", + "b6781586-6346-41cd-935a-a6b1487918fc", + "b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa", + "3ce045a0-877b-42aa-8d2c-b4a863336ab8", + "fe41f596-a71b-4c2f-9b2f-9dcd40b568c3", + "a4d98375-215b-4a4d-aee9-3d4370fccc41", + "13584542-872b-42d8-b299-866967b5c3ef", + "23393935-50c7-4a86-aeea-2b78fd089c5c", + "5812b315-e7bd-4265-b51f-863c02174c28", + "c288e301-e626-4b98-a1ab-159dcb162af5", + "cc9d4f34-1ca0-4a1b-8ff2-09302696acb9", + "c56de254-a3ec-414e-81a6-83d2ce8c41fa", + "4783cc41-c03c-4e1b-89b4-50658f642bd5", + "5c1075ca-bb34-46a3-a7a0-029bd7463e79", + "5ced85fc-fa1a-4217-95fd-0fb530545ce2", + "37887e8c-da15-4192-923c-08fa390a176d", + "4127319a-8b79-4410-b58a-7a151e15f3d7", + "4d117223-a354-47fb-8b45-62ab1390a95f", + "6f56bf42-85b8-4fbb-8e06-6c44960184ba" + ], + "thunderbird": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", + "12086550-11c0-466b-b367-1d9e75b3910e", + "06fe7178-4491-4589-810f-2e2bc9502122", + "6766f2b8-8a72-417f-a9e5-56fcaa735837", + "e1e75309-3ddb-4d09-92ec-de869c928143", + "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5", + "35253b65-1c19-4304-8aa4-6884b8218fc0", + "d088f539-cab4-4f9a-ac92-9999fc3a656e", + "2ad9387a-65d8-4e33-ad5b-7580065a27ca", + "480bcfea-d68f-4aaa-a0a9-2589ef319381", + "030eeff7-b492-4218-b312-701ec99ee0cc", + "94760984-3ff5-41ee-8347-cf1af709fea0", + "99146c54-4f37-4ab8-9327-5f3291665e1e", + "c9e7eaf2-b1a1-4efc-a982-721972fa9f02" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", + "8f080098-ddb1-424c-b438-4e96e5e4786e", + "bba3381f-b5eb-4439-bd9e-80c22218d5a7", + "fba2c100-79e8-42df-ae74-b592418d54f4", + "efcf0d81-0835-4880-b2fd-d866e8bc2294", + "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f", + "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6", + "386dbd0e-0241-4a0a-b6a2-6704fba26b1c", + "9195653c-f4aa-453d-aa95-787f6ccfaae9", + "d06f0d4d-2cd5-4ede-8de9-598629438c6e", + "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81", + "5ac2891a-eacd-4954-b339-98abba077adb", + "f3977615-2b45-4ac5-8bba-80c17dbe2a37", + "215dfd39-f493-4bc3-a027-8a97d72c61bf", + "cb130f0d-d36f-4302-9838-b3baf46139b6", + "7882ed6e-bece-4bf0-bada-c32dc1ddae72" + ], + "vs_code": [ + "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "eabc805a-bfcf-4460-b250-ac92135819f6", + "982d12a5-beab-424f-8d38-d2a48429e511", + "4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", + "9439a27b-18ae-42d8-9778-5f68f891805e", + "ae506c68-352c-4094-9caa-ee9d42052317", + "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", + "c714dcee-cad3-4e12-8f3c-12bdcfcdb048", + "930fdb3b-11a8-46fe-9bac-577332e2640e", + "276cc624-87ea-4f08-ab93-f770e3790175", + "9d425400-e9b2-4424-9a4b-d4c7abac4140", + "5e2d93d8-8ad0-4435-b150-1692aacaa994", + "6ed0a554-cbee-4b44-84ea-fd6c042f4fe1", + "ec71221e-ac43-46f9-89b8-ee7d80f7e1c5", + "70745df8-f2f5-42bd-8074-fbc10334fcc5", + "57242fad-77ca-454f-b71b-f187181a9f23", + "c6bf789c-ba3a-4209-971d-b63abf0ab733", + "0512bb38-d531-4acf-9e7e-0add90816068", + "847a96b6-df94-4927-97e6-8cc9ea66ced7", + "7aeae0e2-70ee-4705-821d-1bba5d5b2ddd", + "dcbe20e8-647f-4f1d-8696-f1c5bbb570e3", + "7c4cc09e-7a92-40dd-8338-b2286535c4ed", + "971cbb5b-3cbf-4ff7-9e24-b5c84fcebfa6" + ] +} \ No newline at end of file diff --git a/experiment_screenshot.py b/experiment_screenshot.py index 37952bf..b426401 100644 --- a/experiment_screenshot.py +++ b/experiment_screenshot.py @@ -1,20 +1,15 @@ # todo: unifiy all the experiments python file into one file +import argparse import datetime import json import logging import os import sys -# import eventlet import func_timeout -from func_timeout import FunctionTimedOut from desktop_env.envs.desktop_env import DesktopEnv -from mm_agents.gpt_4v_agent import GPT4v_Agent - -# eventlet.monkey_patch() - -# from mm_agents.gemini_pro_agent import GeminiPro_Agent +from mm_agents.gpt_4v_agent import GPT4v_Agent # todo: change the name into PromptAgent # Logger Configs {{{ # logger = logging.getLogger() @@ -123,7 +118,6 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr result = env.evaluate() logger.info("Result: %.2f", result) - # fixme: change to write the result into a separate file with open(trajectory_recording_path, "a") as f: f.write(json.dumps({ @@ -176,7 +170,8 @@ def main(example_class, example_id, gpt4_model="gpt-4-vision-preview"): if len(lines) > 0: last_line = json.loads(lines[-1]) if "result" in last_line: - logger.info(f"evaluation_examples/examples/{example_class}/{example_id}.json" + "has been evaluated. Skip.") + logger.info( + f"evaluation_examples/examples/{example_class}/{example_id}.json" + "has been evaluated. Skip.") return try: @@ -189,510 +184,152 @@ def main(example_class, example_id, gpt4_model="gpt-4-vision-preview"): })) +def config() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run end-to-end evaluation on the benchmark" + ) + parser.add_argument( + "--render", action="store_true", help="Render the browser" + ) + + parser.add_argument( + "--slow_mo", + type=int, + default=0, + help="Slow down the browser by the specified amount", + ) + parser.add_argument( + "--action_set_tag", default="id_accessibility_tree", help="Action type" + ) + parser.add_argument( + "--observation_type", + choices=[ + "accessibility_tree", + "accessibility_tree_with_captioner", + "html", + "image", + "image_som", + ], + default="accessibility_tree", + help="Observation type", + ) + parser.add_argument( + "--current_viewport_only", + action="store_true", + help="Only use the current viewport for the observation", + ) + parser.add_argument("--viewport_width", type=int, default=1280) + parser.add_argument("--viewport_height", type=int, default=2048) + parser.add_argument("--save_trace_enabled", action="store_true") + parser.add_argument("--sleep_after_execution", type=float, default=0.0) + + parser.add_argument("--max_steps", type=int, default=30) + + # agent config + parser.add_argument("--agent_type", type=str, default="prompt") + parser.add_argument( + "--instruction_path", + type=str, + default="agents/prompts/state_action_agent.json", + ) + parser.add_argument( + "--parsing_failure_th", + help="When consecutive parsing failures exceed this threshold, the agent will terminate early.", + type=int, + default=3, + ) + parser.add_argument( + "--repeating_action_failure_th", + help="When consecutive repeated actions exceed this threshold, the agent will terminate early.", + type=int, + default=5, + ) + + parser.add_argument("--test_config_base_dir", type=str) + + parser.add_argument( + "--eval_captioning_model_device", + type=str, + default="cpu", + choices=["cpu", "cuda"], + help="Device to run eval captioning model on. By default, runs it on CPU.", + ) + parser.add_argument( + "--eval_captioning_model", + type=str, + default="Salesforce/blip2-flan-t5-xl", + choices=["Salesforce/blip2-flan-t5-xl"], + help="Captioning backbone for VQA-type evals.", + ) + parser.add_argument( + "--captioning_model", + type=str, + default="Salesforce/blip2-flan-t5-xl", + choices=["Salesforce/blip2-flan-t5-xl", "llava-hf/llava-1.5-7b-hf"], + help="Captioning backbone for accessibility tree alt text.", + ) + + # lm config + parser.add_argument("--provider", type=str, default="openai") + parser.add_argument("--model", type=str, default="gpt-3.5-turbo-0613") + parser.add_argument("--mode", type=str, default="chat") + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--top_p", type=float, default=0.9) + parser.add_argument("--context_length", type=int, default=0) + parser.add_argument("--max_tokens", type=int, default=384) + parser.add_argument("--stop_token", type=str, default=None) + parser.add_argument( + "--max_retry", + type=int, + help="max retry times to perform generations when parsing fails", + default=1, + ) + parser.add_argument( + "--max_obs_length", + type=int, + help="when not zero, will truncate the observation to this length before feeding to the model", + default=3840, + ) + + # example config + parser.add_argument("--test_start_idx", type=int, default=0) + parser.add_argument("--test_end_idx", type=int, default=910) + + # logging related + parser.add_argument("--result_dir", type=str, default="") + args = parser.parse_args() + + # check the whether the action space is compatible with the observation space + if ( + args.action_set_tag == "id_accessibility_tree" + and args.observation_type + not in [ + "accessibility_tree", + "accessibility_tree_with_captioner", + "image_som", + ] + ): + raise ValueError( + f"Action type {args.action_set_tag} is incompatible with the observation type {args.observation_type}" + ) + + return args + + if __name__ == '__main__': ####### The complete version of the list of examples ####### + os.environ["TOKENIZERS_PARALLELISM"] = "false" + args = config() + args.sleep_after_execution = 2.5 + prepare(args) # todo: add recorder of the progress of the examples # todo: remove the useless example files - os_list = [ - '94d95f96-9699-4208-98ba-3c3119edf9c2', - 'bedcedc4-4d72-425e-ad62-21960b11fe0d', - '43c2d64c-bab5-4dcb-a30c-b888321c319a', - '7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82', - 'ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3', - 'a462a795-fdc7-4b23-b689-e8b6df786b78', - 'f9be0997-4b7c-45c5-b05c-4612b44a6118', - '28cc3b7e-b194-4bc9-8353-d04c0f4d56d2', - '5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57', - 'e0df059f-28a6-4169-924f-b9623e7184cc', - 'ddc75b62-7311-4af8-bfb3-859558542b36', - 'b6781586-6346-41cd-935a-a6b1487918fc', - 'b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa', - '3ce045a0-877b-42aa-8d2c-b4a863336ab8', - 'fe41f596-a71b-4c2f-9b2f-9dcd40b568c3', - 'a4d98375-215b-4a4d-aee9-3d4370fccc41', - '13584542-872b-42d8-b299-866967b5c3ef', - '23393935-50c7-4a86-aeea-2b78fd089c5c', - '5812b315-e7bd-4265-b51f-863c02174c28', - 'c288e301-e626-4b98-a1ab-159dcb162af5', - 'cc9d4f34-1ca0-4a1b-8ff2-09302696acb9', - 'c56de254-a3ec-414e-81a6-83d2ce8c41fa', - '4783cc41-c03c-4e1b-89b4-50658f642bd5', - '5c1075ca-bb34-46a3-a7a0-029bd7463e79', - '5ced85fc-fa1a-4217-95fd-0fb530545ce2', - '37887e8c-da15-4192-923c-08fa390a176d', - '4127319a-8b79-4410-b58a-7a151e15f3d7', - '4d117223-a354-47fb-8b45-62ab1390a95f', - '6f56bf42-85b8-4fbb-8e06-6c44960184ba' - ] + with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f: + test_all_meta = json.load(f) - calc_list = [ - 'eb03d19a-b88d-4de4-8a64-ca0ac66f426b', - '0bf05a7d-b28b-44d2-955a-50b41e24012a', - '7b802dad-6e0f-4204-9815-d4e3f57627d8', - '7a4e4bc8-922c-4c84-865c-25ba34136be1', - '2bd59342-0664-4ccb-ba87-79379096cc08', - 'a9f325aa-8c05-4e4f-8341-9e4358565f4f', - 'ecb0df7a-4e8d-4a03-b162-053391d3afaf', - '7efeb4b1-3d19-4762-b163-63328d66303b', - '4e6fcf72-daf3-439f-a232-c434ce416af6', - '6054afcb-5bab-4702-90a0-b259b5d3217c', - 'abed40dc-063f-4598-8ba5-9fe749c0615d', - '01b269ae-2111-4a07-81fd-3fcd711993b0', - '8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14', - '0cecd4f3-74de-457b-ba94-29ad6b5dafb6', - '4188d3a4-077d-46b7-9c86-23e1a036f6c1', - '51b11269-2ca8-4b2a-9163-f21758420e78', - '7e429b8d-a3f0-4ed0-9b58-08957d00b127', - '347ef137-7eeb-4c80-a3bb-0951f26a8aff', - '6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5', - '3aaa4e37-dc91-482e-99af-132a612d40f3', - '37608790-6147-45d0-9f20-1137bb35703d', - 'f9584479-3d0d-4c79-affa-9ad7afdd8850', - 'd681960f-7bc3-4286-9913-a8812ba3261a', - '21df9241-f8d7-4509-b7f1-37e501a823f7', - '1334ca3e-f9e3-4db8-9ca7-b4c653be7d17', - '357ef137-7eeb-4c80-a3bb-0951f26a8aff', - 'aa3a8974-2e85-438b-b29e-a64df44deb4b', - 'a01fbce3-2793-461f-ab86-43680ccbae25', - '4f07fbe9-70de-4927-a4d5-bb28bc12c52c', - '1e8df695-bd1b-45b3-b557-e7d599cf7597', - '1273e544-688f-496b-8d89-3e0f40aa0606', - '4172ea6e-6b77-4edb-a9cc-c0014bd1603b', - '0326d92d-d218-48a8-9ca1-981cd6d064c7', - '26a8440e-c166-4c50-aef4-bfb77314b46b', - '1954cced-e748-45c4-9c26-9855b97fbc5e', - '535364ea-05bd-46ea-9937-9f55c68507e8', - '4de54231-e4b5-49e3-b2ba-61a0bec721c0', - '1de60575-bb6e-4c3d-9e6a-2fa699f9f197', - '0a2e43bf-b26c-4631-a966-af9dfa12c9e5', - '3a7c8185-25c1-4941-bd7b-96e823c9f21f', - '04d9aeaf-7bed-4024-bedb-e10e6f00eb7f', - '42e0a640-4f19-4b28-973d-729602b5a4a7', - '1d17d234-e39d-4ed7-b46f-4417922a4e7c', - '21ab7b40-77c2-4ae6-8321-e00d3a086c73', - '30e3e107-1cfb-46ee-a755-2cd080d7ba6a', - '12382c62-0cd1-4bf2-bdc8-1d20bf9b2371', - '035f41ba-6653-43ab-aa63-c86d449d62e5', - '51719eea-10bc-4246-a428-ac7c433dd4b3' - ] - - sheetcopilot_list = [ - # "1e8df695-bd1b-45b3-b557-e7d599cf7597", - # "1273e544-688f-496b-8d89-3e0f40aa0606", - # "4172ea6e-6b77-4edb-a9cc-c0014bd1603b", - # "0326d92d-d218-48a8-9ca1-981cd6d064c7", - # "26a8440e-c166-4c50-aef4-bfb77314b46b", - # "1954cced-e748-45c4-9c26-9855b97fbc5e", - # "535364ea-05bd-46ea-9937-9f55c68507e8", - # "4de54231-e4b5-49e3-b2ba-61a0bec721c0", - # "1de60575-bb6e-4c3d-9e6a-2fa699f9f197", - # "0a2e43bf-b26c-4631-a966-af9dfa12c9e5", - # "3a7c8185-25c1-4941-bd7b-96e823c9f21f", - # "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f", - # "42e0a640-4f19-4b28-973d-729602b5a4a7", - # "1d17d234-e39d-4ed7-b46f-4417922a4e7c", - "21ab7b40-77c2-4ae6-8321-e00d3a086c73", - "30e3e107-1cfb-46ee-a755-2cd080d7ba6a", - "12382c62-0cd1-4bf2-bdc8-1d20bf9b2371", - "51719eea-10bc-4246-a428-ac7c433dd4b3" - ] - - impress_list = [ - '5d901039-a89c-4bfb-967b-bf66f4df075e', - '550ce7e7-747b-495f-b122-acdc4d0b8e54', - '455d3c66-7dc6-4537-a39a-36d3e9119df7', - 'af23762e-2bfd-4a1d-aada-20fa8de9ce07', - 'c59742c0-4323-4b9d-8a02-723c251deaa0', - 'ef9d12bd-bcee-4ba0-a40e-918400f43ddf', - '9ec204e4-f0a3-42f8-8458-b772a6797cab', - '0f84bef9-9790-432e-92b7-eece357603fb', - 'ce88f674-ab7a-43da-9201-468d38539e4a', - '3b27600c-3668-4abd-8f84-7bcdebbccbdb', - 'a097acff-6266-4291-9fbd-137af7ecd439', - 'bf4e9888-f10f-47af-8dba-76413038b73c', - '21760ecb-8f62-40d2-8d85-0cee5725cb72', - 'ac9bb6cb-1888-43ab-81e4-a98a547918cd', - '2cd43775-7085-45d8-89fa-9e35c0a915cf', - '358aa0a7-6677-453f-ae35-e440f004c31e', - 'a669ef01-ded5-4099-9ea9-25e99b569840', - '73c99fb9-f828-43ce-b87a-01dc07faa224', - '15aece23-a215-4579-91b4-69eec72e18da', - '986fc832-6af2-417c-8845-9272b3a1528b', - 'a434992a-89df-4577-925c-0c58b747f0f4', - '7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8', - '841b50aa-df53-47bd-a73a-22d3a9f73160', - '8979838c-54a5-4454-a2b8-3d135a1a5c8f', - 'b8adbc24-cef2-4b15-99d5-ecbe7ff445eb', - '2b94c692-6abb-48ae-ab0b-b3e8a19cb340', - '9cf05d24-6bd9-4dae-8967-f67d88f5d38a', - '08aced46-45a2-48d7-993b-ed3fb5b32302', - 'edb61b14-a854-4bf5-a075-c8075c11293a', - 'c82632a4-56b6-4db4-9dd1-3820ee3388e4', - '39be0d19-634d-4475-8768-09c130f5425d', - 'ac1b39ff-ee4d-4483-abce-c117e98942f0', - 'f23acfd2-c485-4b7c-a1e7-d4303ddfe864', - '70bca0cc-c117-427e-b0be-4df7299ebeb6', - 'af2d657a-e6b3-4c6a-9f67-9e3ed015974c', - '57667013-ea97-417c-9dce-2713091e6e2a', - '0a211154-fda0-48d0-9274-eaac4ce5486d', - 'a53f80cd-4a90-4490-8310-097b011433f6', - '7ae48c60-f143-4119-b659-15b8f485eb9a', - '5cfb9197-e72b-454b-900e-c06b0c802b40', - '05dd4c1d-c489-4c85-8389-a7836c4f0567', - '5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1', - '4ed5abd0-8b5d-47bd-839f-cacfa15ca37a', - 'e4ef0baf-4b52-4590-a47e-d4d464cca2d7', - 'ed43c15f-00cb-4054-9c95-62c880865d68', - '3161d64e-3120-47b4-aaad-6a764a92493b', - '04578141-1d42-4146-b9cf-6fab4ce5fd74' - ] - - impress_gpt4_list = [ - # "5d901039-a89c-4bfb-967b-bf66f4df075e", - # "550ce7e7-747b-495f-b122-acdc4d0b8e54", - # "455d3c66-7dc6-4537-a39a-36d3e9119df7", - # "af23762e-2bfd-4a1d-aada-20fa8de9ce07", - # "c59742c0-4323-4b9d-8a02-723c251deaa0", - # "ef9d12bd-bcee-4ba0-a40e-918400f43ddf", - # "9ec204e4-f0a3-42f8-8458-b772a6797cab", - # "0f84bef9-9790-432e-92b7-eece357603fb", - # "ce88f674-ab7a-43da-9201-468d38539e4a", - # "3b27600c-3668-4abd-8f84-7bcdebbccbdb", - # "a097acff-6266-4291-9fbd-137af7ecd439", - # "bf4e9888-f10f-47af-8dba-76413038b73c", - # "21760ecb-8f62-40d2-8d85-0cee5725cb72", - "ac9bb6cb-1888-43ab-81e4-a98a547918cd", - "2cd43775-7085-45d8-89fa-9e35c0a915cf", - "358aa0a7-6677-453f-ae35-e440f004c31e", - "a669ef01-ded5-4099-9ea9-25e99b569840", - # The following examples are from PPTC - "73c99fb9-f828-43ce-b87a-01dc07faa224", - "15aece23-a215-4579-91b4-69eec72e18da", - "986fc832-6af2-417c-8845-9272b3a1528b", - "a434992a-89df-4577-925c-0c58b747f0f4", - "7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8", - "841b50aa-df53-47bd-a73a-22d3a9f73160", - "8979838c-54a5-4454-a2b8-3d135a1a5c8f", - "b8adbc24-cef2-4b15-99d5-ecbe7ff445eb", - "2b94c692-6abb-48ae-ab0b-b3e8a19cb340", - "9cf05d24-6bd9-4dae-8967-f67d88f5d38a", - "08aced46-45a2-48d7-993b-ed3fb5b32302", - "edb61b14-a854-4bf5-a075-c8075c11293a", - "c82632a4-56b6-4db4-9dd1-3820ee3388e4", - "39be0d19-634d-4475-8768-09c130f5425d", - "ac1b39ff-ee4d-4483-abce-c117e98942f0", - "f23acfd2-c485-4b7c-a1e7-d4303ddfe864", - "70bca0cc-c117-427e-b0be-4df7299ebeb6", - "af2d657a-e6b3-4c6a-9f67-9e3ed015974c", - "57667013-ea97-417c-9dce-2713091e6e2a", - "0a211154-fda0-48d0-9274-eaac4ce5486d", - "a53f80cd-4a90-4490-8310-097b011433f6", - "7ae48c60-f143-4119-b659-15b8f485eb9a", - "5cfb9197-e72b-454b-900e-c06b0c802b40", - "05dd4c1d-c489-4c85-8389-a7836c4f0567", - "5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1", - "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a", - "e4ef0baf-4b52-4590-a47e-d4d464cca2d7", - "ed43c15f-00cb-4054-9c95-62c880865d68", - "3161d64e-3120-47b4-aaad-6a764a92493b", - "04578141-1d42-4146-b9cf-6fab4ce5fd74" - ] - - writer_list = [ - '0810415c-bde4-4443-9047-d5f70165a697', - '0a0faba3-5580-44df-965d-f562a99b291c', - '0b17a146-2934-46c7-8727-73ff6b6483e8', - '0e47de2a-32e0-456c-a366-8c607ef7a9d2', - '0e763496-b6bb-4508-a427-fad0b6c3e195', - '3ef2b351-8a84-4ff2-8724-d86eae9b842e', - '4bcb1253-a636-4df4-8cb0-a35c04dfef31', - '66399b0d-8fda-4618-95c4-bfc6191617e9', - '6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2', - '6ada715d-3aae-4a32-a6a7-429b2e43fb93', - '6f81754e-285d-4ce0-b59e-af7edb02d108', - '72b810ef-4156-4d09-8f08-a0cf57e7cefe', - '8472fece-c7dd-4241-8d65-9b3cd1a0b568', - '88fe4b2d-3040-4c70-9a70-546a47764b48', - '936321ce-5236-426a-9a20-e0e3c5dc536f', - 'adf5e2c3-64c7-4644-b7b6-d2f0167927e7', - 'b21acd93-60fd-4127-8a43-2f5178f4a830', - 'd53ff5ee-3b1a-431e-b2be-30ed2673079b', - 'e246f6d8-78d7-44ac-b668-fcf47946cb50', - 'e528b65e-1107-4b8c-8988-490e4fece599', - 'ecc2413d-8a48-416e-a3a2-d30106ca36cb', - 'f178a4a9-d090-4b56-bc4c-4b72a61a035d', - 'bb8ccc78-479f-4a2f-a71e-d565e439436b' - ] - - vlc_list = [ - '59f21cfb-0120-4326-b255-a5b827b38967', - '8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89', - '8f080098-ddb1-424c-b438-4e96e5e4786e', - 'bba3381f-b5eb-4439-bd9e-80c22218d5a7', - 'fba2c100-79e8-42df-ae74-b592418d54f4', - 'efcf0d81-0835-4880-b2fd-d866e8bc2294', - '8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f', - 'aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6', - '386dbd0e-0241-4a0a-b6a2-6704fba26b1c', - '9195653c-f4aa-453d-aa95-787f6ccfaae9', - 'd06f0d4d-2cd5-4ede-8de9-598629438c6e', - 'a5bbbcd5-b398-4c91-83d4-55e1e31bbb81', - '5ac2891a-eacd-4954-b339-98abba077adb', - 'f3977615-2b45-4ac5-8bba-80c17dbe2a37', - '215dfd39-f493-4bc3-a027-8a97d72c61bf', - 'cb130f0d-d36f-4302-9838-b3baf46139b6', - '7882ed6e-bece-4bf0-bada-c32dc1ddae72' - ] - - chrome_list = [ - 'bb5e4c0d-f964-439c-97b6-bdb9747de3f4', - '7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3', - '06fe7178-4491-4589-810f-2e2bc9502122', - 'e1e75309-3ddb-4d09-92ec-de869c928143', - '35253b65-1c19-4304-8aa4-6884b8218fc0', - '2ad9387a-65d8-4e33-ad5b-7580065a27ca', - '7a5a7856-f1b6-42a4-ade9-1ca81ca0f263', - '44ee5668-ecd5-4366-a6ce-c1c9b8d4e938', - '2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3', - '480bcfea-d68f-4aaa-a0a9-2589ef319381', - 'af630914-714e-4a24-a7bb-f9af687d3b91', - '3720f614-37fd-4d04-8a6b-76f54f8c222d', - '99146c54-4f37-4ab8-9327-5f3291665e1e', - '12086550-11c0-466b-b367-1d9e75b3910e', - '6766f2b8-8a72-417f-a9e5-56fcaa735837', - '93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9', - 'ae78f875-5b98-4907-bbb5-9c737fc68c03', - '3299584d-8f11-4457-bf4c-ce98f7600250', - '030eeff7-b492-4218-b312-701ec99ee0cc', - '9656a811-9b5b-4ddf-99c7-5117bcef0626', - 'fc6d8143-9452-4171-9459-7f515143419a', - 'a96b564e-dbe9-42c3-9ccf-b4498073438a', - '1704f00f-79e6-43a7-961b-cedd3724d5fd', - 'f3b19d1e-2d48-44e9-b4e1-defcae1a0197', - '82bc8d6a-36eb-4d2d-8801-ef714fb1e55a', - '47543840-672a-467d-80df-8f7c3b9788c9', - 'c1fa57f3-c3db-4596-8f09-020701085416', - 'da46d875-6b82-4681-9284-653b0c7ae241', - '6c4c23a1-42a4-43cc-9db1-2f86ff3738cc', - 'f79439ad-3ee8-4f99-a518-0eb60e5652b0', - 'b7895e80-f4d1-4648-bee0-4eb45a6f1fa8', - '9f3f70fc-5afc-4958-a7b7-3bb4fcb01805', - '7f52cab9-535c-4835-ac8c-391ee64dc930', - '82279c77-8fc6-46f6-9622-3ba96f61b477', - '2888b4e6-5b47-4b57-8bf5-c73827890774', - 'b4f95342-463e-4179-8c3f-193cd7241fb2', - 'f5d96daf-83a8-4c86-9686-bada31fc66ab', - '121ba48f-9e17-48ce-9bc6-a4fb17a7ebba', - '368d9ba4-203c-40c1-9fa3-da2f1430ce63', - '59155008-fe71-45ec-8a8f-dc35497b6aa8', - 'a728a36e-8bf1-4bb6-9a03-ef039a5233f0', - 'b070486d-e161-459b-aa2b-ef442d973b92', - '0d8b7de3-e8de-4d86-b9fd-dd2dce58a217', - '9f935cce-0a9f-435f-8007-817732bfc0a5', - 'f0b971a1-6831-4b9b-a50e-22a6e47f45ba', - 'cabb3bae-cccb-41bd-9f5d-0f3a9fecd825' - ] - - vs_code_list = [ - '0ed39f63-6049-43d4-ba4d-5fa2fe04a951', - '53ad5833-3455-407b-bbc6-45b4c79ab8fb', - 'eabc805a-bfcf-4460-b250-ac92135819f6', - '982d12a5-beab-424f-8d38-d2a48429e511', - '4e60007a-f5be-4bfc-9723-c39affa0a6d3', - 'e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2', - '9439a27b-18ae-42d8-9778-5f68f891805e', - 'ae506c68-352c-4094-9caa-ee9d42052317', - 'ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae', - 'c714dcee-cad3-4e12-8f3c-12bdcfcdb048', - '930fdb3b-11a8-46fe-9bac-577332e2640e', - '276cc624-87ea-4f08-ab93-f770e3790175', - '9d425400-e9b2-4424-9a4b-d4c7abac4140', - '5e2d93d8-8ad0-4435-b150-1692aacaa994', - '6ed0a554-cbee-4b44-84ea-fd6c042f4fe1', - 'ec71221e-ac43-46f9-89b8-ee7d80f7e1c5', - '70745df8-f2f5-42bd-8074-fbc10334fcc5', - '57242fad-77ca-454f-b71b-f187181a9f23', - 'c6bf789c-ba3a-4209-971d-b63abf0ab733', - '0512bb38-d531-4acf-9e7e-0add90816068', - '847a96b6-df94-4927-97e6-8cc9ea66ced7', - '7aeae0e2-70ee-4705-821d-1bba5d5b2ddd', - 'dcbe20e8-647f-4f1d-8696-f1c5bbb570e3', - '7c4cc09e-7a92-40dd-8338-b2286535c4ed', - '971cbb5b-3cbf-4ff7-9e24-b5c84fcebfa6' - ] - - gimp_list = [ - '7a4deb26-d57d-4ea9-9a73-630f66a7b568', - '554785e9-4523-4e7a-b8e1-8016f565f56a', - '77b8ab4d-994f-43ac-8930-8ca087d7c4b4', - 'f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce', - 'd52d6308-ec58-42b7-a2c9-de80e4837b2b', - '2a729ded-3296-423d-aec4-7dd55ed5fbb3', - 'b148e375-fe0b-4bec-90e7-38632b0d73c2', - 'a746add2-cab0-4740-ac36-c3769d9bfb46', - '7b7617bd-57cc-468e-9c91-40c4ec2bcb3d', - 'd16c99dc-2a1e-46f2-b350-d97c86c85c15', - '06ca5602-62ca-47f6-ad4f-da151cde54cc', - 'e2dd0213-26db-4349-abe5-d5667bfd725c', - 'f723c744-e62c-4ae6-98d1-750d3cd7d79d', - '72f83cdc-bf76-4531-9a1b-eb893a13f8aa', - '7767eef2-56a3-4cea-8c9f-48c070c7d65b', - '734d6579-c07d-47a8-9ae2-13339795476b', - 'e19bd559-633b-4b02-940f-d946248f088e', - '38f48d40-764e-4e77-a7cf-51dfce880291', - 'fbb548ca-c2a6-4601-9204-e39a2efc507b', - '5ca86c6f-f317-49d8-b6a7-b527541caae8', - '62f7fd55-0687-4a43-b6e1-3eda16fc6252', - '8ea73f6f-9689-42ad-8c60-195bbf06a7ba', - '58d3eeeb-e9d0-499f-962e-fd0db2a744d8', - '2e6f678f-472d-4c55-99cc-8e7c5c402a71', - '045bf3ff-9077-4b86-b483-a1040a949cff', - 'dbbf4b99-2253-4b10-9274-45f246af2466' - ] - - thunderbird_list = [ - 'bb5e4c0d-f964-439c-97b6-bdb9747de3f4', - '7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3', - '12086550-11c0-466b-b367-1d9e75b3910e', - '06fe7178-4491-4589-810f-2e2bc9502122', - '6766f2b8-8a72-417f-a9e5-56fcaa735837', - 'e1e75309-3ddb-4d09-92ec-de869c928143', - '3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5', - '35253b65-1c19-4304-8aa4-6884b8218fc0', - 'd088f539-cab4-4f9a-ac92-9999fc3a656e', - '2ad9387a-65d8-4e33-ad5b-7580065a27ca', - '480bcfea-d68f-4aaa-a0a9-2589ef319381', - '030eeff7-b492-4218-b312-701ec99ee0cc', - '94760984-3ff5-41ee-8347-cf1af709fea0', - '99146c54-4f37-4ab8-9327-5f3291665e1e', - 'c9e7eaf2-b1a1-4efc-a982-721972fa9f02'] - - multiple_list = [ - '2b9493d7-49b8-493a-a71b-56cd1f4d6908', - '2c9fc0de-3ee7-45e1-a5df-c86206ad78b5', - '2fe4b718-3bd7-46ec-bdce-b184f5653624', - '3680a5ee-6870-426a-a997-eba929a0d25c', - '46407397-a7d5-4c6b-92c6-dbe038b1457b', - '4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc', - '510f64c8-9bcc-4be1-8d30-638705850618', - '51f5801c-18b3-4f25-b0c3-02f85507a078', - '58565672-7bfe-48ab-b828-db349231de6b', - '78aed49a-a710-4321-a793-b611a7c5b56b', - '897e3b53-5d4d-444b-85cb-2cdc8a97d903', - '937087b6-f668-4ba6-9110-60682ee33441', - 'a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb', - 'b52b40a5-ad70-4c53-b5b0-5650a8387052', - 'c867c42d-a52d-4a24-8ae3-f75d256b5618', - 'd9b7c649-c975-4f53-88f5-940b29c47247', - 'e135df7c-7687-4ac0-a5f0-76b74438b53e', - 'ee9a3c83-f437-4879-8918-be5efbb9fac7', - 'f7dfbef3-7697-431c-883a-db8583a4e4f9', - 'f8cfa149-d1c1-4215-8dac-4a0932bad3c2', - '6d72aad6-187a-4392-a4c4-ed87269c51cf', - 'f918266a-b3e0-4914-865d-4faa564f1aef', - 'da52d699-e8d2-4dc5-9191-a2199e0b6a9b', - 'bc2b57f3-686d-4ec9-87ce-edf850b7e442', - '74d5859f-ed66-4d3e-aa0e-93d7a592ce41', - 'b5062e3e-641c-4e3a-907b-ac864d2e7652', - '00fa164e-2612-4439-992e-157d019a8436', - 'acb0f96b-e27c-44d8-b55f-7cb76609dfcd', - '69acbb55-d945-4927-a87b-8480e1a5bb7e', - '48d05431-6cd5-4e76-82eb-12b60d823f7d', - '68a25bd4-59c7-4f4d-975e-da0c8509c848', - 'eb303e01-261e-4972-8c07-c9b4e7a4922a', - '0c825995-5b70-4526-b663-113f4c999dd2', - 'c7c1e4c3-9e92-4eba-a4b8-689953975ea4', - 'd1acdb87-bb67-4f30-84aa-990e56a09c92', - 'deec51c9-3b1e-4b9e-993c-4776f20e8bb2', - '8e116af7-7db7-4e35-a68b-b0939c066c78', - '337d318b-aa07-4f4f-b763-89d9a2dd013f', - '82e3c869-49f6-4305-a7ce-f3e64a0618e7', - '185f29bd-5da0-40a6-b69c-ba7f4e0324ef', - '869de13e-bef9-4b91-ba51-f6708c40b096', - '2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e', - '3a93cae4-ad3e-403e-8c12-65303b271818', - '1f18aa87-af6f-41ef-9853-cdb8f32ebdea', - '26150609-0da3-4a7d-8868-0faf9c5f01bb', - '9219480b-3aed-47fc-8bac-d2cffc5849f7', - '881deb30-9549-4583-a841-8270c65f2a17', - '7e287123-70ca-47b9-8521-47db09b69b14', - 'e2392362-125e-4f76-a2ee-524b183a3412', - '5bc63fb9-276a-4439-a7c1-9dc76401737f', - '26660ad1-6ebb-4f59-8cba-a8432dfe8d38', - 'a82b78bb-7fde-4cb3-94a4-035baf10bcf0', - '36037439-2044-4b50-b9d1-875b5a332143', - '716a6079-22da-47f1-ba73-c9d58f986a38', - '873cafdd-a581-47f6-8b33-b9696ddb7b05', - 'a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a', - '6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a', - 'da922383-bfa4-4cd3-bbad-6bebab3d7742', - '2373b66a-092d-44cb-bfd7-82e86e7a3b4d', - '81c425f5-78f3-4771-afd6-3d2973825947', - 'bb83cab4-e5c7-42c7-a67b-e46068032b86', - '227d2f97-562b-4ccb-ae47-a5ec9e142fbb', - 'b337d106-053f-4d37-8da0-7f9c4043a66b', - '20236825-b5df-46e7-89bf-62e1d640a897', - '8df7e444-8e06-4f93-8a1a-c5c974269d82', - 'aad10cd7-9337-4b62-b704-a857848cedf2', - '02ce9a50-7af2-47ed-8596-af0c230501f8', - '4c26e3f3-3a14-4d86-b44a-d3cedebbb487', - 'a503b07f-9119-456b-b75d-f5146737d24f', - '09a37c51-e625-49f4-a514-20a773797a8a', - '3e3fc409-bff3-4905-bf16-c968eee3f807', - 'f5c13cdd-205c-4719-a562-348ae5cd1d91', - '5990457f-2adb-467b-a4af-5c857c92d762', - '415ef462-bed3-493a-ac36-ca8c6d23bf1b', - '7ff48d5b-2df2-49da-b500-a5150ffc7f18', - '9f3bb592-209d-43bc-bb47-d77d9df56504', - 'dd60633f-2c72-42ba-8547-6f2c8cb0fdb0', - 'ce2b64a2-ddc1-4f91-8c7d-a88be7121aac', - '3f05f3b9-29ba-4b6b-95aa-2204697ffc06', - 'e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56', - 'f8369178-fafe-40c2-adc4-b9b08a125456', - '778efd0a-153f-4842-9214-f05fc176b877', - '47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5', - 'c2751594-0cd5-4088-be1b-b5f2f9ec97c4', - '788b3701-3ec9-4b67-b679-418bfa726c22', - '48c46dc7-fe04-4505-ade7-723cba1aa6f6', - '42d25c08-fb87-4927-8b65-93631280a26f', - 'bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108', - 'e8172110-ec08-421b-a6f5-842e6451911f', - '42f4d1c7-4521-4161-b646-0a8934e36081', - '3c8f201a-009d-4bbe-8b65-a6f8b35bb57f', - 'd68204bf-11c1-4b13-b48b-d303c73d4bf6', - '91190194-f406-4cd6-b3f9-c43fac942b22', - '7f35355e-02a6-45b5-b140-f0be698bcf85', - '98e8e339-5f91-4ed2-b2b2-12647cb134f4', - '0e5303d4-8820-42f6-b18d-daf7e633de21', - 'df67aebb-fb3a-44fd-b75b-51b6012df509', - '5df7b33a-9f77-4101-823e-02f863e1c1ae', - 'aceb0368-56b8-4073-b70e-3dc9aee184e0', - '22a4636f-8179-4357-8e87-d1743ece1f81', - '236833a3-5704-47fc-888c-4f298f09f799', - '67890eb6-6ce5-4c00-9e3d-fb4972699b06', - ] - - - # for example_id in calc_list: - # try: - # with eventlet.Timeout(600, False): - # main("libreoffice_calc", example_id, gpt4_model="gemini-pro-vision") - # except Exception as e: - # logger.error("An error occurred while running the example: %s", e) - # continue - - # for example_id in vs_code_list: - # main("vs_code", example_id, gpt4_model="gemini-pro-vision") - # - # for example_id in gimp_list: - # main("gimp", example_id, gpt4_model="gemini-pro-vision") - - # todo: specify the class of the example automatically by the example info, rather than hardcoding it - for example_id in chrome_list: - main("chrome", example_id, "gemini-pro-vision") - - for example_id in chrome_list: - main("chrome", example_id) + for domain in test_all_meta: + for example_id in test_all_meta[domain]: + main(domain, example_id, args.model)