run webrl chat ok

This commit is contained in:
yuyr 2025-04-23 17:01:18 +08:00
parent 698e9147ea
commit 873b6c9462
209 changed files with 108828 additions and 17 deletions

8
.gitignore vendored
View File

@ -26,3 +26,11 @@ src/server/tasks/minecraft/vab_minecraft_src/jarvis/stark_tech/MCP-Reborn
src/server/tasks/minecraft/vab_minecraft_src/jarvis/steveI/weights src/server/tasks/minecraft/vab_minecraft_src/jarvis/steveI/weights
src/server/tasks/minecraft/vab_minecraft_src/jarvis/global_configs/envs/* src/server/tasks/minecraft/vab_minecraft_src/jarvis/global_configs/envs/*
!src/server/tasks/minecraft/vab_minecraft_src/jarvis/global_configs/envs/jarvis.yaml !src/server/tasks/minecraft/vab_minecraft_src/jarvis/global_configs/envs/jarvis.yaml
VAB-WebArena-Lite/config_files/wa/test_webarena
VAB-WebArena-Lite/config_files/wa/test_webarena_lite
VAB-WebArena-Lite/log_files
VAB-WebArena-Lite/results

View File

@ -0,0 +1 @@
{"cookies": [{"name": "_cookie_check", "value": "1", "domain": "localhost", "path": "/login", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "REMEMBERME", "value": "QXBwXEVudGl0eVxVc2VyOlRXRnlkbVZzYzBkeVlXNTBUV0Z1TVRNMjoxNzc2OTA5MzI5OlltVTFZalF5Tldaa05qaG1Oamt6WWpJMllXRTVOVEE0TW1GaFl6ZGpaV0V4WVdFMk1HSmlNamsyTjJVd01qVXdaakkwTVRSaVlURTNNRGhpTVROa1lnPT0%3D", "domain": "localhost", "path": "/", "expires": 1776909329.082441, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "PHPSESSID", "value": "c002205f152b213deaedc0800c372f3f", "domain": "localhost", "path": "/", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "known_sign_in", "value": "UlVrNWNuN1g3cDRWdjNiUmJZUVl3OGNZNmJGQmlhYll6WFN0ZDdXNFpaZnBWNk1VLzBFKyt0eDJWQ2F1VkhkUndNR1hjeHIybnNzYlY5RC8wbFVWejJrclpUSUZZR2g1T0ZJQTRVVlZMUGVjKzdiMmhWVk1yR2czb1pzYXAzQWstLW8vYnlFdTlMYmZHUXBTZENLRDVBNlE9PQ%3D%3D--4b1888d4ebbdb533cf162356c2630ea53dd95286", "domain": "localhost", "path": "/", "expires": 1746582930.098554, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "_gitlab_session", "value": "037c2d46106a9cea55da705dce5b5708", "domain": "localhost", "path": "/", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": []}

View File

@ -0,0 +1 @@
{"cookies": [{"name": "admin", "value": "1087c582d275f78834da637c08d2dd0c", "domain": "localhost", "path": "/admin", "expires": 1745733329.394644, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "known_sign_in", "value": "UW1ZRyt2cTd2VmltKzMzWGVDOWovVi9HUkwyU2lYQ3BteUdGNUFyNHpNNHpMVldsWEdTZnJ6TDM5ekpxaDFpSTNGTXAvWTBDOGtWRUp6YWlWbE5UdEJXZzM5a0FvUGQ2V0c5dHJib1J6b0VrNWhVQTgvOTZxTnVtd3NHMjBWQTktLXJub3VnR0RISHFkV1Y2QmtzRlg3Nnc9PQ%3D%3D--20fe4b8ba28a2bb065bc6691d0cc2733e25633c4", "domain": "localhost", "path": "/", "expires": 1746582930.401271, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "_gitlab_session", "value": "f98c1667e8eacdb4484308adbc9212fd", "domain": "localhost", "path": "/", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": []}

View File

@ -0,0 +1 @@
{"cookies": [{"name": "mage-cache-storage", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "mage-cache-storage-section-invalidation", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "mage-messages", "value": "", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Strict"}, {"name": "recently_viewed_product", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_viewed_product_previous", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_compared_product", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_compared_product_previous", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "product_data_storage", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "section_data_ids", "value": "{%22messages%22:null%2C%22customer%22:null%2C%22compare-products%22:null%2C%22last-ordered-items%22:null%2C%22cart%22:null%2C%22directory-data%22:null%2C%22captcha%22:null%2C%22instant-purchase%22:null%2C%22loggedAsCustomer%22:null%2C%22persistent%22:null%2C%22review%22:null%2C%22wishlist%22:null%2C%22recently_viewed_product%22:null%2C%22recently_compared_product%22:null%2C%22product_data_storage%22:null%2C%22paypal-billing-agreement%22:null}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "private_content_version", "value": "5a1361e59223072bebd2793394207496", "domain": "localhost", "path": "/", "expires": 1779933329.571442, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "PHPSESSID", "value": "177a7b68734a2083018b3fb119634917", "domain": "localhost", "path": "/", "expires": 1776909330.114376, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "X-Magento-Vary", "value": "9bf9a599123e6402b85cde67144717a08b817412", "domain": "localhost", "path": "/", "expires": 1776909330.114451, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "known_sign_in", "value": "VjEvQms0R1Y0WFJwNzhWcXJmWUQzTlM5aEZhS1FLd3pIaGcrRHZoUUMxb2RtK2gzWFVMK1N1YjA0QTJiV1NPL1NwQWYrc3dpRlU3U3Q1Q3p4T0RNbWd4UVl2Vk5pR2dSbVBFTUhrQUhQMnBJOXlxaGVsdG5ycXFkVDQ2dDBBejYtLWNYZ3pJREcxbkVXWjVsZE9LMnIvZkE9PQ%3D%3D--860cb1881ab774ae0647d031b6a58789ca2dc43f", "domain": "localhost", "path": "/", "expires": 1746582931.071908, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "_gitlab_session", "value": "23ff9045fd7ed259aa91a5056469be23", "domain": "localhost", "path": "/", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": [{"origin": "http://localhost:28082", "localStorage": [{"name": "mage-cache-storage", "value": "{}"}, {"name": "product_data_storage", "value": "{}"}, {"name": "mage-cache-storage-section-invalidation", "value": "{\"messages\":true,\"customer\":true,\"compare-products\":true,\"last-ordered-items\":true,\"cart\":true,\"directory-data\":true,\"captcha\":true,\"instant-purchase\":true,\"loggedAsCustomer\":true,\"persistent\":true,\"review\":true,\"wishlist\":true,\"recently_viewed_product\":true,\"recently_compared_product\":true,\"product_data_storage\":true,\"paypal-billing-agreement\":true}"}, {"name": "mage-cache-timeout", "value": "\"2026-04-23T01:55:28.860Z\""}, {"name": "recently_compared_product_previous", "value": "{}"}, {"name": "recently_viewed_product", "value": "{}"}, {"name": "recently_compared_product", "value": "{}"}, {"name": "recently_viewed_product_previous", "value": "{}"}]}]}

View File

@ -0,0 +1 @@
{"cookies": [{"name": "known_sign_in", "value": "WHpKbmQyNkhlWWhHcGljQmZmVGE2aFl0WFh3VWxGMTRRYjBsa3dNbVJZLzIrYWM5RUpwa0IxYU85Q291WTE5cE9SWVcvdC9xTjNBTFp2ZHdCTGxaaEx6RXpDVm0zS1BCVldyODdMYlZuTWRHNFF0K0JQZXVqcGhPWWVkMWZWMmktLVNCc3F6VlFFQjBFSGN0TnlKUnd2MEE9PQ%3D%3D--752f2d15a69617f851d3140acb5500b2564af08f", "domain": "localhost", "path": "/", "expires": 1746582928.781901, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "_gitlab_session", "value": "85d65f4d47be2142eee2a21e8bbcec79", "domain": "localhost", "path": "/", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": []}

View File

@ -0,0 +1 @@
{"cookies": [{"name": "_cookie_check", "value": "1", "domain": "localhost", "path": "/login", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "REMEMBERME", "value": "QXBwXEVudGl0eVxVc2VyOlRXRnlkbVZzYzBkeVlXNTBUV0Z1TVRNMjoxNzc2OTA5MzI4OlpUQTRNV1F3WldNMU1HSmhOalUwTTJaaE1qWmxOVGs1TldNMlpUVTFNVEUyT1RRNFpqTXdaR1F3TVRRelpUTXhPVEl3TjJSa09USm1PREF4Wm1WaE5nPT0%3D", "domain": "localhost", "path": "/", "expires": 1776909328.956723, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "PHPSESSID", "value": "4a07cf057d5020f791451af6c2bc29e0", "domain": "localhost", "path": "/", "expires": -1, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": []}

View File

@ -0,0 +1 @@
{"cookies": [{"name": "admin", "value": "732e6b7c1a7ab1de0d1a4ab105b549b6", "domain": "localhost", "path": "/admin", "expires": 1745733331.781848, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "mage-cache-storage", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "mage-cache-storage-section-invalidation", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "mage-messages", "value": "", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Strict"}, {"name": "recently_viewed_product", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_viewed_product_previous", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_compared_product", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_compared_product_previous", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "product_data_storage", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "section_data_ids", "value": "{%22messages%22:null%2C%22customer%22:null%2C%22compare-products%22:null%2C%22last-ordered-items%22:null%2C%22cart%22:null%2C%22directory-data%22:null%2C%22captcha%22:null%2C%22instant-purchase%22:null%2C%22loggedAsCustomer%22:null%2C%22persistent%22:null%2C%22review%22:null%2C%22wishlist%22:null%2C%22recently_viewed_product%22:null%2C%22recently_compared_product%22:null%2C%22product_data_storage%22:null%2C%22paypal-billing-agreement%22:null}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "private_content_version", "value": "c694b83ea1426b503effa8c412f7ae6d", "domain": "localhost", "path": "/", "expires": 1779933329.584947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "PHPSESSID", "value": "ac8e0a869234bb4b31a917ff6a016ce9", "domain": "localhost", "path": "/", "expires": 1776909330.149523, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "X-Magento-Vary", "value": "9bf9a599123e6402b85cde67144717a08b817412", "domain": "localhost", "path": "/", "expires": 1776909330.149613, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": [{"origin": "http://localhost:28082", "localStorage": [{"name": "mage-cache-storage", "value": "{}"}, {"name": "product_data_storage", "value": "{}"}, {"name": "mage-cache-storage-section-invalidation", "value": "{\"messages\":true,\"customer\":true,\"compare-products\":true,\"last-ordered-items\":true,\"cart\":true,\"directory-data\":true,\"captcha\":true,\"instant-purchase\":true,\"loggedAsCustomer\":true,\"persistent\":true,\"review\":true,\"wishlist\":true,\"recently_viewed_product\":true,\"recently_compared_product\":true,\"product_data_storage\":true,\"paypal-billing-agreement\":true}"}, {"name": "mage-cache-timeout", "value": "\"2026-04-23T01:55:28.879Z\""}, {"name": "recently_compared_product_previous", "value": "{}"}, {"name": "recently_viewed_product", "value": "{}"}, {"name": "recently_compared_product", "value": "{}"}, {"name": "recently_viewed_product_previous", "value": "{}"}]}]}

View File

@ -0,0 +1 @@
{"cookies": [{"name": "admin", "value": "753cf7bedc7f97223f77af0369ec4ccf", "domain": "localhost", "path": "/admin", "expires": 1745733329.396607, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": []}

View File

@ -0,0 +1 @@
{"cookies": [{"name": "mage-cache-storage", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "mage-cache-storage-section-invalidation", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "mage-messages", "value": "", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Strict"}, {"name": "recently_viewed_product", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_viewed_product_previous", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_compared_product", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "recently_compared_product_previous", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "product_data_storage", "value": "{}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "section_data_ids", "value": "{%22messages%22:null%2C%22customer%22:null%2C%22compare-products%22:null%2C%22last-ordered-items%22:null%2C%22cart%22:null%2C%22directory-data%22:null%2C%22captcha%22:null%2C%22instant-purchase%22:null%2C%22loggedAsCustomer%22:null%2C%22persistent%22:null%2C%22review%22:null%2C%22wishlist%22:null%2C%22recently_viewed_product%22:null%2C%22recently_compared_product%22:null%2C%22product_data_storage%22:null%2C%22paypal-billing-agreement%22:null}", "domain": "localhost", "path": "/", "expires": 1776909328, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "private_content_version", "value": "db7c8357df43bdda4423b1d0be6cd1c0", "domain": "localhost", "path": "/", "expires": 1779933329.561383, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "PHPSESSID", "value": "9d34565b19e00a4310aba625db3a7fd7", "domain": "localhost", "path": "/", "expires": 1776909330.076342, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "X-Magento-Vary", "value": "9bf9a599123e6402b85cde67144717a08b817412", "domain": "localhost", "path": "/", "expires": 1776909330.076442, "httpOnly": true, "secure": false, "sameSite": "Lax"}], "origins": [{"origin": "http://localhost:28082", "localStorage": [{"name": "mage-cache-storage", "value": "{}"}, {"name": "product_data_storage", "value": "{}"}, {"name": "mage-cache-storage-section-invalidation", "value": "{\"messages\":true,\"customer\":true,\"compare-products\":true,\"last-ordered-items\":true,\"cart\":true,\"directory-data\":true,\"captcha\":true,\"instant-purchase\":true,\"loggedAsCustomer\":true,\"persistent\":true,\"review\":true,\"wishlist\":true,\"recently_viewed_product\":true,\"recently_compared_product\":true,\"product_data_storage\":true,\"paypal-billing-agreement\":true}"}, {"name": "mage-cache-timeout", "value": "\"2026-04-23T01:55:28.896Z\""}, {"name": "recently_compared_product_previous", "value": "{}"}, {"name": "recently_viewed_product", "value": "{}"}, {"name": "recently_compared_product", "value": "{}"}, {"name": "recently_viewed_product_previous", "value": "{}"}]}]}

View File

@ -0,0 +1,6 @@
@article{koh2024visualwebarena,
title={VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web Tasks},
author={Koh, Jing Yu and Lo, Robert and Jang, Lawrence and Duvvur, Vikram and Lim, Ming Chong and Huang, Po-Yu and Neubig, Graham and Zhou, Shuyan and Salakhutdinov, Ruslan and Fried, Daniel},
journal={arXiv preprint arXiv:24xx.xxxxx},
year={2024}
}

20
VAB-WebArena-Lite/LICENSE Normal file
View File

@ -0,0 +1,20 @@
Copyright (c) 2024 Jing Yu Koh, Robert Lo, Lawrence Jang, Vikram Duvvur, Ming Chong Lim, and Po-Yu Huang
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,8 @@
from .agent import (
Agent,
PromptAgent,
TeacherForcingAgent,
construct_agent,
)
__all__ = ["Agent", "TeacherForcingAgent", "PromptAgent", "construct_agent"]

View File

@ -0,0 +1 @@
from .prompt_constructor import *

View File

@ -0,0 +1,27 @@
{
"intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [direction=down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"N/A\" in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.",
"examples": [
[
"OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n\t\t[1749] StaticText '$279.49'\n\t\t[1757] button 'Add to Cart'\n\t\t[1760] button 'Add to Wish List'\n\t\t[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```"
],
[
"OBSERVATION:\n[164] textbox 'Search' focused: True required: False\n[171] button 'Go'\n[174] link 'Find directions between two points'\n[212] heading 'Search Results'\n[216] button 'Close'\nURL: http://openstreetmap.org\nOBJECTIVE: Show me the restaurants near CMU\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```"
]
],
"template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": [
"url",
"objective",
"observation",
"previous_action"
],
"prompt_constructor": "CoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
}
}

View File

@ -0,0 +1,27 @@
{
"intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [direction=down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.",
"examples": [
[
"OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n\t\t[1749] StaticText '$279.49'\n\t\t[1757] button 'Add to Cart'\n\t\t[1760] button 'Add to Wish List'\n\t\t[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```"
],
[
"OBSERVATION:\n[164] textbox 'Search' focused: True required: False\n[171] button 'Go'\n[174] link 'Find directions between two points'\n[212] heading 'Search Results'\n[216] button 'Close'\nURL: http://openstreetmap.org\nOBJECTIVE: Show me the restaurants near CMU\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```"
]
],
"template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": [
"url",
"objective",
"observation",
"previous_action"
],
"prompt_constructor": "CoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
}
}

View File

@ -0,0 +1,31 @@
{
"intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n```click [id]```: This action clicks on an element with a specific id on the webpage.\n```type [id] [content]```: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.\n```hover [id]```: Hover over an element with id.\n```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.\n\nTab Management Actions:\n```new_tab```: Open a new, empty browser tab.\n```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.\n```close_tab```: Close the currently active tab.\n\nURL Navigation Actions:\n```goto [url]```: Navigate to a specific URL.\n```go_back```: Navigate to the previously viewed page.\n```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n```stop [answer]```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.",
"examples": [
[
"OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n[1749] StaticText '$279.49'\n[1757] button 'Add to Cart'\n[1760] button 'Add to Wish List'\n[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine?\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```"
],
[
"OBSERVATION:\n[204] heading '/f/food'\n[593] heading '[homemade] Obligatory Halloween Pumpkin Loaf!'\n\t[942] link '[homemade] Obligatory Halloween Pumpkin Loaf!'\n[945] StaticText 'Submitted by '\n[30] link 'kneechalice' expanded: False\n[1484] StaticText 't3_yid9lu'\n[949] time 'October 31, 2022 at 10:10:03 AM EDT'\n\t[1488] StaticText '1 year ago'\n[1489] link '45 comments'\n[605] heading '[I ate] Maple Pecan Croissant'\n\t[963] link '[I ate] Maple Pecan Croissant'\n[966] StaticText 'Submitted by '\n[37] link 'AccordingtoJP' expanded: False\n[1494] StaticText 't3_y3hrpn'\n[970] time 'October 13, 2022 at 10:41:09 PM EDT'\n\t[1498] StaticText '1 year ago'\n[1499] link '204 comments'\nURL: http://reddit.com\nOBJECTIVE: Tell me what the top comment on the croissant post says.\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page has a post titled '[I ate] Maple Pecan Croissant', which is the post mentioned in the objective. In order to find the top comment, I will navigate into the comments section of the post. In summary, the next action I will perform is ```click [1499]```"
],
[
"OBSERVATION:\n[42] link 'My account'\n[43] link 'Logout'\n[44] link 'Publish Ad'\n[25] heading 'What are you looking for today?'\n[143] StaticText 'Keyword'\n[81] textbox 'e.g., a blue used car' required: False\n[146] StaticText 'Category'\n[28] heading 'Latest Listings'\n[86] link 'Atlas Powered Audio System w/ Tripod'\n\t[176] img 'Atlas Powered Audio System w/ Tripod'\n[511] StaticText '150.00 $'\n[88] link 'Neptune Gaming Console'\n\t[178] img 'Neptune Gaming Console'\n[515] StaticText '350.00 $'\nURL: http://classifieds.com\nOBJECTIVE: Help me find the cheapest dark colored guitar.\nPREVIOUS ACTION: None",
"Let's think step-by-step. The objective is to find the cheapest dark colored guitar on the site. The site has a search box whose ID is [81]. I can search for guitars by entering \"guitar\". I can submit this by pressing the Enter afterwards. In summary, the next action I will perform is ```type [81] [guitar] [1]```"
]
],
"template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": [
"url",
"objective",
"observation",
"previous_action"
],
"prompt_constructor": "CoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
}
}

View File

@ -0,0 +1,34 @@
{
"intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n```click [id]```: This action clicks on an element with a specific id on the webpage.\n```type [id] [content]```: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.\n```hover [id]```: Hover over an element with id.\n```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.\n\nTab Management Actions:\n```new_tab```: Open a new, empty browser tab.\n```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.\n```close_tab```: Close the currently active tab.\n\nURL Navigation Actions:\n```goto [url]```: Navigate to a specific URL.\n```go_back```: Navigate to the previously viewed page.\n```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n```stop [answer]```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.",
"examples": [
[
"OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n[1749] StaticText '$279.49'\n[1757] button 'Add to Cart'\n[1760] button 'Add to Wish List'\n[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine?\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
"agent/prompts/multimodal_examples/multimodal_example1.png"
],
[
"OBSERVATION:\n[204] heading '/f/food'\n[593] heading '[homemade] Obligatory Halloween Pumpkin Loaf!'\n\t[942] link '[homemade] Obligatory Halloween Pumpkin Loaf!'\n[945] StaticText 'Submitted by '\n[30] link 'kneechalice' expanded: False\n[1484] StaticText 't3_yid9lu'\n[949] time 'October 31, 2022 at 10:10:03 AM EDT'\n\t[1488] StaticText '1 year ago'\n[1489] link '45 comments'\n[605] heading '[I ate] Maple Pecan Croissant'\n\t[963] link '[I ate] Maple Pecan Croissant'\n[966] StaticText 'Submitted by '\n[37] link 'AccordingtoJP' expanded: False\n[1494] StaticText 't3_y3hrpn'\n[970] time 'October 13, 2022 at 10:41:09 PM EDT'\n\t[1498] StaticText '1 year ago'\n[1499] link '204 comments'\nURL: http://reddit.com\nOBJECTIVE: Tell me what the top comment on the croissant post says.\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page has a post titled '[I ate] Maple Pecan Croissant', which is the post mentioned in the objective. In order to find the top comment, I will navigate into the comments section of the post. In summary, the next action I will perform is ```click [1499]```",
"agent/prompts/multimodal_examples/multimodal_example2.png"
],
[
"OBSERVATION:\n[42] link 'My account'\n[43] link 'Logout'\n[44] link 'Publish Ad'\n[25] heading 'What are you looking for today?'\n[143] StaticText 'Keyword'\n[81] textbox 'e.g., a blue used car' required: False\n[146] StaticText 'Category'\n[28] heading 'Latest Listings'\n[86] link 'Atlas Powered Audio System w/ Tripod'\n\t[176] img 'Atlas Powered Audio System w/ Tripod'\n[511] StaticText '150.00 $'\n[88] link 'Neptune Gaming Console'\n\t[178] img 'Neptune Gaming Console'\n[515] StaticText '350.00 $'\nURL: http://classifieds.com\nOBJECTIVE: Help me find the cheapest dark colored guitar.\nPREVIOUS ACTION: None",
"Let's think step-by-step. The objective is to find the cheapest dark colored guitar on the site. The site has a search box whose ID is [81]. I can search for guitars by entering \"guitar\". I can submit this by pressing the Enter afterwards. In summary, the next action I will perform is ```type [81] [guitar] [1]```",
"agent/prompts/multimodal_examples/multimodal_example3.png"
]
],
"template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": [
"url",
"objective",
"observation",
"previous_action"
],
"prompt_constructor": "MultimodalCoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
}
}

View File

@ -0,0 +1,34 @@
{
"intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page screenshot: This is a screenshot of the webpage, with each interactable element assigned a unique numerical id. Each bounding box and its respective id shares the same color.\nThe observation, which lists the IDs of all interactable elements on the current web page with their text content if any, in the format [id] [tagType] [text content]. tagType is the type of the element, such as button, link, or textbox. text content is the text content of the element. For example, [1234] [button] ['Add to Cart'] means that there is a button with id 1234 and text content 'Add to Cart' on the current web page. [] [StaticText] [text] means that the element is of some text that is not interactable.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n```click [id]```: This action clicks on an element with a specific id on the webpage.\n```type [id] [content]```: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.\n```hover [id]```: Hover over an element with id.\n```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.\n\nTab Management Actions:\n```new_tab```: Open a new, empty browser tab.\n```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.\n```close_tab```: Close the currently active tab.\n\nURL Navigation Actions:\n```goto [url]```: Navigate to a specific URL.\n```go_back```: Navigate to the previously viewed page.\n```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n```stop [answer]```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.",
"examples": [
[
"OBSERVATION:\n[31] [IMG] [Image, description: hp fx-7010dn fax machine, url: http://ec2-3-13-232-171.us-east-2.compute.amazonaws.com:7770/media/catalog/product/cache/89ff578b9cd87e0600daac45c9e1ea98/B/0/B08GKZ3ZKD.0.jpg]\n[32] [A] [HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)]\n[] [StaticText] [$279.49]\n[33] [BUTTON] [Add to Cart]\n[34] [A] [Add to Wish List]\n[35] [A] [Add to Compare]\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine?\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
"agent/prompts/som_examples/som_example1.png"
],
[
"OBSERVATION:\n[] [StaticText] [/f/food]\n[] [StaticText] [[homemade] Obligatory Halloween Pumpkin Loaf!\tSubmitted by\tkneechalice\tt3_yid9lu\t1 year ago]\n[9] [IMG] []\n[] [StaticText] [Submitted by\tkneechalice\tt3_yid9lu\t1 year ago]\n[10] [A] [kneechalice]\n[11] [A] [45 comments]\n[] [StaticText] [[I ate] Maple Pecan Croissant\tSubmitted by\tAccordingtoJP\tt3_y3hrpn\t1 year ago]\n[14] [IMG] []\n[] [StaticText] [Submitted by\tAccordingtoJP\tt3_y3hrpn\t1 year ago]\n[15] [A] [AccordingtoJP]\n[16] [A] [204 comments]\nURL: http://reddit.com\nOBJECTIVE: Tell me what the top comment on the croissant post says.\nPREVIOUS ACTION: None",
"Let's think step-by-step. This page has a post titled '[I ate] Maple Pecan Croissant', which is the post mentioned in the objective. In order to find the top comment, I will navigate into the comments section of the post. In summary, the next action I will perform is ```click [11]```",
"agent/prompts/som_examples/som_example2.png"
],
[
"OBSERVATION:\n[] [StaticText] [What are you looking for today?]\n[5] [INPUT] []\n[6] [SELECT] [Select a category]\n[7] [BUTTON] [Search]\n[] [StaticText] [Latest Listings]\n[] [StaticText] [Atlas Powered Audio System w/ Tripod\t150.00 $\tMusic instruments\tBorough of Red Lion (Pennsylvania)\t2023/11/16]\n[8] [IMG] [Atlas Powered Audio System w/ Tripod]\n[9] [A] [Atlas Powered Audio System w/ Tripod]\n[] [StaticText] [150.00 $]\n[] [StaticText] [Neptune Gaming Console\t350.00 $\tVideo gaming\tPennwyn (Pennsylvania)\t2023/11/16]\n[10] [IMG] [Neptune Gaming Console]\n[11] [A] [Neptune Gaming Console]\n[] [StaticText] [350.00 $]\nURL: http://classifieds.com\nOBJECTIVE: Help me find the cheapest dark colored guitar.\nPREVIOUS ACTION: None",
"Let's think step-by-step. The objective is to find the cheapest dark colored guitar on the site. The site has a search box whose ID is [5]. I can search for guitars by entering \"guitar\". I can submit this by pressing the Enter afterwards. In summary, the next action I will perform is ```type [5] [guitar] [1]```",
"agent/prompts/som_examples/som_example3.png"
]
],
"template": "OBSERVATION: {observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}",
"meta_data": {
"observation": "image_som",
"action_type": "som",
"keywords": [
"url",
"objective",
"observation",
"previous_action"
],
"prompt_constructor": "MultimodalCoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 253 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 210 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 312 KiB

View File

@ -0,0 +1,82 @@
prompt = {
"intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.
Here's the information you'll have:
The user's objective: This is the task you're trying to complete.
The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
The current web page's URL: This is the page you're currently navigating.
The open tabs: These are the tabs you have open.
The previous action: This is the action you just performed. It may be helpful to track your progress.
The actions you can perform fall into several categories:
Page Operation Actions:
`click [id]`: This action clicks on an element with a specific id on the webpage.
`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0.
`hover [id]`: Hover over an element with id.
`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
`scroll [direction=down|up]`: Scroll the page up or down.
Tab Management Actions:
`new_tab`: Open a new, empty browser tab.
`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.
`close_tab`: Close the currently active tab.
URL Navigation Actions:
`goto [url]`: Navigate to a specific URL.
`go_back`: Navigate to the previously viewed page.
`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).
Completion Action:
`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket.
Homepage:
If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.
http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.
To be successful, it is very important to follow the following rules:
1. You should only issue an action that is valid given the current observation
2. You should only issue one action at a time.
3. You should follow the examples to reason step by step and then issue the next action.
4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```".
5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""",
"examples": [
(
"""OBSERVATION:
[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'
[1749] StaticText '$279.49'
[1757] button 'Add to Cart'
[1760] button 'Add to Wish List'
[1761] button 'Add to Compare'
URL: http://onestopmarket.com/office-products/office-electronics.html
OBJECTIVE: What is the price of HP Inkjet Fax Machine
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
),
(
"""OBSERVATION:
[164] textbox 'Search' focused: True required: False
[171] button 'Go'
[174] link 'Find directions between two points'
[212] heading 'Search Results'
[216] button 'Close'
URL: http://openstreetmap.org
OBJECTIVE: Show me the restaurants near CMU
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```",
),
],
"template": """OBSERVATION:
{observation}
URL: {url}
OBJECTIVE: {objective}
PREVIOUS ACTION: {previous_action}""",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": ["url", "objective", "observation", "previous_action"],
"prompt_constructor": "CoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
},
}

View File

@ -0,0 +1,82 @@
prompt = {
"intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.
Here's the information you'll have:
The user's objective: This is the task you're trying to complete.
The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
The current web page's URL: This is the page you're currently navigating.
The open tabs: These are the tabs you have open.
The previous action: This is the action you just performed. It may be helpful to track your progress.
The actions you can perform fall into several categories:
Page Operation Actions:
`click [id]`: This action clicks on an element with a specific id on the webpage.
`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0.
`hover [id]`: Hover over an element with id.
`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
`scroll [direction=down|up]`: Scroll the page up or down.
Tab Management Actions:
`new_tab`: Open a new, empty browser tab.
`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.
`close_tab`: Close the currently active tab.
URL Navigation Actions:
`goto [url]`: Navigate to a specific URL.
`go_back`: Navigate to the previously viewed page.
`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).
Completion Action:
`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.
Homepage:
If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.
http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.
To be successful, it is very important to follow the following rules:
1. You should only issue an action that is valid given the current observation
2. You should only issue one action at a time.
3. You should follow the examples to reason step by step and then issue the next action.
4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```".
5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""",
"examples": [
(
"""OBSERVATION:
[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'
[1749] StaticText '$279.49'
[1757] button 'Add to Cart'
[1760] button 'Add to Wish List'
[1761] button 'Add to Compare'
URL: http://onestopmarket.com/office-products/office-electronics.html
OBJECTIVE: What is the price of HP Inkjet Fax Machine
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
),
(
"""OBSERVATION:
[164] textbox 'Search' focused: True required: False
[171] button 'Go'
[174] link 'Find directions between two points'
[212] heading 'Search Results'
[216] button 'Close'
URL: http://openstreetmap.org
OBJECTIVE: Show me the restaurants near CMU
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```",
),
],
"template": """OBSERVATION:
{observation}
URL: {url}
OBJECTIVE: {objective}
PREVIOUS ACTION: {previous_action}""",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": ["url", "objective", "observation", "previous_action"],
"prompt_constructor": "CoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
},
}

View File

@ -0,0 +1,115 @@
prompt = {
"intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.
Here's the information you'll have:
The user's objective: This is the task you're trying to complete.
The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
The current web page's URL: This is the page you're currently navigating.
The open tabs: These are the tabs you have open.
The previous action: This is the action you just performed. It may be helpful to track your progress.
The actions you can perform fall into several categories:
Page Operation Actions:
```click [id]```: This action clicks on an element with a specific id on the webpage.
```type [id] [content]```: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.
```hover [id]```: Hover over an element with id.
```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.
Tab Management Actions:
```new_tab```: Open a new, empty browser tab.
```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.
```close_tab```: Close the currently active tab.
URL Navigation Actions:
```goto [url]```: Navigate to a specific URL.
```go_back```: Navigate to the previously viewed page.
```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).
Completion Action:
```stop [answer]```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.
Homepage:
If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.
http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.
To be successful, it is very important to follow the following rules:
1. You should only issue an action that is valid given the current observation
2. You should only issue one action at a time.
3. You should follow the examples to reason step by step and then issue the next action.
4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```".
5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""",
"examples": [
(
"""OBSERVATION:
[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'
[1749] StaticText '$279.49'
[1757] button 'Add to Cart'
[1760] button 'Add to Wish List'
[1761] button 'Add to Compare'
URL: http://onestopmarket.com/office-products/office-electronics.html
OBJECTIVE: What is the price of HP Inkjet Fax Machine?
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
),
(
"""OBSERVATION:
[204] heading '/f/food'
[593] heading '[homemade] Obligatory Halloween Pumpkin Loaf!'
[942] link '[homemade] Obligatory Halloween Pumpkin Loaf!'
[945] StaticText 'Submitted by '
[30] link 'kneechalice' expanded: False
[1484] StaticText 't3_yid9lu'
[949] time 'October 31, 2022 at 10:10:03 AM EDT'
[1488] StaticText '1 year ago'
[1489] link '45 comments'
[605] heading '[I ate] Maple Pecan Croissant'
[963] link '[I ate] Maple Pecan Croissant'
[966] StaticText 'Submitted by '
[37] link 'AccordingtoJP' expanded: False
[1494] StaticText 't3_y3hrpn'
[970] time 'October 13, 2022 at 10:41:09 PM EDT'
[1498] StaticText '1 year ago'
[1499] link '204 comments'
URL: http://reddit.com
OBJECTIVE: Tell me what the top comment on the croissant post says.
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page has a post titled '[I ate] Maple Pecan Croissant', which is the post mentioned in the objective. In order to find the top comment, I will navigate into the comments section of the post. In summary, the next action I will perform is ```click [1499]```",
),
(
"""OBSERVATION:
[42] link 'My account'
[43] link 'Logout'
[44] link 'Publish Ad'
[25] heading 'What are you looking for today?'
[143] StaticText 'Keyword'
[81] textbox 'e.g., a blue used car' required: False
[146] StaticText 'Category'
[28] heading 'Latest Listings'
[86] link 'Atlas Powered Audio System w/ Tripod'
[176] img 'Atlas Powered Audio System w/ Tripod'
[511] StaticText '150.00 $'
[88] link 'Neptune Gaming Console'
[178] img 'Neptune Gaming Console'
[515] StaticText '350.00 $'
URL: http://classifieds.com
OBJECTIVE: Help me find the cheapest dark colored guitar.
PREVIOUS ACTION: None""",
"Let's think step-by-step. The objective is to find the cheapest dark colored guitar on the site. The site has a search box whose ID is [81]. I can search for guitars by entering \"guitar\". I can submit this by pressing the Enter afterwards. In summary, the next action I will perform is ```type [81] [guitar] [1]```",
),
],
"template": """OBSERVATION:
{observation}
URL: {url}
OBJECTIVE: {objective}
PREVIOUS ACTION: {previous_action}""",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": ["url", "objective", "observation", "previous_action"],
"prompt_constructor": "CoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
},
}

View File

@ -0,0 +1,118 @@
prompt = {
"intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.
Here's the information you'll have:
The user's objective: This is the task you're trying to complete.
The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
The current web page's URL: This is the page you're currently navigating.
The open tabs: These are the tabs you have open.
The previous action: This is the action you just performed. It may be helpful to track your progress.
The actions you can perform fall into several categories:
Page Operation Actions:
```click [id]```: This action clicks on an element with a specific id on the webpage.
```type [id] [content]```: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.
```hover [id]```: Hover over an element with id.
```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.
Tab Management Actions:
```new_tab```: Open a new, empty browser tab.
```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.
```close_tab```: Close the currently active tab.
URL Navigation Actions:
```goto [url]```: Navigate to a specific URL.
```go_back```: Navigate to the previously viewed page.
```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).
Completion Action:
```stop [answer]```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.
Homepage:
If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.
http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.
To be successful, it is very important to follow the following rules:
1. You should only issue an action that is valid given the current observation
2. You should only issue one action at a time.
3. You should follow the examples to reason step by step and then issue the next action.
4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```".
5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""",
"examples": [
(
"""OBSERVATION:
[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'
[1749] StaticText '$279.49'
[1757] button 'Add to Cart'
[1760] button 'Add to Wish List'
[1761] button 'Add to Compare'
URL: http://onestopmarket.com/office-products/office-electronics.html
OBJECTIVE: What is the price of HP Inkjet Fax Machine?
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
"agent/prompts/multimodal_examples/multimodal_example1.png"
),
(
"""OBSERVATION:
[204] heading '/f/food'
[593] heading '[homemade] Obligatory Halloween Pumpkin Loaf!'
[942] link '[homemade] Obligatory Halloween Pumpkin Loaf!'
[945] StaticText 'Submitted by '
[30] link 'kneechalice' expanded: False
[1484] StaticText 't3_yid9lu'
[949] time 'October 31, 2022 at 10:10:03 AM EDT'
[1488] StaticText '1 year ago'
[1489] link '45 comments'
[605] heading '[I ate] Maple Pecan Croissant'
[963] link '[I ate] Maple Pecan Croissant'
[966] StaticText 'Submitted by '
[37] link 'AccordingtoJP' expanded: False
[1494] StaticText 't3_y3hrpn'
[970] time 'October 13, 2022 at 10:41:09 PM EDT'
[1498] StaticText '1 year ago'
[1499] link '204 comments'
URL: http://reddit.com
OBJECTIVE: Tell me what the top comment on the croissant post says.
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page has a post titled '[I ate] Maple Pecan Croissant', which is the post mentioned in the objective. In order to find the top comment, I will navigate into the comments section of the post. In summary, the next action I will perform is ```click [1499]```",
"agent/prompts/multimodal_examples/multimodal_example2.png"
),
(
"""OBSERVATION:
[42] link 'My account'
[43] link 'Logout'
[44] link 'Publish Ad'
[25] heading 'What are you looking for today?'
[143] StaticText 'Keyword'
[81] textbox 'e.g., a blue used car' required: False
[146] StaticText 'Category'
[28] heading 'Latest Listings'
[86] link 'Atlas Powered Audio System w/ Tripod'
[176] img 'Atlas Powered Audio System w/ Tripod'
[511] StaticText '150.00 $'
[88] link 'Neptune Gaming Console'
[178] img 'Neptune Gaming Console'
[515] StaticText '350.00 $'
URL: http://classifieds.com
OBJECTIVE: Help me find the cheapest dark colored guitar.
PREVIOUS ACTION: None""",
"Let's think step-by-step. The objective is to find the cheapest dark colored guitar on the site. The site has a search box whose ID is [81]. I can search for guitars by entering \"guitar\". I can submit this by pressing the Enter afterwards. In summary, the next action I will perform is ```type [81] [guitar] [1]```",
"agent/prompts/multimodal_examples/multimodal_example3.png"
),
],
"template": """OBSERVATION:
{observation}
URL: {url}
OBJECTIVE: {objective}
PREVIOUS ACTION: {previous_action}""",
"meta_data": {
"observation": "accessibility_tree",
"action_type": "id_accessibility_tree",
"keywords": ["url", "objective", "observation", "previous_action"],
"prompt_constructor": "MultimodalCoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
},
}

View File

@ -0,0 +1,112 @@
prompt = {
"intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.
Here's the information you'll have:
The user's objective: This is the task you're trying to complete.
The current web page screenshot: This is a screenshot of the webpage, with each interactable element assigned a unique numerical id. Each bounding box and its respective id shares the same color.
The observation, which lists the IDs of all interactable elements on the current web page with their text content if any, in the format [id] [tagType] [text content]. tagType is the type of the element, such as button, link, or textbox. text content is the text content of the element. For example, [1234] [button] ['Add to Cart'] means that there is a button with id 1234 and text content 'Add to Cart' on the current web page. [] [StaticText] [text] means that the element is of some text that is not interactable.
The current web page's URL: This is the page you're currently navigating.
The open tabs: These are the tabs you have open.
The previous action: This is the action you just performed. It may be helpful to track your progress.
The actions you can perform fall into several categories:
Page Operation Actions:
```click [id]```: This action clicks on an element with a specific id on the webpage.
```type [id] [content]```: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.
```hover [id]```: Hover over an element with id.
```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.
Tab Management Actions:
```new_tab```: Open a new, empty browser tab.
```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.
```close_tab```: Close the currently active tab.
URL Navigation Actions:
```goto [url]```: Navigate to a specific URL.
```go_back```: Navigate to the previously viewed page.
```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed).
Completion Action:
```stop [answer]```: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.
Homepage:
If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.
http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.
To be successful, it is very important to follow the following rules:
1. You should only issue an action that is valid given the current observation
2. You should only issue one action at a time.
3. You should follow the examples to reason step by step and then issue the next action.
4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```".
5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""",
"examples": [
(
"""OBSERVATION:
[31] [IMG] [Image, description: hp fx-7010dn fax machine, url: http://ec2-3-13-232-171.us-east-2.compute.amazonaws.com:7770/media/catalog/product/cache/89ff578b9cd87e0600daac45c9e1ea98/B/0/B08GKZ3ZKD.0.jpg]
[32] [A] [HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)]
[] [StaticText] [$279.49]
[33] [BUTTON] [Add to Cart]
[34] [A] [Add to Wish List]
[35] [A] [Add to Compare]
URL: http://onestopmarket.com/office-products/office-electronics.html
OBJECTIVE: What is the price of HP Inkjet Fax Machine?
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```",
"agent/prompts/som_examples/som_example1.png"
),
(
"""OBSERVATION:
[] [StaticText] [/f/food]
[] [StaticText] [[homemade] Obligatory Halloween Pumpkin Loaf! Submitted by kneechalice t3_yid9lu 1 year ago]
[9] [IMG] []
[] [StaticText] [Submitted by kneechalice t3_yid9lu 1 year ago]
[10] [A] [kneechalice]
[11] [A] [45 comments]
[] [StaticText] [[I ate] Maple Pecan Croissant Submitted by AccordingtoJP t3_y3hrpn 1 year ago]
[14] [IMG] []
[] [StaticText] [Submitted by AccordingtoJP t3_y3hrpn 1 year ago]
[15] [A] [AccordingtoJP]
[16] [A] [204 comments]
URL: http://reddit.com
OBJECTIVE: Tell me what the top comment on the croissant post says.
PREVIOUS ACTION: None""",
"Let's think step-by-step. This page has a post titled '[I ate] Maple Pecan Croissant', which is the post mentioned in the objective. In order to find the top comment, I will navigate into the comments section of the post. In summary, the next action I will perform is ```click [11]```",
"agent/prompts/som_examples/som_example2.png"
),
(
"""OBSERVATION:
[] [StaticText] [What are you looking for today?]
[5] [INPUT] []
[6] [SELECT] [Select a category]
[7] [BUTTON] [Search]
[] [StaticText] [Latest Listings]
[] [StaticText] [Atlas Powered Audio System w/ Tripod 150.00 $ Music instruments Borough of Red Lion (Pennsylvania) 2023/11/16]
[8] [IMG] [Atlas Powered Audio System w/ Tripod]
[9] [A] [Atlas Powered Audio System w/ Tripod]
[] [StaticText] [150.00 $]
[] [StaticText] [Neptune Gaming Console 350.00 $ Video gaming Pennwyn (Pennsylvania) 2023/11/16]
[10] [IMG] [Neptune Gaming Console]
[11] [A] [Neptune Gaming Console]
[] [StaticText] [350.00 $]
URL: http://classifieds.com
OBJECTIVE: Help me find the cheapest dark colored guitar.
PREVIOUS ACTION: None""",
"Let's think step-by-step. The objective is to find the cheapest dark colored guitar on the site. The site has a search box whose ID is [5]. I can search for guitars by entering \"guitar\". I can submit this by pressing the Enter afterwards. In summary, the next action I will perform is ```type [5] [guitar] [1]```",
"agent/prompts/som_examples/som_example3.png"
),
],
"template": """OBSERVATION: {observation}
URL: {url}
OBJECTIVE: {objective}
PREVIOUS ACTION: {previous_action}""",
"meta_data": {
"observation": "image_som",
"action_type": "som",
"keywords": ["url", "objective", "observation", "previous_action"],
"prompt_constructor": "MultimodalCoTPromptConstructor",
"answer_phrase": "In summary, the next action I will perform is",
"action_splitter": "```"
},
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 282 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 222 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 310 KiB

View File

@ -0,0 +1,26 @@
import glob
import importlib
import json
import os
# use the current directory as the root
def run() -> None:
"""Convert all python files in agent/prompts to json files in agent/prompts/jsons
Python files are easiser to edit
"""
for p_file in glob.glob(f"agent/prompts/raw/*.py"):
# import the file as a module
base_name = os.path.basename(p_file).replace(".py", "")
module = importlib.import_module(f"agent.prompts.raw.{base_name}")
prompt = module.prompt
# save the prompt as a json file
os.makedirs("agent/prompts/jsons", exist_ok=True)
with open(f"agent/prompts/jsons/{base_name}.json", "w+") as f:
json.dump(prompt, f, indent=2)
print(f"Done convert python files to json")
if __name__ == "__main__":
run()

View File

@ -0,0 +1,76 @@
import asyncio
from .actions import (
Action,
ActionParsingError,
ActionTypes,
action2create_function,
action2str,
create_check_action,
create_click_action,
create_focus_and_click_action,
create_focus_and_type_action,
create_go_back_action,
create_go_forward_action,
create_goto_url_action,
create_hover_action,
create_id_based_action,
create_key_press_action,
create_keyboard_type_action,
create_mouse_click_action,
create_mouse_hover_action,
create_new_tab_action,
create_none_action,
create_page_close_action,
create_page_focus_action,
create_playwright_action,
create_random_action,
create_scroll_action,
create_select_option_action,
create_stop_action,
create_type_action,
is_equivalent,
)
from .async_envs import AsyncScriptBrowserEnv
from .envs import ScriptBrowserEnv
from .processors import ObservationMetadata
from .trajectory import Trajectory
from .utils import DetachedPage, StateInfo
__all__ = [
"ScriptBrowserEnv",
"AsyncScriptBrowserEnv",
"DetachedPage",
"StateInfo",
"ObservationMetadata",
"Action",
"ActionTypes",
"action2str",
"create_random_action",
"create_focus_and_click_action",
"create_focus_and_type_action",
"is_equivalent",
"create_mouse_click_action",
"create_mouse_hover_action",
"create_none_action",
"create_keyboard_type_action",
"create_page_focus_action",
"create_new_tab_action",
"create_go_back_action",
"create_go_forward_action",
"create_goto_url_action",
"create_page_close_action",
"action2create_function",
"create_playwright_action",
"create_id_based_action",
"create_scroll_action",
"create_key_press_action",
"create_check_action",
"create_click_action",
"create_type_action",
"create_hover_action",
"create_select_option_action",
"create_stop_action",
"ActionParsingError",
"Trajectory",
]

View File

@ -0,0 +1,160 @@
import asyncio
import json
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import numpy.typing as npt
from beartype import beartype
from gymnasium import Env
from gymnasium.spaces import Box, Text
from playwright.async_api import Page, ViewportSize, async_playwright
from .actions import Action, aexecute_action, get_action_space
from .utils import DetachedPage, png_bytes_to_numpy
class AsyncScriptBrowserEnv(Env[npt.NDArray[np.uint8], Action]):
"""
The goal of this environment is to produce a prototype of a browser environment.
In the end, we want to support a fully configurable browser environment with wide
range of action spaces and observation spaces, both structured and unstructured.
But in this prototype, we just support action space specified by Playwright script,
and observation space is the html content of the page.
"""
@beartype
def __init__(
self,
max_page_length: int = 2048,
headless: bool = True,
slow_mo: int = 0,
timeout: int = 30000,
viewport_size: ViewportSize = {"width": 1280, "height": 720},
):
self.observation_space = Box(
0,
255,
(viewport_size["height"], viewport_size["width"], 4),
np.uint8,
)
# TODO: make Space[Action] = ActionSpace
self.action_space = get_action_space() # type: ignore[assignment]
self.headless = headless
self.slow_mo = slow_mo
self.reset_finished = False
self.timeout = timeout
self.viewport_size = viewport_size
@beartype
async def setup(self, config_file: Path | None = None) -> None:
self.context_manager = async_playwright()
self.playwright = await self.context_manager.__aenter__()
self.browser = await self.playwright.chromium.launch(
headless=self.headless, slow_mo=self.slow_mo
)
if config_file:
with open(config_file, "r") as f:
instance_config = json.load(f)
else:
instance_config = {}
storage_state = instance_config.get("storage_state", None)
start_url = instance_config.get("start_url", None)
geolocation = instance_config.get("geolocation", None)
self.context = await self.browser.new_context(
viewport=self.viewport_size,
storage_state=storage_state,
geolocation=geolocation,
device_scale_factor=1,
)
self.page = await self.context.new_page()
if start_url:
await self.page.goto(start_url)
@beartype
async def areset(
self,
*,
seed: int | None = None,
options: dict[str, str] | None = None,
) -> tuple[npt.NDArray[np.uint8], dict[str, object]]:
"""
Reset the environment.
:param options: options for the environment. The options are:
- storage_state: the path to the storage state file
"""
super().reset(seed=seed, options=options)
if self.reset_finished:
await self.context_manager.__aexit__()
if options is not None and "config_file" in options:
config_file = Path(options["config_file"])
if config_file.exists():
await self.setup(config_file=config_file)
else:
raise ValueError(f"Config state {config_file} does not exist.")
else:
await self.setup()
self.reset_finished = True
content = await self.page.content()
screenshot = png_bytes_to_numpy(await self.page.screenshot())
return (
screenshot,
{"page": DetachedPage(self.page.url, content)},
)
@beartype
def reset(
self,
*,
seed: int | None = None,
options: dict[str, str] | None = None,
) -> tuple[npt.NDArray[np.uint8], dict[str, object]]:
return asyncio.run(self.areset(seed=seed, options=options))
async def aclose(self) -> None:
if self.reset_finished:
await self.context_manager.__aexit__()
def close(self) -> None:
asyncio.run(self.aclose())
@beartype
async def astep(
self, action: Action
) -> tuple[npt.NDArray[np.uint8], float, bool, bool, dict[str, object]]:
if not self.reset_finished:
raise RuntimeError("Call reset first before calling step.")
success = False
fail_error = ""
try:
self.page = await aexecute_action(action, self.page, self.context)
success = True
except Exception as e:
fail_error = str(e)
try:
content = await self.page.content()
screenshot = png_bytes_to_numpy(await self.page.screenshot())
except:
await self.page.wait_for_load_state("load")
content = await self.page.content()
screenshot = png_bytes_to_numpy(await self.page.screenshot())
return (
screenshot,
float(success),
False,
False,
{
"page": DetachedPage(self.page.url, content),
"fail_error": fail_error,
},
)
@beartype
def step(
self, action: Action
) -> tuple[npt.NDArray[np.uint8], float, bool, bool, dict[str, object]]:
return asyncio.run(self.astep(action), debug=True)

View File

@ -0,0 +1,182 @@
"""Script to automatically login each website"""
import argparse
import glob
import os
import time
from concurrent.futures import ThreadPoolExecutor
from itertools import combinations
from pathlib import Path
from playwright.sync_api import sync_playwright
from browser_env.env_config import ACCOUNTS
DATASET = os.environ["DATASET"]
if DATASET == "webarena":
from browser_env.env_config import (
GITLAB,
REDDIT,
SHOPPING,
SHOPPING_ADMIN,
)
SITES = ["gitlab", "shopping", "shopping_admin", "reddit"]
URLS = [
f"{GITLAB}/-/profile",
f"{SHOPPING}/wishlist/",
f"{SHOPPING_ADMIN}/dashboard",
f"{REDDIT}/user/{ACCOUNTS['reddit']['username']}/account",
]
EXACT_MATCH = [True, True, True, True]
KEYWORDS = ["", "", "Dashboard", "Delete"]
elif DATASET == "visualwebarena":
from browser_env.env_config import (
CLASSIFIEDS,
REDDIT,
SHOPPING,
)
SITES = ["shopping", "reddit", "classifieds"]
URLS = [
f"{SHOPPING}/wishlist/",
f"{REDDIT}/user/{ACCOUNTS['reddit']['username']}/account",
f"{CLASSIFIEDS}/index.php?page=user&action=items",
]
EXACT_MATCH = [True, True, True]
KEYWORDS = ["", "Delete", "My listings"]
else:
raise ValueError(f"Dataset not implemented: {DATASET}")
HEADLESS = True
SLOW_MO = 0
assert len(SITES) == len(URLS) == len(EXACT_MATCH) == len(KEYWORDS)
def is_expired(
storage_state: Path, url: str, keyword: str, url_exact: bool = True
) -> bool:
"""Test whether the cookie is expired"""
if not storage_state.exists():
return True
context_manager = sync_playwright()
playwright = context_manager.__enter__()
browser = playwright.chromium.launch(headless=True, slow_mo=SLOW_MO)
context = browser.new_context(storage_state=storage_state)
page = context.new_page()
page.goto(url)
time.sleep(1)
d_url = page.url
content = page.content()
context_manager.__exit__()
if keyword:
return keyword not in content
else:
if url_exact:
return d_url != url
else:
return url not in d_url
def renew_comb(comb: list[str], auth_folder: str = "./.auth") -> None:
context_manager = sync_playwright()
playwright = context_manager.__enter__()
browser = playwright.chromium.launch(headless=HEADLESS)
context = browser.new_context()
page = context.new_page()
if "shopping" in comb:
username = ACCOUNTS["shopping"]["username"]
password = ACCOUNTS["shopping"]["password"]
page.goto(f"{SHOPPING}/customer/account/login/")
page.get_by_label("Email", exact=True).fill(username)
page.get_by_label("Password", exact=True).fill(password)
page.get_by_role("button", name="Sign In").click()
if "reddit" in comb:
username = ACCOUNTS["reddit"]["username"]
password = ACCOUNTS["reddit"]["password"]
page.goto(f"{REDDIT}/login")
page.get_by_label("Username").fill(username)
page.get_by_label("Password").fill(password)
page.get_by_role("button", name="Log in").click()
if "classifieds" in comb:
username = ACCOUNTS["classifieds"]["username"]
password = ACCOUNTS["classifieds"]["password"]
page.goto(f"{CLASSIFIEDS}/index.php?page=login")
page.locator("#email").fill(username)
page.locator("#password").fill(password)
page.get_by_role("button", name="Log in").click()
if "shopping_admin" in comb:
username = ACCOUNTS["shopping_admin"]["username"]
password = ACCOUNTS["shopping_admin"]["password"]
page.goto(f"{SHOPPING_ADMIN}")
page.get_by_placeholder("user name").fill(username)
page.get_by_placeholder("password").fill(password)
page.get_by_role("button", name="Sign in").click()
if "gitlab" in comb:
username = ACCOUNTS["gitlab"]["username"]
password = ACCOUNTS["gitlab"]["password"]
page.goto(f"{GITLAB}/users/sign_in")
page.get_by_test_id("username-field").click()
page.get_by_test_id("username-field").fill(username)
page.get_by_test_id("username-field").press("Tab")
page.get_by_test_id("password-field").fill(password)
page.get_by_test_id("sign-in-button").click()
context.storage_state(path=f"{auth_folder}/{'.'.join(comb)}_state.json")
context_manager.__exit__()
def get_site_comb_from_filepath(file_path: str) -> list[str]:
comb = os.path.basename(file_path).rsplit("_", 1)[0].split(".")
return comb
def main(auth_folder: str = "./.auth") -> None:
pairs = list(combinations(SITES, 2))
with ThreadPoolExecutor(max_workers=8) as executor:
for pair in pairs:
# Auth doesn't work on this pair as they share the same cookie
if "reddit" in pair and (
"shopping" in pair or "shopping_admin" in pair
):
continue
executor.submit(
renew_comb, list(sorted(pair)), auth_folder=auth_folder
)
for site in SITES:
executor.submit(renew_comb, [site], auth_folder=auth_folder)
# parallel checking if the cookies are expired
futures = []
cookie_files = list(glob.glob(f"{auth_folder}/*.json"))
with ThreadPoolExecutor(max_workers=8) as executor:
for c_file in cookie_files:
comb = get_site_comb_from_filepath(c_file)
for cur_site in comb:
url = URLS[SITES.index(cur_site)]
keyword = KEYWORDS[SITES.index(cur_site)]
match = EXACT_MATCH[SITES.index(cur_site)]
future = executor.submit(
is_expired, Path(c_file), url, keyword, match
)
futures.append(future)
for i, future in enumerate(futures):
assert not future.result(), f"Cookie {cookie_files[i]} expired."
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--site_list", nargs="+", default=[])
parser.add_argument("--auth_folder", type=str, default="./.auth")
args = parser.parse_args()
if not args.site_list:
main()
else:
renew_comb(args.site_list, auth_folder=args.auth_folder)

View File

@ -0,0 +1,324 @@
import re
from typing import Literal
ROLES = (
"alert",
"alertdialog",
"application",
"article",
"banner",
"blockquote",
"button",
"caption",
"cell",
"checkbox",
"code",
"columnheader",
"combobox",
"complementary",
"contentinfo",
"definition",
"deletion",
"dialog",
"directory",
"document",
"emphasis",
"feed",
"figure",
"form",
"generic",
"grid",
"gridcell",
"group",
"heading",
"img",
"insertion",
"link",
"list",
"listbox",
"listitem",
"log",
"main",
"marquee",
"math",
"meter",
"menu",
"menubar",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"navigation",
"none",
"note",
"option",
"paragraph",
"presentation",
"progressbar",
"radio",
"radiogroup",
"region",
"row",
"rowgroup",
"rowheader",
"scrollbar",
"search",
"searchbox",
"separator",
"slider",
"spinbutton",
"status",
"strong",
"subscript",
"superscript",
"switch",
"tab",
"table",
"tablist",
"tabpanel",
"term",
"textbox",
"time",
"timer",
"toolbar",
"tooltip",
"tree",
"treegrid",
"treeitem",
)
SPECIAL_LOCATORS = (
"alt_text",
"label",
"placeholder",
)
ASCII_CHARSET = "".join(chr(x) for x in range(32, 128))
FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 130000))
UTTERANCE_MAX_LENGTH = 8192
ATTRIBUTE_MAX_LENGTH = 256
TEXT_MAX_LENGTH = 256
TYPING_MAX_LENGTH = 64
URL_MAX_LENGTH = 256
MAX_ELEMENT_INDEX_IN_VIEWPORT = 10
MAX_ELEMENT_ID = 1000
MAX_ANSWER_LENGTH = 512
MIN_REF = -1000000
MAX_REF = 1000000
WINDOW_WIDTH = 500
WINDOW_HEIGHT = 240
TASK_WIDTH = 160
TASK_HEIGHT = 210
FLIGHT_WINDOW_WIDTH = 600
FLIGHT_WINDOW_HEIGHT = 700
FLIGHT_TASK_WIDTH = 375
FLIGHT_TASK_HEIGHT = 667
MAX_PAGE_NUMBER = 10
SPECIAL_KEYS = (
"Enter",
"Tab",
"Control",
"Shift",
"Meta",
"Backspace",
"Delete",
"Escape",
"ArrowUp",
"ArrowDown",
"ArrowLeft",
"ArrowRight",
"PageDown",
"PageUp",
"Meta+a",
)
SPECIAL_KEY_MAPPINGS = {
"backquote": "Backquote",
"minus": "Minus",
"equal": "Equal",
"backslash": "Backslash",
"backspace": "Backspace",
"meta": "Meta",
"tab": "Tab",
"delete": "Delete",
"escape": "Escape",
"arrowdown": "ArrowDown",
"end": "End",
"enter": "Enter",
"home": "Home",
"insert": "Insert",
"pagedown": "PageDown",
"pageup": "PageUp",
"arrowright": "ArrowRight",
"arrowup": "ArrowUp",
"f1": "F1",
"f2": "F2",
"f3": "F3",
"f4": "F4",
"f5": "F5",
"f6": "F6",
"f7": "F7",
"f8": "F8",
"f9": "F9",
"f10": "F10",
"f11": "F11",
"f12": "F12",
}
RolesType = Literal[
"alert",
"alertdialog",
"application",
"article",
"banner",
"blockquote",
"button",
"caption",
"cell",
"checkbox",
"code",
"columnheader",
"combobox",
"complementary",
"contentinfo",
"definition",
"deletion",
"dialog",
"directory",
"document",
"emphasis",
"feed",
"figure",
"form",
"generic",
"grid",
"gridcell",
"group",
"heading",
"img",
"insertion",
"link",
"list",
"listbox",
"listitem",
"log",
"main",
"marquee",
"math",
"meter",
"menu",
"menubar",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"navigation",
"none",
"note",
"option",
"paragraph",
"presentation",
"progressbar",
"radio",
"radiogroup",
"region",
"row",
"rowgroup",
"rowheader",
"scrollbar",
"search",
"searchbox",
"separator",
"slider",
"spinbutton",
"status",
"strong",
"subscript",
"superscript",
"switch",
"tab",
"table",
"tablist",
"tabpanel",
"term",
"textbox",
"time",
"timer",
"toolbar",
"tooltip",
"tree",
"treegrid",
"treeitem",
"alt_text",
"label",
"placeholder",
]
MAX_VANILLA_STR_LENGTH = 1000
PLAYWRIGHT_LOCATORS = (
"get_by_role",
"get_by_text",
"get_by_label",
"get_by_placeholder",
"get_by_alt_text",
"get_by_title",
"get_by_test_id",
"filter",
"frame_locator",
"locator",
)
PLAYWRIGHT_ACTIONS = (
"fill",
"check",
"select_option",
"click",
"hover",
"dclick",
"type",
"focus",
"goto",
"press",
"scroll",
)
IGNORED_ACTREE_PROPERTIES = (
"focusable",
"editable",
"readonly",
"level",
"settable",
"multiline",
"invalid",
)
INJECTED_ATTR_NAME = "aria-roledescription"
BID_ATTR = "bid" # the attribute name for extra meta data
BID_EXPR = r"([-0-9]+)"
FLOAT_EXPR = r"([+-]?(?:[0-9]*[.])?[0-9]+)"
BOOL_EXPR = r"([01])"
DATA_REGEXP = re.compile(
BID_EXPR
+ r"_"
+ FLOAT_EXPR
+ r"_"
+ FLOAT_EXPR
+ r"_"
+ FLOAT_EXPR
+ r"_"
+ FLOAT_EXPR
+ r"_"
+ FLOAT_EXPR
+ r"_"
+ FLOAT_EXPR
+ r"_"
+ BOOL_EXPR
+ r"_"
+ r"(.*)"
)
IN_VIEWPORT_RATIO_THRESHOLD = 0.6

View File

@ -0,0 +1,97 @@
# websites domain
import os
DATASET = os.environ["DATASET"]
if DATASET not in ["webarena", "visualwebarena"]:
raise ValueError("Please set the DATASET environment variable, the possible options are `webarena`, `visualwebarena` and `miniwob++`")
# WebArena
if DATASET == "webarena":
REDDIT = os.environ.get("REDDIT", "")
SHOPPING = os.environ.get("SHOPPING", "")
SHOPPING_ADMIN = os.environ.get("SHOPPING_ADMIN", "")
GITLAB = os.environ.get("GITLAB", "")
WIKIPEDIA = os.environ.get("WIKIPEDIA", "")
MAP = os.environ.get("MAP", "")
HOMEPAGE = os.environ.get("HOMEPAGE", "")
assert (
REDDIT
and SHOPPING
and SHOPPING_ADMIN
and GITLAB
and WIKIPEDIA
and MAP
and HOMEPAGE
), (
f"Please setup the URLs to each site. Current: \n"
+ f"Reddit: {REDDIT}\n"
+ f"Shopping: {SHOPPING}\n"
+ f"Shopping Admin: {SHOPPING_ADMIN}\n"
+ f"Gitlab: {GITLAB}\n"
+ f"Wikipedia: {WIKIPEDIA}\n"
+ f"Map: {MAP}\n"
+ f"Homepage: {HOMEPAGE}\n"
)
URL_MAPPINGS = {
REDDIT: "http://reddit.com",
SHOPPING: "http://onestopmarket.com",
SHOPPING_ADMIN: "http://luma.com/admin",
GITLAB: "http://gitlab.com",
WIKIPEDIA: "http://wikipedia.org",
MAP: "http://openstreetmap.org",
HOMEPAGE: "http://homepage.com",
}
elif DATASET == "visualwebarena":
REDDIT = os.environ.get("REDDIT", "")
SHOPPING = os.environ.get("SHOPPING", "")
WIKIPEDIA = os.environ.get("WIKIPEDIA", "")
HOMEPAGE = os.environ.get("HOMEPAGE", "")
CLASSIFIEDS = os.environ.get("CLASSIFIEDS", "")
CLASSIFIEDS_RESET_TOKEN = os.environ.get("CLASSIFIEDS_RESET_TOKEN", "")
REDDIT_RESET_URL = os.environ.get("REDDIT_RESET_URL", "")
assert (
REDDIT
and SHOPPING
and WIKIPEDIA
and HOMEPAGE
and CLASSIFIEDS
and CLASSIFIEDS_RESET_TOKEN
), (
f"Please setup the URLs and tokens to each site. Current: "
+ f"Reddit: {REDDIT}"
+ f"Shopping: {SHOPPING}"
+ f"Wikipedia: {WIKIPEDIA}"
+ f"Homepage: {HOMEPAGE}"
+ f"Classifieds: {CLASSIFIEDS}"
+ f"Classifieds reset token: {CLASSIFIEDS_RESET_TOKEN}"
)
URL_MAPPINGS = {
REDDIT: "http://reddit.com",
SHOPPING: "http://onestopmarket.com",
WIKIPEDIA: "http://wikipedia.org",
HOMEPAGE: "http://homepage.com",
CLASSIFIEDS: "http://classifieds.com",
}
else:
raise ValueError(f"Dataset not implemented: {DATASET}")
ACCOUNTS = {
"reddit": {"username": "MarvelsGrantMan136", "password": "test1234"},
"shopping": {
"username": "emma.lopez@gmail.com",
"password": "Password.123",
},
"classifieds": {
"username": "blake.sullivan@gmail.com",
"password": "Password.123",
},
"shopping_site_admin": {"username": "admin", "password": "admin1234"},
"shopping_admin": {"username": "admin", "password": "admin1234"},
"gitlab": {"username": "byteblaze", "password": "hello1234"},
}

View File

@ -94,6 +94,7 @@ class ScriptBrowserEnv(Env[dict[str, Observation], Action]):
save_trace_enabled: bool = False, save_trace_enabled: bool = False,
sleep_after_execution: float = 0.0, sleep_after_execution: float = 0.0,
captioning_fn=None, captioning_fn=None,
proxy_url: str = "",
): ):
# TODO: make Space[Action] = ActionSpace # TODO: make Space[Action] = ActionSpace
self.action_space = get_action_space() # type: ignore[assignment] self.action_space = get_action_space() # type: ignore[assignment]
@ -104,6 +105,7 @@ class ScriptBrowserEnv(Env[dict[str, Observation], Action]):
self.viewport_size = viewport_size self.viewport_size = viewport_size
self.save_trace_enabled = save_trace_enabled self.save_trace_enabled = save_trace_enabled
self.sleep_after_execution = sleep_after_execution self.sleep_after_execution = sleep_after_execution
self.proxy_url = proxy_url
match observation_type: match observation_type:
case "html" | "accessibility_tree" | "accessibility_tree_with_captioner" | "webrl": case "html" | "accessibility_tree" | "accessibility_tree_with_captioner" | "webrl":
@ -187,6 +189,12 @@ class ScriptBrowserEnv(Env[dict[str, Observation], Action]):
storage_state=storage_state, storage_state=storage_state,
geolocation=geolocation, geolocation=geolocation,
device_scale_factor=1, device_scale_factor=1,
proxy={
"server": self.proxy_url,
"bypass": "127.0.0.1,localhost",
}
if self.proxy_url
else None,
) )
if self.save_trace_enabled: if self.save_trace_enabled:
self.context.tracing.start(screenshots=True, snapshots=True) self.context.tracing.start(screenshots=True, snapshots=True)

View File

@ -0,0 +1,192 @@
/**
* Go through all DOM elements in the frame (including shadowDOMs), give them unique browsergym
* identifiers (bid), and store custom data in the aria-roledescription attribute.
*/
var { innerWidth: windowWidth, innerHeight: windowHeight } = window;
var scrollX = window.scrollX || document.documentElement.scrollLeft;
var scrollY = window.scrollY || document.documentElement.scrollTop;
([parent_bid, bid_attr_name, iframe_position, super_iframe_offset]) => {
// standard html tags
// https://www.w3schools.com/tags/
const html_tags = [
"a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio",
"b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button",
"canvas", "caption", "center", "cite", "code", "col", "colgroup", "data", "datalist",
"dd", "del", "details", "dfn", "dialog", "dir", "div", "dl", "dt", "em", "embed",
"fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset",
"h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i",
"iframe", "img", "input", "ins", "kbd", "label", "legend", "li", "link", "main",
"map", "mark", "menu", "meta", "meter", "nav", "noframes", "noscript", "object",
"ol", "optgroup", "option", "output", "p", "param", "picture", "pre", "progress",
"q", "rp", "rt", "ruby", "s", "samp", "script", "search", "section", "select",
"small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup",
"svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead",
"time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"
];
if (super_iframe_offset == null) {
iframe_offset = { x: scrollX, y: scrollY, right: windowWidth, bottom: windowHeight };
}
else {
[super_x, super_y, super_right, super_bottom] = [super_iframe_offset["x"], super_iframe_offset["y"], super_iframe_offset["right"], super_iframe_offset["bottom"]];
x = Math.max(-iframe_position.x, 0);
y = Math.max(-iframe_position.y, 0);
right = Math.min(...[super_right, windowWidth, super_right - iframe_position.x]);
bottom = Math.min(...[super_bottom, windowHeight, super_bottom - iframe_position.y]);
iframe_offset = { x: x, y: y, right: right, bottom: bottom };
}
let browsergym_first_visit = false;
// if no yet set, set the frame (local) element counter to 0
if (!("browsergym_frame_elem_counter" in window)) {
window.browsergym_frame_elem_counter = 0;
browsergym_first_visit = true;
}
// get all DOM elements in the current frame (does not include elements in shadowDOMs)
let elements = Array.from(document.querySelectorAll('*'));
i = 0;
while (i < elements.length) {
const elem = elements[i];
// add shadowDOM elements to the elements array, in such a way that order is preserved
// TODO: do we really need the order preserved?
if (elem.shadowRoot !== null) {
elements = new Array(
...Array.prototype.slice.call(elements, 0, i + 1),
...Array.from(elem.shadowRoot.querySelectorAll("*")),
...Array.prototype.slice.call(elements, i + 1)
);
}
i++;
// we will mark only standard HTML tags
if (!elem.tagName || !html_tags.includes(elem.tagName.toLowerCase())) {
// console.log(`Skipping element ${elem.outerHTML}`)
continue; // stop and move on to the next element
}
// console.log(`Processing element ${elem.outerHTML}`)
// write dynamic element values to the DOM
if (typeof elem.value !== 'undefined') {
elem.setAttribute("value", elem.value);
}
// write dynamic checked properties to the DOM
if (typeof elem.checked !== 'undefined') {
if (elem.checked === true) {
elem.setAttribute("checked", "");
}
else {
elem.removeAttribute("checked");
}
}
// add the element global id to a custom HTML attribute
// https://playwright.dev/docs/locators#locate-by-test-id
// recover the element id if it has one already, else compute a new element id
let elem_global_bid;
if (elem.hasAttribute(bid_attr_name)) {
// throw an error if the attribute is already set while this is the first visit of the page
if (browsergym_first_visit) {
throw new Error(`Attribute ${bid_attr_name} already used in element ${elem.outerHTML}`);
}
elem_global_bid = elem.getAttribute(bid_attr_name);
}
else {
let elem_local_id = window.browsergym_frame_elem_counter++;
if (parent_bid == "") {
elem_global_bid = `${elem_local_id}`;
}
else {
elem_global_bid = `${parent_bid}-${elem_local_id}`;
}
elem.setAttribute(bid_attr_name, `${elem_global_bid}`);
}
// Hack: store custom data inside the aria-roledescription attribute (will be available in DOM and AXTree)
// - elem_global_bid: global element identifier (unique over multiple frames)
// TODO: add more data if needed (x, y coordinates, bounding box, is_visible, is_clickable etc.)
let [rect, is_in_viewport] = getElementPositionInfo(elem, iframe_offset, iframe_position);
let left = (rect.left + iframe_position.x).toString();
let top = (rect.top + iframe_position.y ).toString();
let right = (rect.right + iframe_position.x ).toString();
let bottom = (rect.bottom + iframe_position.y).toString();
let center_x = ((rect.left + rect.right) / 2 + iframe_position.x).toString();
let center_y = ((rect.top + rect.bottom) / 2 + iframe_position.y).toString();
elem.setAttribute("browsergym_center", `(${center_x}, ${center_y})`);
elem.setAttribute("browsergym_bounding_box", `(${left}, ${top}, ${right}, ${bottom})`);
elem.setAttribute("browsergym_is_in_viewport", `${is_in_viewport}`);
let original_content = "";
if (elem.hasAttribute("aria-roledescription")) {
original_content = elem.getAttribute("aria-roledescription");
}
let new_content = `${elem_global_bid}_${left}_${top}_${center_x}_${center_y}_${right}_${bottom}_${is_in_viewport}_${original_content}`
elem.setAttribute("aria-roledescription", new_content);
}
return iframe_offset;
}
function getElementPositionInfo(element, iframe_offset, iframe_position) {
var rect = element.getBoundingClientRect();
let x = (rect.left + rect.right) / 2 ;
let y = (rect.top + rect.bottom) / 2 ;
//loop over element ancestors (parent) and refine iframe offset to be the most precise possible
var parent = element.parentElement;
parent_iframe_offset = { x: 0, y: 0, right: windowWidth, bottom: windowHeight };
while (parent !== null) {
var parent_rect = parent.getBoundingClientRect();
parent_iframe_offset["x"] = Math.max(parent_rect.left , parent_iframe_offset["x"] );
parent_iframe_offset["y"] = Math.max(parent_rect.top , parent_iframe_offset["y"] );
parent_iframe_offset["right"] = Math.min(parent_rect.right , parent_iframe_offset["right"] );
parent_iframe_offset["bottom"] = Math.min(parent_rect.bottom , parent_iframe_offset["bottom"] );
parent = parent.parentElement;
}
var is_in_viewport = (
x >= iframe_offset["x"] &&
y >= iframe_offset["y"] &&
x <= iframe_offset["right"] &&
y <= iframe_offset["bottom"]
);
//this features is broken for the moment
var NotBehindParent = (
x >= parent_iframe_offset["x"] &&
y >= parent_iframe_offset["y"] &&
x <= parent_iframe_offset["right"] &&
y <= parent_iframe_offset["bottom"]
);
var isVisible = (typeof element.offsetWidth === 'undefined' || typeof element.offsetHeight === 'undefined') || (element.offsetWidth > 0 && element.offsetHeight > 0);
// Return true if the element is both in the viewport and has non-zero dimensions
return [rect, (is_in_viewport && isVisible && IsInFront(element))? 1 : 0];
}
function IsInFront(element){
var rect = element.getBoundingClientRect();
var x = (rect.left + rect.right) / 2 ;
var y = (rect.top + rect.bottom) / 2 ;
var newElement = elementFromPoint(x, y); //return the element in the foreground at position (x,y)
if(newElement){
if(newElement === element)
return true;
}
return false;
}
function elementFromPoint(x, y) {
let node = document.elementFromPoint(x, y);
let child = node?.shadowRoot?.elementFromPoint(x, y);
while (child && child !== node) {
node = child;
child = node?.shadowRoot?.elementFromPoint(x, y);
}
return child || node;
}

View File

@ -0,0 +1,41 @@
/**
* Go through all DOM elements in the frame (including shadowDOMs),
* and cleanup previously stored data in the aria-roledescription attribute.
*/
() => {
// get all DOM elements in the current frame (does not include elements in shadowDOMs)
let elements = Array.from(document.querySelectorAll('*'));
let i = 0;
while (i < elements.length) {
const elem = elements[i];
// add shadowDOM elements to the elements array, in such a way that order is preserved
// TODO: do we really need the order preserved?
if (elem.shadowRoot !== null) {
elements = new Array(
...Array.prototype.slice.call(elements, 0, i + 1),
...Array.from(elem.shadowRoot.querySelectorAll("*")),
...Array.prototype.slice.call(elements, i + 1)
);
}
i++;
// Hack: remove custom data stored inside the aria-roledescription tag
// - elem_global_id: global browsergym identifier
if (elem.hasAttribute("aria-roledescription")) {
let content = elem.getAttribute("aria-roledescription");
// TODO: handle more data if needed
let n_data_items = 8; // bid, bbox_left, bbox_top, center_x, center_y, bbox_right, bbox_bottom, is_in_viewport
let post_data_index = 0;
for (let j = 0 ; j < n_data_items ; j++) {
post_data_index = content.indexOf("_", post_data_index) + 1;
}
original_content = content.substring(post_data_index);
if (original_content) {
elem.setAttribute("aria-roledescription", original_content);
}
else {
elem.removeAttribute("aria-roledescription");
}
}
}
}

View File

@ -1114,7 +1114,7 @@ class ImageObservationProcessor(ObservationProcessor):
try: try:
browser_info = self.fetch_browser_info(page) browser_info = self.fetch_browser_info(page)
except Exception: except Exception:
page.wait_for_load_state("load", timeout=500) page.wait_for_load_state("load", timeout=30000) # 500->30000, modified by yuyr
browser_info = self.fetch_browser_info(page) browser_info = self.fetch_browser_info(page)
self.browser_config = browser_info["config"] self.browser_config = browser_info["config"]

View File

View File

@ -0,0 +1,6 @@
from typing import Union
from .actions import Action
from .utils import StateInfo
Trajectory = list[Union[StateInfo, Action]]

View File

@ -0,0 +1,106 @@
import base64
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Dict, TypedDict, Union
import numpy as np
import numpy.typing as npt
from beartype import beartype
from PIL import Image
try:
from vertexai.preview.generative_models import Image as VertexImage
except:
print('Google Cloud not set up, skipping import of vertexai.preview.generative_models.Image')
@dataclass
class DetachedPage:
url: str
content: str # html
@beartype
def png_bytes_to_numpy(png: bytes) -> npt.NDArray[np.uint8]:
"""Convert png bytes to numpy array
Example:
>>> fig = go.Figure(go.Scatter(x=[1], y=[1]))
>>> plt.imshow(png_bytes_to_numpy(fig.to_image('png')))
"""
return np.array(Image.open(BytesIO(png)))
def pil_to_b64(img: Image.Image) -> str:
with BytesIO() as image_buffer:
img.save(image_buffer, format="PNG")
byte_data = image_buffer.getvalue()
img_b64 = base64.b64encode(byte_data).decode("utf-8")
img_b64 = "data:image/png;base64," + img_b64
return img_b64
def pil_to_vertex(img: Image.Image) -> str:
with BytesIO() as image_buffer:
img.save(image_buffer, format="PNG")
byte_data = image_buffer.getvalue()
img_vertex = VertexImage.from_bytes(byte_data)
return img_vertex
class DOMNode(TypedDict):
nodeId: str
nodeType: str
nodeName: str
nodeValue: str
attributes: str
backendNodeId: str
parentId: str
childIds: list[str]
cursor: int
union_bound: list[float] | None
center: list[float] | None
class AccessibilityTreeNode(TypedDict):
nodeId: str
ignored: bool
role: dict[str, Any]
chromeRole: dict[str, Any]
name: dict[str, Any]
properties: list[dict[str, Any]]
childIds: list[str]
parentId: str
backendDOMNodeId: int
frameId: str
bound: list[float] | None
union_bound: list[float] | None
offsetrect_bound: list[float] | None
center: list[float] | None
class BrowserConfig(TypedDict):
win_upper_bound: float
win_left_bound: float
win_width: float
win_height: float
win_right_bound: float
win_lower_bound: float
device_pixel_ratio: float
class BrowserInfo(TypedDict):
DOMTree: dict[str, Any]
config: BrowserConfig
AccessibilityTree = list[AccessibilityTreeNode]
DOMTree = list[DOMNode]
Observation = str | npt.NDArray[np.uint8]
class StateInfo(TypedDict):
observation: dict[str, Observation]
info: Dict[str, Any]

Binary file not shown.

After

Width:  |  Height:  |  Size: 328 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 150 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 180 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 150 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 225 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 229 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 161 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 251 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 191 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 170 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 153 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 172 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 208 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 109 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 680 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 125 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 150 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 114 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

Some files were not shown because too many files have changed in this diff Show More