增加环境变量指向g14部署webarena环境;

map网站使用socks5连接到官网openstreetmap;
模型改成gpt-4o; llm fuzzy math也使用4o;
增加generate_test_data.py脚本
This commit is contained in:
yuyr 2025-04-15 18:16:52 +08:00
parent c078ba6292
commit bb7393bcb3
11 changed files with 115 additions and 22 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
config_files/tasks/
*.pyc
.auth/*
trash-*.txt
.env

View File

@ -1,7 +1,7 @@
logging: True logging: True
verbose: 1 verbose: 1
debug: False debug: False
logdir: "../AgentOccam-Trajectories" logdir: "../AgentOccam-Trajectories-shopping-admin"
logname: "AgentOccam" logname: "AgentOccam"
max_steps: 20 max_steps: 20
agent: agent:
@ -16,7 +16,7 @@ agent:
debug: 0 debug: 0
verbose: 1 verbose: 1
number: 1 number: 1
model: "gpt-4-turbo" model: "gpt-4o"
documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"]
online_interaction_elements: [] online_interaction_elements: []
input: ["step", "objective", "previous plans", "interaction history", "current observation"] input: ["step", "objective", "previous plans", "interaction history", "current observation"]
@ -35,7 +35,7 @@ agent:
mode: false mode: false
debug: 0 debug: 0
verbose: 1 verbose: 1
model: "gpt-4-turbo" model: "gpt-4o-mini"
documented_interaction_elements: [] documented_interaction_elements: []
online_interaction_elements: [] online_interaction_elements: []
character: "normal" character: "normal"
@ -52,7 +52,7 @@ agent:
mode: false mode: false
debug: 0 debug: 0
verbose: 1 verbose: 1
model: "gpt-4-turbo" model: "gpt-4o-mini"
documented_interaction_elements: [] documented_interaction_elements: []
online_interaction_elements: [] online_interaction_elements: []
strict: false strict: false
@ -70,7 +70,11 @@ env:
prune: true prune: true
max_browser_rows: 500 max_browser_rows: 500
headless: True headless: True
task_ids: ["Allrecipes--3", 65] proxy_url: "socks5://104.248.187.88:1001"
# task_ids: ["Allrecipes--3", 65]
# task_ids: [7]
task_ids: [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790]
# a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790]
# b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767]
# c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798]

View File

@ -15,14 +15,15 @@ from AgentOccam.obs_opt import (
class WebArenaEnvironmentWrapper(): class WebArenaEnvironmentWrapper():
def __init__(self, config_file, max_browser_rows=300, max_steps=50, slow_mo=1, observation_type="accessibility_tree", current_viewport_only=False, viewport_size={"width": 1280, "height": 720}, headless=False, global_config=None): def __init__(self, config_file, max_browser_rows=300, max_steps=50, slow_mo=1, observation_type="accessibility_tree", current_viewport_only=False, viewport_size={"width": 1280, "height": 720}, headless=False, global_config=None, proxy_url=""):
self.webarena_env = ScriptBrowserEnv( self.webarena_env = ScriptBrowserEnv(
headless=headless, headless=headless,
slow_mo=slow_mo, slow_mo=slow_mo,
observation_type=observation_type, observation_type=observation_type,
current_viewport_only=current_viewport_only, current_viewport_only=current_viewport_only,
viewport_size=viewport_size, viewport_size=viewport_size,
global_config=global_config global_config=global_config,
proxy_url=proxy_url
) )
self.config_file = config_file self.config_file = config_file
with open(self.config_file, "r") as f: with open(self.config_file, "r") as f:

View File

@ -87,6 +87,7 @@ class ScriptBrowserEnv(Env[dict[str, Observation], Action]):
save_trace_enabled: bool = False, save_trace_enabled: bool = False,
sleep_after_execution: float = 5.0, sleep_after_execution: float = 5.0,
global_config = None, global_config = None,
proxy_url: str = "",
): ):
# TODO: make Space[Action] = ActionSpace # TODO: make Space[Action] = ActionSpace
self.action_space = get_action_space() # type: ignore[assignment] self.action_space = get_action_space() # type: ignore[assignment]
@ -98,6 +99,9 @@ class ScriptBrowserEnv(Env[dict[str, Observation], Action]):
self.save_trace_enabled = save_trace_enabled self.save_trace_enabled = save_trace_enabled
self.sleep_after_execution = sleep_after_execution self.sleep_after_execution = sleep_after_execution
self.global_config = global_config self.global_config = global_config
self.proxy_url = proxy_url
print(f"ScriptBrowserEnv proxy_url: {self.proxy_url}")
match observation_type: match observation_type:
case "html" | "accessibility_tree": case "html" | "accessibility_tree":
@ -151,6 +155,12 @@ class ScriptBrowserEnv(Env[dict[str, Observation], Action]):
storage_state=storage_state, storage_state=storage_state,
geolocation=geolocation, geolocation=geolocation,
device_scale_factor=1, device_scale_factor=1,
proxy={
"server": self.proxy_url,
"bypass": "127.0.0.1,localhost",
}
if self.proxy_url
else None,
) )
if self.save_trace_enabled: if self.save_trace_enabled:
self.context.tracing.start(screenshots=True, snapshots=True) self.context.tracing.start(screenshots=True, snapshots=True)
@ -165,7 +175,7 @@ class ScriptBrowserEnv(Env[dict[str, Observation], Action]):
if self.text_observation_type == "accessibility_tree": if self.text_observation_type == "accessibility_tree":
client.send("Accessibility.enable") client.send("Accessibility.enable")
page.client = client # type: ignore # TODO[shuyanzh], fix this hackey client page.client = client # type: ignore # TODO[shuyanzh], fix this hackey client
page.goto(url) page.goto(url, timeout=10000)
# set the first page as the current page # set the first page as the current page
self.page = self.context.pages[0] self.page = self.context.pages[0]
self.page.bring_to_front() self.page.bring_to_front()

14
env.sh Normal file
View File

@ -0,0 +1,14 @@
webarena_server_address="localhost"
export SHOPPING="http://${webarena_server_address}:28082"
export SHOPPING_ADMIN="http://${webarena_server_address}:28083/admin"
export REDDIT="http://${webarena_server_address}:28080"
export GITLAB="http://${webarena_server_address}:28084"
export MAP="https://www.openstreetmap.org"
export WIKIPEDIA="http://${webarena_server_address}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
export HOMEPAGE="http://${webarena_server_address}:20080"
export OPENAI_API_KEY="sk-xxx"
export OPENAI_BASE_URL="https://aiproxy.lmzgc.cn:8080/v1"
export GEMINI_API_KEY="AIzaSyBwE1234567890" # Optional, we provide several other agent base models, such as Claude and LLaMa.

View File

@ -102,7 +102,8 @@ def run():
current_viewport_only=current_viewport_only, current_viewport_only=current_viewport_only,
viewport_size={"width": 1920, "height": 1080}, viewport_size={"width": 1920, "height": 1080},
headless=config.env.headless, headless=config.env.headless,
global_config=config) global_config=config,
proxy_url=config.env.proxy_url)
agent = agent_init() agent = agent_init()
objective = env.get_objective() objective = env.get_objective()

View File

@ -11,7 +11,7 @@ from typing import Any, Tuple, Union, Optional
from beartype import beartype from beartype import beartype
import nltk import nltk
nltk.download('punkt') # nltk.download('punkt') # NOTE: you need to download the punkt model first
from nltk.tokenize import word_tokenize # type: ignore from nltk.tokenize import word_tokenize # type: ignore
from playwright.sync_api import CDPSession, Page from playwright.sync_api import CDPSession, Page

View File

@ -158,7 +158,7 @@ def llm_fuzzy_match(pred: str, reference: str, question: str) -> float:
{"role": "user", "content": message}, {"role": "user", "content": message},
] ]
response = generate_from_openai_chat_completion( response = generate_from_openai_chat_completion(
model="gpt-4-1106-preview", model="gpt-4o",
messages=messages, messages=messages,
temperature=0, temperature=0,
max_tokens=768, max_tokens=768,
@ -193,7 +193,7 @@ def llm_ua_match(pred: str, reference: str, question: str) -> float:
] ]
response = generate_from_openai_chat_completion( response = generate_from_openai_chat_completion(
model="gpt-4-1106-preview", model="gpt-4o",
messages=messages, messages=messages,
temperature=0, temperature=0,
max_tokens=768, max_tokens=768,

18
play-AgentOccam.txt Normal file
View File

@ -0,0 +1,18 @@
STEP:
7
OBJECTIVE:
Add a new size XXS to blue and purple Nona Fitness Tank
PREVIOUS PLANS:
[0] (Active Plan) Find the solution to "Add a new size XXS to blue and purple Nona Fitness Tank"
OBSERVATION DESCRIPTION:
The current page is the 'Products / Inventory / Catalog' section of the Magento Admin Panel. It displays a list of products with various details such as Name, Type, SKU, and more. The Nona Fitness Tank is listed as a configurable product with an option to edit it. The page includes a search textbox and several buttons for actions like adding products, filters, and search. There is no direct option visible for adding new size attributes.
REASON:
To add the XXS size, it is necessary to create or enable it as an attribute. Since the XXS size was not available in the previous configuration steps, I need to investigate how to add or enable this size attribute. The logical next step is to explore the 'Attributes' section within the 'STORES' menu, as this is where product attributes are typically managed.
ACTION:
click [20159]

View File

@ -0,0 +1,27 @@
"""Replace the website placeholders with website domains from env_config
Generate the test data"""
import json
from browser_env.env_config import *
def main() -> None:
with open("config_files/test.raw.json", "r") as f:
raw = f.read()
raw = raw.replace("__GITLAB__", GITLAB)
raw = raw.replace("__REDDIT__", REDDIT)
raw = raw.replace("__SHOPPING__", SHOPPING)
raw = raw.replace("__SHOPPING_ADMIN__", SHOPPING_ADMIN)
raw = raw.replace("__WIKIPEDIA__", WIKIPEDIA)
raw = raw.replace("__MAP__", MAP)
with open("config_files/tasks/test.json", "w") as f:
f.write(raw)
# split to multiple files
data = json.loads(raw)
for idx, item in enumerate(data):
with open(f"config_files/tasks/{idx}.json", "w") as f:
json.dump(item, f, indent=2)
if __name__ == "__main__":
main()

View File

@ -1,11 +1,18 @@
export SHOPPING="http://<webarena_server_address>:7770" webarena_server_address="localhost"
export SHOPPING_ADMIN="http://<webarena_server_address>:7780/admin"
export REDDIT="http://<webarena_server_address>:9999" export SHOPPING="http://${webarena_server_address}:28082"
export GITLAB="http://<webarena_server_address>:8023" export SHOPPING_ADMIN="http://${webarena_server_address}:28083/admin"
export MAP="http://<webarena_server_address>:3000" export REDDIT="http://${webarena_server_address}:28080"
export WIKIPEDIA="http://<webarena_server_address>:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" export GITLAB="http://${webarena_server_address}:28084"
export HOMEPAGE="http://<webarena_server_address>:4399" export MAP="https://www.openstreetmap.org"
export OPENAI_API_KEY=<openai_api_key> export WIKIPEDIA="http://${webarena_server_address}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
conda activate webarena export HOMEPAGE="http://${webarena_server_address}:20080"
export OPENAI_API_KEY="sk-xxx" # 改成你的key
export OPENAI_BASE_URL="https://aiproxy.lmzgc.cn:8080/v1"
export GEMINI_API_KEY="AIzaSyBwE1234567890" # Optional, we provide several other agent base models, such as Claude and LLaMa.
python browser_env/auto_login.py python browser_env/auto_login.py
python eval_webarena.py --config AgentOccam/configs/AgentOccam.yml python eval_webarena.py --config AgentOccam/configs/AgentOccam.yml