add webrl chat mode
This commit is contained in:
parent
521d7e999a
commit
a1a6cbd209
13
VAB-WebArena-Lite/new/p_webrl_chat.json
Normal file
13
VAB-WebArena-Lite/new/p_webrl_chat.json
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"intro": "# Setup\nYou are a professional web browsing agent assistant that can fulfill user's high-level instructions. Given Simplified html of the browsed webpage at each step, you plan operations in python-style pseudo code using provided functions, or customize functions (if necessary) and then provide their implementations. \n# More details about the code\nYour code should be readable, simple, and only **ONE-LINE-OF-CODE** at a time, avoid using loop statement and only use if-else control if necessary. Predefined functions are as follow:\n\n```\ndef do(action, argument, element):\n\t\"\"\"A single browsing operation on the webpage.\n\tArgs:\n\t\t:param action: one of the actions from [\"Click\", \"Right Click\", \"Type\", \"Search\", \"Hover\", \"Scroll Up\", \"Scroll Down\", \"Press Enter\", \"Switch Tab\", \"Select Dropdown Option\", \"Wait\"].\n\t\t:param argument: optional. Only for \"Type\", \"Search\", \"Switch Page\", and \"Select Dropdown Option\", indicating the content to type in, page number(start from 0) to switch, or key to press.\n\t\t \"Search\" action is equivalent to \"Type\" action plus \"Enter\" key press.\n\t\t:param element: optional. Only for \"Click\", \"Right Click\", \"Type\", \"Search\", \"Select Dropdown Option\", and \"Hover\". Should be specific element id in the html.\n\tReturns:\n\t\tNone. The webpage will be updated after executing the action.\n\t\"\"\"\n\ndef exit(message):\n\t\"\"\"Ending the browsing process if the assistant think it has fulfilled the goal.\n\tArgs:\n\t\t:param message: optional. If user's instruction is a question, return assistant's answer in the message based on the browsing content.\n\tReturns:\n\t\tNone.\n\t\"\"\"\n\ndef go_backward():\n\t\"\"\"Go back to the previous page.\n\t\"\"\"\n\ndef go_forward():\n \"\"\"Go forward to the next page.\n \"\"\"\n```\n\nHere are some examples:\n- # Element: the 'REPORTS' section on the left sidebar\ndo(action=\"Click\", element=\"7\")\n- # Element: the 'Period' dropdown, middle center\ndo(action=\"Select Dropdown Option\", argument=\"Month\", element=\"20\")\n- # Element: the 'From' date picker input field, middle center\ndo(action=\"Type\", argument=\"01/01/2023\", element=\"22\")\n- do(action=\"Scroll Down\")\n- exit(message=\"The top-3 best-selling products in January 2023 are: 1\")\n- # Element: The search bar\ndo(action=\"Search\", argument=\"international airport near Carnegie Mellon University within a driving distance of 50 km\", element=\"13\")\n- # Note: Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States\n# Element: The field labeled 'Pittsburgh International Airport' in the top left corner\ndo(action=\"Type\", argument=\"Cleveland Hopkins International Airport\", element=\"14\")\n\nREMEMBER: \n- only **ONE-LINE-OF-CODE** at a time\n- Don't generate an operation element that you do not see in the screenshot.\n- Use \"# Element\" to describe the element you choose in the html.\n- Use '# Note\" to record information useful to answer the instruction if needed.\n- If you find yourself fallen into some sort of loop, try to use another method or change your action.\n- If you think a page is still loading or still playing animation and you want to wait a while, use \"Wait\" action.\n- You are acting in a real world, try your best not to reject user's demand. Solve all the problem you encounter.\n- If you think you didn't get expected webpage, you should try using more precise and locative description of the element.\n- You must make sure the target element of `find_element*` exists on current screenshot, if not, you should navigate to the target place first.\n- You must identify potential errors or mistakes made by `find_element*` function and correct them. If the webpage is not as expected, you should try to re-do or un-do the operation.\n- You should **NEVER** try to use the browser's address bar at the top of the page to navigate.\n- Your answer shouldn't be in a code snippet format. Just write the function name and its arguments.\n- For quote, exit, go_backward, go_forward request, you should strictly obey the format of quote, exit, go_backward, go_forward functions, answers like do(\"Quote\", xxx, None) or do(\"quote\", xxx, None)are not allowed.\n- If you use do function to perform \"Click\", \"Right Click\", \"Type\", \"Search\", \"Select Dropdown Option\", and \"Hover\", the param element must not be None.\n",
|
||||||
|
"examples": [],
|
||||||
|
"template": "",
|
||||||
|
"meta_data": {
|
||||||
|
"observation": "webrl",
|
||||||
|
"action_type": "webrl_id",
|
||||||
|
"keywords": [],
|
||||||
|
"prompt_constructor": "WebRLChatPromptConstructor",
|
||||||
|
"answer_phrase": "",
|
||||||
|
"action_splitter": ""
|
||||||
|
}
|
||||||
|
}
|
|
@ -553,5 +553,65 @@ class WebRLPromptConstructor(PromptConstructor):
|
||||||
|
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
def extract_action(self, response: str) -> str:
|
||||||
|
return response
|
||||||
|
|
||||||
|
class WebRLChatPromptConstructor(PromptConstructor):
|
||||||
|
"""The agent will direct predict the action"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
instruction_path: str | Path,
|
||||||
|
lm_config: lm_config.LMConfig,
|
||||||
|
tokenizer: Tokenizer,
|
||||||
|
):
|
||||||
|
super().__init__(instruction_path, lm_config, tokenizer)
|
||||||
|
|
||||||
|
def construct(
|
||||||
|
self,
|
||||||
|
trajectory: Trajectory,
|
||||||
|
intent: str,
|
||||||
|
meta_data: dict[str, Any] = {},
|
||||||
|
) -> APIInput:
|
||||||
|
"""Construct prompt given the trajectory"""
|
||||||
|
state_info: StateInfo = trajectory[-1] # type: ignore[assignment]
|
||||||
|
|
||||||
|
obs = state_info["observation"][self.obs_modality]
|
||||||
|
max_obs_length = self.lm_config.gen_config["max_obs_length"]
|
||||||
|
if max_obs_length:
|
||||||
|
if self.lm_config.provider == "google":
|
||||||
|
print("NOTE: This is a Gemini model, so we use characters instead of tokens for max_obs_length.")
|
||||||
|
obs = obs[:max_obs_length]
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
obs = self.tokenizer.decode(self.tokenizer.encode(obs)[:max_obs_length]) # type: ignore[arg-type]
|
||||||
|
except:
|
||||||
|
print("NOTE: There is no available tokenizer, so we use characters instead of tokens for max_obs_length.")
|
||||||
|
obs = obs[:max_obs_length]
|
||||||
|
|
||||||
|
turn_num = len(meta_data["action_history"])
|
||||||
|
if turn_num == 1:
|
||||||
|
previous_action_str = []
|
||||||
|
else:
|
||||||
|
previous_action_str = meta_data["action_history"][1:]
|
||||||
|
|
||||||
|
index = turn_num - 1
|
||||||
|
conversations = []
|
||||||
|
for i in range(index - 1, -1, -1):
|
||||||
|
if i == 0:
|
||||||
|
content_user = f"Task Instruction: {intent}\n\nRound {i}\n{intent}"
|
||||||
|
content_assistant = f"{previous_action_str[i]}"
|
||||||
|
else:
|
||||||
|
content_user = f"Round {i}\n** Simplified html **"
|
||||||
|
content_assistant = f"{previous_action_str[i]}"
|
||||||
|
conversation = [{'role': 'user', 'content': content_user}, {'role': 'assistant', 'content': content_assistant}]
|
||||||
|
conversations = conversation + conversations
|
||||||
|
|
||||||
|
system_turn = [{'role': 'system', 'content': self.instruction['intro']}]
|
||||||
|
current_turn = [{'role': 'user', 'content': f'Round {index}\n\n{obs}'}]
|
||||||
|
conversations = system_turn + conversations + current_turn
|
||||||
|
|
||||||
|
return conversations
|
||||||
|
|
||||||
def extract_action(self, response: str) -> str:
|
def extract_action(self, response: str) -> str:
|
||||||
return response
|
return response
|
|
@ -84,3 +84,28 @@ for src in file_list:
|
||||||
if len(file_list) > 1:
|
if len(file_list) > 1:
|
||||||
get_result(all_result)
|
get_result(all_result)
|
||||||
export_result(all_result, show_all=True)
|
export_result(all_result, show_all=True)
|
||||||
|
|
||||||
|
with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
|
||||||
|
configs = json.load(fp)
|
||||||
|
sub_results = {}
|
||||||
|
sub_ids = {}
|
||||||
|
for item in configs:
|
||||||
|
web = tuple(item['sites'])
|
||||||
|
task_id = int(item['task_id'])
|
||||||
|
old_task_id = int(item['old_task_id'])
|
||||||
|
if web not in sub_results:
|
||||||
|
sub_results[web] = []
|
||||||
|
if web not in sub_ids:
|
||||||
|
sub_ids[web] = []
|
||||||
|
if task_id in all_result:
|
||||||
|
sub_results[web].append(all_result[task_id])
|
||||||
|
if all_result[task_id] == 1:
|
||||||
|
sub_ids[web].append(old_task_id)
|
||||||
|
else:
|
||||||
|
sub_results[web].append(0)
|
||||||
|
for web in sub_results:
|
||||||
|
print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1))
|
||||||
|
|
||||||
|
print('\n\n')
|
||||||
|
for web in sub_ids:
|
||||||
|
print(web, sorted(sub_ids[web]), len(sub_ids[web]))
|
97
VAB-WebArena-Lite/new/wa_parallel_run_webrl_chat.sh
Normal file
97
VAB-WebArena-Lite/new/wa_parallel_run_webrl_chat.sh
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
#!/bin/bash
|
||||||
|
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
|
||||||
|
result_dir='' # TODO: set your result_dir
|
||||||
|
provider='openai' # TODO: select from ['openai', 'finetune', ...]
|
||||||
|
model='' # TODO: assign model name, which is used for action generation
|
||||||
|
planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
|
||||||
|
instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
|
||||||
|
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
|
||||||
|
temperature=0.0
|
||||||
|
|
||||||
|
SERVER='' # TODO: your server address
|
||||||
|
MAP_SERVER='' # TODO: the server address for MAP tasks
|
||||||
|
OPENAI_API_KEY='' # TODO: if you test OpenAI APIs
|
||||||
|
OPENAI_ORGANIZATION=''
|
||||||
|
CONDA_ENV_NAME='' # TODO: the name of your conda environment for testing WebArena
|
||||||
|
|
||||||
|
ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:7770';export SHOPPING_ADMIN='http://${SERVER}:7780/admin';export REDDIT='http://${SERVER}:9999';export GITLAB='http://${SERVER}:8023';export MAP='http://${MAP_SERVER}:3000';export WIKIPEDIA='http://${SERVER}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:4399';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION}"
|
||||||
|
|
||||||
|
# get the number of tmux panes
|
||||||
|
num_panes=$(tmux list-panes | wc -l)
|
||||||
|
|
||||||
|
# calculate how many panes need to be created
|
||||||
|
let "panes_to_create = 7 - num_panes"
|
||||||
|
|
||||||
|
# array of tmux commands to create each pane
|
||||||
|
tmux_commands=(
|
||||||
|
'tmux split-window -h'
|
||||||
|
'tmux split-window -v'
|
||||||
|
'tmux select-pane -t 0; tmux split-window -v'
|
||||||
|
'tmux split-window -v'
|
||||||
|
'tmux select-pane -t 3; tmux split-window -v'
|
||||||
|
'tmux select-pane -t 5; tmux split-window -v'
|
||||||
|
)
|
||||||
|
|
||||||
|
# create panes up to 7
|
||||||
|
for ((i=0; i<$panes_to_create; i++)); do
|
||||||
|
eval ${tmux_commands[$i]}
|
||||||
|
done
|
||||||
|
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Function to run a job
|
||||||
|
run_job() {
|
||||||
|
tmux select-pane -t $1
|
||||||
|
COMMAND="python run.py \
|
||||||
|
--instruction_path ${instruction_path} \
|
||||||
|
--test_start_idx $2 \
|
||||||
|
--test_end_idx $3 \
|
||||||
|
--result_dir ${result_dir} \
|
||||||
|
--test_config_base_dir ${test_config_base_dir} \
|
||||||
|
--provider ${provider} \
|
||||||
|
--model ${model} \
|
||||||
|
--mode chat \
|
||||||
|
--planner_ip ${planner_ip} \
|
||||||
|
--stop_token \"<|eot_id|>\" \
|
||||||
|
--temperature ${temperature} \
|
||||||
|
--max_obs_length 0 \
|
||||||
|
--max_tokens 2048 \
|
||||||
|
--viewport_width 1280 \
|
||||||
|
--viewport_height 720 \
|
||||||
|
--parsing_failure_th 5 \
|
||||||
|
--repeating_action_failure_th 5 \
|
||||||
|
--action_set_tag webrl_id --observation_type webrl"
|
||||||
|
tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m
|
||||||
|
sleep 3
|
||||||
|
}
|
||||||
|
|
||||||
|
TOLERANCE=2
|
||||||
|
run_batch() {
|
||||||
|
args=("$@") # save all arguments in an array
|
||||||
|
num_jobs=${#args[@]} # get number of arguments
|
||||||
|
|
||||||
|
for ((i=1; i<$num_jobs; i++)); do
|
||||||
|
run_job $i ${args[i-1]} ${args[i]}
|
||||||
|
done
|
||||||
|
|
||||||
|
# Wait for all jobs to finish
|
||||||
|
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
|
||||||
|
sleep 100 # wait for 10 seconds before checking again
|
||||||
|
done
|
||||||
|
|
||||||
|
# Run checker
|
||||||
|
while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do
|
||||||
|
echo "Check failed, rerunning jobs..."
|
||||||
|
for ((i=1; i<$num_jobs; i++)); do
|
||||||
|
run_job $i ${args[i-1]} ${args[i]}
|
||||||
|
done
|
||||||
|
|
||||||
|
# Wait for all jobs to finish
|
||||||
|
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
|
||||||
|
sleep 100 # wait for 10 seconds before checking again
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
}
|
||||||
|
run_batch 0 28 56 84 112 140 165
|
||||||
|
|
|
@ -15,6 +15,7 @@ cp -f new/run.py visualwebarena/run.py
|
||||||
cp -f new/agent.py visualwebarena/agent/agent.py
|
cp -f new/agent.py visualwebarena/agent/agent.py
|
||||||
cp -f new/prompt_constructor.py visualwebarena/agent/prompts/prompt_constructor.py
|
cp -f new/prompt_constructor.py visualwebarena/agent/prompts/prompt_constructor.py
|
||||||
cp -f new/p_webrl.json visualwebarena/agent/prompts/jsons/p_webrl.json
|
cp -f new/p_webrl.json visualwebarena/agent/prompts/jsons/p_webrl.json
|
||||||
|
cp -f new/p_webrl_chat.json visualwebarena/agent/prompts/jsons/p_webrl_chat.json
|
||||||
|
|
||||||
# browser_env
|
# browser_env
|
||||||
cp -f new/actions.py visualwebarena/browser_env/actions.py
|
cp -f new/actions.py visualwebarena/browser_env/actions.py
|
||||||
|
@ -42,6 +43,7 @@ cp -f new/wa_parallel_run.sh visualwebarena/wa_parallel_run.sh
|
||||||
|
|
||||||
cp -f new/score.py visualwebarena/score.py
|
cp -f new/score.py visualwebarena/score.py
|
||||||
cp -f new/wa_parallel_run_webrl.sh visualwebarena/wa_parallel_run_webrl.sh
|
cp -f new/wa_parallel_run_webrl.sh visualwebarena/wa_parallel_run_webrl.sh
|
||||||
|
cp -f new/wa_parallel_run_webrl_chat.sh visualwebarena/wa_parallel_run_webrl_chat.sh
|
||||||
|
|
||||||
# 3. remove temporary files
|
# 3. remove temporary files
|
||||||
mv visualwebarena/* .
|
mv visualwebarena/* .
|
||||||
|
|
Loading…
Reference in New Issue
Block a user