add webrl chat mode

2024-11-14 20:04:38 +08:00 · 2024-11-14 20:04:38 +08:00 · a1a6cbd209
commit a1a6cbd209
parent 521d7e999a
5 changed files with 197 additions and 0 deletions
--- a/VAB-WebArena-Lite/new/p_webrl_chat.json
+++ b/VAB-WebArena-Lite/new/p_webrl_chat.json
@ -0,0 +1,13 @@
 {
    "intro": "# Setup\nYou are a professional web browsing agent assistant that can fulfill user's high-level instructions. Given Simplified html of the browsed webpage at each step, you plan operations in python-style pseudo code using provided functions, or customize functions (if necessary) and then provide their implementations. \n# More details about the code\nYour code should be readable, simple, and only **ONE-LINE-OF-CODE** at a time, avoid using loop statement and only use if-else control if necessary. Predefined functions are as follow:\n\n```\ndef do(action, argument, element):\n\t\"\"\"A single browsing operation on the webpage.\n\tArgs:\n\t\t:param action: one of the actions from [\"Click\", \"Right Click\", \"Type\", \"Search\", \"Hover\", \"Scroll Up\", \"Scroll Down\", \"Press Enter\", \"Switch Tab\", \"Select Dropdown Option\", \"Wait\"].\n\t\t:param argument: optional. Only for \"Type\", \"Search\", \"Switch Page\", and \"Select Dropdown Option\", indicating the content to type in, page number(start from 0) to switch, or key to press.\n\t\t                           \"Search\" action is equivalent to \"Type\" action plus \"Enter\" key press.\n\t\t:param element: optional. Only for \"Click\", \"Right Click\", \"Type\", \"Search\", \"Select Dropdown Option\", and \"Hover\". Should be specific element id in the html.\n\tReturns:\n\t\tNone. The webpage will be updated after executing the action.\n\t\"\"\"\n\ndef exit(message):\n\t\"\"\"Ending the browsing process if the assistant think it has fulfilled the goal.\n\tArgs:\n\t\t:param message: optional. If user's instruction is a question, return assistant's answer in the message based on the browsing content.\n\tReturns:\n\t\tNone.\n\t\"\"\"\n\ndef go_backward():\n\t\"\"\"Go back to the previous page.\n\t\"\"\"\n\ndef go_forward():\n  \"\"\"Go forward to the next page.\n  \"\"\"\n```\n\nHere are some examples:\n- # Element: the 'REPORTS' section on the left sidebar\ndo(action=\"Click\", element=\"7\")\n- # Element: the 'Period' dropdown, middle center\ndo(action=\"Select Dropdown Option\", argument=\"Month\", element=\"20\")\n- # Element: the 'From' date picker input field, middle center\ndo(action=\"Type\", argument=\"01/01/2023\", element=\"22\")\n- do(action=\"Scroll Down\")\n- exit(message=\"The top-3 best-selling products in January 2023 are: 1\")\n- # Element: The search bar\ndo(action=\"Search\", argument=\"international airport near Carnegie Mellon University within a driving distance of 50 km\", element=\"13\")\n- # Note: Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States\n# Element: The field labeled 'Pittsburgh International Airport' in the top left corner\ndo(action=\"Type\", argument=\"Cleveland Hopkins International Airport\", element=\"14\")\n\nREMEMBER: \n- only **ONE-LINE-OF-CODE** at a time\n- Don't generate an operation element that you do not see in the screenshot.\n- Use \"# Element\" to describe the element you choose in the html.\n- Use '# Note\" to record information useful to answer the instruction if needed.\n- If you find yourself fallen into some sort of loop, try to use another method or change your action.\n- If you think a page is still loading or still playing animation and you want to wait a while, use \"Wait\" action.\n- You are acting in a real world, try your best not to reject user's demand. Solve all the problem you encounter.\n- If you think you didn't get expected webpage, you should try using more precise and locative description of the element.\n- You must make sure the target element of `find_element*` exists on current screenshot, if not, you should navigate to the target place first.\n- You must identify potential errors or mistakes made by `find_element*` function and correct them. If the webpage is not as expected, you should try to re-do or un-do the operation.\n- You should **NEVER** try to use the browser's address bar at the top of the page to navigate.\n- Your answer shouldn't be in a code snippet format. Just write the function name and its arguments.\n- For quote, exit, go_backward, go_forward request, you should strictly obey the format of quote, exit, go_backward, go_forward functions, answers like do(\"Quote\", xxx, None) or  do(\"quote\", xxx, None)are not allowed.\n- If you use do function to perform \"Click\", \"Right Click\", \"Type\", \"Search\", \"Select Dropdown Option\", and \"Hover\", the param element must not be None.\n", 
    "examples": [], 
    "template": "", 
    "meta_data": {
        "observation": "webrl", 
        "action_type": "webrl_id", 
        "keywords": [], 
        "prompt_constructor": "WebRLChatPromptConstructor", 
        "answer_phrase": "", 
        "action_splitter": ""
    }
 }
--- a/VAB-WebArena-Lite/new/prompt_constructor.py
+++ b/VAB-WebArena-Lite/new/prompt_constructor.py
@ -553,5 +553,65 @@ class WebRLPromptConstructor(PromptConstructor):
        return prompt
    def extract_action(self, response: str) -> str:
        return response
 class WebRLChatPromptConstructor(PromptConstructor):
    """The agent will direct predict the action"""
    def __init__(
        self,
        instruction_path: str | Path,
        lm_config: lm_config.LMConfig,
        tokenizer: Tokenizer,
    ):
        super().__init__(instruction_path, lm_config, tokenizer)
    def construct(
        self,
        trajectory: Trajectory,
        intent: str,
        meta_data: dict[str, Any] = {},
    ) -> APIInput:
        """Construct prompt given the trajectory"""
        state_info: StateInfo = trajectory[-1]  # type: ignore[assignment]
        obs = state_info["observation"][self.obs_modality]
        max_obs_length = self.lm_config.gen_config["max_obs_length"]
        if max_obs_length:
            if self.lm_config.provider == "google":
                print("NOTE: This is a Gemini model, so we use characters instead of tokens for max_obs_length.")
                obs = obs[:max_obs_length]
            else:
                try:
                    obs = self.tokenizer.decode(self.tokenizer.encode(obs)[:max_obs_length])  # type: ignore[arg-type]
                except:
                    print("NOTE: There is no available tokenizer, so we use characters instead of tokens for max_obs_length.")
                    obs = obs[:max_obs_length]
        turn_num = len(meta_data["action_history"])
        if turn_num == 1:
            previous_action_str = []
        else:
            previous_action_str = meta_data["action_history"][1:]
        index = turn_num - 1
        conversations = []
        for i in range(index - 1, -1, -1):
            if i == 0:
                content_user = f"Task Instruction: {intent}\n\nRound {i}\n{intent}"
                content_assistant = f"{previous_action_str[i]}"
            else:
                content_user = f"Round {i}\n** Simplified html **"
                content_assistant = f"{previous_action_str[i]}"
            conversation = [{'role': 'user', 'content': content_user}, {'role': 'assistant', 'content': content_assistant}]
            conversations = conversation + conversations
        system_turn = [{'role': 'system', 'content': self.instruction['intro']}]
        current_turn = [{'role': 'user', 'content': f'Round {index}\n\n{obs}'}]
        conversations = system_turn + conversations + current_turn
        return conversations
    def extract_action(self, response: str) -> str:
        return response
--- a/VAB-WebArena-Lite/new/score.py
+++ b/VAB-WebArena-Lite/new/score.py
@ -84,3 +84,28 @@ for src in file_list:
 if len(file_list) > 1:
    get_result(all_result)
 export_result(all_result, show_all=True)
 with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
    configs = json.load(fp)
 sub_results = {}
 sub_ids = {}
 for item in configs:
    web = tuple(item['sites'])
    task_id = int(item['task_id'])
    old_task_id = int(item['old_task_id'])
    if web not in sub_results:
        sub_results[web] = []
    if web not in sub_ids:
        sub_ids[web] = []
    if task_id in all_result:
        sub_results[web].append(all_result[task_id])
        if all_result[task_id] == 1:
            sub_ids[web].append(old_task_id)
    else:
        sub_results[web].append(0)
 for web in sub_results:
    print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1))
 print('\n\n')
 for web in sub_ids:
    print(web, sorted(sub_ids[web]), len(sub_ids[web]))
--- a/VAB-WebArena-Lite/new/wa_parallel_run_webrl_chat.sh
+++ b/VAB-WebArena-Lite/new/wa_parallel_run_webrl_chat.sh
@ -0,0 +1,97 @@
 #!/bin/bash
 DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
 result_dir='' # TODO: set your result_dir
 provider='openai' # TODO: select from ['openai', 'finetune', ...]
 model='' # TODO: assign model name, which is used for action generation
 planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
 instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
 test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
 temperature=0.0
 SERVER='' # TODO: your server address
 MAP_SERVER='' # TODO: the server address for MAP tasks
 OPENAI_API_KEY='' # TODO: if you test OpenAI APIs
 OPENAI_ORGANIZATION=''
 CONDA_ENV_NAME='' # TODO: the name of your conda environment for testing WebArena
 ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:7770';export SHOPPING_ADMIN='http://${SERVER}:7780/admin';export REDDIT='http://${SERVER}:9999';export GITLAB='http://${SERVER}:8023';export MAP='http://${MAP_SERVER}:3000';export WIKIPEDIA='http://${SERVER}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:4399';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION}"
 # get the number of tmux panes
 num_panes=$(tmux list-panes | wc -l)
 # calculate how many panes need to be created
 let "panes_to_create = 7 - num_panes"
 # array of tmux commands to create each pane
 tmux_commands=(
    'tmux split-window -h'
    'tmux split-window -v'
    'tmux select-pane -t 0; tmux split-window -v'
    'tmux split-window -v'
    'tmux select-pane -t 3; tmux split-window -v'
    'tmux select-pane -t 5; tmux split-window -v'
 )
 # create panes up to 7
 for ((i=0; i<$panes_to_create; i++)); do
    eval ${tmux_commands[$i]}
 done
 #!/bin/bash
 # Function to run a job
 run_job() {
    tmux select-pane -t $1
    COMMAND="python run.py \
        --instruction_path ${instruction_path} \
        --test_start_idx $2 \
        --test_end_idx $3 \
        --result_dir ${result_dir} \
        --test_config_base_dir ${test_config_base_dir} \
        --provider ${provider} \
        --model ${model} \
        --mode chat \
        --planner_ip ${planner_ip} \
        --stop_token \"<|eot_id|>\" \
        --temperature ${temperature} \
        --max_obs_length 0 \
        --max_tokens 2048 \
        --viewport_width 1280 \
        --viewport_height 720 \
        --parsing_failure_th 5 \
        --repeating_action_failure_th 5 \
        --action_set_tag webrl_id  --observation_type webrl"
    tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m
    sleep 3
 }
 TOLERANCE=2
 run_batch() {
    args=("$@") # save all arguments in an array
    num_jobs=${#args[@]} # get number of arguments
    for ((i=1; i<$num_jobs; i++)); do
        run_job $i ${args[i-1]} ${args[i]}
    done
    # Wait for all jobs to finish
    while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
        sleep 100  # wait for 10 seconds before checking again
    done
    # Run checker
    while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do
        echo "Check failed, rerunning jobs..."
        for ((i=1; i<$num_jobs; i++)); do
            run_job $i ${args[i-1]} ${args[i]}
        done
        # Wait for all jobs to finish
        while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
            sleep 100  # wait for 10 seconds before checking again
        done
    done
 }
 run_batch 0 28 56 84 112 140 165
--- a/VAB-WebArena-Lite/replace.sh
+++ b/VAB-WebArena-Lite/replace.sh
@ -15,6 +15,7 @@ cp -f new/run.py visualwebarena/run.py
 cp -f new/agent.py visualwebarena/agent/agent.py
 cp -f new/prompt_constructor.py visualwebarena/agent/prompts/prompt_constructor.py
 cp -f new/p_webrl.json visualwebarena/agent/prompts/jsons/p_webrl.json
 cp -f new/p_webrl_chat.json visualwebarena/agent/prompts/jsons/p_webrl_chat.json
 # browser_env
 cp -f new/actions.py visualwebarena/browser_env/actions.py
@ -42,6 +43,7 @@ cp -f new/wa_parallel_run.sh visualwebarena/wa_parallel_run.sh
 cp -f new/score.py visualwebarena/score.py
 cp -f new/wa_parallel_run_webrl.sh visualwebarena/wa_parallel_run_webrl.sh
 cp -f new/wa_parallel_run_webrl_chat.sh visualwebarena/wa_parallel_run_webrl_chat.sh
 # 3. remove temporary files
 mv visualwebarena/* .