From e50dd34bee78d9861b9886b50c75f7721d769973 Mon Sep 17 00:00:00 2001 From: yuyr Date: Sun, 27 Apr 2025 18:04:39 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=B5=8B=E8=AF=95webrl?= =?UTF-8?q?=E5=92=8Corm=E7=A8=8B=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- VAB-WebArena-Lite/README_yuyr.md | 48 +++++ VAB-WebArena-Lite/export.txt | 165 ++++++++++++++++++ VAB-WebArena-Lite/score.py | 50 +++++- .../wa_parallel_run_webrl_chat.sh | 13 +- .../wa_parallel_run_webrl_completion.sh | 104 +++++++++++ analyze_case_time.py | 128 ++++++++++++++ 6 files changed, 495 insertions(+), 13 deletions(-) create mode 100644 VAB-WebArena-Lite/README_yuyr.md create mode 100644 VAB-WebArena-Lite/export.txt create mode 100644 VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh create mode 100644 analyze_case_time.py diff --git a/VAB-WebArena-Lite/README_yuyr.md b/VAB-WebArena-Lite/README_yuyr.md new file mode 100644 index 0000000..5c824ef --- /dev/null +++ b/VAB-WebArena-Lite/README_yuyr.md @@ -0,0 +1,48 @@ + +# evaluation +1. 在lm2上部署vllm,加载webrl-llama-3.1-8b模型 +```bash +MODEL_PATH=/data1/yuyr/webrl-llama-3.1-8b + +export HF_ENDPOINT=https://hf-mirror.com + +PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-llama-3.1-8b \ + --model $MODEL_PATH \ + --gpu-memory-utilization 0.9 \ + --max-num-seqs 32 \ + --dtype half \ + --port 18080 +``` +2. g14上已经部署好shopping/shopping_admin/gitlab/reddit四个网站,map使用官网(使用socks5代理); +3. 运行`wa_parallel_run_webrl_completion.sh`,注意!一定是这个completion,不要chat,否则webrl模型SR会降低很多。 +```bash +# 需要先建好vab conda环境 +tmux new -t webrl + +bash wa_parallel_run_webrl_completion.sh +``` + +# orm 测试 +1. lm2上部署vllm,加载webrl-orm-llama-3.1-8b模型 +```bash +MODEL_PATH=/data1/yuyr/webrl-orm-llama-3.1-8b + +export HF_ENDPOINT=https://hf-mirror.com + +PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-orm-llama-3.1-8b \ + --model $MODEL_PATH \ + --gpu-memory-utilization 0.9 \ + --max-num-seqs 32 \ + --dtype half \ + --port 18081 +``` +2. 执行分析脚本 +```bash + +cd result + +python orm_test.py --data_dir webrl_chat_completion/ + +``` +3. 修改.env 配置使用vllm或者aiproxy api +3. 修改`orm_test.py`中model的值,指定模型 \ No newline at end of file diff --git a/VAB-WebArena-Lite/export.txt b/VAB-WebArena-Lite/export.txt new file mode 100644 index 0000000..ce04a76 --- /dev/null +++ b/VAB-WebArena-Lite/export.txt @@ -0,0 +1,165 @@ +1.0 +0.0 +1.0 +0.0 +1.0 +1.0 +0.0 +0.0 +1.0 +1.0 +0.0 +0.0 +0.0 +1.0 +1.0 +1.0 +0.0 +1.0 +0.0 +0.0 +1.0 +1.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +1.0 +1.0 +0.0 +0.0 +1.0 +0.0 +0.0 +1.0 +0.0 +0.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +0.0 +0.0 +1.0 +0.0 +0.0 +0.0 +1.0 +1.0 +0.0 +0.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +1.0 +1.0 +1.0 +0.0 +0.0 +0.0 +0.0 +1.0 +0.0 +1.0 +1.0 +1.0 +0.0 +1.0 +1.0 +1.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +1.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +0.0 +1.0 +0.0 +1.0 +0.0 +1.0 +0.0 +1.0 +0.0 +0.0 +0.0 +0.0 +1.0 +1.0 +0.0 +0.0 +1.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +0.0 +0.0 +1.0 +0.0 +1.0 +1.0 +0.0 +0.0 +0.0 +1.0 +1.0 +0.0 +0.0 +0.0 +0.0 +0.0 +1.0 +1.0 +1.0 diff --git a/VAB-WebArena-Lite/score.py b/VAB-WebArena-Lite/score.py index 915c3e7..39d165a 100644 --- a/VAB-WebArena-Lite/score.py +++ b/VAB-WebArena-Lite/score.py @@ -89,23 +89,57 @@ with open('./config_files/wa/test_webarena_lite.raw.json') as fp: configs = json.load(fp) sub_results = {} sub_ids = {} +sub_failed_ids = {} +sub_counts = {} +non_map_success = 0 +non_map_total = 0 + for item in configs: web = tuple(item['sites']) task_id = int(item['task_id']) old_task_id = int(item['old_task_id']) + if web not in sub_results: sub_results[web] = [] - if web not in sub_ids: sub_ids[web] = [] + sub_failed_ids[web] = [] + sub_counts[web] = 0 + + sub_counts[web] += 1 + + is_success = False if task_id in all_result: - sub_results[web].append(all_result[task_id]) - if all_result[task_id] == 1: + score = all_result[task_id] + sub_results[web].append(score) + if score >= 1.0: sub_ids[web].append(old_task_id) + is_success = True + else: + sub_failed_ids[web].append(old_task_id) else: sub_results[web].append(0) -for web in sub_results: - print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1)) + sub_failed_ids[web].append(old_task_id) -print('\n\n') -for web in sub_ids: - print(web, sorted(sub_ids[web]), len(sub_ids[web])) \ No newline at end of file + is_map_task = any('map' in site for site in web) + if not is_map_task: + non_map_total += 1 + if is_success: + non_map_success += 1 + +print("\n--- Category Statistics ---") +for web in sub_results: + total_cases = sub_counts[web] + success_cases = len(sub_ids[web]) + success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0 + print(f"Category: {web}") + print(f" Total Cases: {total_cases}") + print(f" Success Rate: {success_rate}% ({success_cases}/{total_cases})") + print(f" Successful old_task_ids: {sorted(sub_ids[web])}") + print(f" Failed old_task_ids: {sorted(sub_failed_ids[web])}") + +print('\n--- Overall Accuracy without Map ---') +if non_map_total > 0: + overall_acc_without_map = round(non_map_success / non_map_total * 100, 2) + print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})") +else: + print("No non-map tasks found.") \ No newline at end of file diff --git a/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh b/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh index 37d81c0..0a498e4 100644 --- a/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh +++ b/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh @@ -1,8 +1,9 @@ #!/bin/bash DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena'] -result_dir='./results/webrl_chat' # TODO: set your result_dir +result_dir='./results/webrl_chat_r1' # TODO: set your result_dir provider='openai' # TODO: select from ['openai', 'finetune', ...] -model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation +# model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation +model='aiproxy/deepseek-reasoner' planner_ip='192.168.16.116' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm) instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite @@ -11,8 +12,10 @@ proxy_url='socks5://98.152.200.61:8081' SERVER='localhost' # TODO: your server address MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks -OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs -OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs +# OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs +# OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs +OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f" +OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/" OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f" OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/" OPENAI_ORGANIZATION='' @@ -49,7 +52,7 @@ done # Function to run a job run_job() { tmux select-pane -t $1 - COMMAND="python run.py \ + COMMAND="conda activate vab; python run.py \ --instruction_path ${instruction_path} \ --test_start_idx $2 \ --test_end_idx $3 \ diff --git a/VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh b/VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh new file mode 100644 index 0000000..ac4d648 --- /dev/null +++ b/VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh @@ -0,0 +1,104 @@ +#!/bin/bash +DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena'] +result_dir='./results/webrl_chat_completion' # TODO: set your result_dir +provider='openai' # TODO: select from ['openai', 'finetune', ...] +model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation +# model='aiproxy/deepseek-reasoner' +planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm) +instruction_path='agent/prompts/jsons/p_webrl.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json +test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite +temperature=0.0 +proxy_url='socks5://98.152.200.61:8081' + +SERVER='localhost' # TODO: your server address +MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks +OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs +OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs +# OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f" +# OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/" +OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f" +OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/" +OPENAI_ORGANIZATION='' +CONDA_ENV_NAME='vab' # TODO: the name of your conda environment for testing WebArena + + +ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:28082';export SHOPPING_ADMIN='http://${SERVER}:28083/admin';export REDDIT='http://${SERVER}:28080';export GITLAB='http://${SERVER}:28084';export MAP='${MAP_SERVER}';export WIKIPEDIA='http://${SERVER}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:20080';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_API_URL=${OPENAI_API_URL};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION};export OPENAI_API_KEY_FUZZY=${OPENAI_API_KEY_FUZZY};export OPENAI_API_URL_FUZZY=${OPENAI_API_URL_FUZZY}" +echo $ENV_VARIABLES + +# get the number of tmux panes +num_panes=$(tmux list-panes | wc -l) + +# calculate how many panes need to be created +let "panes_to_create = 7 - num_panes" +# let "panes_to_create = 1 - num_panes" + +# array of tmux commands to create each pane +tmux_commands=( + 'tmux split-window -h' + 'tmux split-window -v' + 'tmux select-pane -t 0; tmux split-window -v' + 'tmux split-window -v' + 'tmux select-pane -t 3; tmux split-window -v' + 'tmux select-pane -t 5; tmux split-window -v' +) + +# create panes up to 7 +for ((i=0; i<$panes_to_create; i++)); do + eval ${tmux_commands[$i]} +done + +#!/bin/bash + +# Function to run a job +run_job() { + tmux select-pane -t $1 + COMMAND="conda activate vab; python run.py \ + --instruction_path ${instruction_path} \ + --test_start_idx $2 \ + --test_end_idx $3 \ + --result_dir ${result_dir} \ + --test_config_base_dir ${test_config_base_dir} \ + --provider ${provider} \ + --mode completion \ + --model ${model} \ + --stop_token \"<|eot_id|>\" \ + --max_obs_length 0 \ + --max_tokens 2048 \ + --viewport_width 1280 \ + --viewport_height 720 \ + --proxy_url ${proxy_url} \ + --action_set_tag webrl_id --observation_type webrl" + tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m + sleep 3 +} + +TOLERANCE=2 +run_batch() { + args=("$@") # save all arguments in an array + num_jobs=${#args[@]} # get number of arguments + + for ((i=1; i<$num_jobs; i++)); do + run_job $i ${args[i-1]} ${args[i]} + done + + # Wait for all jobs to finish + while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do + sleep 100 # wait for 10 seconds before checking again + done + + # Run checker + while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do + echo "Check failed, rerunning jobs..." + for ((i=1; i<$num_jobs; i++)); do + run_job $i ${args[i-1]} ${args[i]} + done + + # Wait for all jobs to finish + while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do + sleep 100 # wait for 10 seconds before checking again + done + done + +} +run_batch 0 28 56 84 112 140 165 + diff --git a/analyze_case_time.py b/analyze_case_time.py new file mode 100644 index 0000000..b9e3571 --- /dev/null +++ b/analyze_case_time.py @@ -0,0 +1,128 @@ +import re +import datetime +import numpy as np +from collections import defaultdict, Counter + +log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt' +with open(log_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + +# 收集所有用例的开始和结束时间 +case_times = {} +case_results = {} +case_intents = {} +current_case = None +start_time = None +current_intent = None + +for line in lines: + if '[Config file]:' in line: + # 从行中提取时间和用例ID + match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line) + if match: + time_str, config_file = match.groups() + # 提取用例ID + case_id = config_file.split('/')[-1].replace('.json', '') + current_case = case_id + start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f') + + elif '[Intent]:' in line and current_case: + # 提取意图 + match = re.match(r'.+\[Intent\]: (.+)', line) + if match: + current_intent = match.group(1) + case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent + + elif '[Result]' in line and current_case and start_time: + # 从行中提取时间和结果 + match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line) + if match: + time_str, result = match.groups() + end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f') + duration = (end_time - start_time).total_seconds() + + case_times[current_case] = duration + case_results[current_case] = result + + current_case = None + start_time = None + current_intent = None + +# 计算总体统计信息 +if case_times: + total_cases = len(case_times) + durations = list(case_times.values()) + + # 基本统计信息 + avg_duration = np.mean(durations) + median_duration = np.median(durations) + min_duration = min(durations) + max_duration = max(durations) + std_duration = np.std(durations) + + # 按结果分类 + pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS'] + fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL'] + + pass_durations = [case_times[case_id] for case_id in pass_cases] + fail_durations = [case_times[case_id] for case_id in fail_cases] + + # 时间分布 + time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240] + time_distribution = {f"{low}-{high}秒": 0 for low, high in zip(time_bins[:-1], time_bins[1:])} + time_distribution[f"{time_bins[-1]}+秒"] = 0 + + for duration in durations: + placed = False + for i in range(len(time_bins) - 1): + if time_bins[i] <= duration < time_bins[i+1]: + time_distribution[f"{time_bins[i]}-{time_bins[i+1]}秒"] += 1 + placed = True + break + if not placed: + time_distribution[f"{time_bins[-1]}+秒"] += 1 + + # 打印结果 + print("=" * 60) + print(f"测试用例执行时间分析报告") + print("=" * 60) + + print(f"\n总测试用例数: {total_cases}") + print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)") + print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)") + print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)") + print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)") + print(f"标准差: {std_duration:.2f}秒") + + print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)") + if pass_durations: + print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)") + print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)") + + print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)") + if fail_durations: + print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)") + print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)") + + # 时间分布 + print("\n执行时长分布:") + for bin_name, count in time_distribution.items(): + percentage = count / total_cases * 100 + print(f" {bin_name}: {count} 个用例 ({percentage:.2f}%)") + + # 打印前10长和前10短的用例 + print("\n执行时长最长的10个用例:") + for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1): + result = case_results[case_id] + intent = case_intents.get(case_id, "未知意图") + print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}") + print(f" 意图: {intent}") + + print("\n执行时长最短的10个用例:") + for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1): + result = case_results[case_id] + intent = case_intents.get(case_id, "未知意图") + print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}") + print(f" 意图: {intent}") +else: + print("未找到有效的测试用例数据") \ No newline at end of file