修复测试webrl和orm程序

This commit is contained in:
yuyr 2025-04-27 18:04:39 +08:00
parent 543543218c
commit e50dd34bee
6 changed files with 495 additions and 13 deletions

View File

@ -0,0 +1,48 @@
# evaluation
1. 在lm2上部署vllm加载webrl-llama-3.1-8b模型
```bash
MODEL_PATH=/data1/yuyr/webrl-llama-3.1-8b
export HF_ENDPOINT=https://hf-mirror.com
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-llama-3.1-8b \
--model $MODEL_PATH \
--gpu-memory-utilization 0.9 \
--max-num-seqs 32 \
--dtype half \
--port 18080
```
2. g14上已经部署好shopping/shopping_admin/gitlab/reddit四个网站map使用官网使用socks5代理;
3. 运行`wa_parallel_run_webrl_completion.sh`注意一定是这个completion不要chat否则webrl模型SR会降低很多。
```bash
# 需要先建好vab conda环境
tmux new -t webrl
bash wa_parallel_run_webrl_completion.sh
```
# orm 测试
1. lm2上部署vllm加载webrl-orm-llama-3.1-8b模型
```bash
MODEL_PATH=/data1/yuyr/webrl-orm-llama-3.1-8b
export HF_ENDPOINT=https://hf-mirror.com
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-orm-llama-3.1-8b \
--model $MODEL_PATH \
--gpu-memory-utilization 0.9 \
--max-num-seqs 32 \
--dtype half \
--port 18081
```
2. 执行分析脚本
```bash
cd result
python orm_test.py --data_dir webrl_chat_completion/
```
3. 修改.env 配置使用vllm或者aiproxy api
3. 修改`orm_test.py`中model的值指定模型

View File

@ -0,0 +1,165 @@
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0

View File

@ -89,23 +89,57 @@ with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
configs = json.load(fp)
sub_results = {}
sub_ids = {}
sub_failed_ids = {}
sub_counts = {}
non_map_success = 0
non_map_total = 0
for item in configs:
web = tuple(item['sites'])
task_id = int(item['task_id'])
old_task_id = int(item['old_task_id'])
if web not in sub_results:
sub_results[web] = []
if web not in sub_ids:
sub_ids[web] = []
sub_failed_ids[web] = []
sub_counts[web] = 0
sub_counts[web] += 1
is_success = False
if task_id in all_result:
sub_results[web].append(all_result[task_id])
if all_result[task_id] == 1:
score = all_result[task_id]
sub_results[web].append(score)
if score >= 1.0:
sub_ids[web].append(old_task_id)
is_success = True
else:
sub_failed_ids[web].append(old_task_id)
else:
sub_results[web].append(0)
for web in sub_results:
print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1))
sub_failed_ids[web].append(old_task_id)
print('\n\n')
for web in sub_ids:
print(web, sorted(sub_ids[web]), len(sub_ids[web]))
is_map_task = any('map' in site for site in web)
if not is_map_task:
non_map_total += 1
if is_success:
non_map_success += 1
print("\n--- Category Statistics ---")
for web in sub_results:
total_cases = sub_counts[web]
success_cases = len(sub_ids[web])
success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0
print(f"Category: {web}")
print(f" Total Cases: {total_cases}")
print(f" Success Rate: {success_rate}% ({success_cases}/{total_cases})")
print(f" Successful old_task_ids: {sorted(sub_ids[web])}")
print(f" Failed old_task_ids: {sorted(sub_failed_ids[web])}")
print('\n--- Overall Accuracy without Map ---')
if non_map_total > 0:
overall_acc_without_map = round(non_map_success / non_map_total * 100, 2)
print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})")
else:
print("No non-map tasks found.")

View File

@ -1,8 +1,9 @@
#!/bin/bash
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
result_dir='./results/webrl_chat' # TODO: set your result_dir
result_dir='./results/webrl_chat_r1' # TODO: set your result_dir
provider='openai' # TODO: select from ['openai', 'finetune', ...]
model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
# model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
model='aiproxy/deepseek-reasoner'
planner_ip='192.168.16.116' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
@ -11,8 +12,10 @@ proxy_url='socks5://98.152.200.61:8081'
SERVER='localhost' # TODO: your server address
MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
# OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
# OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
OPENAI_ORGANIZATION=''
@ -49,7 +52,7 @@ done
# Function to run a job
run_job() {
tmux select-pane -t $1
COMMAND="python run.py \
COMMAND="conda activate vab; python run.py \
--instruction_path ${instruction_path} \
--test_start_idx $2 \
--test_end_idx $3 \

View File

@ -0,0 +1,104 @@
#!/bin/bash
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
result_dir='./results/webrl_chat_completion' # TODO: set your result_dir
provider='openai' # TODO: select from ['openai', 'finetune', ...]
model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
# model='aiproxy/deepseek-reasoner'
planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
instruction_path='agent/prompts/jsons/p_webrl.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
temperature=0.0
proxy_url='socks5://98.152.200.61:8081'
SERVER='localhost' # TODO: your server address
MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
# OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
# OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
OPENAI_ORGANIZATION=''
CONDA_ENV_NAME='vab' # TODO: the name of your conda environment for testing WebArena
ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:28082';export SHOPPING_ADMIN='http://${SERVER}:28083/admin';export REDDIT='http://${SERVER}:28080';export GITLAB='http://${SERVER}:28084';export MAP='${MAP_SERVER}';export WIKIPEDIA='http://${SERVER}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:20080';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_API_URL=${OPENAI_API_URL};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION};export OPENAI_API_KEY_FUZZY=${OPENAI_API_KEY_FUZZY};export OPENAI_API_URL_FUZZY=${OPENAI_API_URL_FUZZY}"
echo $ENV_VARIABLES
# get the number of tmux panes
num_panes=$(tmux list-panes | wc -l)
# calculate how many panes need to be created
let "panes_to_create = 7 - num_panes"
# let "panes_to_create = 1 - num_panes"
# array of tmux commands to create each pane
tmux_commands=(
'tmux split-window -h'
'tmux split-window -v'
'tmux select-pane -t 0; tmux split-window -v'
'tmux split-window -v'
'tmux select-pane -t 3; tmux split-window -v'
'tmux select-pane -t 5; tmux split-window -v'
)
# create panes up to 7
for ((i=0; i<$panes_to_create; i++)); do
eval ${tmux_commands[$i]}
done
#!/bin/bash
# Function to run a job
run_job() {
tmux select-pane -t $1
COMMAND="conda activate vab; python run.py \
--instruction_path ${instruction_path} \
--test_start_idx $2 \
--test_end_idx $3 \
--result_dir ${result_dir} \
--test_config_base_dir ${test_config_base_dir} \
--provider ${provider} \
--mode completion \
--model ${model} \
--stop_token \"<|eot_id|>\" \
--max_obs_length 0 \
--max_tokens 2048 \
--viewport_width 1280 \
--viewport_height 720 \
--proxy_url ${proxy_url} \
--action_set_tag webrl_id --observation_type webrl"
tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m
sleep 3
}
TOLERANCE=2
run_batch() {
args=("$@") # save all arguments in an array
num_jobs=${#args[@]} # get number of arguments
for ((i=1; i<$num_jobs; i++)); do
run_job $i ${args[i-1]} ${args[i]}
done
# Wait for all jobs to finish
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
sleep 100 # wait for 10 seconds before checking again
done
# Run checker
while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do
echo "Check failed, rerunning jobs..."
for ((i=1; i<$num_jobs; i++)); do
run_job $i ${args[i-1]} ${args[i]}
done
# Wait for all jobs to finish
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
sleep 100 # wait for 10 seconds before checking again
done
done
}
run_batch 0 28 56 84 112 140 165

128
analyze_case_time.py Normal file
View File

@ -0,0 +1,128 @@
import re
import datetime
import numpy as np
from collections import defaultdict, Counter
log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt'
with open(log_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 收集所有用例的开始和结束时间
case_times = {}
case_results = {}
case_intents = {}
current_case = None
start_time = None
current_intent = None
for line in lines:
if '[Config file]:' in line:
# 从行中提取时间和用例ID
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line)
if match:
time_str, config_file = match.groups()
# 提取用例ID
case_id = config_file.split('/')[-1].replace('.json', '')
current_case = case_id
start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
elif '[Intent]:' in line and current_case:
# 提取意图
match = re.match(r'.+\[Intent\]: (.+)', line)
if match:
current_intent = match.group(1)
case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent
elif '[Result]' in line and current_case and start_time:
# 从行中提取时间和结果
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line)
if match:
time_str, result = match.groups()
end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
duration = (end_time - start_time).total_seconds()
case_times[current_case] = duration
case_results[current_case] = result
current_case = None
start_time = None
current_intent = None
# 计算总体统计信息
if case_times:
total_cases = len(case_times)
durations = list(case_times.values())
# 基本统计信息
avg_duration = np.mean(durations)
median_duration = np.median(durations)
min_duration = min(durations)
max_duration = max(durations)
std_duration = np.std(durations)
# 按结果分类
pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS']
fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL']
pass_durations = [case_times[case_id] for case_id in pass_cases]
fail_durations = [case_times[case_id] for case_id in fail_cases]
# 时间分布
time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240]
time_distribution = {f"{low}-{high}": 0 for low, high in zip(time_bins[:-1], time_bins[1:])}
time_distribution[f"{time_bins[-1]}+秒"] = 0
for duration in durations:
placed = False
for i in range(len(time_bins) - 1):
if time_bins[i] <= duration < time_bins[i+1]:
time_distribution[f"{time_bins[i]}-{time_bins[i+1]}"] += 1
placed = True
break
if not placed:
time_distribution[f"{time_bins[-1]}+秒"] += 1
# 打印结果
print("=" * 60)
print(f"测试用例执行时间分析报告")
print("=" * 60)
print(f"\n总测试用例数: {total_cases}")
print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)")
print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)")
print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)")
print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)")
print(f"标准差: {std_duration:.2f}")
print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)")
if pass_durations:
print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)")
print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)")
print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)")
if fail_durations:
print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)")
print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)")
# 时间分布
print("\n执行时长分布:")
for bin_name, count in time_distribution.items():
percentage = count / total_cases * 100
print(f" {bin_name}: {count} 个用例 ({percentage:.2f}%)")
# 打印前10长和前10短的用例
print("\n执行时长最长的10个用例:")
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1):
result = case_results[case_id]
intent = case_intents.get(case_id, "未知意图")
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
print(f" 意图: {intent}")
print("\n执行时长最短的10个用例:")
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1):
result = case_results[case_id]
intent = case_intents.get(case_id, "未知意图")
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
print(f" 意图: {intent}")
else:
print("未找到有效的测试用例数据")