修复测试webrl和orm程序

2025-04-27 18:04:39 +08:00 · 2025-04-27 18:04:39 +08:00 · e50dd34bee
commit e50dd34bee
parent 543543218c
6 changed files with 495 additions and 13 deletions
--- a/VAB-WebArena-Lite/README_yuyr.md
+++ b/VAB-WebArena-Lite/README_yuyr.md
@ -0,0 +1,48 @@
 # evaluation
 1. 在lm2上部署vllm，加载webrl-llama-3.1-8b模型
 ```bash
 MODEL_PATH=/data1/yuyr/webrl-llama-3.1-8b
 export HF_ENDPOINT=https://hf-mirror.com
 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-llama-3.1-8b \
        --model $MODEL_PATH  \
        --gpu-memory-utilization 0.9 \
        --max-num-seqs 32 \
        --dtype half \
        --port 18080
 ```
 2. g14上已经部署好shopping/shopping_admin/gitlab/reddit四个网站，map使用官网（使用socks5代理）;
 3. 运行`wa_parallel_run_webrl_completion.sh`，注意！一定是这个completion，不要chat，否则webrl模型SR会降低很多。
 ```bash
 # 需要先建好vab conda环境
 tmux new -t webrl
 bash wa_parallel_run_webrl_completion.sh
 ```
 # orm 测试
 1. lm2上部署vllm，加载webrl-orm-llama-3.1-8b模型
 ```bash
 MODEL_PATH=/data1/yuyr/webrl-orm-llama-3.1-8b
 export HF_ENDPOINT=https://hf-mirror.com
 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-orm-llama-3.1-8b \
        --model $MODEL_PATH  \
        --gpu-memory-utilization 0.9 \
        --max-num-seqs 32 \
        --dtype half \
        --port 18081 
 ```
 2. 执行分析脚本
 ```bash
 cd result
 python orm_test.py --data_dir webrl_chat_completion/
 ```
 3. 修改.env 配置使用vllm或者aiproxy api
 3. 修改`orm_test.py`中model的值，指定模型
--- a/VAB-WebArena-Lite/export.txt
+++ b/VAB-WebArena-Lite/export.txt
@ -0,0 +1,165 @@
 1.0
 0.0
 1.0
 0.0
 1.0
 1.0
 0.0
 0.0
 1.0
 1.0
 0.0
 0.0
 0.0
 1.0
 1.0
 1.0
 0.0
 1.0
 0.0
 0.0
 1.0
 1.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 1.0
 1.0
 0.0
 0.0
 1.0
 0.0
 0.0
 1.0
 0.0
 0.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 0.0
 0.0
 1.0
 0.0
 0.0
 0.0
 1.0
 1.0
 0.0
 0.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 1.0
 1.0
 1.0
 0.0
 0.0
 0.0
 0.0
 1.0
 0.0
 1.0
 1.0
 1.0
 0.0
 1.0
 1.0
 1.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 1.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 0.0
 1.0
 0.0
 1.0
 0.0
 1.0
 0.0
 1.0
 0.0
 0.0
 0.0
 0.0
 1.0
 1.0
 0.0
 0.0
 1.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 0.0
 0.0
 1.0
 0.0
 1.0
 1.0
 0.0
 0.0
 0.0
 1.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 1.0
 1.0
 1.0
--- a/VAB-WebArena-Lite/score.py
+++ b/VAB-WebArena-Lite/score.py
@ -89,23 +89,57 @@ with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
    configs = json.load(fp)
 sub_results = {}
 sub_ids = {}
 sub_failed_ids = {}
 sub_counts = {}
 non_map_success = 0
 non_map_total = 0
 for item in configs:
    web = tuple(item['sites'])
    task_id = int(item['task_id'])
    old_task_id = int(item['old_task_id'])
    if web not in sub_results:
        sub_results[web] = []
    if web not in sub_ids:
        sub_ids[web] = []
        sub_failed_ids[web] = []
        sub_counts[web] = 0
    sub_counts[web] += 1
    is_success = False
    if task_id in all_result:
-        sub_results[web].append(all_result[task_id])
+        score = all_result[task_id]
-        if all_result[task_id] == 1:
+        sub_results[web].append(score)
        if score >= 1.0:
            sub_ids[web].append(old_task_id)
            is_success = True
        else:
            sub_failed_ids[web].append(old_task_id)
    else:
        sub_results[web].append(0)
-for web in sub_results:
+        sub_failed_ids[web].append(old_task_id)
    print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1))
-print('\n\n')
+    is_map_task = any('map' in site for site in web)
-for web in sub_ids:
+    if not is_map_task:
-    print(web, sorted(sub_ids[web]), len(sub_ids[web]))
+        non_map_total += 1
        if is_success:
            non_map_success += 1
 print("\n--- Category Statistics ---")
 for web in sub_results:
    total_cases = sub_counts[web]
    success_cases = len(sub_ids[web])
    success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0
    print(f"Category: {web}")
    print(f"  Total Cases: {total_cases}")
    print(f"  Success Rate: {success_rate}% ({success_cases}/{total_cases})")
    print(f"  Successful old_task_ids: {sorted(sub_ids[web])}")
    print(f"  Failed old_task_ids: {sorted(sub_failed_ids[web])}")
 print('\n--- Overall Accuracy without Map ---')
 if non_map_total > 0:
    overall_acc_without_map = round(non_map_success / non_map_total * 100, 2)
    print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})")
 else:
    print("No non-map tasks found.")
--- a/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh
+++ b/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh
@ -1,8 +1,9 @@
 #!/bin/bash
 DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
-result_dir='./results/webrl_chat' # TODO: set your result_dir
+result_dir='./results/webrl_chat_r1' # TODO: set your result_dir
 provider='openai' # TODO: select from ['openai', 'finetune', ...]
-model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
+# model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
 model='aiproxy/deepseek-reasoner'
 planner_ip='192.168.16.116' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
 instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
 test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
@ -11,8 +12,10 @@ proxy_url='socks5://98.152.200.61:8081'
 SERVER='localhost' # TODO: your server address
 MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
-OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
+# OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
-OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
+# OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
 OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
 OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
 OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
 OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
 OPENAI_ORGANIZATION=''
@ -49,7 +52,7 @@ done
 # Function to run a job
 run_job() {
    tmux select-pane -t $1
-    COMMAND="python run.py \
+    COMMAND="conda activate vab; python run.py \
        --instruction_path ${instruction_path} \
        --test_start_idx $2 \
        --test_end_idx $3 \
--- a/VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
+++ b/VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
@ -0,0 +1,104 @@
 #!/bin/bash
 DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
 result_dir='./results/webrl_chat_completion' # TODO: set your result_dir
 provider='openai' # TODO: select from ['openai', 'finetune', ...]
 model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
 # model='aiproxy/deepseek-reasoner'
 planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
 instruction_path='agent/prompts/jsons/p_webrl.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
 test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
 temperature=0.0
 proxy_url='socks5://98.152.200.61:8081'
 SERVER='localhost' # TODO: your server address
 MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
 OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
 OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
 # OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
 # OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
 OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
 OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
 OPENAI_ORGANIZATION=''
 CONDA_ENV_NAME='vab' # TODO: the name of your conda environment for testing WebArena
 ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:28082';export SHOPPING_ADMIN='http://${SERVER}:28083/admin';export REDDIT='http://${SERVER}:28080';export GITLAB='http://${SERVER}:28084';export MAP='${MAP_SERVER}';export WIKIPEDIA='http://${SERVER}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:20080';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_API_URL=${OPENAI_API_URL};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION};export OPENAI_API_KEY_FUZZY=${OPENAI_API_KEY_FUZZY};export OPENAI_API_URL_FUZZY=${OPENAI_API_URL_FUZZY}"
 echo $ENV_VARIABLES
 # get the number of tmux panes
 num_panes=$(tmux list-panes | wc -l)
 # calculate how many panes need to be created
 let "panes_to_create = 7 - num_panes"
 # let "panes_to_create = 1 - num_panes"
 # array of tmux commands to create each pane
 tmux_commands=(
    'tmux split-window -h'
    'tmux split-window -v'
    'tmux select-pane -t 0; tmux split-window -v'
    'tmux split-window -v'
    'tmux select-pane -t 3; tmux split-window -v'
    'tmux select-pane -t 5; tmux split-window -v'
 )
 # create panes up to 7
 for ((i=0; i<$panes_to_create; i++)); do
    eval ${tmux_commands[$i]}
 done
 #!/bin/bash
 # Function to run a job
 run_job() {
    tmux select-pane -t $1
    COMMAND="conda activate vab; python run.py \
        --instruction_path ${instruction_path} \
        --test_start_idx $2 \
        --test_end_idx $3 \
        --result_dir ${result_dir} \
        --test_config_base_dir ${test_config_base_dir} \
        --provider ${provider} \
        --mode completion \
        --model ${model} \
        --stop_token \"<|eot_id|>\" \
        --max_obs_length 0 \
        --max_tokens 2048 \
        --viewport_width 1280 \
        --viewport_height 720 \
        --proxy_url ${proxy_url} \
        --action_set_tag webrl_id  --observation_type webrl"
    tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m
    sleep 3
 }
 TOLERANCE=2
 run_batch() {
    args=("$@") # save all arguments in an array
    num_jobs=${#args[@]} # get number of arguments
    for ((i=1; i<$num_jobs; i++)); do
        run_job $i ${args[i-1]} ${args[i]}
    done
    # Wait for all jobs to finish
    while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
        sleep 100  # wait for 10 seconds before checking again
    done
    # Run checker
    while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do
        echo "Check failed, rerunning jobs..."
        for ((i=1; i<$num_jobs; i++)); do
            run_job $i ${args[i-1]} ${args[i]}
        done
        # Wait for all jobs to finish
        while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
            sleep 100  # wait for 10 seconds before checking again
        done
    done
 }
 run_batch 0 28 56 84 112 140 165
--- a/analyze_case_time.py
+++ b/analyze_case_time.py
@ -0,0 +1,128 @@
 import re
 import datetime
 import numpy as np
 from collections import defaultdict, Counter
 log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt'
 with open(log_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
 # 收集所有用例的开始和结束时间
 case_times = {}
 case_results = {}
 case_intents = {}
 current_case = None
 start_time = None
 current_intent = None
 for line in lines:
    if '[Config file]:' in line:
        # 从行中提取时间和用例ID
        match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line)
        if match:
            time_str, config_file = match.groups()
            # 提取用例ID
            case_id = config_file.split('/')[-1].replace('.json', '')
            current_case = case_id
            start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
    elif '[Intent]:' in line and current_case:
        # 提取意图
        match = re.match(r'.+\[Intent\]: (.+)', line)
        if match:
            current_intent = match.group(1)
            case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent
    elif '[Result]' in line and current_case and start_time:
        # 从行中提取时间和结果
        match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line)
        if match:
            time_str, result = match.groups()
            end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
            duration = (end_time - start_time).total_seconds()
            case_times[current_case] = duration
            case_results[current_case] = result
            current_case = None
            start_time = None
            current_intent = None
 # 计算总体统计信息
 if case_times:
    total_cases = len(case_times)
    durations = list(case_times.values())
    # 基本统计信息
    avg_duration = np.mean(durations)
    median_duration = np.median(durations)
    min_duration = min(durations)
    max_duration = max(durations)
    std_duration = np.std(durations)
    # 按结果分类
    pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS']
    fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL']
    pass_durations = [case_times[case_id] for case_id in pass_cases]
    fail_durations = [case_times[case_id] for case_id in fail_cases]
    # 时间分布
    time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240]
    time_distribution = {f"{low}-{high}秒": 0 for low, high in zip(time_bins[:-1], time_bins[1:])}
    time_distribution[f"{time_bins[-1]}+秒"] = 0
    for duration in durations:
        placed = False
        for i in range(len(time_bins) - 1):
            if time_bins[i] <= duration < time_bins[i+1]:
                time_distribution[f"{time_bins[i]}-{time_bins[i+1]}秒"] += 1
                placed = True
                break
        if not placed:
            time_distribution[f"{time_bins[-1]}+秒"] += 1
    # 打印结果
    print("=" * 60)
    print(f"测试用例执行时间分析报告")
    print("=" * 60)
    print(f"\n总测试用例数: {total_cases}")
    print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)")
    print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)")
    print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)")
    print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)")
    print(f"标准差: {std_duration:.2f}秒")
    print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)")
    if pass_durations:
        print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)")
        print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)")
    print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)")
    if fail_durations:
        print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)")
        print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)")
    # 时间分布
    print("\n执行时长分布:")
    for bin_name, count in time_distribution.items():
        percentage = count / total_cases * 100
        print(f"  {bin_name}: {count} 个用例 ({percentage:.2f}%)")
    # 打印前10长和前10短的用例
    print("\n执行时长最长的10个用例:")
    for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1):
        result = case_results[case_id]
        intent = case_intents.get(case_id, "未知意图")
        print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
        print(f"   意图: {intent}")
    print("\n执行时长最短的10个用例:")
    for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1):
        result = case_results[case_id]
        intent = case_intents.get(case_id, "未知意图")
        print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
        print(f"   意图: {intent}")
 else:
    print("未找到有效的测试用例数据")
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0
+.0