修复测试webrl和orm程序

2025-04-27 18:04:39 +08:00 · 2025-04-27 18:04:39 +08:00 · e50dd34bee
commit e50dd34bee
parent 543543218c
6 changed files with 495 additions and 13 deletions
--- a/VAB-WebArena-Lite/README_yuyr.md
+++ b/VAB-WebArena-Lite/README_yuyr.md
@ -0,0 +1,48 @@
+
+# evaluation
+1. 在lm2上部署vllm，加载webrl-llama-3.1-8b模型
+```bash
+MODEL_PATH=/data1/yuyr/webrl-llama-3.1-8b
+
+export HF_ENDPOINT=https://hf-mirror.com
+
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-llama-3.1-8b \
+        --model $MODEL_PATH  \
+        --gpu-memory-utilization 0.9 \
+        --max-num-seqs 32 \
+        --dtype half \
+        --port 18080
+```
+2. g14上已经部署好shopping/shopping_admin/gitlab/reddit四个网站，map使用官网（使用socks5代理）;
+3. 运行`wa_parallel_run_webrl_completion.sh`，注意！一定是这个completion，不要chat，否则webrl模型SR会降低很多。
+```bash
+# 需要先建好vab conda环境
+tmux new -t webrl
+
+bash wa_parallel_run_webrl_completion.sh
+```
+
+# orm 测试
+1. lm2上部署vllm，加载webrl-orm-llama-3.1-8b模型
+```bash
+MODEL_PATH=/data1/yuyr/webrl-orm-llama-3.1-8b
+
+export HF_ENDPOINT=https://hf-mirror.com
+
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-orm-llama-3.1-8b \
+        --model $MODEL_PATH  \
+        --gpu-memory-utilization 0.9 \
+        --max-num-seqs 32 \
+        --dtype half \
+        --port 18081 
+```
+2. 执行分析脚本
+```bash
+
+cd result
+
+python orm_test.py --data_dir webrl_chat_completion/
+
+```
+3. 修改.env 配置使用vllm或者aiproxy api
+3. 修改`orm_test.py`中model的值，指定模型
--- a/VAB-WebArena-Lite/export.txt
+++ b/VAB-WebArena-Lite/export.txt
@ -0,0 +1,165 @@
+1.0
+0.0
+1.0
+0.0
+1.0
+1.0
+0.0
+0.0
+1.0
+1.0
+0.0
+0.0
+0.0
+1.0
+1.0
+1.0
+0.0
+1.0
+0.0
+0.0
+1.0
+1.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+1.0
+1.0
+0.0
+0.0
+1.0
+0.0
+0.0
+1.0
+0.0
+0.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+0.0
+0.0
+1.0
+0.0
+0.0
+0.0
+1.0
+1.0
+0.0
+0.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+1.0
+1.0
+1.0
+0.0
+0.0
+0.0
+0.0
+1.0
+0.0
+1.0
+1.0
+1.0
+0.0
+1.0
+1.0
+1.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+1.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+0.0
+1.0
+0.0
+1.0
+0.0
+1.0
+0.0
+1.0
+0.0
+0.0
+0.0
+0.0
+1.0
+1.0
+0.0
+0.0
+1.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+0.0
+0.0
+1.0
+0.0
+1.0
+1.0
+0.0
+0.0
+0.0
+1.0
+1.0
+0.0
+0.0
+0.0
+0.0
+0.0
+1.0
+1.0
+1.0
--- a/VAB-WebArena-Lite/score.py
+++ b/VAB-WebArena-Lite/score.py
@ -89,23 +89,57 @@ with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
    configs = json.load(fp)
 sub_results = {}
 sub_ids = {}
+sub_failed_ids = {}
+sub_counts = {}
+non_map_success = 0
+non_map_total = 0
+
 for item in configs:
    web = tuple(item['sites'])
    task_id = int(item['task_id'])
    old_task_id = int(item['old_task_id'])
+
    if web not in sub_results:
        sub_results[web] = []
-    if web not in sub_ids:
        sub_ids[web] = []
+        sub_failed_ids[web] = []
+        sub_counts[web] = 0
+
+    sub_counts[web] += 1
+
+    is_success = False
    if task_id in all_result:
-        sub_results[web].append(all_result[task_id])
-        if all_result[task_id] == 1:
+        score = all_result[task_id]
+        sub_results[web].append(score)
+        if score >= 1.0:
            sub_ids[web].append(old_task_id)
+            is_success = True
+        else:
+            sub_failed_ids[web].append(old_task_id)
    else:
        sub_results[web].append(0)
-for web in sub_results:
-    print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1))
+        sub_failed_ids[web].append(old_task_id)

-print('\n\n')
-for web in sub_ids:
-    print(web, sorted(sub_ids[web]), len(sub_ids[web]))
+    is_map_task = any('map' in site for site in web)
+    if not is_map_task:
+        non_map_total += 1
+        if is_success:
+            non_map_success += 1
+
+print("\n--- Category Statistics ---")
+for web in sub_results:
+    total_cases = sub_counts[web]
+    success_cases = len(sub_ids[web])
+    success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0
+    print(f"Category: {web}")
+    print(f"  Total Cases: {total_cases}")
+    print(f"  Success Rate: {success_rate}% ({success_cases}/{total_cases})")
+    print(f"  Successful old_task_ids: {sorted(sub_ids[web])}")
+    print(f"  Failed old_task_ids: {sorted(sub_failed_ids[web])}")
+
+print('\n--- Overall Accuracy without Map ---')
+if non_map_total > 0:
+    overall_acc_without_map = round(non_map_success / non_map_total * 100, 2)
+    print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})")
+else:
+    print("No non-map tasks found.")
--- a/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh
+++ b/VAB-WebArena-Lite/wa_parallel_run_webrl_chat.sh
@ -1,8 +1,9 @@
 #!/bin/bash
 DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
-result_dir='./results/webrl_chat' # TODO: set your result_dir
+result_dir='./results/webrl_chat_r1' # TODO: set your result_dir
 provider='openai' # TODO: select from ['openai', 'finetune', ...]
-model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
+# model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
+model='aiproxy/deepseek-reasoner'
 planner_ip='192.168.16.116' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
 instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
 test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
@ -11,8 +12,10 @@ proxy_url='socks5://98.152.200.61:8081'

 SERVER='localhost' # TODO: your server address
 MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
-OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
-OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
+# OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
+# OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
+OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
+OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
 OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
 OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
 OPENAI_ORGANIZATION=''
@ -49,7 +52,7 @@ done
 # Function to run a job
 run_job() {
    tmux select-pane -t $1
-    COMMAND="python run.py \
+    COMMAND="conda activate vab; python run.py \
        --instruction_path ${instruction_path} \
        --test_start_idx $2 \
        --test_end_idx $3 \
--- a/VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
+++ b/VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
@ -0,0 +1,104 @@
+#!/bin/bash
+DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
+result_dir='./results/webrl_chat_completion' # TODO: set your result_dir
+provider='openai' # TODO: select from ['openai', 'finetune', ...]
+model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
+# model='aiproxy/deepseek-reasoner'
+planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
+instruction_path='agent/prompts/jsons/p_webrl.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
+test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
+temperature=0.0
+proxy_url='socks5://98.152.200.61:8081'
+
+SERVER='localhost' # TODO: your server address
+MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
+OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
+OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
+# OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
+# OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
+OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
+OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
+OPENAI_ORGANIZATION=''
+CONDA_ENV_NAME='vab' # TODO: the name of your conda environment for testing WebArena
+
+
+ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:28082';export SHOPPING_ADMIN='http://${SERVER}:28083/admin';export REDDIT='http://${SERVER}:28080';export GITLAB='http://${SERVER}:28084';export MAP='${MAP_SERVER}';export WIKIPEDIA='http://${SERVER}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:20080';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_API_URL=${OPENAI_API_URL};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION};export OPENAI_API_KEY_FUZZY=${OPENAI_API_KEY_FUZZY};export OPENAI_API_URL_FUZZY=${OPENAI_API_URL_FUZZY}"
+echo $ENV_VARIABLES
+
+# get the number of tmux panes
+num_panes=$(tmux list-panes | wc -l)
+
+# calculate how many panes need to be created
+let "panes_to_create = 7 - num_panes"
+# let "panes_to_create = 1 - num_panes"
+
+# array of tmux commands to create each pane
+tmux_commands=(
+    'tmux split-window -h'
+    'tmux split-window -v'
+    'tmux select-pane -t 0; tmux split-window -v'
+    'tmux split-window -v'
+    'tmux select-pane -t 3; tmux split-window -v'
+    'tmux select-pane -t 5; tmux split-window -v'
+)
+
+# create panes up to 7
+for ((i=0; i<$panes_to_create; i++)); do
+    eval ${tmux_commands[$i]}
+done
+
+#!/bin/bash
+
+# Function to run a job
+run_job() {
+    tmux select-pane -t $1
+    COMMAND="conda activate vab; python run.py \
+        --instruction_path ${instruction_path} \
+        --test_start_idx $2 \
+        --test_end_idx $3 \
+        --result_dir ${result_dir} \
+        --test_config_base_dir ${test_config_base_dir} \
+        --provider ${provider} \
+        --mode completion \
+        --model ${model} \
+        --stop_token \"<|eot_id|>\" \
+        --max_obs_length 0 \
+        --max_tokens 2048 \
+        --viewport_width 1280 \
+        --viewport_height 720 \
+        --proxy_url ${proxy_url} \
+        --action_set_tag webrl_id  --observation_type webrl"
+    tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m
+    sleep 3
+}
+
+TOLERANCE=2
+run_batch() {
+    args=("$@") # save all arguments in an array
+    num_jobs=${#args[@]} # get number of arguments
+
+    for ((i=1; i<$num_jobs; i++)); do
+        run_job $i ${args[i-1]} ${args[i]}
+    done
+
+    # Wait for all jobs to finish
+    while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
+        sleep 100  # wait for 10 seconds before checking again
+    done
+
+    # Run checker
+    while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do
+        echo "Check failed, rerunning jobs..."
+        for ((i=1; i<$num_jobs; i++)); do
+            run_job $i ${args[i-1]} ${args[i]}
+        done
+
+        # Wait for all jobs to finish
+        while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
+            sleep 100  # wait for 10 seconds before checking again
+        done
+    done
+
+}
+run_batch 0 28 56 84 112 140 165
+
--- a/analyze_case_time.py
+++ b/analyze_case_time.py
@ -0,0 +1,128 @@
+import re
+import datetime
+import numpy as np
+from collections import defaultdict, Counter
+
+log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt'
+with open(log_file, 'r', encoding='utf-8') as f:
+    lines = f.readlines()
+
+# 收集所有用例的开始和结束时间
+case_times = {}
+case_results = {}
+case_intents = {}
+current_case = None
+start_time = None
+current_intent = None
+
+for line in lines:
+    if '[Config file]:' in line:
+        # 从行中提取时间和用例ID
+        match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line)
+        if match:
+            time_str, config_file = match.groups()
+            # 提取用例ID
+            case_id = config_file.split('/')[-1].replace('.json', '')
+            current_case = case_id
+            start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
+    
+    elif '[Intent]:' in line and current_case:
+        # 提取意图
+        match = re.match(r'.+\[Intent\]: (.+)', line)
+        if match:
+            current_intent = match.group(1)
+            case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent
+    
+    elif '[Result]' in line and current_case and start_time:
+        # 从行中提取时间和结果
+        match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line)
+        if match:
+            time_str, result = match.groups()
+            end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
+            duration = (end_time - start_time).total_seconds()
+            
+            case_times[current_case] = duration
+            case_results[current_case] = result
+            
+            current_case = None
+            start_time = None
+            current_intent = None
+
+# 计算总体统计信息
+if case_times:
+    total_cases = len(case_times)
+    durations = list(case_times.values())
+    
+    # 基本统计信息
+    avg_duration = np.mean(durations)
+    median_duration = np.median(durations)
+    min_duration = min(durations)
+    max_duration = max(durations)
+    std_duration = np.std(durations)
+    
+    # 按结果分类
+    pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS']
+    fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL']
+    
+    pass_durations = [case_times[case_id] for case_id in pass_cases]
+    fail_durations = [case_times[case_id] for case_id in fail_cases]
+    
+    # 时间分布
+    time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240]
+    time_distribution = {f"{low}-{high}秒": 0 for low, high in zip(time_bins[:-1], time_bins[1:])}
+    time_distribution[f"{time_bins[-1]}+秒"] = 0
+    
+    for duration in durations:
+        placed = False
+        for i in range(len(time_bins) - 1):
+            if time_bins[i] <= duration < time_bins[i+1]:
+                time_distribution[f"{time_bins[i]}-{time_bins[i+1]}秒"] += 1
+                placed = True
+                break
+        if not placed:
+            time_distribution[f"{time_bins[-1]}+秒"] += 1
+    
+    # 打印结果
+    print("=" * 60)
+    print(f"测试用例执行时间分析报告")
+    print("=" * 60)
+    
+    print(f"\n总测试用例数: {total_cases}")
+    print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)")
+    print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)")
+    print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)")
+    print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)")
+    print(f"标准差: {std_duration:.2f}秒")
+    
+    print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)")
+    if pass_durations:
+        print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)")
+        print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)")
+    
+    print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)")
+    if fail_durations:
+        print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)")
+        print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)")
+    
+    # 时间分布
+    print("\n执行时长分布:")
+    for bin_name, count in time_distribution.items():
+        percentage = count / total_cases * 100
+        print(f"  {bin_name}: {count} 个用例 ({percentage:.2f}%)")
+    
+    # 打印前10长和前10短的用例
+    print("\n执行时长最长的10个用例:")
+    for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1):
+        result = case_results[case_id]
+        intent = case_intents.get(case_id, "未知意图")
+        print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
+        print(f"   意图: {intent}")
+    
+    print("\n执行时长最短的10个用例:")
+    for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1):
+        result = case_results[case_id]
+        intent = case_intents.get(case_id, "未知意图")
+        print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
+        print(f"   意图: {intent}")
+else:
+    print("未找到有效的测试用例数据")