修复测试webrl和orm程序
This commit is contained in:
parent
543543218c
commit
e50dd34bee
48
VAB-WebArena-Lite/README_yuyr.md
Normal file
48
VAB-WebArena-Lite/README_yuyr.md
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
|
||||||
|
# evaluation
|
||||||
|
1. 在lm2上部署vllm,加载webrl-llama-3.1-8b模型
|
||||||
|
```bash
|
||||||
|
MODEL_PATH=/data1/yuyr/webrl-llama-3.1-8b
|
||||||
|
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
|
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-llama-3.1-8b \
|
||||||
|
--model $MODEL_PATH \
|
||||||
|
--gpu-memory-utilization 0.9 \
|
||||||
|
--max-num-seqs 32 \
|
||||||
|
--dtype half \
|
||||||
|
--port 18080
|
||||||
|
```
|
||||||
|
2. g14上已经部署好shopping/shopping_admin/gitlab/reddit四个网站,map使用官网(使用socks5代理);
|
||||||
|
3. 运行`wa_parallel_run_webrl_completion.sh`,注意!一定是这个completion,不要chat,否则webrl模型SR会降低很多。
|
||||||
|
```bash
|
||||||
|
# 需要先建好vab conda环境
|
||||||
|
tmux new -t webrl
|
||||||
|
|
||||||
|
bash wa_parallel_run_webrl_completion.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
# orm 测试
|
||||||
|
1. lm2上部署vllm,加载webrl-orm-llama-3.1-8b模型
|
||||||
|
```bash
|
||||||
|
MODEL_PATH=/data1/yuyr/webrl-orm-llama-3.1-8b
|
||||||
|
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
|
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-orm-llama-3.1-8b \
|
||||||
|
--model $MODEL_PATH \
|
||||||
|
--gpu-memory-utilization 0.9 \
|
||||||
|
--max-num-seqs 32 \
|
||||||
|
--dtype half \
|
||||||
|
--port 18081
|
||||||
|
```
|
||||||
|
2. 执行分析脚本
|
||||||
|
```bash
|
||||||
|
|
||||||
|
cd result
|
||||||
|
|
||||||
|
python orm_test.py --data_dir webrl_chat_completion/
|
||||||
|
|
||||||
|
```
|
||||||
|
3. 修改.env 配置使用vllm或者aiproxy api
|
||||||
|
3. 修改`orm_test.py`中model的值,指定模型
|
165
VAB-WebArena-Lite/export.txt
Normal file
165
VAB-WebArena-Lite/export.txt
Normal file
|
@ -0,0 +1,165 @@
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
0.0
|
||||||
|
1.0
|
||||||
|
1.0
|
||||||
|
1.0
|
|
@ -89,23 +89,57 @@ with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
|
||||||
configs = json.load(fp)
|
configs = json.load(fp)
|
||||||
sub_results = {}
|
sub_results = {}
|
||||||
sub_ids = {}
|
sub_ids = {}
|
||||||
|
sub_failed_ids = {}
|
||||||
|
sub_counts = {}
|
||||||
|
non_map_success = 0
|
||||||
|
non_map_total = 0
|
||||||
|
|
||||||
for item in configs:
|
for item in configs:
|
||||||
web = tuple(item['sites'])
|
web = tuple(item['sites'])
|
||||||
task_id = int(item['task_id'])
|
task_id = int(item['task_id'])
|
||||||
old_task_id = int(item['old_task_id'])
|
old_task_id = int(item['old_task_id'])
|
||||||
|
|
||||||
if web not in sub_results:
|
if web not in sub_results:
|
||||||
sub_results[web] = []
|
sub_results[web] = []
|
||||||
if web not in sub_ids:
|
|
||||||
sub_ids[web] = []
|
sub_ids[web] = []
|
||||||
|
sub_failed_ids[web] = []
|
||||||
|
sub_counts[web] = 0
|
||||||
|
|
||||||
|
sub_counts[web] += 1
|
||||||
|
|
||||||
|
is_success = False
|
||||||
if task_id in all_result:
|
if task_id in all_result:
|
||||||
sub_results[web].append(all_result[task_id])
|
score = all_result[task_id]
|
||||||
if all_result[task_id] == 1:
|
sub_results[web].append(score)
|
||||||
|
if score >= 1.0:
|
||||||
sub_ids[web].append(old_task_id)
|
sub_ids[web].append(old_task_id)
|
||||||
|
is_success = True
|
||||||
|
else:
|
||||||
|
sub_failed_ids[web].append(old_task_id)
|
||||||
else:
|
else:
|
||||||
sub_results[web].append(0)
|
sub_results[web].append(0)
|
||||||
for web in sub_results:
|
sub_failed_ids[web].append(old_task_id)
|
||||||
print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1))
|
|
||||||
|
|
||||||
print('\n\n')
|
is_map_task = any('map' in site for site in web)
|
||||||
for web in sub_ids:
|
if not is_map_task:
|
||||||
print(web, sorted(sub_ids[web]), len(sub_ids[web]))
|
non_map_total += 1
|
||||||
|
if is_success:
|
||||||
|
non_map_success += 1
|
||||||
|
|
||||||
|
print("\n--- Category Statistics ---")
|
||||||
|
for web in sub_results:
|
||||||
|
total_cases = sub_counts[web]
|
||||||
|
success_cases = len(sub_ids[web])
|
||||||
|
success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0
|
||||||
|
print(f"Category: {web}")
|
||||||
|
print(f" Total Cases: {total_cases}")
|
||||||
|
print(f" Success Rate: {success_rate}% ({success_cases}/{total_cases})")
|
||||||
|
print(f" Successful old_task_ids: {sorted(sub_ids[web])}")
|
||||||
|
print(f" Failed old_task_ids: {sorted(sub_failed_ids[web])}")
|
||||||
|
|
||||||
|
print('\n--- Overall Accuracy without Map ---')
|
||||||
|
if non_map_total > 0:
|
||||||
|
overall_acc_without_map = round(non_map_success / non_map_total * 100, 2)
|
||||||
|
print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})")
|
||||||
|
else:
|
||||||
|
print("No non-map tasks found.")
|
|
@ -1,8 +1,9 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
|
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
|
||||||
result_dir='./results/webrl_chat' # TODO: set your result_dir
|
result_dir='./results/webrl_chat_r1' # TODO: set your result_dir
|
||||||
provider='openai' # TODO: select from ['openai', 'finetune', ...]
|
provider='openai' # TODO: select from ['openai', 'finetune', ...]
|
||||||
model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
|
# model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
|
||||||
|
model='aiproxy/deepseek-reasoner'
|
||||||
planner_ip='192.168.16.116' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
|
planner_ip='192.168.16.116' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
|
||||||
instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
|
instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
|
||||||
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
|
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
|
||||||
|
@ -11,8 +12,10 @@ proxy_url='socks5://98.152.200.61:8081'
|
||||||
|
|
||||||
SERVER='localhost' # TODO: your server address
|
SERVER='localhost' # TODO: your server address
|
||||||
MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
|
MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
|
||||||
OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
|
# OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
|
||||||
OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
|
# OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
|
||||||
|
OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||||
|
OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||||
OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||||
OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
|
OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||||
OPENAI_ORGANIZATION=''
|
OPENAI_ORGANIZATION=''
|
||||||
|
@ -49,7 +52,7 @@ done
|
||||||
# Function to run a job
|
# Function to run a job
|
||||||
run_job() {
|
run_job() {
|
||||||
tmux select-pane -t $1
|
tmux select-pane -t $1
|
||||||
COMMAND="python run.py \
|
COMMAND="conda activate vab; python run.py \
|
||||||
--instruction_path ${instruction_path} \
|
--instruction_path ${instruction_path} \
|
||||||
--test_start_idx $2 \
|
--test_start_idx $2 \
|
||||||
--test_end_idx $3 \
|
--test_end_idx $3 \
|
||||||
|
|
104
VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
Normal file
104
VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
#!/bin/bash
|
||||||
|
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
|
||||||
|
result_dir='./results/webrl_chat_completion' # TODO: set your result_dir
|
||||||
|
provider='openai' # TODO: select from ['openai', 'finetune', ...]
|
||||||
|
model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
|
||||||
|
# model='aiproxy/deepseek-reasoner'
|
||||||
|
planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
|
||||||
|
instruction_path='agent/prompts/jsons/p_webrl.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
|
||||||
|
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
|
||||||
|
temperature=0.0
|
||||||
|
proxy_url='socks5://98.152.200.61:8081'
|
||||||
|
|
||||||
|
SERVER='localhost' # TODO: your server address
|
||||||
|
MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
|
||||||
|
OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
|
||||||
|
OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
|
||||||
|
# OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||||
|
# OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||||
|
OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||||
|
OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||||
|
OPENAI_ORGANIZATION=''
|
||||||
|
CONDA_ENV_NAME='vab' # TODO: the name of your conda environment for testing WebArena
|
||||||
|
|
||||||
|
|
||||||
|
ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:28082';export SHOPPING_ADMIN='http://${SERVER}:28083/admin';export REDDIT='http://${SERVER}:28080';export GITLAB='http://${SERVER}:28084';export MAP='${MAP_SERVER}';export WIKIPEDIA='http://${SERVER}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:20080';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_API_URL=${OPENAI_API_URL};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION};export OPENAI_API_KEY_FUZZY=${OPENAI_API_KEY_FUZZY};export OPENAI_API_URL_FUZZY=${OPENAI_API_URL_FUZZY}"
|
||||||
|
echo $ENV_VARIABLES
|
||||||
|
|
||||||
|
# get the number of tmux panes
|
||||||
|
num_panes=$(tmux list-panes | wc -l)
|
||||||
|
|
||||||
|
# calculate how many panes need to be created
|
||||||
|
let "panes_to_create = 7 - num_panes"
|
||||||
|
# let "panes_to_create = 1 - num_panes"
|
||||||
|
|
||||||
|
# array of tmux commands to create each pane
|
||||||
|
tmux_commands=(
|
||||||
|
'tmux split-window -h'
|
||||||
|
'tmux split-window -v'
|
||||||
|
'tmux select-pane -t 0; tmux split-window -v'
|
||||||
|
'tmux split-window -v'
|
||||||
|
'tmux select-pane -t 3; tmux split-window -v'
|
||||||
|
'tmux select-pane -t 5; tmux split-window -v'
|
||||||
|
)
|
||||||
|
|
||||||
|
# create panes up to 7
|
||||||
|
for ((i=0; i<$panes_to_create; i++)); do
|
||||||
|
eval ${tmux_commands[$i]}
|
||||||
|
done
|
||||||
|
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Function to run a job
|
||||||
|
run_job() {
|
||||||
|
tmux select-pane -t $1
|
||||||
|
COMMAND="conda activate vab; python run.py \
|
||||||
|
--instruction_path ${instruction_path} \
|
||||||
|
--test_start_idx $2 \
|
||||||
|
--test_end_idx $3 \
|
||||||
|
--result_dir ${result_dir} \
|
||||||
|
--test_config_base_dir ${test_config_base_dir} \
|
||||||
|
--provider ${provider} \
|
||||||
|
--mode completion \
|
||||||
|
--model ${model} \
|
||||||
|
--stop_token \"<|eot_id|>\" \
|
||||||
|
--max_obs_length 0 \
|
||||||
|
--max_tokens 2048 \
|
||||||
|
--viewport_width 1280 \
|
||||||
|
--viewport_height 720 \
|
||||||
|
--proxy_url ${proxy_url} \
|
||||||
|
--action_set_tag webrl_id --observation_type webrl"
|
||||||
|
tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m
|
||||||
|
sleep 3
|
||||||
|
}
|
||||||
|
|
||||||
|
TOLERANCE=2
|
||||||
|
run_batch() {
|
||||||
|
args=("$@") # save all arguments in an array
|
||||||
|
num_jobs=${#args[@]} # get number of arguments
|
||||||
|
|
||||||
|
for ((i=1; i<$num_jobs; i++)); do
|
||||||
|
run_job $i ${args[i-1]} ${args[i]}
|
||||||
|
done
|
||||||
|
|
||||||
|
# Wait for all jobs to finish
|
||||||
|
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
|
||||||
|
sleep 100 # wait for 10 seconds before checking again
|
||||||
|
done
|
||||||
|
|
||||||
|
# Run checker
|
||||||
|
while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do
|
||||||
|
echo "Check failed, rerunning jobs..."
|
||||||
|
for ((i=1; i<$num_jobs; i++)); do
|
||||||
|
run_job $i ${args[i-1]} ${args[i]}
|
||||||
|
done
|
||||||
|
|
||||||
|
# Wait for all jobs to finish
|
||||||
|
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
|
||||||
|
sleep 100 # wait for 10 seconds before checking again
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
}
|
||||||
|
run_batch 0 28 56 84 112 140 165
|
||||||
|
|
128
analyze_case_time.py
Normal file
128
analyze_case_time.py
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
import numpy as np
|
||||||
|
from collections import defaultdict, Counter
|
||||||
|
|
||||||
|
log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt'
|
||||||
|
with open(log_file, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
# 收集所有用例的开始和结束时间
|
||||||
|
case_times = {}
|
||||||
|
case_results = {}
|
||||||
|
case_intents = {}
|
||||||
|
current_case = None
|
||||||
|
start_time = None
|
||||||
|
current_intent = None
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if '[Config file]:' in line:
|
||||||
|
# 从行中提取时间和用例ID
|
||||||
|
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line)
|
||||||
|
if match:
|
||||||
|
time_str, config_file = match.groups()
|
||||||
|
# 提取用例ID
|
||||||
|
case_id = config_file.split('/')[-1].replace('.json', '')
|
||||||
|
current_case = case_id
|
||||||
|
start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
|
||||||
|
|
||||||
|
elif '[Intent]:' in line and current_case:
|
||||||
|
# 提取意图
|
||||||
|
match = re.match(r'.+\[Intent\]: (.+)', line)
|
||||||
|
if match:
|
||||||
|
current_intent = match.group(1)
|
||||||
|
case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent
|
||||||
|
|
||||||
|
elif '[Result]' in line and current_case and start_time:
|
||||||
|
# 从行中提取时间和结果
|
||||||
|
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line)
|
||||||
|
if match:
|
||||||
|
time_str, result = match.groups()
|
||||||
|
end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
|
||||||
|
duration = (end_time - start_time).total_seconds()
|
||||||
|
|
||||||
|
case_times[current_case] = duration
|
||||||
|
case_results[current_case] = result
|
||||||
|
|
||||||
|
current_case = None
|
||||||
|
start_time = None
|
||||||
|
current_intent = None
|
||||||
|
|
||||||
|
# 计算总体统计信息
|
||||||
|
if case_times:
|
||||||
|
total_cases = len(case_times)
|
||||||
|
durations = list(case_times.values())
|
||||||
|
|
||||||
|
# 基本统计信息
|
||||||
|
avg_duration = np.mean(durations)
|
||||||
|
median_duration = np.median(durations)
|
||||||
|
min_duration = min(durations)
|
||||||
|
max_duration = max(durations)
|
||||||
|
std_duration = np.std(durations)
|
||||||
|
|
||||||
|
# 按结果分类
|
||||||
|
pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS']
|
||||||
|
fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL']
|
||||||
|
|
||||||
|
pass_durations = [case_times[case_id] for case_id in pass_cases]
|
||||||
|
fail_durations = [case_times[case_id] for case_id in fail_cases]
|
||||||
|
|
||||||
|
# 时间分布
|
||||||
|
time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240]
|
||||||
|
time_distribution = {f"{low}-{high}秒": 0 for low, high in zip(time_bins[:-1], time_bins[1:])}
|
||||||
|
time_distribution[f"{time_bins[-1]}+秒"] = 0
|
||||||
|
|
||||||
|
for duration in durations:
|
||||||
|
placed = False
|
||||||
|
for i in range(len(time_bins) - 1):
|
||||||
|
if time_bins[i] <= duration < time_bins[i+1]:
|
||||||
|
time_distribution[f"{time_bins[i]}-{time_bins[i+1]}秒"] += 1
|
||||||
|
placed = True
|
||||||
|
break
|
||||||
|
if not placed:
|
||||||
|
time_distribution[f"{time_bins[-1]}+秒"] += 1
|
||||||
|
|
||||||
|
# 打印结果
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"测试用例执行时间分析报告")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print(f"\n总测试用例数: {total_cases}")
|
||||||
|
print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)")
|
||||||
|
print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)")
|
||||||
|
print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)")
|
||||||
|
print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)")
|
||||||
|
print(f"标准差: {std_duration:.2f}秒")
|
||||||
|
|
||||||
|
print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)")
|
||||||
|
if pass_durations:
|
||||||
|
print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)")
|
||||||
|
print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)")
|
||||||
|
|
||||||
|
print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)")
|
||||||
|
if fail_durations:
|
||||||
|
print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)")
|
||||||
|
print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)")
|
||||||
|
|
||||||
|
# 时间分布
|
||||||
|
print("\n执行时长分布:")
|
||||||
|
for bin_name, count in time_distribution.items():
|
||||||
|
percentage = count / total_cases * 100
|
||||||
|
print(f" {bin_name}: {count} 个用例 ({percentage:.2f}%)")
|
||||||
|
|
||||||
|
# 打印前10长和前10短的用例
|
||||||
|
print("\n执行时长最长的10个用例:")
|
||||||
|
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1):
|
||||||
|
result = case_results[case_id]
|
||||||
|
intent = case_intents.get(case_id, "未知意图")
|
||||||
|
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
|
||||||
|
print(f" 意图: {intent}")
|
||||||
|
|
||||||
|
print("\n执行时长最短的10个用例:")
|
||||||
|
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1):
|
||||||
|
result = case_results[case_id]
|
||||||
|
intent = case_intents.get(case_id, "未知意图")
|
||||||
|
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
|
||||||
|
print(f" 意图: {intent}")
|
||||||
|
else:
|
||||||
|
print("未找到有效的测试用例数据")
|
Loading…
Reference in New Issue
Block a user