修复测试webrl和orm程序
This commit is contained in:
parent
543543218c
commit
e50dd34bee
48
VAB-WebArena-Lite/README_yuyr.md
Normal file
48
VAB-WebArena-Lite/README_yuyr.md
Normal file
|
@ -0,0 +1,48 @@
|
|||
|
||||
# evaluation
|
||||
1. 在lm2上部署vllm,加载webrl-llama-3.1-8b模型
|
||||
```bash
|
||||
MODEL_PATH=/data1/yuyr/webrl-llama-3.1-8b
|
||||
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-llama-3.1-8b \
|
||||
--model $MODEL_PATH \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--max-num-seqs 32 \
|
||||
--dtype half \
|
||||
--port 18080
|
||||
```
|
||||
2. g14上已经部署好shopping/shopping_admin/gitlab/reddit四个网站,map使用官网(使用socks5代理);
|
||||
3. 运行`wa_parallel_run_webrl_completion.sh`,注意!一定是这个completion,不要chat,否则webrl模型SR会降低很多。
|
||||
```bash
|
||||
# 需要先建好vab conda环境
|
||||
tmux new -t webrl
|
||||
|
||||
bash wa_parallel_run_webrl_completion.sh
|
||||
```
|
||||
|
||||
# orm 测试
|
||||
1. lm2上部署vllm,加载webrl-orm-llama-3.1-8b模型
|
||||
```bash
|
||||
MODEL_PATH=/data1/yuyr/webrl-orm-llama-3.1-8b
|
||||
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --served-model-name webrl-orm-llama-3.1-8b \
|
||||
--model $MODEL_PATH \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--max-num-seqs 32 \
|
||||
--dtype half \
|
||||
--port 18081
|
||||
```
|
||||
2. 执行分析脚本
|
||||
```bash
|
||||
|
||||
cd result
|
||||
|
||||
python orm_test.py --data_dir webrl_chat_completion/
|
||||
|
||||
```
|
||||
3. 修改.env 配置使用vllm或者aiproxy api
|
||||
3. 修改`orm_test.py`中model的值,指定模型
|
165
VAB-WebArena-Lite/export.txt
Normal file
165
VAB-WebArena-Lite/export.txt
Normal file
|
@ -0,0 +1,165 @@
|
|||
1.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
0.0
|
||||
1.0
|
||||
1.0
|
||||
1.0
|
|
@ -89,23 +89,57 @@ with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
|
|||
configs = json.load(fp)
|
||||
sub_results = {}
|
||||
sub_ids = {}
|
||||
sub_failed_ids = {}
|
||||
sub_counts = {}
|
||||
non_map_success = 0
|
||||
non_map_total = 0
|
||||
|
||||
for item in configs:
|
||||
web = tuple(item['sites'])
|
||||
task_id = int(item['task_id'])
|
||||
old_task_id = int(item['old_task_id'])
|
||||
|
||||
if web not in sub_results:
|
||||
sub_results[web] = []
|
||||
if web not in sub_ids:
|
||||
sub_ids[web] = []
|
||||
sub_failed_ids[web] = []
|
||||
sub_counts[web] = 0
|
||||
|
||||
sub_counts[web] += 1
|
||||
|
||||
is_success = False
|
||||
if task_id in all_result:
|
||||
sub_results[web].append(all_result[task_id])
|
||||
if all_result[task_id] == 1:
|
||||
score = all_result[task_id]
|
||||
sub_results[web].append(score)
|
||||
if score >= 1.0:
|
||||
sub_ids[web].append(old_task_id)
|
||||
is_success = True
|
||||
else:
|
||||
sub_failed_ids[web].append(old_task_id)
|
||||
else:
|
||||
sub_results[web].append(0)
|
||||
for web in sub_results:
|
||||
print(web, round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1))
|
||||
sub_failed_ids[web].append(old_task_id)
|
||||
|
||||
print('\n\n')
|
||||
for web in sub_ids:
|
||||
print(web, sorted(sub_ids[web]), len(sub_ids[web]))
|
||||
is_map_task = any('map' in site for site in web)
|
||||
if not is_map_task:
|
||||
non_map_total += 1
|
||||
if is_success:
|
||||
non_map_success += 1
|
||||
|
||||
print("\n--- Category Statistics ---")
|
||||
for web in sub_results:
|
||||
total_cases = sub_counts[web]
|
||||
success_cases = len(sub_ids[web])
|
||||
success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0
|
||||
print(f"Category: {web}")
|
||||
print(f" Total Cases: {total_cases}")
|
||||
print(f" Success Rate: {success_rate}% ({success_cases}/{total_cases})")
|
||||
print(f" Successful old_task_ids: {sorted(sub_ids[web])}")
|
||||
print(f" Failed old_task_ids: {sorted(sub_failed_ids[web])}")
|
||||
|
||||
print('\n--- Overall Accuracy without Map ---')
|
||||
if non_map_total > 0:
|
||||
overall_acc_without_map = round(non_map_success / non_map_total * 100, 2)
|
||||
print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})")
|
||||
else:
|
||||
print("No non-map tasks found.")
|
|
@ -1,8 +1,9 @@
|
|||
#!/bin/bash
|
||||
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
|
||||
result_dir='./results/webrl_chat' # TODO: set your result_dir
|
||||
result_dir='./results/webrl_chat_r1' # TODO: set your result_dir
|
||||
provider='openai' # TODO: select from ['openai', 'finetune', ...]
|
||||
model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
|
||||
# model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
|
||||
model='aiproxy/deepseek-reasoner'
|
||||
planner_ip='192.168.16.116' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
|
||||
instruction_path='agent/prompts/jsons/p_webrl_chat.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
|
||||
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
|
||||
|
@ -11,8 +12,10 @@ proxy_url='socks5://98.152.200.61:8081'
|
|||
|
||||
SERVER='localhost' # TODO: your server address
|
||||
MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
|
||||
OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
|
||||
OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
|
||||
# OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
|
||||
# OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
|
||||
OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||
OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||
OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||
OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||
OPENAI_ORGANIZATION=''
|
||||
|
@ -49,7 +52,7 @@ done
|
|||
# Function to run a job
|
||||
run_job() {
|
||||
tmux select-pane -t $1
|
||||
COMMAND="python run.py \
|
||||
COMMAND="conda activate vab; python run.py \
|
||||
--instruction_path ${instruction_path} \
|
||||
--test_start_idx $2 \
|
||||
--test_end_idx $3 \
|
||||
|
|
104
VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
Normal file
104
VAB-WebArena-Lite/wa_parallel_run_webrl_completion.sh
Normal file
|
@ -0,0 +1,104 @@
|
|||
#!/bin/bash
|
||||
DATASET='webarena' # TODO: select from ['webarena', 'visualwebarena']
|
||||
result_dir='./results/webrl_chat_completion' # TODO: set your result_dir
|
||||
provider='openai' # TODO: select from ['openai', 'finetune', ...]
|
||||
model='webrl-llama-3.1-8b' # TODO: assign model name, which is used for action generation
|
||||
# model='aiproxy/deepseek-reasoner'
|
||||
planner_ip='' # TODO: ip address of the model you are deploying (only if you are deploying your own model using e.g. vllm)
|
||||
instruction_path='agent/prompts/jsons/p_webrl.json' # e.g., agent/prompts/jsons/p_cot_id_actree_2s.json
|
||||
test_config_base_dir='config_files/wa/test_webarena_lite' # e.g., config_files/wa/test_webarena_lite
|
||||
temperature=0.0
|
||||
proxy_url='socks5://98.152.200.61:8081'
|
||||
|
||||
SERVER='localhost' # TODO: your server address
|
||||
MAP_SERVER='https://www.openstreetmap.org' # TODO: the server address for MAP tasks
|
||||
OPENAI_API_KEY='none' # TODO: if you test OpenAI APIs
|
||||
OPENAI_API_URL='http://192.168.16.116:18080/v1' # TODO: if you test OpenAI APIs
|
||||
# OPENAI_API_KEY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||
# OPENAI_API_URL="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||
OPENAI_API_KEY_FUZZY="sk-uG8JLGz3wlXTAmiC8337A02f5d2946F2Ba3dE64427B90c2f"
|
||||
OPENAI_API_URL_FUZZY="https://aiproxy.lmzgc.cn:8080/v1/"
|
||||
OPENAI_ORGANIZATION=''
|
||||
CONDA_ENV_NAME='vab' # TODO: the name of your conda environment for testing WebArena
|
||||
|
||||
|
||||
ENV_VARIABLES="export DATASET=${DATASET}; export SHOPPING='http://${SERVER}:28082';export SHOPPING_ADMIN='http://${SERVER}:28083/admin';export REDDIT='http://${SERVER}:28080';export GITLAB='http://${SERVER}:28084';export MAP='${MAP_SERVER}';export WIKIPEDIA='http://${SERVER}:28081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:20080';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_API_URL=${OPENAI_API_URL};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION};export OPENAI_API_KEY_FUZZY=${OPENAI_API_KEY_FUZZY};export OPENAI_API_URL_FUZZY=${OPENAI_API_URL_FUZZY}"
|
||||
echo $ENV_VARIABLES
|
||||
|
||||
# get the number of tmux panes
|
||||
num_panes=$(tmux list-panes | wc -l)
|
||||
|
||||
# calculate how many panes need to be created
|
||||
let "panes_to_create = 7 - num_panes"
|
||||
# let "panes_to_create = 1 - num_panes"
|
||||
|
||||
# array of tmux commands to create each pane
|
||||
tmux_commands=(
|
||||
'tmux split-window -h'
|
||||
'tmux split-window -v'
|
||||
'tmux select-pane -t 0; tmux split-window -v'
|
||||
'tmux split-window -v'
|
||||
'tmux select-pane -t 3; tmux split-window -v'
|
||||
'tmux select-pane -t 5; tmux split-window -v'
|
||||
)
|
||||
|
||||
# create panes up to 7
|
||||
for ((i=0; i<$panes_to_create; i++)); do
|
||||
eval ${tmux_commands[$i]}
|
||||
done
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
# Function to run a job
|
||||
run_job() {
|
||||
tmux select-pane -t $1
|
||||
COMMAND="conda activate vab; python run.py \
|
||||
--instruction_path ${instruction_path} \
|
||||
--test_start_idx $2 \
|
||||
--test_end_idx $3 \
|
||||
--result_dir ${result_dir} \
|
||||
--test_config_base_dir ${test_config_base_dir} \
|
||||
--provider ${provider} \
|
||||
--mode completion \
|
||||
--model ${model} \
|
||||
--stop_token \"<|eot_id|>\" \
|
||||
--max_obs_length 0 \
|
||||
--max_tokens 2048 \
|
||||
--viewport_width 1280 \
|
||||
--viewport_height 720 \
|
||||
--proxy_url ${proxy_url} \
|
||||
--action_set_tag webrl_id --observation_type webrl"
|
||||
tmux send-keys "tmux set mouse on; conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until ${COMMAND}; do echo 'crashed' >&2; sleep 1; done" C-m
|
||||
sleep 3
|
||||
}
|
||||
|
||||
TOLERANCE=2
|
||||
run_batch() {
|
||||
args=("$@") # save all arguments in an array
|
||||
num_jobs=${#args[@]} # get number of arguments
|
||||
|
||||
for ((i=1; i<$num_jobs; i++)); do
|
||||
run_job $i ${args[i-1]} ${args[i]}
|
||||
done
|
||||
|
||||
# Wait for all jobs to finish
|
||||
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
|
||||
sleep 100 # wait for 10 seconds before checking again
|
||||
done
|
||||
|
||||
# Run checker
|
||||
while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do
|
||||
echo "Check failed, rerunning jobs..."
|
||||
for ((i=1; i<$num_jobs; i++)); do
|
||||
run_job $i ${args[i-1]} ${args[i]}
|
||||
done
|
||||
|
||||
# Wait for all jobs to finish
|
||||
while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do
|
||||
sleep 100 # wait for 10 seconds before checking again
|
||||
done
|
||||
done
|
||||
|
||||
}
|
||||
run_batch 0 28 56 84 112 140 165
|
||||
|
128
analyze_case_time.py
Normal file
128
analyze_case_time.py
Normal file
|
@ -0,0 +1,128 @@
|
|||
import re
|
||||
import datetime
|
||||
import numpy as np
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt'
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# 收集所有用例的开始和结束时间
|
||||
case_times = {}
|
||||
case_results = {}
|
||||
case_intents = {}
|
||||
current_case = None
|
||||
start_time = None
|
||||
current_intent = None
|
||||
|
||||
for line in lines:
|
||||
if '[Config file]:' in line:
|
||||
# 从行中提取时间和用例ID
|
||||
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line)
|
||||
if match:
|
||||
time_str, config_file = match.groups()
|
||||
# 提取用例ID
|
||||
case_id = config_file.split('/')[-1].replace('.json', '')
|
||||
current_case = case_id
|
||||
start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
|
||||
|
||||
elif '[Intent]:' in line and current_case:
|
||||
# 提取意图
|
||||
match = re.match(r'.+\[Intent\]: (.+)', line)
|
||||
if match:
|
||||
current_intent = match.group(1)
|
||||
case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent
|
||||
|
||||
elif '[Result]' in line and current_case and start_time:
|
||||
# 从行中提取时间和结果
|
||||
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line)
|
||||
if match:
|
||||
time_str, result = match.groups()
|
||||
end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
|
||||
duration = (end_time - start_time).total_seconds()
|
||||
|
||||
case_times[current_case] = duration
|
||||
case_results[current_case] = result
|
||||
|
||||
current_case = None
|
||||
start_time = None
|
||||
current_intent = None
|
||||
|
||||
# 计算总体统计信息
|
||||
if case_times:
|
||||
total_cases = len(case_times)
|
||||
durations = list(case_times.values())
|
||||
|
||||
# 基本统计信息
|
||||
avg_duration = np.mean(durations)
|
||||
median_duration = np.median(durations)
|
||||
min_duration = min(durations)
|
||||
max_duration = max(durations)
|
||||
std_duration = np.std(durations)
|
||||
|
||||
# 按结果分类
|
||||
pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS']
|
||||
fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL']
|
||||
|
||||
pass_durations = [case_times[case_id] for case_id in pass_cases]
|
||||
fail_durations = [case_times[case_id] for case_id in fail_cases]
|
||||
|
||||
# 时间分布
|
||||
time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240]
|
||||
time_distribution = {f"{low}-{high}秒": 0 for low, high in zip(time_bins[:-1], time_bins[1:])}
|
||||
time_distribution[f"{time_bins[-1]}+秒"] = 0
|
||||
|
||||
for duration in durations:
|
||||
placed = False
|
||||
for i in range(len(time_bins) - 1):
|
||||
if time_bins[i] <= duration < time_bins[i+1]:
|
||||
time_distribution[f"{time_bins[i]}-{time_bins[i+1]}秒"] += 1
|
||||
placed = True
|
||||
break
|
||||
if not placed:
|
||||
time_distribution[f"{time_bins[-1]}+秒"] += 1
|
||||
|
||||
# 打印结果
|
||||
print("=" * 60)
|
||||
print(f"测试用例执行时间分析报告")
|
||||
print("=" * 60)
|
||||
|
||||
print(f"\n总测试用例数: {total_cases}")
|
||||
print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)")
|
||||
print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)")
|
||||
print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)")
|
||||
print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)")
|
||||
print(f"标准差: {std_duration:.2f}秒")
|
||||
|
||||
print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)")
|
||||
if pass_durations:
|
||||
print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)")
|
||||
print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)")
|
||||
|
||||
print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)")
|
||||
if fail_durations:
|
||||
print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)")
|
||||
print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)")
|
||||
|
||||
# 时间分布
|
||||
print("\n执行时长分布:")
|
||||
for bin_name, count in time_distribution.items():
|
||||
percentage = count / total_cases * 100
|
||||
print(f" {bin_name}: {count} 个用例 ({percentage:.2f}%)")
|
||||
|
||||
# 打印前10长和前10短的用例
|
||||
print("\n执行时长最长的10个用例:")
|
||||
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1):
|
||||
result = case_results[case_id]
|
||||
intent = case_intents.get(case_id, "未知意图")
|
||||
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
|
||||
print(f" 意图: {intent}")
|
||||
|
||||
print("\n执行时长最短的10个用例:")
|
||||
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1):
|
||||
result = case_results[case_id]
|
||||
intent = case_intents.get(case_id, "未知意图")
|
||||
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
|
||||
print(f" 意图: {intent}")
|
||||
else:
|
||||
print("未找到有效的测试用例数据")
|
Loading…
Reference in New Issue
Block a user