webrl/analyze_case_time.py

128 lines
5.5 KiB
Python

import re
import datetime
import numpy as np
from collections import defaultdict, Counter
log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt'
with open(log_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 收集所有用例的开始和结束时间
case_times = {}
case_results = {}
case_intents = {}
current_case = None
start_time = None
current_intent = None
for line in lines:
if '[Config file]:' in line:
# 从行中提取时间和用例ID
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line)
if match:
time_str, config_file = match.groups()
# 提取用例ID
case_id = config_file.split('/')[-1].replace('.json', '')
current_case = case_id
start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
elif '[Intent]:' in line and current_case:
# 提取意图
match = re.match(r'.+\[Intent\]: (.+)', line)
if match:
current_intent = match.group(1)
case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent
elif '[Result]' in line and current_case and start_time:
# 从行中提取时间和结果
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line)
if match:
time_str, result = match.groups()
end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
duration = (end_time - start_time).total_seconds()
case_times[current_case] = duration
case_results[current_case] = result
current_case = None
start_time = None
current_intent = None
# 计算总体统计信息
if case_times:
total_cases = len(case_times)
durations = list(case_times.values())
# 基本统计信息
avg_duration = np.mean(durations)
median_duration = np.median(durations)
min_duration = min(durations)
max_duration = max(durations)
std_duration = np.std(durations)
# 按结果分类
pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS']
fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL']
pass_durations = [case_times[case_id] for case_id in pass_cases]
fail_durations = [case_times[case_id] for case_id in fail_cases]
# 时间分布
time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240]
time_distribution = {f"{low}-{high}": 0 for low, high in zip(time_bins[:-1], time_bins[1:])}
time_distribution[f"{time_bins[-1]}+秒"] = 0
for duration in durations:
placed = False
for i in range(len(time_bins) - 1):
if time_bins[i] <= duration < time_bins[i+1]:
time_distribution[f"{time_bins[i]}-{time_bins[i+1]}"] += 1
placed = True
break
if not placed:
time_distribution[f"{time_bins[-1]}+秒"] += 1
# 打印结果
print("=" * 60)
print(f"测试用例执行时间分析报告")
print("=" * 60)
print(f"\n总测试用例数: {total_cases}")
print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)")
print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)")
print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)")
print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)")
print(f"标准差: {std_duration:.2f}")
print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)")
if pass_durations:
print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)")
print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)")
print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)")
if fail_durations:
print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)")
print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)")
# 时间分布
print("\n执行时长分布:")
for bin_name, count in time_distribution.items():
percentage = count / total_cases * 100
print(f" {bin_name}: {count} 个用例 ({percentage:.2f}%)")
# 打印前10长和前10短的用例
print("\n执行时长最长的10个用例:")
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1):
result = case_results[case_id]
intent = case_intents.get(case_id, "未知意图")
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
print(f" 意图: {intent}")
print("\n执行时长最短的10个用例:")
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1):
result = case_results[case_id]
intent = case_intents.get(case_id, "未知意图")
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
print(f" 意图: {intent}")
else:
print("未找到有效的测试用例数据")