128 lines
5.5 KiB
Python
128 lines
5.5 KiB
Python
import re
|
|
import datetime
|
|
import numpy as np
|
|
from collections import defaultdict, Counter
|
|
|
|
log_file = '/home2/yuyr/VisualAgentBench/VAB-WebArena-Lite/results/webrl_chat/tmp_merged_log.txt'
|
|
with open(log_file, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
# 收集所有用例的开始和结束时间
|
|
case_times = {}
|
|
case_results = {}
|
|
case_intents = {}
|
|
current_case = None
|
|
start_time = None
|
|
current_intent = None
|
|
|
|
for line in lines:
|
|
if '[Config file]:' in line:
|
|
# 从行中提取时间和用例ID
|
|
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Config file\]: (.+)', line)
|
|
if match:
|
|
time_str, config_file = match.groups()
|
|
# 提取用例ID
|
|
case_id = config_file.split('/')[-1].replace('.json', '')
|
|
current_case = case_id
|
|
start_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
|
|
|
|
elif '[Intent]:' in line and current_case:
|
|
# 提取意图
|
|
match = re.match(r'.+\[Intent\]: (.+)', line)
|
|
if match:
|
|
current_intent = match.group(1)
|
|
case_intents[current_case] = current_intent[:50] + '...' if len(current_intent) > 50 else current_intent
|
|
|
|
elif '[Result]' in line and current_case and start_time:
|
|
# 从行中提取时间和结果
|
|
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}).+\[Result\] \(([A-Z]+)\)', line)
|
|
if match:
|
|
time_str, result = match.groups()
|
|
end_time = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S,%f')
|
|
duration = (end_time - start_time).total_seconds()
|
|
|
|
case_times[current_case] = duration
|
|
case_results[current_case] = result
|
|
|
|
current_case = None
|
|
start_time = None
|
|
current_intent = None
|
|
|
|
# 计算总体统计信息
|
|
if case_times:
|
|
total_cases = len(case_times)
|
|
durations = list(case_times.values())
|
|
|
|
# 基本统计信息
|
|
avg_duration = np.mean(durations)
|
|
median_duration = np.median(durations)
|
|
min_duration = min(durations)
|
|
max_duration = max(durations)
|
|
std_duration = np.std(durations)
|
|
|
|
# 按结果分类
|
|
pass_cases = [case_id for case_id, result in case_results.items() if result == 'PASS']
|
|
fail_cases = [case_id for case_id, result in case_results.items() if result == 'FAIL']
|
|
|
|
pass_durations = [case_times[case_id] for case_id in pass_cases]
|
|
fail_durations = [case_times[case_id] for case_id in fail_cases]
|
|
|
|
# 时间分布
|
|
time_bins = [0, 30, 60, 90, 120, 150, 180, 210, 240]
|
|
time_distribution = {f"{low}-{high}秒": 0 for low, high in zip(time_bins[:-1], time_bins[1:])}
|
|
time_distribution[f"{time_bins[-1]}+秒"] = 0
|
|
|
|
for duration in durations:
|
|
placed = False
|
|
for i in range(len(time_bins) - 1):
|
|
if time_bins[i] <= duration < time_bins[i+1]:
|
|
time_distribution[f"{time_bins[i]}-{time_bins[i+1]}秒"] += 1
|
|
placed = True
|
|
break
|
|
if not placed:
|
|
time_distribution[f"{time_bins[-1]}+秒"] += 1
|
|
|
|
# 打印结果
|
|
print("=" * 60)
|
|
print(f"测试用例执行时间分析报告")
|
|
print("=" * 60)
|
|
|
|
print(f"\n总测试用例数: {total_cases}")
|
|
print(f"平均执行时长: {avg_duration:.2f}秒 ({avg_duration/60:.2f}分钟)")
|
|
print(f"中位数执行时长: {median_duration:.2f}秒 ({median_duration/60:.2f}分钟)")
|
|
print(f"最短执行时长: {min_duration:.2f}秒 ({min_duration/60:.2f}分钟)")
|
|
print(f"最长执行时长: {max_duration:.2f}秒 ({max_duration/60:.2f}分钟)")
|
|
print(f"标准差: {std_duration:.2f}秒")
|
|
|
|
print(f"\n成功用例数: {len(pass_cases)} ({len(pass_cases)/total_cases*100:.2f}%)")
|
|
if pass_durations:
|
|
print(f"成功用例平均执行时长: {np.mean(pass_durations):.2f}秒 ({np.mean(pass_durations)/60:.2f}分钟)")
|
|
print(f"成功用例中位数执行时长: {np.median(pass_durations):.2f}秒 ({np.median(pass_durations)/60:.2f}分钟)")
|
|
|
|
print(f"\n失败用例数: {len(fail_cases)} ({len(fail_cases)/total_cases*100:.2f}%)")
|
|
if fail_durations:
|
|
print(f"失败用例平均执行时长: {np.mean(fail_durations):.2f}秒 ({np.mean(fail_durations)/60:.2f}分钟)")
|
|
print(f"失败用例中位数执行时长: {np.median(fail_durations):.2f}秒 ({np.median(fail_durations)/60:.2f}分钟)")
|
|
|
|
# 时间分布
|
|
print("\n执行时长分布:")
|
|
for bin_name, count in time_distribution.items():
|
|
percentage = count / total_cases * 100
|
|
print(f" {bin_name}: {count} 个用例 ({percentage:.2f}%)")
|
|
|
|
# 打印前10长和前10短的用例
|
|
print("\n执行时长最长的10个用例:")
|
|
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1], reverse=True)[:10], 1):
|
|
result = case_results[case_id]
|
|
intent = case_intents.get(case_id, "未知意图")
|
|
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
|
|
print(f" 意图: {intent}")
|
|
|
|
print("\n执行时长最短的10个用例:")
|
|
for i, (case_id, duration) in enumerate(sorted(case_times.items(), key=lambda x: x[1])[:10], 1):
|
|
result = case_results[case_id]
|
|
intent = case_intents.get(case_id, "未知意图")
|
|
print(f"{i}. 用例 {case_id}: {duration:.2f}秒 ({duration/60:.2f}分钟) - {result}")
|
|
print(f" 意图: {intent}")
|
|
else:
|
|
print("未找到有效的测试用例数据") |