import os, json, sys, copy USE_TASKS = [i for i in range(165)] def get_result(res_dict, src="all"): if len(res_dict) == 0: return '' success_id = [k for k, v in res_dict.items() if v >= 1.0] score = len(success_id) finish_count = len(res_dict) pacc, acc = score / finish_count * 100, score / TASKS * 100 print(sorted(success_id)) meta = """ -------- src file: {} successed: {:3} / {:4} (812) partial accuracy: {:7} overall accuracy: {:7} -------- """.format(src, int(score), finish_count, round(pacc, 2), round(acc, 2)) print(meta) def export_result(res_dict, src=".", note=["1.0", "0.0"], show_all=False): out_string = "" for id in USE_TASKS: # with open(f"Pipeline/config_files/{id}.json", "r") as f: # jd = json.load(f) # if "map" in jd["sites"]: # continue if id in res_dict: if res_dict[id] >= 1.0: out_string += note[0] else: out_string += note[1] elif show_all: out_string += note[1] out_string += "\n" with open(os.path.join(src, 'export.txt'), 'w') as f: f.write(out_string) TASKS = 165 files = sys.argv[1] file_list = files.split(',') all_result = {} for src in file_list: path = os.path.join(src, 'actions') result = {} finished = os.listdir(path) for file in finished: if not file.endswith('.json'): continue with open(os.path.join(path, file), 'r') as f: data = json.load(f) if not isinstance(data, dict): continue task_id = data.get('task_id', 1000) # if task_id >= TASKS: # continue task_score = data.get('score', 0) if task_score < 0: continue result[task_id] = task_score if task_id not in all_result or task_score > all_result[task_id]: all_result[task_id] = task_score get_result(result, src) export_result(result, src=src) if len(file_list) > 1: get_result(all_result) export_result(all_result, show_all=True) with open('./config_files/wa/test_webarena_lite.raw.json') as fp: configs = json.load(fp) sub_results = {} sub_ids = {} sub_failed_ids = {} sub_counts = {} non_map_success = 0 non_map_total = 0 for item in configs: web = tuple(item['sites']) task_id = int(item['task_id']) old_task_id = int(item['old_task_id']) if web not in sub_results: sub_results[web] = [] sub_ids[web] = [] sub_failed_ids[web] = [] sub_counts[web] = 0 sub_counts[web] += 1 is_success = False if task_id in all_result: score = all_result[task_id] sub_results[web].append(score) if score >= 1.0: sub_ids[web].append(old_task_id) is_success = True else: sub_failed_ids[web].append(old_task_id) else: sub_results[web].append(0) sub_failed_ids[web].append(old_task_id) is_map_task = any('map' in site for site in web) if not is_map_task: non_map_total += 1 if is_success: non_map_success += 1 print("\n--- Category Statistics ---") for web in sub_results: total_cases = sub_counts[web] success_cases = len(sub_ids[web]) success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0 print(f"Category: {web}") print(f" Total Cases: {total_cases}") print(f" Success Rate: {success_rate}% ({success_cases}/{total_cases})") print(f" Successful old_task_ids: {sorted(sub_ids[web])}") print(f" Failed old_task_ids: {sorted(sub_failed_ids[web])}") print('\n--- Overall Accuracy without Map ---') if non_map_total > 0: overall_acc_without_map = round(non_map_success / non_map_total * 100, 2) print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})") else: print("No non-map tasks found.")