145 lines
4.0 KiB
Python
145 lines
4.0 KiB
Python
import os, json, sys, copy
|
|
|
|
USE_TASKS = [i for i in range(165)]
|
|
|
|
def get_result(res_dict, src="all"):
|
|
if len(res_dict) == 0:
|
|
return ''
|
|
|
|
success_id = [k for k, v in res_dict.items() if v >= 1.0]
|
|
score = len(success_id)
|
|
finish_count = len(res_dict)
|
|
pacc, acc = score / finish_count * 100, score / TASKS * 100
|
|
|
|
print(sorted(success_id))
|
|
|
|
meta = """
|
|
--------
|
|
src file: {}
|
|
successed: {:3} / {:4} (812)
|
|
partial accuracy: {:7}
|
|
overall accuracy: {:7}
|
|
--------
|
|
""".format(src, int(score), finish_count, round(pacc, 2), round(acc, 2))
|
|
|
|
print(meta)
|
|
|
|
def export_result(res_dict, src=".", note=["1.0", "0.0"], show_all=False):
|
|
out_string = ""
|
|
for id in USE_TASKS:
|
|
# with open(f"Pipeline/config_files/{id}.json", "r") as f:
|
|
# jd = json.load(f)
|
|
|
|
# if "map" in jd["sites"]:
|
|
# continue
|
|
if id in res_dict:
|
|
if res_dict[id] >= 1.0:
|
|
out_string += note[0]
|
|
else:
|
|
out_string += note[1]
|
|
elif show_all:
|
|
out_string += note[1]
|
|
out_string += "\n"
|
|
|
|
with open(os.path.join(src, 'export.txt'), 'w') as f:
|
|
f.write(out_string)
|
|
|
|
TASKS = 165
|
|
|
|
files = sys.argv[1]
|
|
file_list = files.split(',')
|
|
|
|
all_result = {}
|
|
|
|
for src in file_list:
|
|
path = os.path.join(src, 'actions')
|
|
|
|
result = {}
|
|
finished = os.listdir(path)
|
|
|
|
for file in finished:
|
|
if not file.endswith('.json'):
|
|
continue
|
|
with open(os.path.join(path, file), 'r') as f:
|
|
data = json.load(f)
|
|
|
|
if not isinstance(data, dict):
|
|
continue
|
|
|
|
task_id = data.get('task_id', 1000)
|
|
# if task_id >= TASKS:
|
|
# continue
|
|
|
|
task_score = data.get('score', 0)
|
|
if task_score < 0:
|
|
continue
|
|
|
|
result[task_id] = task_score
|
|
if task_id not in all_result or task_score > all_result[task_id]:
|
|
all_result[task_id] = task_score
|
|
|
|
get_result(result, src)
|
|
export_result(result, src=src)
|
|
|
|
if len(file_list) > 1:
|
|
get_result(all_result)
|
|
export_result(all_result, show_all=True)
|
|
|
|
with open('./config_files/wa/test_webarena_lite.raw.json') as fp:
|
|
configs = json.load(fp)
|
|
sub_results = {}
|
|
sub_ids = {}
|
|
sub_failed_ids = {}
|
|
sub_counts = {}
|
|
non_map_success = 0
|
|
non_map_total = 0
|
|
|
|
for item in configs:
|
|
web = tuple(item['sites'])
|
|
task_id = int(item['task_id'])
|
|
old_task_id = int(item['old_task_id'])
|
|
|
|
if web not in sub_results:
|
|
sub_results[web] = []
|
|
sub_ids[web] = []
|
|
sub_failed_ids[web] = []
|
|
sub_counts[web] = 0
|
|
|
|
sub_counts[web] += 1
|
|
|
|
is_success = False
|
|
if task_id in all_result:
|
|
score = all_result[task_id]
|
|
sub_results[web].append(score)
|
|
if score >= 1.0:
|
|
sub_ids[web].append(old_task_id)
|
|
is_success = True
|
|
else:
|
|
sub_failed_ids[web].append(old_task_id)
|
|
else:
|
|
sub_results[web].append(0)
|
|
sub_failed_ids[web].append(old_task_id)
|
|
|
|
is_map_task = any('map' in site for site in web)
|
|
if not is_map_task:
|
|
non_map_total += 1
|
|
if is_success:
|
|
non_map_success += 1
|
|
|
|
print("\n--- Category Statistics ---")
|
|
for web in sub_results:
|
|
total_cases = sub_counts[web]
|
|
success_cases = len(sub_ids[web])
|
|
success_rate = round(sum(sub_results[web]) / len(sub_results[web]) * 100, 1) if len(sub_results[web]) > 0 else 0.0
|
|
print(f"Category: {web}")
|
|
print(f" Total Cases: {total_cases}")
|
|
print(f" Success Rate: {success_rate}% ({success_cases}/{total_cases})")
|
|
print(f" Successful old_task_ids: {sorted(sub_ids[web])}")
|
|
print(f" Failed old_task_ids: {sorted(sub_failed_ids[web])}")
|
|
|
|
print('\n--- Overall Accuracy without Map ---')
|
|
if non_map_total > 0:
|
|
overall_acc_without_map = round(non_map_success / non_map_total * 100, 2)
|
|
print(f"Accuracy (excluding map tasks): {overall_acc_without_map}% ({non_map_success}/{non_map_total})")
|
|
else:
|
|
print("No non-map tasks found.") |