webrl/src/analysis.py
2024-08-08 23:40:38 +08:00

458 lines
15 KiB
Python

import argparse
import datetime
import json
import os
import re
import time
from collections import OrderedDict
import yaml
from .configs import ConfigLoader
from .utils import ColorMessage
MODEL_MAP = {
"gpt-4o-2024-05-13": "gpt-4o-2024-05-13",
"gpt-4": "gpt-4",
"gpt-3.5-turbo-0613": "gpt-3.5-turbo",
"llama-2-13b": "llama2-13b",
"llama-2-7b": "llama2-7b",
"chatglm-6b": "chatglm-6b",
"wizard-30b": "wizardlm-30b",
"vicuna-33b": "vicuna-33b",
"oasst-12b": "oasst-12b",
"guanaco-65b": "guanaco-65b",
"koala-13b": "koala-13b",
"text-davinci-003": "text-davinci-003",
"wizard-13b": "wizardlm-13b",
"guanaco-33b": "guanaco-33b",
"text-davinci-002": "text-davinci-002",
"llama2-70b": "llama2-70b",
"codellama": "codellama-34b",
"openchat": "openchat-13b",
"claude-ins": "claude-instant",
"claude-v1.3": "claude",
"claude-2": "claude-2",
"codellama-13b": "codellama-13b",
"codellama-7b": "codellama-7b",
"codegeex2-6b": "codegeex2-6b",
"dolly": "dolly-12b",
"vicuna-7b": "vicuna-7b",
"vicuna-13b": "vicuna-13b",
"chat-bison": "chat-bison-001",
}
VALIDATION_MAP_FUNC = {
"Completed": lambda x: x["COMPLETED"],
"Context Limit Exceeded": lambda x: x["AGENT_CONTEXT_LIMIT"],
"Invalid Format": lambda x: x["AGENT_VALIDATION_FAILED"],
"Invalid Action": lambda x: x["AGENT_INVALID_ACTION"],
# Not in above list
"Task Limit Exceeded": lambda x: sum(
[x[t] for t in x if t in ["UNKNOWN", "TASK_ERROR", "TASK_LIMIT_REACHED"]]
),
}
def analyze_output(config: str, output: str, since_timestamp: float):
"""
Walk through the output folder (including sub-dir) and analyze the overall.json file
Rule:
- valid overall file: **/{agent}/{task}/overall.json
- if a same (agent, task) pair, select the latest one
"""
loader = ConfigLoader()
config: dict = loader.load_from(config)
assert "definition" in config, "definition not found in config"
assert "agent" in config["definition"], "agent not found in config.definition"
assert "task" in config["definition"], "task not found in config.definition"
agents = set(config["definition"]["agent"].keys()).intersection(
set(MODEL_MAP.keys())
)
tasks = list(config["definition"]["task"].keys())
print(
ColorMessage.cyan(
f"Available Agents ({len(agents)}):\n "
+ "\n ".join(agents)
+ "\n\n"
+ f"Available Tasks ({len(tasks)}):\n "
+ "\n ".join(tasks)
+ "\n"
)
)
overall_dict = OrderedDict() # agent -> task -> {file: str, time: float}
for root, dirs, files in os.walk(output):
if "overall.json" in files:
# get full path of root
root = os.path.abspath(root)
# get agent and task name
pattern = root.split("/")
if len(pattern) < 2:
continue
agent = pattern[-2]
task = pattern[-1]
ct = os.path.getmtime(os.path.join(root, "overall.json"))
if agent not in agents:
continue
elif task not in tasks:
continue
elif ct < since_timestamp:
continue
agent = MODEL_MAP[agent]
if agent in overall_dict and task in overall_dict[agent]:
# get time
if ct < overall_dict[agent][task]["time"]:
continue
overall_dict.setdefault(agent, OrderedDict())
overall_dict[agent][task] = {
"file": os.path.join(root, "overall.json"),
"time": os.path.getmtime(os.path.join(root, "overall.json")),
}
# agent -> task -> {file: str, time: str(YYYY-MM-DD HH:MM:SS), overall: dict}
agent_names = []
task_names = []
validation_names = []
for agent in overall_dict:
if agent not in agent_names:
agent_names.append(agent)
for task in overall_dict[agent]:
if task not in task_names:
task_names.append(task)
overall_dict[agent][task]["time"] = datetime.datetime.fromtimestamp(
overall_dict[agent][task]["time"]
).strftime("%Y-%m-%d %H:%M:%S")
with open(overall_dict[agent][task]["file"], "r", encoding="utf-8") as f:
overall_dict[agent][task]["overall"] = json.load(f)
if "validation" in overall_dict[agent][task]["overall"]:
overall_dict[agent][task]["overall"]["validation"] = {
validation: VALIDATION_MAP_FUNC[validation](
overall_dict[agent][task]["overall"]["validation"]
)
for validation in VALIDATION_MAP_FUNC
}
for validation in overall_dict[agent][task]["overall"]["validation"]:
if validation not in validation_names:
validation_names.append(validation)
return agent_names, task_names, validation_names, overall_dict
class TaskHandler:
def match(self, task_name) -> bool:
raise NotImplementedError()
def get_main_metric(self, overall_result):
raise NotImplementedError()
def get_order_priority(self):
return 100000
@staticmethod
def get_handler(task_name) -> "TaskHandler":
handlers = [DCG(), HH(), OS(), DB(), KG(), LTP(), WB(), WS()]
for handler in handlers:
if handler.match(task_name):
return handler
raise ValueError(f"Unknown task: {task_name}")
class DCG(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return (
"card" in task_name
or task_name.startswith("cg")
or task_name.startswith("dcg")
)
def get_main_metric(self, overall_result):
try:
return overall_result["custom"]["score"]
except:
return {"win_rate(legacy)": overall_result["custom"]["win_rate"]}
def get_order_priority(self):
return 4
class HH(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return task_name.startswith("alf")
def get_main_metric(self, overall_result):
return overall_result["custom"]["overall"]["success_rate"]
def get_order_priority(self):
return 6
class OS(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return task_name.startswith("os") or task_name.startswith("operating")
def get_main_metric(self, overall_result):
return overall_result["custom"]["overall"]["acc"]
def get_order_priority(self):
return 1
class DB(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return task_name.startswith("db") or task_name.startswith("database")
def get_main_metric(self, overall_result):
return overall_result["custom"]["overall_cat_accuracy"]
def get_order_priority(self):
return 2
class KG(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return task_name.startswith("kg") or task_name.startswith("knowledge")
def get_main_metric(self, overall_result):
return overall_result["custom"]["main"]
def get_order_priority(self):
return 3
class LTP(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return task_name.startswith("ltp") or task_name.startswith("literal")
def get_main_metric(self, overall_result):
return overall_result["custom"]["main"]
def get_order_priority(self):
return 5
class WB(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return task_name.startswith("m2w") or task_name.startswith("mind2web")
def get_main_metric(self, overall_result):
return overall_result["custom"]["step_sr"] / 100
def get_order_priority(self):
return 8
class WS(TaskHandler):
def match(self, task_name) -> bool:
task_name = task_name.lower()
return task_name.startswith("ws") or task_name.startswith("webshop")
def get_main_metric(self, overall_result):
return overall_result["custom"]["reward"]
def get_order_priority(self):
return 7
def parse_timestamp(time_str: str) -> float:
# is a int or float
try:
return float(time_str)
except:
pass
# is a datetime
try:
return datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S").timestamp()
except:
pass
try:
return datetime.datetime.strptime(time_str, "%Y-%m-%d").timestamp()
except:
pass
try:
return datetime.datetime.strptime(time_str, "%Y-%m").timestamp()
except:
pass
# is a time delta (e.g. 1d, 1h, 1m, 1s)
num = float(re.findall(r"[\d\.]+", time_str)[0])
unit = re.findall(r"[a-zA-Z]+", time_str)[0]
if unit == "d":
delta = num * 24 * 60 * 60
elif unit == "h":
delta = num * 60 * 60
elif unit == "m":
delta = num * 60
elif unit == "s":
delta = num
else:
raise Exception("Unknown time unit")
return time.time() - delta
def main(args):
agent_names, task_names, validation_names, details = analyze_output(
args.config, args.output, parse_timestamp(args.time)
)
task_names.sort(key=lambda x: TaskHandler.get_handler(x).get_order_priority())
summary = OrderedDict()
for agent in details:
summary[agent] = OrderedDict()
for task in details[agent]:
handler = TaskHandler.get_handler(task)
if handler is not None:
summary[agent][task] = handler.get_main_metric(
details[agent][task]["overall"]
)
else:
summary[agent][task] = details[agent][task]["overall"]
for agent in details:
for task in details[agent]:
print(
ColorMessage.cyan(
f"Agent: {agent:20} Task: {task:20} Path: {details[agent][task]['file']}"
)
)
final_result = {
"summary": summary,
"details": details,
}
os.makedirs(args.save, exist_ok=True)
# Overall Calculation
with open(os.path.join(args.save, "result.json"), "w", encoding="utf-8") as f:
json.dump(final_result, f, indent=4, ensure_ascii=False, sort_keys=True)
with open(os.path.join(args.save, "result.yaml"), "w", encoding="utf-8") as f:
yaml.dump(final_result, f, indent=4, allow_unicode=True, sort_keys=True)
with open(os.path.join(args.save, "summary.csv"), "w", encoding="utf-8") as f:
"""
Format:
Agent\\Task, Task1, Task2, ...
Agent1, MainMetric(Agent1,Task1), MainMetric(Agent1,Task2), ...
......
"""
f.write("Agent\\Task," + ",".join(task_names) + "\n")
for agent in summary:
f.write(
agent
+ ","
+ ",".join(
[
(str(summary[agent][task]) if task in summary[agent] else "")
for task in task_names
]
)
+ "\n"
)
# Validation Analysis
agent_validations = {
agent: {validation: [] for validation in validation_names}
for agent in agent_names
}
task_validations = {
task: {validation: [] for validation in validation_names} for task in task_names
}
for agent in summary:
for task in summary[agent]:
if "validation" in details[agent][task]["overall"]:
for validation in details[agent][task]["overall"]["validation"]:
agent_validations[agent][validation].append(
details[agent][task]["overall"]["validation"][validation]
)
task_validations[task][validation].append(
details[agent][task]["overall"]["validation"][validation]
)
# Agent-Centric Validation Analysis
with open(
os.path.join(args.save, "agent_validation.csv"), "w", encoding="utf-8"
) as f:
"""
Format:
Agent\\Validation, Validation1, Validation2, ...
Agent1, Avg(Agent1,Validation1), Avg(Agent1,Validation2), ...
......
"""
f.write("Agent\\Validation," + ",".join(validation_names) + "\n")
for agent in agent_validations:
f.write(
agent
+ ","
+ ",".join(
[
(
str(
sum(agent_validations[agent][validation])
/ len(agent_validations[agent][validation])
)
if validation in agent_validations[agent]
and len(agent_validations[agent][validation]) > 0
else "--"
)
for validation in validation_names
]
)
+ "\n"
)
# Task-Centric Validation Analysis
with open(
os.path.join(args.save, "task_validation.csv"), "w", encoding="utf-8"
) as f:
"""
Format:
Task\\Validation, Validation1, Validation2, ...
Task1, Avg(Task1,Validation1), Avg(Task1,Validation2), ...
......
"""
f.write("Task\\Validation," + ",".join(validation_names) + "\n")
for task in task_validations:
f.write(
task
+ ","
+ ",".join(
[
(
str(
sum(task_validations[task][validation])
/ len(task_validations[task][validation])
)
if validation in task_validations[task]
and len(task_validations[task][validation]) > 0
else "--"
)
for validation in validation_names
]
)
+ "\n"
)
print(ColorMessage.green(f"Analysis result saved to {os.path.abspath(args.save)}"))
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument(
"-c", "--config", type=str, default="configs/assignments/definition.yaml"
)
arg_parser.add_argument("-o", "--output", type=str, default="outputs")
arg_parser.add_argument("-s", "--save", type=str, default="analysis")
arg_parser.add_argument("-t", "--time", type=str, default="0")
args = arg_parser.parse_args()
main(args)