AgentOccam/Agent_E/test/tests_processor.py
2025-01-22 11:32:35 -08:00

410 lines
18 KiB
Python

import asyncio
import json
import os
import time
from Agent_E.ae.core.agents_llm_config import AgentsLLMConfig
from Agent_E.test.test_utils import get_formatted_current_timestamp
from Agent_E.test.test_utils import load_config
from Agent_E.test.test_utils import task_config_validator
from typing import Any
import Agent_E.ae.core.playwright_manager as browserManager
import nltk # type: ignore
from Agent_E.ae.config import PROJECT_TEST_ROOT
from Agent_E.ae.core.autogen_wrapper import AutogenWrapper
from Agent_E.ae.core.playwright_manager import PlaywrightManager
from Agent_E.ae.utils.logger import logger
from Agent_E.ae.utils.response_parser import parse_response
from autogen.agentchat.chat import ChatResult # type: ignore
from playwright.async_api import Page
from tabulate import tabulate
from termcolor import colored
from evaluation_harness.evaluators import evaluator_router
nltk.download('punkt') # type: ignore
last_agent_response = ""
def check_top_level_test_folders(test_log_dir, test_result_dir):
if not os.path.exists(test_log_dir):
os.makedirs(test_log_dir)
logger.info(f"Created log folder at: {test_log_dir}")
if not os.path.exists(test_result_dir):
os.makedirs(test_result_dir)
logger.info(f"Created scores folder at: {test_result_dir}")
def create_task_log_folders(test_log_dir, task_id):
task_log_dir = os.path.join(test_log_dir, task_id)
task_screenshots_dir = os.path.join(task_log_dir, 'snapshots')
if not os.path.exists(task_log_dir):
os.makedirs(task_log_dir)
logger.info(f"Created log dir for task {task_id} at: {task_log_dir}")
if not os.path.exists(task_screenshots_dir):
os.makedirs(task_screenshots_dir)
logger.info(f"Created screenshots dir for task {task_id} at: {task_screenshots_dir}")
return {"task_log_folder": task_log_dir, "task_screenshots_folder": task_screenshots_dir}
def create_results_dir(test_file: str, test_results_id: str|None) -> str:
results_dir = ""
if test_results_id:
results_dir = os.path.join(TEST_RESULTS, f"results_for_{test_results_id}")
else:
test_file_base = os.path.basename(test_file)
test_file_name = os.path.splitext(test_file_base)[0]
results_dir = os.path.join(TEST_RESULTS, f"results_for_test_file_{test_file_name}")
if not os.path.exists(results_dir):
os.makedirs(results_dir)
logger.info(f"Created results directory: {results_dir}")
return results_dir
def dump_log(task_id: str, messages_str_keys: dict[str, str], logs_dir: str):
file_name = os.path.join(logs_dir, f'execution_logs_{task_id}.json')
with open(file_name, 'w', encoding='utf-8') as f:
json.dump(messages_str_keys, f, ensure_ascii=False, indent=4)
def save_test_results(test_results: list[dict[str, str | int | float | None]], test_results_id: str):
file_name = os.path.join(TEST_RESULTS, f'test_results_{test_results_id}.json')
with open(file_name, 'w', encoding='utf-8') as f:
json.dump(test_results, f, ensure_ascii=False, indent=4)
logger.info(f"Test results dumped to: {file_name}")
def save_individual_test_result(test_result: dict[str, str | int | float | None], results_dir: str):
task_id = test_result["task_id"]
file_name = os.path.join(results_dir, f'{task_id}.json')
with open(file_name, 'w', encoding='utf-8') as f:
json.dump(test_result, f, ensure_ascii=False, indent=4)
logger.info(f"Test result for task {task_id} dumped to: {file_name}")
def extract_last_response(messages: list[dict[str, Any]]) -> str:
"""Extract the last response message from chat history."""
try:
# Iterate over the messages in reverse order
for message in reversed(messages):
if message and 'content' in message:
content=message.get('content', "")
content_json = parse_response(content)
final_answer = content_json.get('final_response', None)
if final_answer:
return final_answer
return ""
except:
logger.error("Error extracting last response from chat history.")
return ""
def print_progress_bar(current: int, total: int, bar_length: int = 50) -> None:
"""
Prints a progress bar to the console.
Parameters:
- current (int): The current progress of the task.
- total (int): The total number of tasks to complete.
- bar_length (int): The character length of the progress bar (default is 50).
This function dynamically updates a single line in the console to reflect current progress.
"""
percent = float(current) * 100 / total
arrow = '-' * int(percent/100 * bar_length - 1) + '>'
spaces = ' ' * (bar_length - len(arrow))
print(f'\rProgress: [{arrow}{spaces}] {current}/{total} ({percent:.2f}%)', end='')
def determine_status_and_color(score: float) -> tuple[str, str]:
"""
Determines the status and color for a test result based on the score.
Parameters:
- score (float): The score of the test task, indicating success (1), failure (0), or skip (-0.1).
Returns:
- tuple[str, str]: A tuple containing the status ('Pass', 'Fail', or 'Skip') and the corresponding color ('green', 'red', or 'yellow').
"""
if score == 1:
return 'Pass', 'green'
elif score < 0:
return 'Skip', 'yellow'
else:
return 'Fail', 'red'
def print_test_result(task_result: dict[str, str | int | float | None], index: int, total: int) -> None:
"""
Prints the result of a single test task in a tabulated format.
Parameters:
- task_result (dict): A dictionary containing the task's evaluation results, including task ID, intent, score, and total command time.
- index (int): The current index of the test in the sequence of all tests being run.
- total (int): The total number of tests to be run.
The function determines the test status (Pass/Fail) based on the 'score' key in task_result and prints the result with colored status.
"""
status, color = determine_status_and_color(task_result['score']) # type: ignore
cost = task_result.get("compute_cost", None)
total_cost = None if cost is None else round(cost.get("cost", -1), 4) # type: ignore
total_tokens = None if cost is None else cost.get("total_tokens", -1) # type: ignore
result_table = [ # type: ignore
['Test Index', 'Task ID', 'Intent', 'Status', 'Time Taken (s)', 'Total Tokens', 'Total Cost ($)'],
[index, task_result['task_id'], task_result['intent'], colored(status, color), round(task_result['tct'], 2), total_tokens, total_cost] # type: ignore
]
print('\n' + tabulate(result_table, headers='firstrow', tablefmt='grid')) # type: ignore
def get_command_exec_cost(command_exec_result: ChatResult):
output: dict[str, Any] = {}
try:
cost = command_exec_result.cost # type: ignore
usage: dict[str, Any] = None
if "usage_including_cached_inference" in cost:
usage: dict[str, Any] = cost["usage_including_cached_inference"]
elif "usage_excluding_cached_inference" in cost:
usage: dict[str, Any] = cost["usage_excluding_cached_inference"]
else:
raise ValueError("Cost not found in the command execution result.")
print("Usage: ", usage)
for key in usage.keys():
if isinstance(usage[key], dict) and "prompt_tokens" in usage[key]:
output["cost"] = usage[key]["cost"]
output["prompt_tokens"] = usage[key]["prompt_tokens"]
output["completion_tokens"] = usage[key]["completion_tokens"]
output["total_tokens"] = usage[key]["total_tokens"]
except Exception as e:
logger.debug(f"Error getting command execution cost: {e}")
return output
async def execute_single_task(task_config_file: str, browser_manager: PlaywrightManager, ag: AutogenWrapper, page: Page, logs_dir: str) -> dict[str, Any]:
"""
Executes a single test task based on a specified task configuration and evaluates its performance.
Parameters:
- task_config (dict): The task configuration dictionary containing all necessary parameters for the task.
- browser_manager (PlaywrightManager): The manager handling browser interactions, responsible for page navigation and control.
- ag (AutogenWrapper): The automation generator wrapper that processes commands and interacts with the web page.
- page (Page): The Playwright page object representing the browser tab where the task is executed.
Returns:
- dict: A dictionary containing the task's evaluation results, including task ID, intent, score, total command time (tct),
the last statement from the chat agent, and the last URL accessed during the task.
"""
command = ""
start_url = None
task_id = None
start_ts = get_formatted_current_timestamp()
task_config = json.load(open(task_config_file, "r"))
task_config_validator(task_config)
command: str = task_config.get('intent', "")
task_id = task_config.get('task_id')
task_index = task_config.get('task_index')
start_url = task_config.get('start_url')
logger.info(f"Intent: {command}, Task ID: {task_id}")
if start_url:
await page.goto(start_url, wait_until='load', timeout=30000)
start_time = time.time()
current_url = await browser_manager.get_current_url()
command_exec_result = await ag.process_command(command, current_url)
end_time = time.time()
evaluator_result: dict[str, float | str] = {}
last_agent_response: str = ""
command_cost: dict[str, Any] = {}
single_task_result: dict[str, Any] = {}
try:
single_task_result = {
"task_id": task_id,
"task_index": task_index,
"start_url": start_url,
"intent": str(command),
"last_url": page.url,
"tct": end_time - start_time,
"start_ts": start_ts,
"completion_ts": get_formatted_current_timestamp()
}
agent_name: str = "planner_agent" if ag.agents_map is not None and "planner_agent" in ag.agents_map else "browser_nav_agent"
command_cost = get_command_exec_cost(command_exec_result) # type: ignore
print(f"Command cost: {command_cost}")
single_task_result["compute_cost"] = command_cost
logger.info(f"Command \"{command}\" took: {round(end_time - start_time, 2)} seconds.")
logger.info(f"Task {task_id} completed.")
messages = ag.agents_map[agent_name].chat_messages # type: ignore
messages_str_keys = {str(key): value for key, value in messages.items()} # type: ignore
agent_key = list(messages.keys())[0] # type: ignore
last_agent_response = extract_last_response(messages[agent_key]) # type: ignore
dump_log(str(task_id), messages_str_keys, logs_dir)
single_task_result["last_statement"] = last_agent_response
evaluator = evaluator_router(task_config_file)
# cdp_session = await page.context.new_cdp_session(page)
evaluator_result = evaluator(
config_file=task_config_file,
page=None,
client=None,
trajectory=[{"answer":last_agent_response}]
)
single_task_result["score"] = evaluator_result
except Exception as e:
logger.error(f"Error getting command cost: {e}")
command_cost = {"cost": -1, "total_tokens": -1}
single_task_result["compute_cost"] = command_cost
single_task_result["error"] = str(e)
return single_task_result
async def run_tests(ag: AutogenWrapper, browser_manager: PlaywrightManager, task_ids,
logdir: str="", logname: str="", relative_task_dir: str="", test_results_id: str = "", wait_time_non_headless: int=5, take_screenshots: bool = False) -> list[dict[str, Any]]:
"""
Runs a specified range of test tasks using Playwright for browser interactions and AutogenWrapper for task automation.
It initializes necessary components, processes each task, handles exceptions, and compiles test results into a structured list.
Parameters:
- ag (AutogenWrapper): The AutoGen wrapper that processes commands.
- browser_manager (PlaywrightManager): The manager handling browser interactions, responsible for page navigation and control.
- logdir (str)
- logname (str)
- task_ids (List[str])
- relative_task_dir (str)
- wait_time_non_headless (int): Time to wait between tasks when running in non-headless mode, useful for live monitoring or debugging.
- take_screenshots (bool): Whether to take screenshots during test execution. Defaults to False.
Returns:
- list[dict[str, Any]]: A list of dictionaries, each containing the results from executing a test task. Results include task ID, intent, score, total command time, etc.
This function also manages logging and saving of test results, updates the progress bar to reflect test execution status, and prints a detailed summary report at the end of the testing session.
"""
test_log_dir = os.path.join(logdir, logname)
test_result_dir = os.path.join(logdir, logname, "results")
check_top_level_test_folders(test_log_dir, test_result_dir)
config_file_list = []
if not relative_task_dir or relative_task_dir == "":
relative_task_dir = "tasks"
if task_ids == "all" or task_ids == ["all"]:
task_ids = [filename[:-len(".json")] for filename in os.listdir(f"config_files/{relative_task_dir}") if filename.endswith(".json")]
for task_id in task_ids:
config_file_list.append(f"config_files/{relative_task_dir}/{task_id}.json")
test_results: list[dict[str, str | int | float | None]] = []
llm_config = AgentsLLMConfig()
if not ag:
ag = await AutogenWrapper.create(llm_config.get_planner_agent_config(), llm_config.get_browser_nav_agent_config())
if not browser_manager:
browser_manager = browserManager.PlaywrightManager(headless=True)
await browser_manager.async_initialize()
page=await browser_manager.get_current_page()
test_results = []
total_tests = len(config_file_list)
for index, task_config_file in enumerate(config_file_list):
task_config = json.load(open(task_config_file, "r"))
task_id = str(task_config.get('task_id'))
if os.path.exists(os.path.join(test_result_dir, f'{task_id}.json')):
continue
log_folders = create_task_log_folders(test_log_dir, task_id)
ag.set_chat_logs_dir(log_folders["task_log_folder"])
browser_manager.set_take_screenshots(take_screenshots)
if take_screenshots:
browser_manager.set_screenshots_dir(log_folders["task_screenshots_folder"])
print_progress_bar(index, total_tests)
task_result = await execute_single_task(task_config_file, browser_manager, ag, page, log_folders["task_log_folder"])
test_results.append(task_result)
save_individual_test_result(task_result, test_result_dir)
print_test_result(task_result, index + 1, total_tests)
if not browser_manager.isheadless: # no need to wait if we are running headless
await asyncio.sleep(wait_time_non_headless) # give time for switching between tasks in case there is a human observer
await browser_manager.take_screenshots("final", None)
await browser_manager.close_except_specified_tab(page) # cleanup pages that are not the one we opened here
print_progress_bar(total_tests, total_tests) # Complete the progress bar
print('\n\nAll tests completed.')
# Aggregate and print individual test results
print("\nDetailed Test Results:")
detailed_results_table = [['Test Index', 'Task ID', 'Intent', 'Status', 'Time Taken (s)', 'Total Tokens', 'Total Cost ($)']]
for idx, result in enumerate(test_results, 1):
status, color = determine_status_and_color(result['score']) # type: ignore
cost: str | int | float | None = result.get("compute_cost", None)
total_cost = None if cost is None else round(cost.get("cost", -1), 4) # type: ignore
total_tokens = None if cost is None else cost.get("total_tokens", -1) # type: ignore
detailed_results_table.append([
idx, result['task_id'], result['intent'], colored(status, color), round(result['tct'], 2), # type: ignore
total_tokens, total_cost
])
print(tabulate(detailed_results_table, headers='firstrow', tablefmt='grid'))
# Summary report
# Calculate aggregated cost and token totals for all tests that have compute cost
total_cost = 0
total_tokens = 0
for result in test_results:
compute_cost = result.get("compute_cost",0) # type: ignore
if compute_cost is not None and isinstance(compute_cost, dict):
total_cost += compute_cost.get("cost", 0) # type: ignore
total_tokens += compute_cost.get("total_tokens", 0) # type: ignore
passed_tests = []
skipped_tests = []
failed_tests = []
for result in test_results:
if result["score"] == 1:
passed_tests.append(result) # type: ignore
elif result["score"] < 0: # type: ignore
skipped_tests.append(result) # type: ignore
else:
failed_tests.append(result) # type: ignore
summary_table = [ # type: ignore
['Total Tests', 'Passed', 'Failed', 'Skipped', 'Average Time Taken (s)', 'Total Time Taken (s)', 'Total Tokens', 'Total Cost ($)'],
[total_tests, len(passed_tests), len(failed_tests), len(skipped_tests),
round(sum(test['tct'] for test in test_results) / total_tests, 2), # type: ignore
round(sum(test['tct'] for test in test_results), 2), # type: ignore
total_tokens, total_cost]
]
print('\nSummary Report:')
print(tabulate(summary_table, headers='firstrow', tablefmt='grid')) # type: ignore
return test_results