410 lines
18 KiB
Python
410 lines
18 KiB
Python
import asyncio
|
|
import json
|
|
import os
|
|
import time
|
|
from Agent_E.ae.core.agents_llm_config import AgentsLLMConfig
|
|
from Agent_E.test.test_utils import get_formatted_current_timestamp
|
|
from Agent_E.test.test_utils import load_config
|
|
from Agent_E.test.test_utils import task_config_validator
|
|
from typing import Any
|
|
|
|
import Agent_E.ae.core.playwright_manager as browserManager
|
|
import nltk # type: ignore
|
|
from Agent_E.ae.config import PROJECT_TEST_ROOT
|
|
from Agent_E.ae.core.autogen_wrapper import AutogenWrapper
|
|
from Agent_E.ae.core.playwright_manager import PlaywrightManager
|
|
from Agent_E.ae.utils.logger import logger
|
|
from Agent_E.ae.utils.response_parser import parse_response
|
|
from autogen.agentchat.chat import ChatResult # type: ignore
|
|
from playwright.async_api import Page
|
|
from tabulate import tabulate
|
|
from termcolor import colored
|
|
|
|
from evaluation_harness.evaluators import evaluator_router
|
|
|
|
nltk.download('punkt') # type: ignore
|
|
|
|
last_agent_response = ""
|
|
|
|
def check_top_level_test_folders(test_log_dir, test_result_dir):
|
|
if not os.path.exists(test_log_dir):
|
|
os.makedirs(test_log_dir)
|
|
logger.info(f"Created log folder at: {test_log_dir}")
|
|
|
|
if not os.path.exists(test_result_dir):
|
|
os.makedirs(test_result_dir)
|
|
logger.info(f"Created scores folder at: {test_result_dir}")
|
|
|
|
def create_task_log_folders(test_log_dir, task_id):
|
|
task_log_dir = os.path.join(test_log_dir, task_id)
|
|
task_screenshots_dir = os.path.join(task_log_dir, 'snapshots')
|
|
if not os.path.exists(task_log_dir):
|
|
os.makedirs(task_log_dir)
|
|
logger.info(f"Created log dir for task {task_id} at: {task_log_dir}")
|
|
if not os.path.exists(task_screenshots_dir):
|
|
os.makedirs(task_screenshots_dir)
|
|
logger.info(f"Created screenshots dir for task {task_id} at: {task_screenshots_dir}")
|
|
|
|
return {"task_log_folder": task_log_dir, "task_screenshots_folder": task_screenshots_dir}
|
|
|
|
|
|
def create_results_dir(test_file: str, test_results_id: str|None) -> str:
|
|
results_dir = ""
|
|
if test_results_id:
|
|
results_dir = os.path.join(TEST_RESULTS, f"results_for_{test_results_id}")
|
|
else:
|
|
test_file_base = os.path.basename(test_file)
|
|
test_file_name = os.path.splitext(test_file_base)[0]
|
|
results_dir = os.path.join(TEST_RESULTS, f"results_for_test_file_{test_file_name}")
|
|
|
|
if not os.path.exists(results_dir):
|
|
os.makedirs(results_dir)
|
|
logger.info(f"Created results directory: {results_dir}")
|
|
|
|
return results_dir
|
|
|
|
|
|
def dump_log(task_id: str, messages_str_keys: dict[str, str], logs_dir: str):
|
|
file_name = os.path.join(logs_dir, f'execution_logs_{task_id}.json')
|
|
with open(file_name, 'w', encoding='utf-8') as f:
|
|
json.dump(messages_str_keys, f, ensure_ascii=False, indent=4)
|
|
|
|
|
|
def save_test_results(test_results: list[dict[str, str | int | float | None]], test_results_id: str):
|
|
file_name = os.path.join(TEST_RESULTS, f'test_results_{test_results_id}.json')
|
|
with open(file_name, 'w', encoding='utf-8') as f:
|
|
json.dump(test_results, f, ensure_ascii=False, indent=4)
|
|
logger.info(f"Test results dumped to: {file_name}")
|
|
|
|
|
|
def save_individual_test_result(test_result: dict[str, str | int | float | None], results_dir: str):
|
|
task_id = test_result["task_id"]
|
|
file_name = os.path.join(results_dir, f'{task_id}.json')
|
|
with open(file_name, 'w', encoding='utf-8') as f:
|
|
json.dump(test_result, f, ensure_ascii=False, indent=4)
|
|
logger.info(f"Test result for task {task_id} dumped to: {file_name}")
|
|
|
|
|
|
def extract_last_response(messages: list[dict[str, Any]]) -> str:
|
|
"""Extract the last response message from chat history."""
|
|
try:
|
|
# Iterate over the messages in reverse order
|
|
for message in reversed(messages):
|
|
if message and 'content' in message:
|
|
content=message.get('content', "")
|
|
content_json = parse_response(content)
|
|
final_answer = content_json.get('final_response', None)
|
|
if final_answer:
|
|
return final_answer
|
|
return ""
|
|
except:
|
|
logger.error("Error extracting last response from chat history.")
|
|
return ""
|
|
|
|
|
|
def print_progress_bar(current: int, total: int, bar_length: int = 50) -> None:
|
|
"""
|
|
Prints a progress bar to the console.
|
|
|
|
Parameters:
|
|
- current (int): The current progress of the task.
|
|
- total (int): The total number of tasks to complete.
|
|
- bar_length (int): The character length of the progress bar (default is 50).
|
|
|
|
This function dynamically updates a single line in the console to reflect current progress.
|
|
|
|
"""
|
|
percent = float(current) * 100 / total
|
|
arrow = '-' * int(percent/100 * bar_length - 1) + '>'
|
|
spaces = ' ' * (bar_length - len(arrow))
|
|
|
|
print(f'\rProgress: [{arrow}{spaces}] {current}/{total} ({percent:.2f}%)', end='')
|
|
|
|
def determine_status_and_color(score: float) -> tuple[str, str]:
|
|
"""
|
|
Determines the status and color for a test result based on the score.
|
|
|
|
Parameters:
|
|
- score (float): The score of the test task, indicating success (1), failure (0), or skip (-0.1).
|
|
|
|
Returns:
|
|
- tuple[str, str]: A tuple containing the status ('Pass', 'Fail', or 'Skip') and the corresponding color ('green', 'red', or 'yellow').
|
|
|
|
"""
|
|
if score == 1:
|
|
return 'Pass', 'green'
|
|
elif score < 0:
|
|
return 'Skip', 'yellow'
|
|
else:
|
|
return 'Fail', 'red'
|
|
|
|
|
|
def print_test_result(task_result: dict[str, str | int | float | None], index: int, total: int) -> None:
|
|
"""
|
|
Prints the result of a single test task in a tabulated format.
|
|
|
|
Parameters:
|
|
- task_result (dict): A dictionary containing the task's evaluation results, including task ID, intent, score, and total command time.
|
|
- index (int): The current index of the test in the sequence of all tests being run.
|
|
- total (int): The total number of tests to be run.
|
|
|
|
The function determines the test status (Pass/Fail) based on the 'score' key in task_result and prints the result with colored status.
|
|
|
|
"""
|
|
status, color = determine_status_and_color(task_result['score']) # type: ignore
|
|
|
|
cost = task_result.get("compute_cost", None)
|
|
total_cost = None if cost is None else round(cost.get("cost", -1), 4) # type: ignore
|
|
total_tokens = None if cost is None else cost.get("total_tokens", -1) # type: ignore
|
|
result_table = [ # type: ignore
|
|
['Test Index', 'Task ID', 'Intent', 'Status', 'Time Taken (s)', 'Total Tokens', 'Total Cost ($)'],
|
|
[index, task_result['task_id'], task_result['intent'], colored(status, color), round(task_result['tct'], 2), total_tokens, total_cost] # type: ignore
|
|
]
|
|
print('\n' + tabulate(result_table, headers='firstrow', tablefmt='grid')) # type: ignore
|
|
|
|
def get_command_exec_cost(command_exec_result: ChatResult):
|
|
output: dict[str, Any] = {}
|
|
try:
|
|
cost = command_exec_result.cost # type: ignore
|
|
usage: dict[str, Any] = None
|
|
if "usage_including_cached_inference" in cost:
|
|
usage: dict[str, Any] = cost["usage_including_cached_inference"]
|
|
elif "usage_excluding_cached_inference" in cost:
|
|
usage: dict[str, Any] = cost["usage_excluding_cached_inference"]
|
|
else:
|
|
raise ValueError("Cost not found in the command execution result.")
|
|
print("Usage: ", usage)
|
|
|
|
for key in usage.keys():
|
|
if isinstance(usage[key], dict) and "prompt_tokens" in usage[key]:
|
|
output["cost"] = usage[key]["cost"]
|
|
output["prompt_tokens"] = usage[key]["prompt_tokens"]
|
|
output["completion_tokens"] = usage[key]["completion_tokens"]
|
|
output["total_tokens"] = usage[key]["total_tokens"]
|
|
except Exception as e:
|
|
logger.debug(f"Error getting command execution cost: {e}")
|
|
return output
|
|
|
|
|
|
async def execute_single_task(task_config_file: str, browser_manager: PlaywrightManager, ag: AutogenWrapper, page: Page, logs_dir: str) -> dict[str, Any]:
|
|
"""
|
|
Executes a single test task based on a specified task configuration and evaluates its performance.
|
|
|
|
Parameters:
|
|
- task_config (dict): The task configuration dictionary containing all necessary parameters for the task.
|
|
- browser_manager (PlaywrightManager): The manager handling browser interactions, responsible for page navigation and control.
|
|
- ag (AutogenWrapper): The automation generator wrapper that processes commands and interacts with the web page.
|
|
- page (Page): The Playwright page object representing the browser tab where the task is executed.
|
|
|
|
Returns:
|
|
- dict: A dictionary containing the task's evaluation results, including task ID, intent, score, total command time (tct),
|
|
the last statement from the chat agent, and the last URL accessed during the task.
|
|
"""
|
|
command = ""
|
|
start_url = None
|
|
task_id = None
|
|
|
|
start_ts = get_formatted_current_timestamp()
|
|
|
|
task_config = json.load(open(task_config_file, "r"))
|
|
|
|
task_config_validator(task_config)
|
|
|
|
command: str = task_config.get('intent', "")
|
|
task_id = task_config.get('task_id')
|
|
task_index = task_config.get('task_index')
|
|
start_url = task_config.get('start_url')
|
|
logger.info(f"Intent: {command}, Task ID: {task_id}")
|
|
|
|
if start_url:
|
|
await page.goto(start_url, wait_until='load', timeout=30000)
|
|
|
|
start_time = time.time()
|
|
current_url = await browser_manager.get_current_url()
|
|
command_exec_result = await ag.process_command(command, current_url)
|
|
end_time = time.time()
|
|
|
|
evaluator_result: dict[str, float | str] = {}
|
|
last_agent_response: str = ""
|
|
command_cost: dict[str, Any] = {}
|
|
single_task_result: dict[str, Any] = {}
|
|
try:
|
|
single_task_result = {
|
|
"task_id": task_id,
|
|
"task_index": task_index,
|
|
"start_url": start_url,
|
|
"intent": str(command),
|
|
"last_url": page.url,
|
|
"tct": end_time - start_time,
|
|
"start_ts": start_ts,
|
|
"completion_ts": get_formatted_current_timestamp()
|
|
}
|
|
|
|
agent_name: str = "planner_agent" if ag.agents_map is not None and "planner_agent" in ag.agents_map else "browser_nav_agent"
|
|
|
|
command_cost = get_command_exec_cost(command_exec_result) # type: ignore
|
|
print(f"Command cost: {command_cost}")
|
|
single_task_result["compute_cost"] = command_cost
|
|
|
|
logger.info(f"Command \"{command}\" took: {round(end_time - start_time, 2)} seconds.")
|
|
logger.info(f"Task {task_id} completed.")
|
|
|
|
messages = ag.agents_map[agent_name].chat_messages # type: ignore
|
|
messages_str_keys = {str(key): value for key, value in messages.items()} # type: ignore
|
|
agent_key = list(messages.keys())[0] # type: ignore
|
|
last_agent_response = extract_last_response(messages[agent_key]) # type: ignore
|
|
|
|
dump_log(str(task_id), messages_str_keys, logs_dir)
|
|
|
|
single_task_result["last_statement"] = last_agent_response
|
|
|
|
|
|
evaluator = evaluator_router(task_config_file)
|
|
# cdp_session = await page.context.new_cdp_session(page)
|
|
evaluator_result = evaluator(
|
|
config_file=task_config_file,
|
|
page=None,
|
|
client=None,
|
|
trajectory=[{"answer":last_agent_response}]
|
|
)
|
|
|
|
single_task_result["score"] = evaluator_result
|
|
except Exception as e:
|
|
logger.error(f"Error getting command cost: {e}")
|
|
command_cost = {"cost": -1, "total_tokens": -1}
|
|
single_task_result["compute_cost"] = command_cost
|
|
single_task_result["error"] = str(e)
|
|
|
|
return single_task_result
|
|
|
|
|
|
async def run_tests(ag: AutogenWrapper, browser_manager: PlaywrightManager, task_ids,
|
|
logdir: str="", logname: str="", relative_task_dir: str="", test_results_id: str = "", wait_time_non_headless: int=5, take_screenshots: bool = False) -> list[dict[str, Any]]:
|
|
"""
|
|
Runs a specified range of test tasks using Playwright for browser interactions and AutogenWrapper for task automation.
|
|
It initializes necessary components, processes each task, handles exceptions, and compiles test results into a structured list.
|
|
|
|
Parameters:
|
|
- ag (AutogenWrapper): The AutoGen wrapper that processes commands.
|
|
- browser_manager (PlaywrightManager): The manager handling browser interactions, responsible for page navigation and control.
|
|
- logdir (str)
|
|
- logname (str)
|
|
- task_ids (List[str])
|
|
- relative_task_dir (str)
|
|
- wait_time_non_headless (int): Time to wait between tasks when running in non-headless mode, useful for live monitoring or debugging.
|
|
- take_screenshots (bool): Whether to take screenshots during test execution. Defaults to False.
|
|
|
|
Returns:
|
|
- list[dict[str, Any]]: A list of dictionaries, each containing the results from executing a test task. Results include task ID, intent, score, total command time, etc.
|
|
|
|
This function also manages logging and saving of test results, updates the progress bar to reflect test execution status, and prints a detailed summary report at the end of the testing session.
|
|
"""
|
|
test_log_dir = os.path.join(logdir, logname)
|
|
test_result_dir = os.path.join(logdir, logname, "results")
|
|
check_top_level_test_folders(test_log_dir, test_result_dir)
|
|
|
|
config_file_list = []
|
|
if not relative_task_dir or relative_task_dir == "":
|
|
relative_task_dir = "tasks"
|
|
if task_ids == "all" or task_ids == ["all"]:
|
|
task_ids = [filename[:-len(".json")] for filename in os.listdir(f"config_files/{relative_task_dir}") if filename.endswith(".json")]
|
|
for task_id in task_ids:
|
|
config_file_list.append(f"config_files/{relative_task_dir}/{task_id}.json")
|
|
|
|
test_results: list[dict[str, str | int | float | None]] = []
|
|
|
|
llm_config = AgentsLLMConfig()
|
|
if not ag:
|
|
ag = await AutogenWrapper.create(llm_config.get_planner_agent_config(), llm_config.get_browser_nav_agent_config())
|
|
|
|
if not browser_manager:
|
|
browser_manager = browserManager.PlaywrightManager(headless=True)
|
|
await browser_manager.async_initialize()
|
|
|
|
page=await browser_manager.get_current_page()
|
|
test_results = []
|
|
total_tests = len(config_file_list)
|
|
|
|
for index, task_config_file in enumerate(config_file_list):
|
|
task_config = json.load(open(task_config_file, "r"))
|
|
task_id = str(task_config.get('task_id'))
|
|
if os.path.exists(os.path.join(test_result_dir, f'{task_id}.json')):
|
|
continue
|
|
|
|
log_folders = create_task_log_folders(test_log_dir, task_id)
|
|
|
|
ag.set_chat_logs_dir(log_folders["task_log_folder"])
|
|
|
|
browser_manager.set_take_screenshots(take_screenshots)
|
|
if take_screenshots:
|
|
browser_manager.set_screenshots_dir(log_folders["task_screenshots_folder"])
|
|
|
|
print_progress_bar(index, total_tests)
|
|
task_result = await execute_single_task(task_config_file, browser_manager, ag, page, log_folders["task_log_folder"])
|
|
test_results.append(task_result)
|
|
save_individual_test_result(task_result, test_result_dir)
|
|
print_test_result(task_result, index + 1, total_tests)
|
|
|
|
if not browser_manager.isheadless: # no need to wait if we are running headless
|
|
await asyncio.sleep(wait_time_non_headless) # give time for switching between tasks in case there is a human observer
|
|
|
|
await browser_manager.take_screenshots("final", None)
|
|
|
|
await browser_manager.close_except_specified_tab(page) # cleanup pages that are not the one we opened here
|
|
|
|
print_progress_bar(total_tests, total_tests) # Complete the progress bar
|
|
print('\n\nAll tests completed.')
|
|
|
|
# Aggregate and print individual test results
|
|
print("\nDetailed Test Results:")
|
|
detailed_results_table = [['Test Index', 'Task ID', 'Intent', 'Status', 'Time Taken (s)', 'Total Tokens', 'Total Cost ($)']]
|
|
for idx, result in enumerate(test_results, 1):
|
|
status, color = determine_status_and_color(result['score']) # type: ignore
|
|
|
|
cost: str | int | float | None = result.get("compute_cost", None)
|
|
total_cost = None if cost is None else round(cost.get("cost", -1), 4) # type: ignore
|
|
total_tokens = None if cost is None else cost.get("total_tokens", -1) # type: ignore
|
|
|
|
detailed_results_table.append([
|
|
idx, result['task_id'], result['intent'], colored(status, color), round(result['tct'], 2), # type: ignore
|
|
total_tokens, total_cost
|
|
])
|
|
|
|
print(tabulate(detailed_results_table, headers='firstrow', tablefmt='grid'))
|
|
|
|
# Summary report
|
|
|
|
# Calculate aggregated cost and token totals for all tests that have compute cost
|
|
total_cost = 0
|
|
total_tokens = 0
|
|
|
|
for result in test_results:
|
|
compute_cost = result.get("compute_cost",0) # type: ignore
|
|
if compute_cost is not None and isinstance(compute_cost, dict):
|
|
total_cost += compute_cost.get("cost", 0) # type: ignore
|
|
total_tokens += compute_cost.get("total_tokens", 0) # type: ignore
|
|
|
|
passed_tests = []
|
|
skipped_tests = []
|
|
failed_tests = []
|
|
for result in test_results:
|
|
if result["score"] == 1:
|
|
passed_tests.append(result) # type: ignore
|
|
elif result["score"] < 0: # type: ignore
|
|
skipped_tests.append(result) # type: ignore
|
|
else:
|
|
failed_tests.append(result) # type: ignore
|
|
|
|
summary_table = [ # type: ignore
|
|
['Total Tests', 'Passed', 'Failed', 'Skipped', 'Average Time Taken (s)', 'Total Time Taken (s)', 'Total Tokens', 'Total Cost ($)'],
|
|
[total_tests, len(passed_tests), len(failed_tests), len(skipped_tests),
|
|
round(sum(test['tct'] for test in test_results) / total_tests, 2), # type: ignore
|
|
round(sum(test['tct'] for test in test_results), 2), # type: ignore
|
|
total_tokens, total_cost]
|
|
]
|
|
|
|
print('\nSummary Report:')
|
|
print(tabulate(summary_table, headers='firstrow', tablefmt='grid')) # type: ignore
|
|
|
|
return test_results
|