596 lines
19 KiB
Python
596 lines
19 KiB
Python
import json
|
|
import os
|
|
import random
|
|
from glob import glob
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pytest
|
|
import requests
|
|
from PIL import Image
|
|
from py import test
|
|
|
|
from agent import Agent, TeacherForcingAgent
|
|
from browser_env import ActionTypes, ScriptBrowserEnv
|
|
from browser_env.env_config import *
|
|
from evaluation_harness import (
|
|
HTMLContentExactEvaluator,
|
|
PageImageEvaluator,
|
|
StringEvaluator,
|
|
URLExactEvaluator,
|
|
image_utils,
|
|
)
|
|
from evaluation_harness.evaluators import EvaluatorComb
|
|
|
|
IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
|
|
HEADLESS = True
|
|
config_file_folder = "tests/test_evaluation_harness/configs"
|
|
|
|
|
|
def tf_roll_out(
|
|
agent: Agent, env: ScriptBrowserEnv, config_file: str
|
|
) -> list[Any]:
|
|
"""Roll out the agent using teacher forcing actions"""
|
|
obs, state_info = env.reset(options={"config_file": config_file})
|
|
|
|
trajectory: list[Any] = [{"observation": obs, "info": state_info}]
|
|
while True:
|
|
action = agent.next_action(
|
|
trajectory=trajectory, intent="", meta_data={}
|
|
)
|
|
trajectory.append(action)
|
|
if action["action_type"] == ActionTypes.STOP:
|
|
break
|
|
|
|
# preceed to next action
|
|
obs, reward, terminated, truncated, info = env.step(action)
|
|
state_info = {"observation": obs, "info": info}
|
|
trajectory.append(state_info)
|
|
|
|
return trajectory
|
|
|
|
|
|
def test_string_match_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/string_match.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = """page.stop("The date is 1985/04/18")"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = StringEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
|
|
assert score == 1.0
|
|
|
|
|
|
def test_string_match_fail(script_browser_env: ScriptBrowserEnv) -> None:
|
|
config_file = f"{config_file_folder}/string_match.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = """page.stop("The date is 1936/04/18")"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = StringEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
|
|
assert score == 0.0
|
|
|
|
|
|
def test_url_exact_match_success(script_browser_env: ScriptBrowserEnv) -> None:
|
|
config_file = f"{config_file_folder}/url_exact_match.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("https://www.google.com/")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = URLExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 1.0
|
|
|
|
|
|
def test_url_exact_match_fail(script_browser_env: ScriptBrowserEnv) -> None:
|
|
config_file = f"{config_file_folder}/url_exact_match.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("https://github.com/web-arena-x")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = URLExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
print(env.page.url)
|
|
assert score == 0.0
|
|
|
|
|
|
def test_html_content_match_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/html_content_exact_match.json"
|
|
|
|
# randomly sample a string
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 1.0
|
|
|
|
|
|
def test_html_content_match_fail(script_browser_env: ScriptBrowserEnv) -> None:
|
|
config_file = f"{config_file_folder}/html_content_exact_match.json"
|
|
|
|
# randomly sample a string
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = """page.goto("https://www.google.com/")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 0.0
|
|
|
|
|
|
def test_html_content_element_match_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/html_content_element_exact_match.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise/")
|
|
page.get_by_label("Full name").fill("Hello World")
|
|
page.get_by_label("Email").click()
|
|
page.get_by_label("Email").fill("alexisxy@hotmail.com")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 1.0
|
|
|
|
|
|
def test_html_content_element_match_fail(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/html_content_element_exact_match.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise/")
|
|
page.get_by_label("Full name").fill("Hello")
|
|
page.get_by_label("Email").click()
|
|
page.get_by_label("Email").fill("alexisxy@hotmail.com")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 0.0
|
|
|
|
|
|
def test_html_content_url_comb_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/html_content_url_comb.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise/")
|
|
page.get_by_label("Full name").fill("Hello World")
|
|
page.get_by_label("Email").click()
|
|
page.get_by_label("Email").fill("alexisxy@hotmail.com")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evaluators = EvaluatorComb(
|
|
[URLExactEvaluator(), HTMLContentExactEvaluator()]
|
|
)
|
|
score = evaluators(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 1.0
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
IN_GITHUB_ACTIONS, reason="Won't work using the demo sites"
|
|
)
|
|
def test_func_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/func_eval_success.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 1.0
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
IN_GITHUB_ACTIONS, reason="Won't work using the demo sites"
|
|
)
|
|
def test_func_fail(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/func_eval_fail.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 0.0
|
|
|
|
|
|
def test_func_url_func_last_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/func_url_func_1.json"
|
|
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("{REDDIT}/f/wallstreetbets/50431/-/comment/676875")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
trajectory = tf_roll_out(agent, env, config_file)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, config_file, env.page
|
|
)
|
|
assert score == 1.0
|
|
|
|
|
|
def test_html_required_values_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
for config_file in glob(
|
|
f"{config_file_folder}/html_required_values_success_*.json"
|
|
):
|
|
# change the URL placeholder with the concrete URL
|
|
with open(config_file, "r") as f:
|
|
configs = json.load(f)
|
|
configs["eval"]["reference_url"] = configs["eval"][
|
|
"reference_url"
|
|
].replace("__SHOPPING__", SHOPPING)
|
|
tmp_config = config_file.replace(".json", ".json.tmp")
|
|
with open(tmp_config, "w+") as f:
|
|
json.dump(configs, f, indent=4)
|
|
|
|
# randomly sample a string
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
gt_url = configs["eval"]["reference_url"]
|
|
action_seq = f"""page.goto("{gt_url}")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, tmp_config)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, tmp_config, env.page
|
|
)
|
|
os.remove(tmp_config)
|
|
assert score == 1.0
|
|
|
|
|
|
def test_page_image_evaluator(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
for config_file in [
|
|
f"{config_file_folder}/image_evaluator_yes.json",
|
|
f"{config_file_folder}/image_evaluator_yes_direct_img.json",
|
|
]:
|
|
# change the URL placeholder with the concrete URL
|
|
with open(config_file, "r") as f:
|
|
configs = json.load(f)
|
|
configs["eval"]["reference_url"] = configs["eval"][
|
|
"reference_url"
|
|
].replace("__SHOPPING__", SHOPPING)
|
|
configs["start_url"] = configs["start_url"].replace(
|
|
"__SHOPPING__", SHOPPING
|
|
)
|
|
for e in configs["eval"]["page_image_query"]:
|
|
e["eval_image_url"] = e["eval_image_url"].replace("__SHOPPING__", SHOPPING)
|
|
tmp_config = config_file.replace(".json", ".json.tmp")
|
|
with open(tmp_config, "w+") as f:
|
|
json.dump(configs, f, indent=4)
|
|
|
|
# randomly sample a string
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
gt_url = configs["eval"]["page_image_query"][0]["eval_image_url"]
|
|
action_seq = f"""page.goto("{gt_url}")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, tmp_config)
|
|
|
|
# Create a dummy captioning function that always returns "yes"
|
|
captioning_fn = lambda images, *args, **kwargs: ["yes"] * len(images)
|
|
evalutor = PageImageEvaluator(captioning_fn)
|
|
score = evalutor(
|
|
trajectory, tmp_config, env.page
|
|
)
|
|
|
|
# Create a dummy captioning function that always returns "yes"
|
|
captioning_fn = lambda images, *args, **kwargs: ["no"] * len(images)
|
|
evalutor = PageImageEvaluator(captioning_fn)
|
|
score_no = evalutor(
|
|
trajectory, tmp_config, env.page
|
|
)
|
|
os.remove(tmp_config)
|
|
assert score == 1.0
|
|
assert score_no == 0.0
|
|
|
|
|
|
def test_page_image_evaluator_yes_no(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/image_evaluator_yes_no.json"
|
|
# change the URL placeholder with the concrete URL
|
|
with open(config_file, "r") as f:
|
|
configs = json.load(f)
|
|
configs["eval"]["reference_url"] = configs["eval"][
|
|
"reference_url"
|
|
].replace("__SHOPPING__", SHOPPING)
|
|
configs["start_url"] = configs["start_url"].replace(
|
|
"__SHOPPING__", SHOPPING
|
|
)
|
|
for e in configs["eval"]["page_image_query"]:
|
|
e["eval_image_url"] = e["eval_image_url"].replace("__SHOPPING__", SHOPPING)
|
|
tmp_config = config_file.replace(".json", ".json.tmp")
|
|
with open(tmp_config, "w+") as f:
|
|
json.dump(configs, f, indent=4)
|
|
|
|
# randomly sample a string
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
gt_url = configs["eval"]["page_image_query"][0]["eval_image_url"]
|
|
action_seq = f"""page.goto("{gt_url}")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, tmp_config)
|
|
|
|
# Create a dummy captioning function that always returns "yes"
|
|
captioning_fn = lambda images, *args, **kwargs: ["yes"] * len(images)
|
|
evalutor = PageImageEvaluator(captioning_fn)
|
|
score = evalutor(
|
|
trajectory, tmp_config, env.page
|
|
)
|
|
assert score == 0.0
|
|
|
|
|
|
def test_html_required_values_failure(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
for config_file in glob(
|
|
f"{config_file_folder}/html_required_values_failure_*.json"
|
|
):
|
|
# change the URL placeholder with the concrete URL
|
|
with open(config_file, "r") as f:
|
|
configs = json.load(f)
|
|
configs["eval"]["reference_url"] = configs["eval"][
|
|
"reference_url"
|
|
].replace("__SHOPPING__", SHOPPING)
|
|
tmp_config = config_file.replace(".json", ".json.tmp")
|
|
with open(tmp_config, "w+") as f:
|
|
json.dump(configs, f, indent=4)
|
|
|
|
# randomly sample a string
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
gt_url = configs["eval"]["reference_url"]
|
|
action_seq = f"""page.goto("{gt_url}")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
|
|
trajectory = tf_roll_out(agent, env, tmp_config)
|
|
|
|
evalutor = HTMLContentExactEvaluator()
|
|
score = evalutor(
|
|
trajectory, tmp_config, env.page
|
|
)
|
|
os.remove(tmp_config)
|
|
assert score == 0.0
|
|
|
|
|
|
def test_exact_image(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
for config_file, expected_score in zip(
|
|
[
|
|
f"{config_file_folder}/exact_image_success.json",
|
|
f"{config_file_folder}/exact_image_failure.json",
|
|
f"{config_file_folder}/exact_image_resize_failure_1.json",
|
|
f"{config_file_folder}/exact_image_resize_failure_2.json",
|
|
],
|
|
[1.0, 0.0, 0.0, 0.0],
|
|
):
|
|
# change the URL placeholder with the concrete URL
|
|
with open(config_file, "r") as f:
|
|
configs = json.load(f)
|
|
configs["start_url"] = configs["start_url"].replace(
|
|
"__REDDIT__", REDDIT
|
|
)
|
|
configs["eval"]["reference_url"] = configs["eval"][
|
|
"reference_url"
|
|
].replace("__REDDIT__", REDDIT)
|
|
|
|
for e in configs["eval"]["page_image_query"]:
|
|
e["eval_fuzzy_image_match"] = e["eval_fuzzy_image_match"].replace("__REDDIT__", REDDIT)
|
|
configs["image"] = configs["image"].replace("__REDDIT__", REDDIT)
|
|
configs["intent"] = configs["intent"].replace("__REDDIT__", REDDIT)
|
|
tmp_config = config_file.replace(".json", ".tmp.json")
|
|
with open(tmp_config, "w+") as f:
|
|
json.dump(configs, f, indent=4)
|
|
|
|
gt_url = configs["eval"]["reference_url"]
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("{gt_url}")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
trajectory = tf_roll_out(agent, env, tmp_config)
|
|
|
|
captioning_fn = lambda images, *args, **kwargs: ["yes"] * len(
|
|
images
|
|
) # Unused for this task
|
|
evalutor = PageImageEvaluator(captioning_fn)
|
|
score = evalutor(
|
|
trajectory, tmp_config, env.page
|
|
)
|
|
assert score == expected_score, config_file
|
|
os.remove(tmp_config)
|
|
|
|
|
|
def test_exact_image_resize_success(
|
|
script_browser_env: ScriptBrowserEnv,
|
|
) -> None:
|
|
config_file = f"{config_file_folder}/exact_image_resize_template.json"
|
|
resized_img_path = "resized_img.tmp.png"
|
|
for downscale, expected_score in zip(
|
|
[2, 8], [1.0, 0.0]
|
|
): # 2x (should pass) and 8x (should fail) downscale
|
|
# change the URL placeholder with the concrete URL
|
|
with open(config_file, "r") as f:
|
|
configs = json.load(f)
|
|
configs["start_url"] = configs["start_url"].replace(
|
|
"__REDDIT__", REDDIT
|
|
)
|
|
configs["eval"]["reference_url"] = configs["eval"][
|
|
"reference_url"
|
|
].replace("__REDDIT__", REDDIT)
|
|
for e in configs["eval"]["page_image_query"]:
|
|
e["eval_fuzzy_image_match"] = e["eval_fuzzy_image_match"].replace("__REDDIT__", REDDIT)
|
|
configs["image"] = configs["image"].replace("__REDDIT__", REDDIT)
|
|
configs["intent"] = configs["intent"].replace("__REDDIT__", REDDIT)
|
|
|
|
# Download an image and resize
|
|
img_url = configs["instantiation_dict"]["image_url"].replace(
|
|
"__REDDIT__", REDDIT
|
|
)
|
|
img = Image.open(requests.get(img_url, stream=True).raw)
|
|
# Resize image to half its size
|
|
resized_img = img.resize(
|
|
(img.width // downscale, img.height // downscale)
|
|
)
|
|
with open(resized_img_path, "wb") as wf:
|
|
resized_img.save(wf, format="png")
|
|
|
|
configs["eval"]["page_image_query"][0]["eval_fuzzy_image_match"] = resized_img_path
|
|
|
|
tmp_config = config_file.replace(".json", ".tmp.json")
|
|
with open(tmp_config, "w+") as f:
|
|
json.dump(configs, f, indent=4)
|
|
|
|
gt_url = configs["eval"]["reference_url"]
|
|
agent = TeacherForcingAgent()
|
|
agent.set_action_set_tag(tag="playwright")
|
|
action_seq = f"""page.goto("{gt_url}")
|
|
page.stop()"""
|
|
agent.set_actions(action_seq)
|
|
|
|
env = script_browser_env
|
|
trajectory = tf_roll_out(agent, env, tmp_config)
|
|
|
|
captioning_fn = lambda images, *args, **kwargs: ["yes"] * len(
|
|
images
|
|
) # Unused for this task
|
|
evalutor = PageImageEvaluator(captioning_fn)
|
|
score = evalutor(
|
|
trajectory, tmp_config, env.page
|
|
)
|
|
assert score == expected_score, config_file
|
|
os.remove(tmp_config)
|
|
os.remove(resized_img_path) |