diff --git a/AgentOccam/AgentOccam.py b/AgentOccam/AgentOccam.py new file mode 100644 index 0000000..e00d806 --- /dev/null +++ b/AgentOccam/AgentOccam.py @@ -0,0 +1,1448 @@ +from AgentOccam.obs_opt import parse_node_descendants, parse_node_ancestors, parse_node_siblings, action_set_invisible, action_set_visible, action_set_visible_if_with_name, translate_node_to_str, construct_new_DOM_with_visible_nodes +from AgentOccam.llms.claude import call_claude, call_claude_with_messages, arrange_message_for_claude +from AgentOccam.llms.mistral import call_mistral, call_mistral_with_messages, arrange_message_for_mistral +from AgentOccam.llms.cohere import call_cohere, call_cohere_with_messages, arrange_message_for_cohere +from AgentOccam.llms.llama import call_llama, call_llama_with_messages, arrange_message_for_llama +from AgentOccam.llms.titan import call_titan, call_titan_with_messages, arrange_message_for_titan +from AgentOccam.llms.gpt import call_gpt, call_gpt_with_messages, arrange_message_for_gpt +from AgentOccam.llms.gemini import call_gemini, call_gemini_with_messages, arrange_message_for_gemini +from AgentOccam.utils import CURRENT_DIR, HOMEPAGE_URL + +from typing import Dict +import re +import copy +import os +from functools import partial +import random +import json + +import warnings +warnings.filterwarnings("ignore") + + +DEFAULT_DOCUMENTED_INTERACTION_ELEMENTS = ["observation", "action"] +DEFAULT_ONLINE_INTERACTION_ELEMENTS = ["url", "observation"] +MODEL_FAMILIES = ["claude", "mistral", "cohere", "llama", "titan", "gpt", "gemini"] +CALL_MODEL_MAP = { + "claude": call_claude, + "mistral": call_mistral, + "cohere": call_cohere, + "llama": call_llama, + "titan": call_titan, + "gpt": call_gpt, + "gemini": call_gemini, +} +CALL_MODEL_WITH_MESSAGES_FUNCTION_MAP = { + "claude": call_claude_with_messages, + "mistral": call_mistral_with_messages, + "cohere": call_cohere_with_messages, + "llama": call_llama_with_messages, + "titan": call_titan_with_messages, + "gpt": call_gpt_with_messages, + "gemini": call_gemini_with_messages, +} +ARRANGE_MESSAGE_FOR_MODEL_MAP = { + "claude": arrange_message_for_claude, + "mistral": arrange_message_for_mistral, + "cohere": arrange_message_for_cohere, + "llama": arrange_message_for_llama, + "titan": arrange_message_for_titan, + "gpt": arrange_message_for_gpt, + "gemini": arrange_message_for_gemini, +} + +class Agent: + def __init__(self, config, objective, prompt_template): + self.config = config + self.objective = objective + self.prompt_template = prompt_template + + if hasattr(self.config, "documented_interaction_elements"): + self.previous_interactions = {k: [] for k in set(DEFAULT_DOCUMENTED_INTERACTION_ELEMENTS+self.config.documented_interaction_elements)} + else: + self.previous_interactions = {k: [] for k in DEFAULT_DOCUMENTED_INTERACTION_ELEMENTS} + if hasattr(self.config, "online_interaction_elements"): + self.online_interaction = {k: None for k in set(DEFAULT_ONLINE_INTERACTION_ELEMENTS+self.config.online_interaction_elements)} + else: + self.online_interaction = {k: None for k in DEFAULT_ONLINE_INTERACTION_ELEMENTS} + + self.model_family = [model_family for model_family in MODEL_FAMILIES if model_family in self.config.model][0] + self.call_model = partial(CALL_MODEL_MAP[self.model_family], model_id=self.config.model) + self.call_model_with_message = partial(CALL_MODEL_WITH_MESSAGES_FUNCTION_MAP[self.model_family], model_id=self.config.model) + self.arrange_message_for_model = ARRANGE_MESSAGE_FOR_MODEL_MAP[self.model_family] + + def shift_model(self, model_id): + self.model_family = [model_family for model_family in MODEL_FAMILIES if model_family in model_id][0] + self.call_model = partial(CALL_MODEL_MAP[self.model_family], model_id=model_id) + self.call_model_with_message = partial(CALL_MODEL_WITH_MESSAGES_FUNCTION_MAP[self.model_family], model_id=model_id) + self.arrange_message_for_model = ARRANGE_MESSAGE_FOR_MODEL_MAP[self.model_family] + + def prune_message_list(self, message_list): + return self.merge_adjacent_text([m for m in message_list if not (m[0]=="text" and len(m[1])==0)]) + + def merge_adjacent_text(self, message_list): + merged_list = [] + current_tuple = None + + for tup in message_list: + if tup[0] == "text": + if current_tuple: + current_tuple = (current_tuple[0], current_tuple[1] + tup[1]) + else: + current_tuple = tup + else: + if current_tuple: + merged_list.append(current_tuple) + current_tuple = None + merged_list.append(tup) + + if current_tuple: + merged_list.append(current_tuple) + + return merged_list + + + def get_step(self): + return len(self.previous_interactions["action"]) + + def update_objective(self, objective): + self.objective = objective + + def update_online_state(self, **online_states): + for k in online_states.keys(): + if k in self.online_interaction.keys(): + self.online_interaction[k] = online_states[k] + + def update_history(self, **interaction_dict): + for k in interaction_dict.keys(): + if k in self.previous_interactions.keys(): + self.previous_interactions[k].append(interaction_dict[k]) + + def equal_history_length(self): + lengths = [len(self.previous_interactions[k]) for k in self.previous_interactions.keys()] + return (len(set(lengths)) == 1) + + def parse_elements(self, text, key_list): + element_dict = {} + for k in key_list: + # _match = re.search(rf'{k.upper()}:\s*(.*?)\s*(?=\n[A-Z\d\s\W]*: *\n|$)', text, re.DOTALL) + _match = re.search(rf'{k.upper()}:\s*(.*?)\s*(?=\n[A-Z\s]*:|$)', text, re.DOTALL) + element_dict[k] = _match.group(1).strip() if _match else "" + return element_dict + + def get_output_specifications(self): + output_specifications = "\n".join([f"{o.upper()}:\n" + "".join(open(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "output_specifications", "{}.txt".format(o.replace(" ", "_"))), "r").readlines()) for o in self.config.output]) + return output_specifications + + def parse_stipulated_action_list(self, text: str, action: str, actions: list) -> str: + pattern = rf'({re.escape(action)}\s*(.*?))(?=\n(?:{"|".join(map(re.escape, actions))})|$)' + return [match[0].strip() for match in re.findall(pattern, text, re.DOTALL)] + + def parse_str_to_action_list(self, text:str, actions: list): + remain_text = copy.deepcopy(text) + action_list = [] + while remain_text: + find_action = False + for action in actions: + if remain_text.startswith(action): + match = re.search(rf'({re.escape(action)}\s*(.*?))(?=\n(?:{"|".join(map(re.escape, actions))})|$)', remain_text, re.DOTALL) + action_list.append(match[0]) + remain_text = remain_text[len(match[0]):].strip() + find_action = True + if not find_action: + break + return action_list + + def get_observation_text(self, idx=None): + if isinstance(self.online_interaction["observation"], dict): + if idx: + return self.previous_interactions["observation"][idx]["text"] + return self.online_interaction["observation"]["text"] + elif isinstance(self.online_interaction["observation"], str): + if idx: + return self.previous_interactions["observation"][idx] + return self.online_interaction["observation"] + + def get_observation_image(self, idx=None): + if isinstance(self.online_interaction["observation"], dict): + if idx: + return self.previous_interactions["observation"][idx]["image"] + return self.online_interaction["observation"]["image"] + elif isinstance(self.online_interaction["observation"], str): + return None + + def get_observation_node(self, idx=None): + if isinstance(self.online_interaction["observation"], dict): + if idx != None: + return self.previous_interactions["observation"][idx]["node"] + return self.online_interaction["observation"]["node"] + elif isinstance(self.online_interaction["observation"], str): + return None + + def get_observation_node_str(self, idx=None): + if isinstance(self.online_interaction["observation"], dict): + if idx != None: + return self.previous_interactions["observation"][idx]["node_str"] + return translate_node_to_str(self.online_interaction["observation"]["node"], mode="name_only") + elif isinstance(self.online_interaction["observation"], str): + return None + + def del_observation_node(self): + if isinstance(self.online_interaction["observation"], str): + return + if isinstance(self.online_interaction["observation"], dict): + for idx in range(len(self.previous_interactions["observation"])): + if "node" in self.previous_interactions["observation"][idx].keys() and self.previous_interactions["observation"][idx]["node"]: + node_str = translate_node_to_str(self.previous_interactions["observation"][idx]["node"], mode="name_only") + self.previous_interactions["observation"][idx]["node_str"] = node_str + self.previous_interactions["observation"][idx]["node"].delete_tree() + self.previous_interactions["observation"][idx]["node"] = None + +class PlanTreeNode: + def __init__(self, id, type, text, level, url, step): + self.visible = True + self.id = id + self.type = type + self.text = text + self.level = level + self.url = url + self.step = step + self.children = [] + self.parent = None + self.note = [] + self.hint = [] + self.resume_reason = [] + self.steps_taken = [] + + def reset(self): + self.visible = True + self.note = [] + self.hint = [] + self.steps_taken = [] + + def add_child(self, child): + child.parent = self + self.children.append(child) + + def search_node_by_id(self, target_id): + if self.visible and self.id == target_id: + return self + for child in self.children: + result = child.search_node_by_id(target_id) + if result: + return result + return None + + def traverse(self, action=None, tree_buffer=[]): + res_action = action(self) + if res_action: + if isinstance(res_action, list): + tree_buffer.extend(res_action) + else: + tree_buffer.append(res_action) + for child in self.children: + child.traverse(action, tree_buffer=tree_buffer) + +class QAActor(Agent): + def __init__(self, config, objective, prompt_template): + super().__init__(config, objective, prompt_template) + def get_instruction(self): + return self.prompt_template["instruction_template"] + def get_online_input(self): + return [("text", self.prompt_template["input_template"].replace("{current_observation}", self.get_observation_text()).replace("{objective}", self.objective))] + def get_action(self, instruction, online_input): + model_response = self.call_model_with_message(system_prompt=instruction, messages=self.arrange_message_for_model(online_input)) + action_elements = self.parse_elements(text=model_response, key_list=self.config.output) + action = action_elements["response"] + action_elements["action"] = f"note [{action}]" + action_elements["instruction"] = instruction + action_elements["input"] = online_input + return model_response, action_elements + +class PlanningActor(Agent): + def __init__(self, config, objective, prompt_template): + super().__init__(config, objective, prompt_template) + self.instruction = None + + def get_planning_specifications(self): + return "\n".join(["- " + "".join(open(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "planning_specifications", f"{p}.txt"), "r").readlines()) for p in self.config.planning_command]) + + def get_instruction(self): + if self.instruction: + return self.instruction + output_specifications = self.get_output_specifications() + self.instruction = self.prompt_template["instruction_template"].replace("{output_specifications}", output_specifications).replace("{planning_specifications}", self.get_planning_specifications()) + return self.instruction + + def get_online_input(self): + return None + + def get_action(self, instruction, online_input): + model_response = self.call_model_with_message(system_prompt=instruction, messages=self.arrange_message_for_model(online_input)) + action_elements = self.parse_elements(text=model_response, key_list=self.config.output) + action_elements["action"] = copy.deepcopy(action_elements["plan"]) + del action_elements["plan"] + action_elements["reason"] = "N/A" + action_elements["instruction"] = instruction + action_elements["input"] = online_input + return model_response, action_elements + +class ReflectionActor(Agent): + def __init__(self, config, objective, prompt_template): + super().__init__(config, objective, prompt_template) + self.instruction = None + + def get_planning_specifications(self): + return "\n".join(["- " + "".join(open(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "planning_specifications", f"{p}.txt"), "r").readlines()) for p in self.config.planning_command]) + + def get_navigation_specifications(self): + return "\n".join(["- " + "".join(open(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "navigation_specifications", f"{n}.txt"), "r").readlines()) for n in self.config.navigation_command]) + + def get_instruction(self): + if self.instruction: + return self.instruction + output_specifications = self.get_output_specifications() + planning_specifications = self.get_planning_specifications() + navigation_specifications = self.get_navigation_specifications() + instruction = self.prompt_template["instruction_template"] + instruction = instruction.replace("{output_specifications}", output_specifications) + instruction = instruction.replace("{planning_specifications}", planning_specifications) + instruction = instruction.replace("{navigation_specifications}", navigation_specifications) + self.instruction = instruction + return self.instruction + + def get_online_input(self): + return None + + def get_action(self, instruction, online_input): + model_response = self.call_model_with_message(system_prompt=instruction, messages=self.arrange_message_for_model(online_input)) + action_elements = self.parse_elements(text=model_response, key_list=self.config.output) + action_elements["instruction"] = instruction + action_elements["input"] = online_input + return model_response, action_elements + +IDENTITY_CLASS_MAP = { + "QA": QAActor, + "planning": PlanningActor, + "reflection": ReflectionActor, +} + +class Actor(Agent): + def __init__(self, config, objective, prompt_template, plan_tree_node): + super().__init__(config, objective, prompt_template) + self.plan_tree_root = plan_tree_node + self.active_node = plan_tree_node + self.output_specifications = None + self.planning_specifications = None + self.navigation_specifications = None + self.criticism_element_list = None + + self.output_play_path = os.path.join(CURRENT_DIR, f"play-{self.config.others.logname}.txt") if getattr(self.config.others, "logname", "") != "" else os.path.join(CURRENT_DIR, f"play.txt") + self.output_trash_path = os.path.join(CURRENT_DIR, f"trash-{self.config.others.logname}.txt") if getattr(self.config.others, "logname", "") != "" else os.path.join(CURRENT_DIR, f"trash.txt") + + self.identities = [] + if hasattr(self.config, "identities"): + i = 0 + while hasattr(self.config.identities, f"identity_{i}"): + identity_config = getattr(self.config.identities, f"identity_{i}") + self.identities.append(IDENTITY_CLASS_MAP[identity_config.name](identity_config, objective=objective, prompt_template=prompt_template[identity_config.name])) + i += 1 + + def update_online_state(self, **online_states): + super().update_online_state(**online_states) + for identity in self.identities: + identity.update_online_state(**online_states) + + def is_planning(self, action): + for c in self.config.planning_command: + if action.startswith(c): + return c + return False + + def is_navigation(self, action): + action_without_note = re.sub(rf'(note\s*(.*?))(?=\n(?:{"|".join(map(re.escape, self.config.navigation_command))})|$)', "", action).strip() + for c in self.config.navigation_command: + if action_without_note.startswith(c): + return c + return False + + def is_valid_action(self, action_str): + action = ( + action_str.split("[")[0].strip() + if "[" in action_str + else action_str.split()[0].strip() + ) + match action: + case "click": + match = re.search(r"click ?\[(\d+)\]", action_str) + if not match: + return False + element_id = match.group(1) + if element_id in self.get_observation_text(): + return True + return False + case "type": + if not (action_str.endswith("[0]") or action_str.endswith("[1]")): + action_str += " [1]" + + match = re.search( + r"type ?\[(\d+)\] ?\[(.*)\] ?\[(\d+)\]", action_str, re.DOTALL + ) + if not match: + return False + element_id, text, enter_flag = ( + match.group(1), + match.group(2), + match.group(3), + ) + enter_flag = True if enter_flag == "1" else False + if enter_flag: + text += "\n" + if element_id in self.get_observation_text(): + return True + case "go_back": + return True + case "go_home": + return True + case "note": + return True + case "stop": + return True + case "branch": + return True + case "prune": + return True + case "goto": + return True + case "scroll": + return True + + def are_valid_actions(self, actions): + action_list = self.parse_str_to_action_list(actions, self.config.planning_command+self.config.navigation_command+["goto"]) + if not action_list: + return False + for action in action_list: + if not self.is_valid_action(action): + return False + return True + + def get_previous_plans(self, verbose=False): + def action_return_visible_node(node, verbose=False): + if node.id == self.active_node.id: + basic = "\t" * node.level + f"[{node.id}] (Active Plan) {node.text}" if node.visible else None + else: + basic = "\t" * node.level + f"[{node.id}] {node.text}" if node.visible else None + if basic and len(node.resume_reason) > 0: + basic += f" # Was resumed to this step {len(node.resume_reason)} time(s) for:" + for i, reason in enumerate(node.resume_reason): + basic += f" {i}. {reason}" + if verbose and basic and len(node.note) > 0: + for i, note in enumerate(node.note): + basic += "\n" + "\t" * node.level + f"Note {i}. {note}" + return basic + plan_tree_buffer = [] + parse_node_descendants(self.plan_tree_root, partial(action_return_visible_node, verbose=verbose), tree_buffer=plan_tree_buffer) + return "\n".join(plan_tree_buffer) + + def get_active_plan(self): + return f"[{self.active_node.id}] {self.active_node.text}" + + def get_interaction_history(self, interaction_history_config=False, mode="highlight"): + interaction_history_config = interaction_history_config if interaction_history_config else self.config.interaction_history + + previous_observation = [] + for i in self.active_node.steps_taken: + if self.get_observation_node_str() and self.get_observation_node_str(i) and not self.get_observation_node_str() == self.get_observation_node_str(i): + if self.previous_interactions["observation highlight"][i] and mode == "highlight" and len(translate_node_to_str(self.previous_interactions["observation highlight"][i], mode="name_only", retained_ids=self.previous_interactions["retained element ids"][i]).split()) < 200: + try: + previous_observation.append({"text": translate_node_to_str(self.previous_interactions["observation highlight"][i], mode="name_only", retained_ids=self.previous_interactions["retained element ids"][i]), "image": self.get_observation_image(i)}) + except: + print(i, self.previous_interactions["observation"][i]["text"]) + raise ValueError("Cannot translate highlight node to text.") + else: + previous_observation.append({"text": self.previous_interactions["observation summary"][i], "image": self.get_observation_image(i)}) + elif not self.get_observation_node() or mode == "full": + if len(self.get_observation_text(i).split()) < 200: + previous_observation.append({"text": self.get_observation_text(i), "image": self.get_observation_image(i)}) + else: + previous_observation.append({"text": self.previous_interactions["observation summary"][i], "image": self.get_observation_image(i)}) + else: + previous_observation.append({"text": "The same as the CURRENT OBSERVATION (see below CURRENT OBSERVATION section).", "image": self.get_observation_image(i)}) + + previous_observation_summary = [self.previous_interactions["observation summary"][i] for i in self.active_node.steps_taken] + + def get_text(obs): + if isinstance(obs, dict): + return obs["text"] + elif isinstance(obs, str): + return obs + + def get_image(obs): + if isinstance(obs, dict): + return obs["image"] + elif isinstance(obs, str): + return obs + + if interaction_history_config.step_num == "all": + textual_observations = [get_text(obs) for obs in previous_observation] if interaction_history_config.verbose else previous_observation_summary + visual_observations = [get_image(obs) for obs in previous_observation] + else: + textual_observations = previous_observation_summary[:-interaction_history_config.step_num] + visual_observations = [None] * len(previous_observation_summary[:-interaction_history_config.step_num]) + textual_observations += [get_text(obs) for obs in previous_observation][-interaction_history_config.step_num:] if interaction_history_config.verbose else previous_observation_summary[-interaction_history_config.step_num:] + visual_observations += [get_image(obs) for obs in previous_observation][-interaction_history_config.step_num:] + + plans = [self.previous_interactions["plan"][i] for i in self.active_node.steps_taken] + reasons = [self.previous_interactions["reason"][i] for i in self.active_node.steps_taken] + actions = [self.previous_interactions["action"][i] for i in self.active_node.steps_taken] + + if "image" in interaction_history_config.type: + message_list = [] + for step, (obs, vi_obs, plan, reason, action) in enumerate(zip(textual_observations, visual_observations, plans, reasons, actions)): + message_list.append(("text", f"\n")) + if vi_obs: + message_list.append(("text", "VISUAL OBSERVATION:\n")) + message_list.append(("image", vi_obs)) + if self.active_node.id != 0: + message_list.append(("text", f"TEXTUAL OBSERVATION:\n{obs}\nACTIVE PLAN:\n{plan}\nREASON FOR ACTION:\n{reason}\nACTION:\n{action}\n\n")) + else: + message_list.append(("text", f"TEXTUAL OBSERVATION:\n{obs}\nREASON FOR ACTION:\n{reason}\nACTION:\n{action}\n\n")) + return self.prune_message_list(message_list=message_list) + else: + message = "" + for step, (obs, plan, reason, action) in enumerate(zip(textual_observations, plans, reasons, actions)): + if self.active_node.id != 0: + message += f"\nOBSERVATION:\n{obs}\nACTIVE PLAN:\n{plan}\nREASON FOR ACTION:\n{reason}\nACTION:\n{action}\n\n" # f"\nOBSERVATION:\n{obs}\nACTIVE PLAN:\n{plan}\nREASON FOR ACTION:\n{reason}\nACTION:\n{action}\n\n" + else: + message += f"\nOBSERVATION:\n{obs}\nREASON FOR ACTION:\n{reason}\nACTION:\n{action}\n\n" # f"\nOBSERVATION:\n{obs}\nREASON FOR ACTION:\n{reason}\nACTION:\n{action}\n\n" + return self.prune_message_list(message_list=[("text", message)]) + + def pre_process_atomic_actions(self, atomic_action_list=["combobox"]): + if self.get_observation_node() and "combobox" in atomic_action_list: + self.online_interaction["observation"]["text"] = translate_node_to_str(self.get_observation_node(), mode="concise", hidden_roles=["menu", "combobox", "listbox"]) + + def get_online_input(self, criticism_elements): + input_template = self.prompt_template["input_template"] + input_prefix, input_suffix = input_template.split("{input}") + INPUT_TYPE_TO_CONTENT_MAP = { + "step": self.get_step(), + "objective": self.objective, + "previous plans": self.get_previous_plans(verbose=True), + "interaction history": self.get_interaction_history(), + "current observation": self.get_observation_text(), + "current visual observation": self.get_observation_image() + } + input_list = [] + for input_type in self.config.input: + input_content = None + if input_type == "current visual observation": + continue + elif input_type in INPUT_TYPE_TO_CONTENT_MAP.keys(): + input_content = INPUT_TYPE_TO_CONTENT_MAP[input_type] + elif input_type.startswith("critic: ") and criticism_elements and input_type[len("critic: "):] in criticism_elements.keys() and criticism_elements[input_type[len("critic: "):]]: + input_type = input_type[len("critic: "):] + input_content = criticism_elements[input_type] + input_type = "FROM USER: " + input_type + if input_content and isinstance(input_content, str): + input_list.append(("text", f"{input_type.upper()}:\n{input_content}\n")) + elif input_content and isinstance(input_content, list): + input_list.append(("text", f"{input_type.upper()}:\n")) + input_list += input_content if len(input_content) > 0 else ["N/A"] + + if "image" in self.config.current_observation.type: + input_type = "current visual observation" + input_list.append(("text", f"{input_type.upper()}:\n")) + input_list.append(("image", INPUT_TYPE_TO_CONTENT_MAP["current visual observation"])) + + return self.prune_message_list(message_list=[("text", input_prefix)] + input_list + [("text", input_suffix)]) + + def get_planning_specifications(self): + if self.planning_specifications: + return self.planning_specifications + self.planning_specifications = "\n".join(["- " + "".join(open(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "planning_specifications", f"{p}.txt"), "r").readlines()) for p in self.config.planning_command]) + return self.planning_specifications + + def get_navigation_specifications(self): + if self.navigation_specifications: + return self.navigation_specifications + self.navigation_specifications = "\n".join(["- " + "".join(open(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "navigation_specifications", f"{n}.txt"), "r").readlines()) for n in self.config.navigation_command]) + return self.navigation_specifications + + def get_actor_instruction(self, examples=None): + if self.config.planning_command: + instruction = self.prompt_template["instruction_template"]["with_planning"] + else: + instruction = self.prompt_template["instruction_template"]["without_planning"] + output_specifications = self.get_output_specifications() + planning_specifications = self.get_planning_specifications() + navigation_specifications = self.get_navigation_specifications() + instruction = instruction.replace("{output_specifications}", output_specifications) + instruction = instruction.replace("{planning_specifications}", planning_specifications) + instruction = instruction.replace("{navigation_specifications}", navigation_specifications) + + example_source = examples if examples is not None else self.prompt_template.get("examples", []) + if len(example_source) > 0: + instruction += f"\n\n## Here are a few examples:" + for i, example in enumerate(example_source): + example_input = example["input"] + example_output = example["output"] + if "example_template" in self.prompt_template.keys(): + instruction += "\n\n" + instruction += self.prompt_template.get("example_template", "| Example {i}\n### Input:\n{example_input}\n### Response: Let's think step by step.\n{example_response}").replace("{i}", i).replace("{example_input}", example_input).replace("{example_output}", example_output) + else: + instruction += f"\n\n| Example {i}\n\n### Input:\n{example_input}\n\n### Response: Let's think step by step.\n{example_output}" + + if self.get_step() == self.config.others.max_steps - 1: + instruction += f"\n\nWARNING: You have a {self.config.others.max_steps}-step budget, and this would be your FINAL STEP. Wrap up your observations and return your answer with `stop [answer]` to maximize the reward." + # else: + # instruction += f"\n\nWARNING: You have a {self.config.others.max_steps}-step budget, and there are {self.config.others.max_steps-self.get_step()} remaining attempts." + + return instruction + + def verbose(self, instruction, online_input, model_response_list, action_element_list): + action_element_keys = [k for k in self.config.play if k in action_element_list[0].keys()] + other_play_keys = [k for k in self.config.play if k not in action_element_list[0].keys()] + + VERBOSE_TO_CONTENT_MAP = { + "step": self.get_step(), + "objective": self.objective, + "previous plans": self.get_previous_plans(verbose=True), + "url": self.online_interaction["url"], + "observation": self.get_observation_text(), + "response": "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n".join([f"|\tAgent {i}:\n{model_response}" for i, model_response in enumerate(model_response_list[:self.config.number])]) if self.config.number > 1 else model_response_list[0], + "instruction": instruction, + "online input": "\n".join([i[1] for i in online_input if i[0]=="text"]), + "alter ego response": "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n".join(["|\tAgent {}:\n{}".format(identity.config.name, response) for identity, response in zip(self.identities, model_response_list[self.config.number:])]) + } + + if self.config.others.verbose > 0 and self.config.verbose > 0: + with open(self.output_trash_path, "a") as af: + af.write("-"*32+"ACTOR"+"-"*32+"\n") + for t in self.config.trash: + content = VERBOSE_TO_CONTENT_MAP.get(t, "") + with open(self.output_trash_path, "a") as af: + af.write(f"{t.upper()}:\n{content}\n\n") + with open(self.output_play_path, "w") as _: + pass + for p in other_play_keys: + content = VERBOSE_TO_CONTENT_MAP.get(p, "") + with open(self.output_play_path, "a") as af: + af.write(f"{p.upper()}:\n{content}\n\n") + for i, action_elements in enumerate(action_element_list): + if len(action_element_list) > 1: + with open(self.output_play_path, "a") as af: + af.write("-"*32+f"AGENT {i}"+"-"*32+"\n") + for action_element_key in action_element_keys: + content = action_elements.get(action_element_key, "N/A") + with open(self.output_play_path, "a") as af: + af.write(f"{action_element_key.upper()}:\n{content}\n\n") + + def parse_plan(self, planning): + planning_type = self.is_planning(action=planning) + match = re.search( + rf"{planning_type} ?\[(\d+)\] ?\[(.+)\]", planning, re.DOTALL + ) + if not match: + raise ValueError("Invalid planning command.") + node_id, planning_content = ( + int(match.group(1)), + match.group(2) + ) + return planning_type, node_id, planning_content + + def prune_planning(self, node:PlanTreeNode, planning_content): + def set_invisible(node:PlanTreeNode): + node.visible = False + def return_steps_taken(node:PlanTreeNode): + return [node.step] + node.steps_taken + after_node = False + if node.id > 0: + for child in node.parent.children: + if not after_node and child != node: + continue + elif child == node: + after_node = True + continue + child.visible = False + node.traverse(set_invisible) + node.reset() + steps_taken = [] + node.traverse(action=return_steps_taken, tree_buffer=steps_taken) + node.steps_taken = sorted(list(set(steps_taken)), reverse=False) + node.resume_reason.append(planning_content) + navigation = f"goto [{node.url}] [1]" + self.active_node = node + return navigation + + def branch_planning(self, node, planning_content): + new_node = PlanTreeNode(id=self.active_node.id+1, type=type, text=planning_content, level=node.level+1, url=self.online_interaction["url"], step=self.get_step()) + self.active_node = new_node + node.add_child(new_node) + + def planning(self, action): + if action and self.is_planning(action): + try: + planning_type, node_id, planning_content = self.parse_plan(planning=action) + node = self.plan_tree_root.search_node_by_id(node_id) + if not node: + raise ValueError(f"Invalid node id {node_id}: {action}.") + if planning_type == "prune": + navigation_action = self.prune_planning(node=node, planning_content=planning_content) + return navigation_action + elif planning_type == "branch": + self.branch_planning(node=node, planning_content=planning_content) + else: + raise ValueError(f"Invalid planning operation {planning_type}: {action}.") + except Exception as e: + print("Invalid plan node:", str(e)) + flaw_node = self.active_node + flaw_node.note.append(f"You previously generate plan \"{action}\", which has INVALID syntax. User planning command like `branch [parent_plan_id] [new_subplan_intent]` or `prune [resume_plan_id] [reason]`.") + else: + self.active_node.steps_taken.append(self.get_step()) + return None + + def go_home(self, action): + if "go_home" in action: + return f"goto [{HOMEPAGE_URL}] [1]" + return None + + def parse_action(self, action_str): + try: + DOM_root_node = self.get_observation_node() + action_str = action_str.strip() + action = ( + action_str.split("[")[0].strip() + if "[" in action_str + else action_str.split()[0].strip() + ) + match action: + case "click": + match = re.search(r"click ?\[(\d+)\]", action_str) + if not match: + raise ValueError(f"Invalid click action {action_str}") + element_id = match.group(1) + node = DOM_root_node.search_node_by_id(element_id) + return f"click [{element_id}] ({node.role} {node.name})" + case "hover": + match = re.search(r"hover ?\[(\d+)\]", action_str) + if not match: + raise ValueError(f"Invalid hover action {action_str}") + element_id = match.group(1) + node = DOM_root_node.search_node_by_id(element_id) + return f"hover [{element_id}] ({node.role} {node.name})" + case "type": + if not (action_str.endswith("[0]") or action_str.endswith("[1]")): + action_str += " [1]" + + match = re.search( + r"type ?\[(\d+)\] ?\[(.+)\] ?\[(\d+)\]", action_str + ) + if not match: + raise ValueError(f"Invalid type action {action_str}") + element_id, text, enter_flag = ( + match.group(1), + match.group(2), + match.group(3), + ) + enter_flag = True if enter_flag == "1" else False + if enter_flag: + text += "\n" + node = DOM_root_node.search_node_by_id(element_id) + return action + f" ({node.name})" + case "scroll": + return action_str + case "goto": + return action + case "new_tab": + return action + case "go_back": + return action + case "go_forward": + return action + case "stop": + return action + + return False + except: + return False + + def parse_actions_to_element_ids(self, actions): + action_str_list = [] + for a in self.config.navigation_command: + action_str_list += self.parse_stipulated_action_list(text=actions, action=a, actions=self.config.planning_command+self.config.navigation_command+["goto"]) + retained_element_ids = [] + for action_str in action_str_list: + try: + action_str = action_str.strip() + action = ( + action_str.split("[")[0].strip() + if "[" in action_str + else action_str.split()[0].strip() + ) + match action: + case "click": + match = re.search(r"click ?\[(\d+)\]", action_str) + if not match: + raise ValueError(f"Invalid click action {action_str}") + element_id = match.group(1) + element_id = int(element_id) + retained_element_ids.append(element_id) + case "hover": + match = re.search(r"hover ?\[(\d+)\]", action_str) + if not match: + raise ValueError(f"Invalid hover action {action_str}") + element_id = match.group(1) + element_id = int(element_id) + retained_element_ids.append(element_id) + case "type": + if not (action_str.endswith("[0]") or action_str.endswith("[1]")): + action_str += " [1]" + + match = re.search( + r"type ?\[(\d+)\] ?\[(.+)\] ?\[(\d+)\]", action_str + ) + if not match: + raise ValueError(f"Invalid type action {action_str}") + element_id, text, enter_flag = ( + match.group(1), + match.group(2), + match.group(3), + ) + element_id = int(element_id) + retained_element_ids.append(element_id) + case "scroll": + pass + case "goto": + pass + case "new_tab": + pass + case "go_back": + pass + case "go_forward": + pass + case "stop": + pass + case "note": + pass + + return retained_element_ids + except: + continue + + return retained_element_ids + + def take_note(self, action, note_as_action=True): + if action and "note [" in action: + none_note_action_list = [] + action_list = self.parse_str_to_action_list(action, actions=self.config.planning_command+self.config.navigation_command+["goto"]) + for a in action_list: + if "note [" in a: + note = re.search(r"note ?\[?(.+)", a, re.DOTALL).group(1) + if note.endswith("]"): + note = note[:-1] + self.active_node.note.append(f"STEP {self.get_step()}: {note}") + self.note_buffer = note + else: + none_note_action_list.append(a) + if note_as_action: + return action + return "\n".join(none_note_action_list) + # action_note = self.parse_action(action) + # if action_note: + # self.active_node.note.append(f"STEP {self.get_step()} ACTION: {action_note}") + return action + + def get_observation_highlight(self, action_elements:dict): + action_elements["observation highlight idxs"] = copy.deepcopy(action_elements.get("observation highlight", "")) + DOM_root_node = self.get_observation_node() + if not DOM_root_node: + action_elements["observation highlight"] = None + return + observation_highlight_idxs = [int(idx.strip()) for idx in action_elements.get("observation highlight", "").split(",") if idx.strip().isdigit()] + if observation_highlight_idxs: + parse_node_descendants(node=DOM_root_node, action=action_set_invisible) + for idx in observation_highlight_idxs: + try: + node = DOM_root_node.search_node_by_id(idx) + parse_node_descendants(node=node, action=action_set_visible) + parse_node_ancestors(node=node, action=action_set_visible) + parse_node_siblings(node=node, action=action_set_visible_if_with_name) + except: + pass + try: + assert DOM_root_node.get_visible_node_number() < 30 and construct_new_DOM_with_visible_nodes(DOM_root=DOM_root_node) + action_elements["observation highlight"] = construct_new_DOM_with_visible_nodes(DOM_root=DOM_root_node) + parse_node_descendants(node=DOM_root_node, action=action_set_visible) + except: + parse_node_descendants(node=DOM_root_node, action=action_set_visible) + action_elements["observation highlight"] = None + + action_elements["retained element ids"] = self.parse_actions_to_element_ids(action_elements["action"]) + + def parse_action_from_action_candidates(self, action_elements): + if "action" in action_elements.keys(): + return action_elements + assert any("action candidates" in k for k in action_elements.keys()) + action_candidates_key = [k for k in action_elements.keys() if "action candidates" in k][0] + def parse_reasons_and_actions(input_string): + pattern = r'- reason: \[(.*?)\]\s*(?:- action: \[(.*?)\])?\s*(?:\n|\Z)' + + matches = re.findall(pattern, input_string, re.DOTALL) + + parsed_data = [] + for match in matches: + reason = match[0].strip() + action = match[1].strip() + if reason and action: + parsed_data.append({'reason': reason, 'action': action}) + + return parsed_data + action_elements[action_candidates_key] = parse_reasons_and_actions(action_elements[action_candidates_key]) + return action_elements + + def predict_action(self, criticism_elements): + if self.config.debug > 1: + action_elements = {k: "" for k in self.config.output} + human_input = input("ACTION: ") + action_elements["action"] = human_input + return [action_elements] + + self.pre_process_atomic_actions() + instruction = self.get_actor_instruction() + online_input = self.get_online_input(criticism_elements=criticism_elements) + model_response_list = [] + action_element_list = [] + for _ in range(self.config.number): + get_valid_actions = False + repetitive_note = False + invalid_actions = False + while not get_valid_actions: + if repetitive_note: + model_response = self.call_model_with_message(system_prompt=instruction+"\nGenerating the command `note [{}]` will be severely punished! Don't generate repetitive notes!".format(getattr(self, "note_buffer", "")), messages=self.arrange_message_for_model(online_input)) + elif invalid_actions: + model_response = self.call_model_with_message(system_prompt=instruction+"\nGenerating the command `{}` will be severely punished! Don't generate invalid actions! We don't have that element id in the current observation!".format(invalid_action_str), messages=self.arrange_message_for_model(online_input)) + else: + model_response = self.call_model_with_message(system_prompt=instruction, messages=self.arrange_message_for_model(online_input)) + action_elements = self.parse_elements(text=model_response, key_list=self.config.output) + action_elements = self.parse_action_from_action_candidates(action_elements=action_elements) + assert not ("action" in action_elements.keys() and any("action candidates" in k for k in action_elements.keys())) + if "action" in action_elements.keys(): + if self.are_valid_actions(action_elements["action"]): + note_buffer = getattr(self, "note_buffer", "") + if note_buffer and f"note [{note_buffer}" in action_elements["action"]: + print(f"Repetitive note: {note_buffer}") + repetitive_note = True + continue + get_valid_actions = True + action_elements["input"] = online_input + model_response_list.append(model_response) + action_element_list.append(action_elements) + else: + invalid_action_str = action_elements["action"] + print(f"Invalid actions: {invalid_action_str}") + invalid_actions = True + elif any("action candidates" in k for k in action_elements.keys()): + action_candidates_key = [k for k in action_elements.keys() if "action candidates" in k][0] + if isinstance(action_elements[action_candidates_key], str): + continue + filtered_action_candidates = [] + note_buffer = getattr(self, "note_buffer", "") + for action_reason_pair in action_elements[action_candidates_key]: + action = action_reason_pair["action"] + reason = action_reason_pair["reason"] + if self.are_valid_actions(action): + if note_buffer and f"note [{note_buffer}" in action: + print(f"Repetitive note: {note_buffer}") + repetitive_note = True + continue + filtered_action_candidates.append({'reason': reason, 'action': action}) + else: + invalid_action_str = action + print(f"Invalid actions: {invalid_action_str}") + invalid_actions = True + if filtered_action_candidates: + action_elements[action_candidates_key] = filtered_action_candidates + get_valid_actions = True + action_elements["input"] = online_input + model_response_list.append(model_response) + action_element_list.append(action_elements) + else: + raise NotImplementedError("You have to generate either action or action candidates.") + # if self.config.number != 1: + if True: + for identity in self.identities: + identity_instruction = identity.get_instruction() if identity.get_instruction() else instruction + identity_online_input = identity.get_online_input() if identity.get_online_input() else online_input + get_valid_actions = False + invalid_actions = False + while not get_valid_actions: + if invalid_actions: + model_response, action_elements = identity.get_action(identity_instruction+"\nGenerating the command `{}` will be severely punished! Don't generate invalid actions! We don't have that element id in the current observation!".format(invalid_action_str), identity_online_input) + else: + model_response, action_elements = identity.get_action(identity_instruction, identity_online_input) + if self.are_valid_actions(action_elements["action"]): + get_valid_actions = True + model_response_list.append(model_response) + action_element_list.append(action_elements) + else: + invalid_action_str = action_elements["action"] + print(f"Invalid actions: {invalid_action_str}") + invalid_actions = True + + self.verbose(instruction=instruction, online_input=online_input, model_response_list=model_response_list, action_element_list=action_element_list) + + if self.config.others.debug or self.config.debug: + for i in range(len(action_element_list)): + human_input = input(f"ACTION {i}: ") + if human_input != "": + action_element_list[i]["action"] = human_input + + return action_element_list + + def finalize_action(self, action_elements): + self.get_observation_highlight(action_elements=action_elements) + action = action_elements["action"] + navigation_action = self.planning(action=action) + if navigation_action: + action_elements["navigation action"] = navigation_action + action = self.take_note(action) + action_elements["action"] = action + navigation_action = self.go_home(action=action) + if navigation_action: + action_elements["navigation action"] = navigation_action + return action_elements + +class Critic(Agent): + def __init__(self, config, objective, prompt_template): + super().__init__(config, objective, prompt_template) + self.instruction = None + self.actor_basic_info_dict = None + + self.output_play_path = os.path.join(CURRENT_DIR, f"play-{self.config.others.logname}.txt") if getattr(self.config.others, "logname", "") != "" else os.path.join(CURRENT_DIR, f"play.txt") + self.output_trash_path = os.path.join(CURRENT_DIR, f"trash-{self.config.others.logname}.txt") if getattr(self.config.others, "logname", "") != "" else os.path.join(CURRENT_DIR, f"trash.txt") + + def verbose(self, instruction, online_input, model_response): + VERBOSE_TO_CONTENT_MAP = { + "url": self.online_interaction["url"], + "objective": self.objective, + "instruction": instruction, + "online input": "\n".join([i[1] for i in online_input if i[0]=="text"]), + "response": model_response + } + if self.config.others.verbose > 0 and self.config.verbose > 0: + with open(self.output_trash_path, "a") as af: + af.write("-"*32+"CRITIC"+"-"*32+"\n") + for t in self.config.trash: + content = VERBOSE_TO_CONTENT_MAP[t] + with open(self.output_trash_path, "a") as af: + af.write(f"{t.upper()}:\n{content}\n\n") + + def update_actor_basic_info(self, **actor_basic_info_dict): + self.actor_basic_info_dict = actor_basic_info_dict + + def get_output_specifications(self): + output_specification_filepath_list = [] + for o in self.config.output: + if os.path.exists(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "output_specifications", "{}_{}.txt".format(o.replace(" ", "_"), self.config.character))): + output_specification_filepath_list.append(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "output_specifications", "{}_{}.txt".format(o.replace(" ", "_"), self.config.character))) + else: + output_specification_filepath_list.append(os.path.join(CURRENT_DIR, "AgentOccam", "prompts", "output_specifications", "{}.txt".format(o.replace(" ", "_")))) + output_specifications = "\n".join([f"{o.upper()}:\n" + "".join(open(filepath, "r").readlines()) for o, filepath in zip(self.config.output, output_specification_filepath_list)]) + return output_specifications + + def get_critic_instruction(self): + if self.instruction: + return self.instruction + instruction = self.prompt_template["instruction_template"] + output_specifications = self.get_output_specifications() + instruction = instruction.replace("{output_specifications}", output_specifications) + instruction = instruction.replace("{planning_specifications}", self.actor_basic_info_dict["planning_specifications"]) + instruction = instruction.replace("{navigation_specifications}", self.actor_basic_info_dict["navigation_specifications"]) + self.instruction = instruction + return self.instruction + + def get_online_input(self): + input_template = self.prompt_template["input_template"] + input_prefix, input_suffix = input_template.split("{input}") + # ["objective", "previous plans", "interaction history", "step", "current observation"] + INPUT_TYPE_TO_CONTENT_MAP = { + "step": self.actor_basic_info_dict["step"], + "objective": self.objective, + "previous plans": self.actor_basic_info_dict["previous_plans"], + "interaction history": self.actor_basic_info_dict["interaction_history"], + "current observation": self.get_observation_text(), + "current visual observation": self.get_observation_image() + } + input_list = [] + for input_type in self.config.input: + input_content = None + if input_type == "current visual observation": + continue + elif input_type in INPUT_TYPE_TO_CONTENT_MAP.keys(): + input_content = INPUT_TYPE_TO_CONTENT_MAP[input_type] + if input_content and isinstance(input_content, str): + input_list.append(("text", f"{input_type.upper()}:\n{input_content}\n")) + elif input_content and isinstance(input_content, list): + input_list.append(("text", f"{input_type.upper()}:\n")) + input_list += input_content if len(input_content) > 0 else ["N/A"] + + if "image" in self.config.current_observation.type: + input_type = "current visual observation" + input_list.append(("text", f"{input_type.upper()}:\n")) + input_list.append(("image", INPUT_TYPE_TO_CONTENT_MAP["current visual observation"])) + + return self.prune_message_list(message_list=[("text", input_prefix)] + input_list + [("text", input_suffix)]) + + def get_criticism_elements(self): + if not self.config.mode: + return {} + if self.config.debug > 1: + criticism_elements = {k: random.choice(["I don't think the task is finished. Don't issue identical actions like taking the same notes. It's annoying. Continue.", "You have make a reasoning mistake. Continue.", "You have missed important details on this page. Continue.", "You don't follow the task requirements. Continue.", "The task assigner might just want to challenge you to answer no and there might be no answer for this brain teaser question. Who knows?", "You should break down the task by using the planning commands.", "You have not gone over all the relevant pages. Continue."]) for k in self.config.output} + # criticism_elements = {k: input(f"{k.upper()}: ") for k in self.config.output} + return criticism_elements + + instruction = self.get_critic_instruction() + online_input = self.get_online_input() + model_response = self.call_model_with_message(system_prompt=instruction, messages=self.arrange_message_for_model(online_input)) + self.verbose(instruction=instruction, online_input=online_input, model_response=model_response) + + criticism_elements = self.parse_elements(text=model_response, key_list=self.config.output) # key_list=self.config.output) + criticism_elements["input"] = online_input + + if self.config.others.debug or self.config.debug: + for k in self.config.output: + human_input = input(f"{k.upper()}: ") + if not human_input == "": + criticism_elements[k] = human_input + + return criticism_elements + +class Judge(Agent): + def __init__(self, config, objective, prompt_template): + super().__init__(config, objective, prompt_template) + self.instruction = None + self.actor_basic_info_dict = None + + self.output_play_path = os.path.join(CURRENT_DIR, f"play-{self.config.others.logname}.txt") if getattr(self.config.others, "logname", "") != "" else os.path.join(CURRENT_DIR, f"play.txt") + self.output_trash_path = os.path.join(CURRENT_DIR, f"trash-{self.config.others.logname}.txt") if getattr(self.config.others, "logname", "") != "" else os.path.join(CURRENT_DIR, f"trash.txt") + + def update_actor_basic_info(self, **actor_basic_info_dict): + self.actor_basic_info_dict = actor_basic_info_dict + + def get_judge_instruction(self): + if self.instruction: + return self.instruction + instruction = self.prompt_template["instruction_template"] + output_specifications = self.get_output_specifications() + instruction = instruction.replace("{output_specifications}", output_specifications) + instruction = instruction.replace("{planning_specifications}", self.actor_basic_info_dict["planning_specifications"]) + instruction = instruction.replace("{navigation_specifications}", self.actor_basic_info_dict["navigation_specifications"]) + self.instruction = instruction + return self.instruction + + def get_online_input(self, action_element_list): + input_template = self.prompt_template["input_template"] + input_prefix, input_suffix = input_template.split("{input}") + INPUT_TYPE_TO_CONTENT_MAP = { + "step": self.actor_basic_info_dict["step"], + "objective": self.objective, + "previous plans": self.actor_basic_info_dict["previous_plans"], + "interaction history": self.actor_basic_info_dict["interaction_history"], + "current observation": self.get_observation_text(), + "current visual observation": self.get_observation_image(), + "action choices": "\n\n".join(["|\taction [{}]:\n{}\n|\treason for action [{}]:\n{}".format(i, action_element["action"], i, action_element.get("reason", "N/A")) for i, action_element in enumerate(action_element_list)]) + } + input_list = [] + for input_type in self.config.input: + input_content = None + if input_type == "current visual observation": + continue + elif input_type in INPUT_TYPE_TO_CONTENT_MAP.keys(): + input_content = INPUT_TYPE_TO_CONTENT_MAP[input_type] + if input_content and isinstance(input_content, str): + input_list.append(("text", f"{input_type.upper()}:\n{input_content}\n")) + elif input_content and isinstance(input_content, list): + input_list.append(("text", f"{input_type.upper()}:\n")) + input_list += input_content if len(input_content) > 0 else ["N/A"] + + if "image" in self.config.current_observation.type: + input_type = "current visual observation" + input_list.append(("text", f"{input_type.upper()}:\n")) + input_list.append(("image", INPUT_TYPE_TO_CONTENT_MAP["current visual observation"])) + + return self.prune_message_list(message_list=[("text", input_prefix)] + input_list + [("text", input_suffix)]) + + def verbose(self, instruction, online_input, model_response): + VERBOSE_TO_CONTENT_MAP = { + "url": self.online_interaction["url"], + "objective": self.objective, + "instruction": instruction, + "online input": "\n".join([i[1] for i in online_input if i[0]=="text"]), + "response": model_response + } + if self.config.others.verbose > 0 and self.config.verbose > 0: + with open(self.output_trash_path, "a") as af: + af.write("-"*32+"JUDGE"+"-"*32+"\n") + for t in self.config.trash: + content = VERBOSE_TO_CONTENT_MAP[t] + with open(self.output_trash_path, "a") as af: + af.write(f"{t.upper()}:\n{content}\n\n") + + def flatten_action_element_list(self, action_element_list): + new_action_element_list = [] + for action_element in action_element_list: + if any("action candidates" in k for k in action_element.keys()): + action_candidates_key = [k for k in action_element.keys() if "action candidates" in k][0] + new_action_element = copy.deepcopy(action_element) + for action_reason_pair in action_element[action_candidates_key]: + new_action_element["action"] = action_reason_pair["action"] + new_action_element["reason"] = action_reason_pair["reason"] + new_action_element_list.append(copy.deepcopy(new_action_element)) + else: + new_action_element_list.append(action_element) + random.shuffle(new_action_element_list) + + return new_action_element_list + + def judge(self, action_element_list): + action_element_list = self.flatten_action_element_list(action_element_list) + if not self.config.mode or self.config.debug > 1: + return action_element_list[0], {} + if all(action_elements["action"]==action_element_list[0]["action"] for action_elements in action_element_list): + return action_element_list[0], {} + + def deduplicate_action_element_list_strict(lst): # deduplicate, remove action_elements with only note or stop command + seen = set() + note_list = [] + stop_list = [] + deduplicated_list = [] + + for i, item in enumerate(lst): + item = copy.deepcopy(item) + action_list = self.parse_str_to_action_list(item["action"], self.actor_basic_info_dict["planning_command"]+self.actor_basic_info_dict["navigation_command"]) + note_list.append([]) + none_note_stop_action_list = [] + for a in action_list: + if a.startswith("stop ["): + stop_list.append((a, i)) + elif a.startswith("note ["): + note_list[-1].append(a) + else: + none_note_stop_action_list.append(a) + item["action"] = "\n".join(none_note_stop_action_list) + if item["action"] and item["action"] not in seen: + seen.add(item["action"]) + deduplicated_list.append(item) + note_list = [("\n".join(notes), i) for i, notes in enumerate(note_list)] + return note_list, stop_list, deduplicated_list + + def deduplicate_action_element_list(lst): # deduplicate, remove action_elements with only note or stop command + seen = set() + deduplicated_list = [] + + for item in lst: + item = copy.deepcopy(item) + if item["action"] and item["action"] not in seen: + seen.add(item["action"]) + deduplicated_list.append(item) + return deduplicated_list + + if hasattr(self.config, "strict") and self.config.strict: + note_list, stop_list, deduplicated_action_element_list = deduplicate_action_element_list_strict(action_element_list) + if len(stop_list) >= 0.6 * len(action_element_list): + stop_action_choice = max([s[0] for s in stop_list], key=len) + stop_action_id = [s[1] for s in stop_list if s[0]==stop_action_choice][0] + return action_element_list[stop_action_id], {} + if not deduplicated_action_element_list: + note_action_choice = max([n[0] for n in note_list], key=len) + note_action_id = [n[1] for n in note_list if n[0]==note_action_choice][0] + action_elements = action_element_list[note_action_id] + action_elements["action"] = note_action_choice + return action_elements, {} + elif len(deduplicated_action_element_list) == 1: + action_elements = deduplicated_action_element_list[0] + note_action_choice = max([n[0] for n in note_list], key=len) + action_elements["action"] = note_action_choice + "\n" + action_elements["action"] + return action_elements, {} + else: + deduplicated_action_element_list = deduplicate_action_element_list(action_element_list) + + instruction = self.get_judge_instruction() + online_input = self.get_online_input(deduplicated_action_element_list) + model_response = self.call_model_with_message(system_prompt=instruction, messages=self.arrange_message_for_model(online_input)) + self.verbose(instruction=instruction, online_input=online_input, model_response=model_response) + + judgement_elements = self.parse_elements(text=model_response, key_list=self.config.output) # key_list=self.config.output) + judgement_elements["input"] = online_input + + if self.config.others.debug or self.config.debug: + for k in self.config.output: + human_input = input(f"{k.upper()}: ") + if not human_input == "": + judgement_elements[k] = human_input + + try: + action_selection = int(re.search(r'\d+', judgement_elements["action selection"]).group()) + selected_action_elements = deduplicated_action_element_list[action_selection] + if hasattr(self.config, "strict") and self.config.strict: + note_action_choice = max([n[0] for n in note_list], key=len) + if note_action_choice: + selected_action_elements["action"] = note_action_choice + "\n" + selected_action_elements["action"] + return selected_action_elements, judgement_elements + except: + return action_element_list[0], judgement_elements + +class AgentOccam: + def __init__(self, + config = None, + prompt_dict: Dict = None, + ): + self.config = config + self.prompt_dict = {} if prompt_dict is None else prompt_dict + + self.objective = None + self.online_observation = None + self.online_url = None + self.actor = None + self.critic = None + + self.trajectory = [] + + def get_refined_objective(self): + model_response = call_claude(self.root_prompt_template["objective_rephrasing_query"].replace("{objective}", self.objective)) + objective_match = re.search(r'REFINED OBJECTIVE:\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL) + self.objective_refined = objective_match.group(1) if objective_match else None + + def get_observation_text(self): + if isinstance(self.online_observation, dict): + return self.online_observation["text"] + else: + return self.online_observation + + def init_actor(self): + self.config.actor.others = self.config.others + if len(self.sites) > 1: + self.config.actor.navigation_command += ["go_home"] + self.actor = Actor( + config=self.config.actor, + objective=self.objective, + prompt_template=self.prompt_dict["actor"], + plan_tree_node=PlanTreeNode(id=0, type="branch", text=f"Find the solution to \"{self.objective}\"", level=0, url=self.online_url, step=0) + ) + with open(self.actor.output_trash_path, "w") as _: + pass + + def init_critic(self): + self.config.critic.others = self.config.others + self.critic = Critic( + config=self.config.critic, + objective=self.objective, + prompt_template=self.prompt_dict["critic"][self.config.critic.character], + ) + + def init_judge(self): + self.config.judge.others = self.config.others + self.judge = Judge( + config=self.config.judge, + objective=self.objective, + prompt_template=self.prompt_dict["judge"], + ) + + def predict_action(self): + self.critic.update_actor_basic_info(step=self.get_step(), planning_specifications=self.actor.get_planning_specifications(), navigation_specifications=self.actor.get_navigation_specifications(), interaction_history=self.actor.get_interaction_history(interaction_history_config=self.critic.config.interaction_history), previous_plans=self.actor.get_previous_plans(verbose=True)) + criticism_elements = self.critic.get_criticism_elements() if not self.get_step()==0 else {} + action_element_list = self.actor.predict_action(criticism_elements=criticism_elements) + self.judge.update_actor_basic_info(step=self.get_step(), planning_specifications=self.actor.get_planning_specifications(), navigation_specifications=self.actor.get_navigation_specifications(), interaction_history=self.actor.get_interaction_history(interaction_history_config=self.judge.config.interaction_history), previous_plans=self.actor.get_previous_plans(verbose=True), planning_command=self.actor.config.planning_command, navigation_command=self.actor.config.navigation_command) + selected_action_elements, judgement_elements = self.judge.judge(action_element_list) + selected_action_elements = self.actor.finalize_action(selected_action_elements) + return {**selected_action_elements, **{"critic:"+k: criticism_elements[k] for k in criticism_elements.keys()}, **{"judge:"+k: judgement_elements[k] for k in judgement_elements.keys()}}, action_element_list + + def update_online_state(self, url, observation): + self.online_url = url + self.online_observation = observation + + def get_step(self): + return self.actor.get_step() + + def is_navigation(self, action): + return self.actor.is_navigation(action=action) + + def get_actor_active_plan(self): + return self.actor.get_active_plan() + + def get_trajectory(self): + return self.trajectory + + def act(self, objective, env): + self.objective = objective + self.sites = env.get_sites() + observation = env.observation() + url = env.get_url() + self.update_online_state(url=url, observation=observation) + self.init_actor() + self.init_critic() + self.init_judge() + while not env.done(): + observation = env.observation() + url = env.get_url() + self.update_online_state(url=url, observation=observation) + self.actor.update_online_state(url=url, observation=observation) + self.critic.update_online_state(url=url, observation=observation) + self.judge.update_online_state(url=url, observation=observation) + action_elements, action_element_list = self.predict_action() + action = action_elements["action"] + navigation_action = action_elements["action"] if not action_elements.get("navigation action", "") else action_elements.get("navigation action", "") + status = env.step(navigation_action) + if navigation_action and self.is_navigation(action=navigation_action) and status == False: # means invalid action + flaw_node = self.actor.active_node + flaw_node.note.append(f"STEP {self.get_step()}: You generate action \"{action}\", which has INVALID syntax. Strictly follow the action specifications.") + DOCUMENTED_INTERACTION_ELEMENT_KEY_TO_CONTENT_MAP = { + "observation": observation, + "action": action, + "url": url, + "plan": self.get_actor_active_plan(), + "reason": action_elements.get("reason", ""), + "observation highlight": action_elements.get("observation highlight", ""), + "retained element ids": action_elements.get("retained element ids", []), + "observation summary": action_elements.get("observation description", "") + } + self.actor.update_history(**DOCUMENTED_INTERACTION_ELEMENT_KEY_TO_CONTENT_MAP) + self.actor.del_observation_node() + assert self.actor.equal_history_length() + + if len(action_element_list) > 1: + if self.config.others.logging: + self.log_step( + status=status if "status" in locals() and isinstance(status, dict) else env.status(), + plan=self.get_actor_active_plan(), + **action_elements, + **{f"actor {i}:{k}": _action_elements[k] for i, _action_elements in enumerate(action_element_list) for k in _action_elements.keys() if k != "input" and k != "instruction"} + ) + else: + if self.config.others.logging: + self.log_step( + status=status if "status" in locals() and isinstance(status, dict) else env.status(), + plan=self.get_actor_active_plan(), + **action_elements, + ) + + return status if "status" in locals() and isinstance(status, dict) else env.status() + + def log_step(self, status, **kwargs): + def serialize_message_list(message_list): + if not isinstance(message_list, list): + return message_list + return "".join([m[1] for m in message_list if m[0]=="text"]) + data_to_log = {} + data_to_log['objective'] = self.objective + data_to_log['url'] = self.online_url + data_to_log['observation'] = self.get_observation_text() + for (k, v) in status.items(): + data_to_log[k] = v + for k in kwargs.keys(): + try: + json.dumps(kwargs[k]) + data_to_log[k.replace(" ", "_")] = kwargs[k] if not "input" in k else serialize_message_list(kwargs[k]) + except: + pass + self.trajectory.append(data_to_log) \ No newline at end of file diff --git a/AgentOccam/__init__.py b/AgentOccam/__init__.py new file mode 100644 index 0000000..0911d44 --- /dev/null +++ b/AgentOccam/__init__.py @@ -0,0 +1,2 @@ +from .obs_opt import parse_node_descendants, parse_node_ancestors, parse_node_siblings, action_set_invisible, action_set_visible, action_set_visible_if_with_name, translate_node_to_str, construct_new_DOM_with_visible_nodes +from .utils import CURRENT_DIR, HOMEPAGE_URL \ No newline at end of file diff --git a/AgentOccam/configs/AgentOccam-Judge.yml b/AgentOccam/configs/AgentOccam-Judge.yml new file mode 100644 index 0000000..079abd3 --- /dev/null +++ b/AgentOccam/configs/AgentOccam-Judge.yml @@ -0,0 +1,78 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "AgentOccam-Judge" +max_steps: 20 +agent: + type: "AgentOccam" + others: + max_steps: 20 + logname: "AgentOccam-Judge" + logging: True + verbose: 1 + debug: False + actor: + debug: 0 + verbose: 1 + number: 1 + model: "gpt-4-turbo" + documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] + online_interaction_elements: [] + input: ["step", "objective", "previous plans", "interaction history", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: 3 + current_observation: + type: ["text"] + output: ["interaction history summary", "observation description", "action candidates", "observation highlight"] + planning_command: ["branch", "prune"] + navigation_command: ["click", "type", "stop", "note", "go_back"] + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response"] + critic: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + character: "normal" + input: ["objective", "previous plans", "interaction history", "step", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["observation description", "mistakes"] + trash: ["instruction", "online input", "response"] + judge: + mode: true + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + strict: false + input: ["objective", "previous plans", "interaction history", "step", "current observation", "action choices"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["plan progress assessment", "action assessment", "action selection"] + trash: ["instruction", "online input", "response"] +env: + fullpage: true + prune: true + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/configs/AgentOccam-SteP.yml b/AgentOccam/configs/AgentOccam-SteP.yml new file mode 100644 index 0000000..badf923 --- /dev/null +++ b/AgentOccam/configs/AgentOccam-SteP.yml @@ -0,0 +1,26 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "AgentOccam-SteP" +max_steps: 20 +agent: + type: "AgentOccam-SteP" + root_action: "shopping_admin_agent" # Need to be adapted to tasks + low_level_action_list: ['click', 'type', 'stop', 'goto', 'hover', 'note', 'go_back'] + model_name: "gpt-4-turbo" + model_host: "openai" + prompt_mode: "chat" + max_target_len: 100 +env: + fullpage: true + prune: true + max_env_steps: 20 + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/configs/AgentOccam-WebVoyager.yml b/AgentOccam/configs/AgentOccam-WebVoyager.yml new file mode 100644 index 0000000..d0583cf --- /dev/null +++ b/AgentOccam/configs/AgentOccam-WebVoyager.yml @@ -0,0 +1,74 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "AgentOccam-WebVoyager" +max_steps: 20 +agent: + type: "AgentOccam" + others: + max_steps: 20 + logname: "AgentOccam-WebVoyager" + logging: True + verbose: 1 + debug: False + actor: + debug: 0 + verbose: 1 + number: 1 + model: "gpt-4-turbo" + documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] + online_interaction_elements: [] + input: ["step", "objective", "previous plans", "interaction history", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: 3 + current_observation: + type: ["text"] + output: ["interaction history summary", "observation description", "reason", "action", "observation highlight"] + planning_command: ["branch", "prune"] + navigation_command: ["click", "type", "stop", "note", "go_back"] + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response"] + critic: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + character: "normal" + input: ["objective", "previous plans", "interaction history", "step", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["observation description", "mistakes"] + trash: ["instruction", "online input", "response"] + judge: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + strict: false + input: ["objective", "previous plans", "interaction history", "step", "current observation", "action choices"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["plan progress assessment", "action assessment", "action selection"] + trash: ["instruction", "online input", "response"] +env: + fullpage: true + prune: true + max_browser_rows: 500 + headless: True + relative_task_dir: "webvoyager" + task_ids: ["Allrecipes--3"] \ No newline at end of file diff --git a/AgentOccam/configs/AgentOccam.yml b/AgentOccam/configs/AgentOccam.yml new file mode 100644 index 0000000..77bf3fc --- /dev/null +++ b/AgentOccam/configs/AgentOccam.yml @@ -0,0 +1,78 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "AgentOccam" +max_steps: 20 +agent: + type: "AgentOccam" + others: + max_steps: 20 + logname: "AgentOccam" + logging: True + verbose: 1 + debug: False + actor: + debug: 0 + verbose: 1 + number: 1 + model: "gpt-4-turbo" + documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] + online_interaction_elements: [] + input: ["step", "objective", "previous plans", "interaction history", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: 3 + current_observation: + type: ["text"] + output: ["interaction history summary", "observation description", "reason", "action", "observation highlight"] + planning_command: ["branch", "prune"] + navigation_command: ["click", "type", "stop", "note", "go_back"] + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response"] + critic: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + character: "normal" + input: ["objective", "previous plans", "interaction history", "step", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["observation description", "mistakes"] + trash: ["instruction", "online input", "response"] + judge: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + strict: false + input: ["objective", "previous plans", "interaction history", "step", "current observation", "action choices"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["plan progress assessment", "action assessment", "action selection"] + trash: ["instruction", "online input", "response"] +env: + fullpage: true + prune: true + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/configs/SteP-replication.yml b/AgentOccam/configs/SteP-replication.yml new file mode 100644 index 0000000..b3c33cf --- /dev/null +++ b/AgentOccam/configs/SteP-replication.yml @@ -0,0 +1,26 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "SteP-replication" +max_steps: 20 +agent: + type: "SteP-replication" + root_action: "shopping_admin_agent" # Need to be adapted to tasks + low_level_action_list: ['click', 'type', 'scroll', 'stop', 'goto', 'hover', 'note', 'go_back'] + model_name: "gpt-4-turbo" + model_host: "openai" + prompt_mode: "chat" + max_target_len: 100 +env: + fullpage: false + prune: false + max_env_steps: 20 + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/configs/reduced_action-X_scrolling-obs_opt-history.yml b/AgentOccam/configs/reduced_action-X_scrolling-obs_opt-history.yml new file mode 100644 index 0000000..7cb45e8 --- /dev/null +++ b/AgentOccam/configs/reduced_action-X_scrolling-obs_opt-history.yml @@ -0,0 +1,78 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "reduced_action-X_scrolling-obs_opt-history" +max_steps: 20 +agent: + type: "AgentOccam" + others: + max_steps: 20 + logname: "reduced_action-X_scrolling-obs_opt-history" + logging: True + verbose: 1 + debug: False + actor: + debug: 0 + verbose: 1 + number: 1 + model: "gpt-4-turbo" + documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] + online_interaction_elements: [] + input: ["step", "objective", "interaction history", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: 3 + current_observation: + type: ["text"] + output: ["interaction history summary", "observation description", "reason", "action", "observation highlight"] + planning_command: [] + navigation_command: ["click", "type", "stop", "note", "go_back"] + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response"] + critic: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + character: "normal" + input: ["objective", "previous plans", "interaction history", "step", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["observation description", "mistakes"] + trash: ["instruction", "online input", "response"] + judge: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + strict: false + input: ["objective", "previous plans", "interaction history", "step", "current observation", "action choices"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["plan progress assessment", "action assessment", "action selection"] + trash: ["instruction", "online input", "response"] +env: + fullpage: true + prune: true + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/configs/reduced_action-X_scrolling-obs_opt.yml b/AgentOccam/configs/reduced_action-X_scrolling-obs_opt.yml new file mode 100644 index 0000000..3c0cbf2 --- /dev/null +++ b/AgentOccam/configs/reduced_action-X_scrolling-obs_opt.yml @@ -0,0 +1,78 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "reduced_action-X_scrolling-obs_opt" +max_steps: 20 +agent: + type: "AgentOccam" + others: + max_steps: 20 + logname: "reduced_action-X_scrolling-obs_opt" + logging: True + verbose: 1 + debug: False + actor: + debug: 0 + verbose: 1 + number: 1 + model: "gpt-4-turbo" + documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] + online_interaction_elements: [] + input: ["step", "objective", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: 3 + current_observation: + type: ["text"] + output: ["observation description", "reason", "action", "observation highlight"] + planning_command: [] + navigation_command: ["click", "type", "stop", "note", "go_back"] + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response"] + critic: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + character: "normal" + input: ["objective", "previous plans", "interaction history", "step", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["observation description", "mistakes"] + trash: ["instruction", "online input", "response"] + judge: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + strict: false + input: ["objective", "previous plans", "interaction history", "step", "current observation", "action choices"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["plan progress assessment", "action assessment", "action selection"] + trash: ["instruction", "online input", "response"] +env: + fullpage: true + prune: true + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/configs/reduced_action-X_scrolling.yml b/AgentOccam/configs/reduced_action-X_scrolling.yml new file mode 100644 index 0000000..bc49bd7 --- /dev/null +++ b/AgentOccam/configs/reduced_action-X_scrolling.yml @@ -0,0 +1,78 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "reduced_action-X_scrolling" +max_steps: 20 +agent: + type: "AgentOccam" + others: + max_steps: 20 + logname: "reduced_action-X_scrolling" + logging: True + verbose: 1 + debug: False + actor: + debug: 0 + verbose: 1 + number: 1 + model: "gpt-4-turbo" + documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] + online_interaction_elements: [] + input: ["step", "objective", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: 3 + current_observation: + type: ["text"] + output: ["observation description", "reason", "action", "observation highlight"] + planning_command: [] + navigation_command: ["click", "type", "stop", "note", "go_back"] + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response"] + critic: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + character: "normal" + input: ["objective", "previous plans", "interaction history", "step", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["observation description", "mistakes"] + trash: ["instruction", "online input", "response"] + judge: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + strict: false + input: ["objective", "previous plans", "interaction history", "step", "current observation", "action choices"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["plan progress assessment", "action assessment", "action selection"] + trash: ["instruction", "online input", "response"] +env: + fullpage: true + prune: false + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/configs/reduced_action.yml b/AgentOccam/configs/reduced_action.yml new file mode 100644 index 0000000..e2a1606 --- /dev/null +++ b/AgentOccam/configs/reduced_action.yml @@ -0,0 +1,78 @@ +logging: True +verbose: 1 +debug: False +logdir: "../AgentOccam-Trajectories" +logname: "reduced_action" +max_steps: 20 +agent: + type: "AgentOccam" + others: + max_steps: 20 + logname: "reduced_action" + logging: True + verbose: 1 + debug: False + actor: + debug: 0 + verbose: 1 + number: 1 + model: "gpt-4-turbo" + documented_interaction_elements: ["url", "plan", "reason", "observation summary", "retained element ids", "observation highlight"] + online_interaction_elements: [] + input: ["step", "objective", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: 3 + current_observation: + type: ["text"] + output: ["observation description", "reason", "action", "observation highlight"] + planning_command: [] + navigation_command: ["click", "type", "scroll", "stop", "note", "go_back"] + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response"] + critic: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + character: "normal" + input: ["objective", "previous plans", "interaction history", "step", "current observation"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["observation description", "mistakes"] + trash: ["instruction", "online input", "response"] + judge: + mode: false + debug: 0 + verbose: 1 + model: "gpt-4-turbo" + documented_interaction_elements: [] + online_interaction_elements: [] + strict: false + input: ["objective", "previous plans", "interaction history", "step", "current observation", "action choices"] + interaction_history: + verbose: True + type: ["text"] + step_num: "all" + current_observation: + type: ["text"] + output: ["plan progress assessment", "action assessment", "action selection"] + trash: ["instruction", "online input", "response"] +env: + fullpage: false + prune: false + max_browser_rows: 500 + headless: True + task_ids: ["stanford_cs_head", 65] + # a. "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790] + # b. "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767] + # c. "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798] + # d. "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735] + # e. "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] \ No newline at end of file diff --git a/AgentOccam/env.py b/AgentOccam/env.py new file mode 100644 index 0000000..7d27a49 --- /dev/null +++ b/AgentOccam/env.py @@ -0,0 +1,129 @@ +import json +from browser_env import ( + create_id_based_action, + create_id_based_actions, + StateInfo, + Trajectory, + ActionTypes, + ScriptBrowserEnv +) +from evaluation_harness.evaluators import evaluator_router +from AgentOccam.obs_opt import ( + prune_tree, + translate_node_to_str, +) + + +class WebArenaEnvironmentWrapper(): + def __init__(self, config_file, max_browser_rows=300, max_steps=50, slow_mo=1, observation_type="accessibility_tree", current_viewport_only=False, viewport_size={"width": 1280, "height": 720}, headless=False, global_config=None): + self.webarena_env = ScriptBrowserEnv( + headless=headless, + slow_mo=slow_mo, + observation_type=observation_type, + current_viewport_only=current_viewport_only, + viewport_size=viewport_size, + global_config=global_config + ) + self.config_file = config_file + with open(self.config_file, "r") as f: + self.config = json.load(f) + self.global_config = global_config + + self.obs, self.info = self.webarena_env.reset(options={"config_file": self.config_file}) + self.terminated = False + self.objective = self.config["intent"] + self.url = self.config["start_url"] + self.max_browser_rows = max_browser_rows + self.max_steps = max_steps + self.steps = 0 + self.is_done = False + self.reward = 0.0 + + self.trajectory: Trajectory = [] + self.update_webarena_metrics() + + def reset(self): + self.obs, self.info = self.webarena_env.reset(options={"config_file": self.config_file}) + + def close(self): + self.webarena_env.close() + + def get_url(self): + return self.url + + def get_objective(self): + return self.objective + + def get_sites(self): + return self.config["sites"] + + def observation(self): + self.url = self.webarena_env.page.url + if self.global_config and self.global_config.env.prune: + root_node = self.obs["text"][1] + DOM_root_node = prune_tree(objective=self.objective, root_node=root_node, mode="node") + DOM_str = translate_node_to_str(node=DOM_root_node, mode="concise") + return {"text": DOM_str, "image": self.obs["image"], "node": DOM_root_node} + else: + browser_content = self.obs["text"][0] + browser_content = browser_content.split("\n")[:self.max_browser_rows] + browser_content = "\n".join(browser_content) + return browser_content + + def done(self): + if self.is_done: + return True + return False + + def status(self): + return {'done': self.is_done, 'reward': self.reward, 'success': float(self.reward > 0), 'num_actions': self.steps} + + def step(self, action): + self.steps = self.steps + 1 + print(f"[Step {self.steps}] {action}") + print("*"*100) + if self.steps > self.max_steps: + print(f"Steps {self.steps} exceeded maximum {self.max_steps}") + self.is_done = True + action_cmd = create_id_based_action(f"stop [Trajectory failed: Steps {self.steps} exceeded maximum {self.max_steps}.]") + self.update_webarena_metrics(action_cmd) + return self.status() + + if action is None or action == "": + action_cmds = [] + else: + try: + action_cmds = create_id_based_actions(action) + if not action_cmds: + return False + except Exception as e: + print(f"Invalid action syntax: {e}") + action_cmds = [] + + for action_cmd in action_cmds: + try: + self.obs, _, self.terminated, _, self.info = self.webarena_env.step(action_cmd) + self.update_webarena_metrics(action_cmd) + except Exception as e: + print(f"Error occurred while taking step: {e}") + + return self.status() + + def update_webarena_metrics(self, action_cmd=None): + # Append action (if any) and resulting sate + if action_cmd: + self.trajectory.append(action_cmd) + if action_cmd["action_type"]== ActionTypes.STOP: + self.is_done = True + + if not self.is_done: # If we are done, no need to append state + state_info: StateInfo = {"observation": self.obs, "info": self.info} + self.trajectory.append(state_info) + + if self.is_done: + try: + evaluator = evaluator_router(self.config_file) + self.reward = evaluator(trajectory=self.trajectory, config_file=self.config_file, page=self.webarena_env.page, client=self.webarena_env.get_page_client(self.webarena_env.page)) + except Exception as e: + print(f"Got excepetion: {e}") + self.reward = 0 \ No newline at end of file diff --git a/AgentOccam/llms/claude.py b/AgentOccam/llms/claude.py new file mode 100644 index 0000000..346ae14 --- /dev/null +++ b/AgentOccam/llms/claude.py @@ -0,0 +1,213 @@ +import boto3 +import json +import numpy as np +from PIL import Image +import base64 +import io +import time + +DEFAULT_SYSTEM_PROMPT = '''You are an AI assistant. Your goal is to provide informative and substantive responses to queries.''' + +def call_claude(prompt, model_id="anthropic.claude-3-sonnet-20240229-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + client = boto3.client("bedrock-runtime", region_name="us-east-1") + + native_request = { + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 1024, + "temperature": 0.95, + "system": system_prompt, + "messages": [ + { + "role": "user", + "content": [{"type": "text", "text": prompt}], + } + ], + } + + request = json.dumps(native_request) + + num_attempts = 0 + while True: + if num_attempts >= 10: + raise ValueError("OpenAI request failed.") + try: + response = client.invoke_model(modelId=model_id, body=request) + model_response = json.loads(response["body"].read()) + + response_text = model_response["content"][0]["text"] + return response_text + + except Exception as e: + print(e) + print("Sleeping for 10s...") + time.sleep(10) + num_attempts += 1 + + + +def arrange_message_for_claude(item_list): + def image_path_to_bytes(file_path): + with open(file_path, "rb") as image_file: + image_bytes = image_file.read() + return image_bytes + combined_item_list = [] + previous_item_is_text = False + text_buffer = "" + for item in item_list: + if item[0] == "image": + if len(text_buffer) > 0: + combined_item_list.append(("text", text_buffer)) + text_buffer = "" + combined_item_list.append(item) + previous_item_is_text = False + else: + if previous_item_is_text: + text_buffer += item[1] + else: + text_buffer = item[1] + previous_item_is_text = True + if item_list[-1][0] != "image" and len(text_buffer) > 0: + combined_item_list.append(("text", text_buffer)) + content = [] + for item in combined_item_list: + item_type = item[0] + if item_type == "text": + content.append({ + "type": "text", + "text": item[1] + }) + elif item_type == "image": + if isinstance(item[1], str): + media_type = "image/png" # "image/jpeg" + image_bytes = image_path_to_bytes(item[1]) + image_data = base64.b64encode(image_bytes).decode("utf-8") + elif isinstance(item[1], np.ndarray): + media_type = "image/jpeg" + image = Image.fromarray(item[1]).convert("RGB") + width, height = image.size + image = image.resize((int(0.5*width), int(0.5*height)), Image.LANCZOS) + image_bytes = io.BytesIO() + image.save(image_bytes, format='JPEG') + image_bytes = image_bytes.getvalue() + image_data = base64.b64encode(image_bytes).decode("utf-8") + content.append({ + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": image_data, + }, + }) + messages = [ + { + "role": "user", + "content": content + } + ] + return messages + +def call_claude_with_messages(messages, model_id="anthropic.claude-3-sonnet-20240229-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + client = boto3.client("bedrock-runtime", region_name="us-east-1") + + native_request = { + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 1024, + "temperature": 0.95, + "system": system_prompt, + "messages": messages, + } + + request = json.dumps(native_request) + + num_attempts = 0 + while True: + if num_attempts >= 10: + raise ValueError("OpenAI request failed.") + try: + response = client.invoke_model(modelId=model_id, body=request) + model_response = json.loads(response["body"].read()) + + response_text = model_response["content"][0]["text"] + return response_text + + except Exception as e: + print(e) + print("Sleeping for 10s...") + time.sleep(10) + num_attempts += 1 + + + +if __name__ == "__main__": + print(call_claude('''CURRENT OBSERVATION: +RootWebArea [2634] 'My Account' + link [3987] 'My Account' + link [3985] 'My Wish List' + link [3989] 'Sign Out' + text 'Welcome to One Stop Market' + link [3800] 'Skip to Content' + link [3809] 'store logo' + link [3996] 'My Cart' + combobox [4190] 'Search' [required: False] + link [4914] 'Advanced Search' + button [4193] 'Search' [disabled: True] + tablist [3699] + tabpanel + menu "[3394] 'Beauty & Personal Care'; [3459] 'Sports & Outdoors'; [3469] 'Clothing, Shoes & Jewelry'; [3483] 'Home & Kitchen'; [3520] 'Office Products'; [3528] 'Tools & Home Improvement'; [3533] 'Health & Household'; [3539] 'Patio, Lawn & Garden'; [3544] 'Electronics'; [3605] 'Cell Phones & Accessories'; [3620] 'Video Games'; [3633] 'Grocery & Gourmet Food'" + main + heading 'My Account' + text 'Contact Information' + text 'Emma Lopez' + text 'emma.lopezgmail.com' + link [3863] 'Change Password' + text 'Newsletters' + text "You aren't subscribed to our newsletter." + link [3877] 'Manage Addresses' + text 'Default Billing Address' + group [3885] + text 'Emma Lopez' + text '101 S San Mateo Dr' + text 'San Mateo, California, 94010' + text 'United States' + text 'T:' + link [3895] '6505551212' + text 'Default Shipping Address' + group [3902] + text 'Emma Lopez' + text '101 S San Mateo Dr' + text 'San Mateo, California, 94010' + text 'United States' + text 'T:' + link [3912] '6505551212' + link [3918] 'View All' + table 'Recent Orders' + row '| Order | Date | Ship To | Order Total | Status | Action |' + row '| --- | --- | --- | --- | --- | --- |' + row "| 000000170 | 5/17/23 | Emma Lopez | 365.42 | Canceled | View OrderReorder\tlink [4110] 'View Order'\tlink [4111] 'Reorder' |" + row "| 000000189 | 5/2/23 | Emma Lopez | 754.99 | Pending | View OrderReorder\tlink [4122] 'View Order'\tlink [4123] 'Reorder' |" + row "| 000000188 | 5/2/23 | Emma Lopez | 2,004.99 | Pending | View OrderReorder\tlink [4134] 'View Order'\tlink [4135] 'Reorder' |" + row "| 000000187 | 5/2/23 | Emma Lopez | 1,004.99 | Pending | View OrderReorder\tlink [4146] 'View Order'\tlink [4147] 'Reorder' |" + row "| 000000180 | 3/11/23 | Emma Lopez | 65.32 | Complete | View OrderReorder\tlink [4158] 'View Order'\tlink [4159] 'Reorder' |" + link [4165] 'My Orders' + link [4166] 'My Downloadable Products' + link [4167] 'My Wish List' + link [4169] 'Address Book' + link [4170] 'Account Information' + link [4171] 'Stored Payment Methods' + link [4173] 'My Product Reviews' + link [4174] 'Newsletter Subscriptions' + heading 'Compare Products' + text 'You have no items to compare.' + heading 'My Wish List' + text 'You have no items in your wish list.' + contentinfo + textbox [4177] 'Sign Up for Our Newsletter:' [required: False] + button [4072] 'Subscribe' + link [4073] 'Privacy and Cookie Policy' + link [4074] 'Search Terms' + link [4075] 'Advanced Search' + link [4076] 'Contact Us' + text 'Copyright 2013-present Magento, Inc. All rights reserved.' + text 'Help Us Keep Magento Healthy' + link [3984] 'Report All Bugs' +Today is 6/12/2023. Base on the webpage, tell me how many fulfilled orders I have over the past month, and the total amount of money I spent over the past month.''')) \ No newline at end of file diff --git a/AgentOccam/llms/cohere.py b/AgentOccam/llms/cohere.py new file mode 100644 index 0000000..184b832 --- /dev/null +++ b/AgentOccam/llms/cohere.py @@ -0,0 +1,42 @@ +import boto3 +import json +from botocore.exceptions import ClientError + +DEFAULT_SYSTEM_PROMPT = '''You are an AI assistant. Your goal is to provide informative and substantive responses to queries.''' + +def call_cohere(prompt, model_id="cohere.command-r-plus-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + client = boto3.client("bedrock-runtime", region_name="us-east-1") + + formatted_prompt = f"{system_prompt}\n{prompt}" + + native_request = { + "message": formatted_prompt, + "max_tokens": 512, + "temperature": 0.5, + } + + request = json.dumps(native_request) + try: + response = client.invoke_model(modelId=model_id, body=request) + + except (ClientError, Exception) as e: + print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}") + + model_response = json.loads(response["body"].read()) + + response_text = model_response["text"] + return response_text + +def arrange_message_for_cohere(item_list): + for item in item_list: + if item[0] == "image": + raise NotImplementedError() + prompt = "".join([item[1] for item in item_list]) + return prompt + +def call_cohere_with_messages(messages, model_id="cohere.command-r-plus-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + return call_cohere(prompt=messages, model_id=model_id, system_prompt=system_prompt) + +if __name__ == "__main__": + print(call_cohere('''Hi''')) + \ No newline at end of file diff --git a/AgentOccam/llms/gemini.py b/AgentOccam/llms/gemini.py new file mode 100644 index 0000000..5291a9c --- /dev/null +++ b/AgentOccam/llms/gemini.py @@ -0,0 +1,107 @@ +import google.generativeai as genai +import os +import time + +GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") +genai.configure(api_key=GEMINI_API_KEY) + + +def call_gemini(prompt, model_id="gemini-1.5-flash", system_prompt=None): + model = genai.GenerativeModel(model_id) + + num_attempts = 0 + while True: + if num_attempts >= 10: + raise ValueError("Gemini request failed.") + try: + response = model.generate_content(system_prompt+"\n"+prompt) + response_text = response.text + return response_text + except Exception as e: + print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}") + time.sleep(30) + + +def arrange_message_for_gemini(item_list): + for item in item_list: + if item[0] == "image": + raise NotImplementedError() + prompt = "".join([item[1] for item in item_list]) + return prompt + +def call_gemini_with_messages(messages, model_id="gemini-1.5-flash", system_prompt=None): + return call_gemini(prompt=messages, model_id=model_id, system_prompt=system_prompt) + +if __name__ == "__main__": + print(call_gemini('''CURRENT OBSERVATION: +RootWebArea [2634] 'My Account' + link [3987] 'My Account' + link [3985] 'My Wish List' + link [3989] 'Sign Out' + text 'Welcome to One Stop Market' + link [3800] 'Skip to Content' + link [3809] 'store logo' + link [3996] 'My Cart' + combobox [4190] 'Search' [required: False] + link [4914] 'Advanced Search' + button [4193] 'Search' [disabled: True] + tablist [3699] + tabpanel + menu "[3394] 'Beauty & Personal Care'; [3459] 'Sports & Outdoors'; [3469] 'Clothing, Shoes & Jewelry'; [3483] 'Home & Kitchen'; [3520] 'Office Products'; [3528] 'Tools & Home Improvement'; [3533] 'Health & Household'; [3539] 'Patio, Lawn & Garden'; [3544] 'Electronics'; [3605] 'Cell Phones & Accessories'; [3620] 'Video Games'; [3633] 'Grocery & Gourmet Food'" + main + heading 'My Account' + text 'Contact Information' + text 'Emma Lopez' + text 'emma.lopezgmail.com' + link [3863] 'Change Password' + text 'Newsletters' + text "You aren't subscribed to our newsletter." + link [3877] 'Manage Addresses' + text 'Default Billing Address' + group [3885] + text 'Emma Lopez' + text '101 S San Mateo Dr' + text 'San Mateo, California, 94010' + text 'United States' + text 'T:' + link [3895] '6505551212' + text 'Default Shipping Address' + group [3902] + text 'Emma Lopez' + text '101 S San Mateo Dr' + text 'San Mateo, California, 94010' + text 'United States' + text 'T:' + link [3912] '6505551212' + link [3918] 'View All' + table 'Recent Orders' + row '| Order | Date | Ship To | Order Total | Status | Action |' + row '| --- | --- | --- | --- | --- | --- |' + row "| 000000170 | 5/17/23 | Emma Lopez | 365.42 | Canceled | View OrderReorder\tlink [4110] 'View Order'\tlink [4111] 'Reorder' |" + row "| 000000189 | 5/2/23 | Emma Lopez | 754.99 | Pending | View OrderReorder\tlink [4122] 'View Order'\tlink [4123] 'Reorder' |" + row "| 000000188 | 5/2/23 | Emma Lopez | 2,004.99 | Pending | View OrderReorder\tlink [4134] 'View Order'\tlink [4135] 'Reorder' |" + row "| 000000187 | 5/2/23 | Emma Lopez | 1,004.99 | Pending | View OrderReorder\tlink [4146] 'View Order'\tlink [4147] 'Reorder' |" + row "| 000000180 | 3/11/23 | Emma Lopez | 65.32 | Complete | View OrderReorder\tlink [4158] 'View Order'\tlink [4159] 'Reorder' |" + link [4165] 'My Orders' + link [4166] 'My Downloadable Products' + link [4167] 'My Wish List' + link [4169] 'Address Book' + link [4170] 'Account Information' + link [4171] 'Stored Payment Methods' + link [4173] 'My Product Reviews' + link [4174] 'Newsletter Subscriptions' + heading 'Compare Products' + text 'You have no items to compare.' + heading 'My Wish List' + text 'You have no items in your wish list.' + contentinfo + textbox [4177] 'Sign Up for Our Newsletter:' [required: False] + button [4072] 'Subscribe' + link [4073] 'Privacy and Cookie Policy' + link [4074] 'Search Terms' + link [4075] 'Advanced Search' + link [4076] 'Contact Us' + text 'Copyright 2013-present Magento, Inc. All rights reserved.' + text 'Help Us Keep Magento Healthy' + link [3984] 'Report All Bugs' +Today is 6/12/2023. Base on the aforementioned webpage, tell me how many fulfilled orders I have over the past month, and the total amount of money I spent over the past month.''')) \ No newline at end of file diff --git a/AgentOccam/llms/gpt.py b/AgentOccam/llms/gpt.py new file mode 100644 index 0000000..c3ab2ec --- /dev/null +++ b/AgentOccam/llms/gpt.py @@ -0,0 +1,222 @@ +import openai +from openai import OpenAI, AzureOpenAI +import time +import numpy as np +from PIL import Image +import base64 +import io +import requests +import os +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None) +AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT", None) +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {OPENAI_API_KEY}" +} +DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant." + +def call_gpt(prompt, model_id="gpt-3.5-turbo", system_prompt=DEFAULT_SYSTEM_PROMPT): + num_attempts = 0 + while True: + if num_attempts >= 10: + raise ValueError("OpenAI request failed.") + try: + response = OpenAI().chat.completions.create( + model=model_id, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ], + temperature=0.95, + top_p=0.95, + frequency_penalty=0, + presence_penalty=0, + stop=None + ) + + return response.choices[0].message.content.strip() + except openai.AuthenticationError as e: + print(e) + return None + except openai.RateLimitError as e: + print(e) + print("Sleeping for 10s...") + time.sleep(10) + num_attempts += 1 + except Exception as e: + print(e) + print("Sleeping for 10s...") + time.sleep(10) + num_attempts += 1 + +def arrange_message_for_gpt(item_list): + def image_path_to_bytes(file_path): + with open(file_path, "rb") as image_file: + image_bytes = image_file.read() + return image_bytes + combined_item_list = [] + previous_item_is_text = False + text_buffer = "" + for item in item_list: + if item[0] == "image": + if len(text_buffer) > 0: + combined_item_list.append(("text", text_buffer)) + text_buffer = "" + combined_item_list.append(item) + previous_item_is_text = False + else: + if previous_item_is_text: + text_buffer += item[1] + else: + text_buffer = item[1] + previous_item_is_text = True + if item_list[-1][0] != "image" and len(text_buffer) > 0: + combined_item_list.append(("text", text_buffer)) + content = [] + for item in combined_item_list: + item_type = item[0] + if item_type == "text": + content.append({ + "type": "text", + "text": item[1] + }) + elif item_type == "image": + if isinstance(item[1], str): + image_bytes = image_path_to_bytes(item[1]) + image_data = base64.b64encode(image_bytes).decode("utf-8") + elif isinstance(item[1], np.ndarray): + image = Image.fromarray(item[1]).convert("RGB") + width, height = image.size + image = image.resize((int(0.5*width), int(0.5*height)), Image.LANCZOS) + image_bytes = io.BytesIO() + image.save(image_bytes, format='JPEG') + image_bytes = image_bytes.getvalue() + image_data = base64.b64encode(image_bytes).decode("utf-8") + content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_data}" + }, + }) + messages = [ + { + "role": "user", + "content": content + } + ] + return messages + +def call_gpt_with_messages(messages, model_id="gpt-3.5-turbo", system_prompt=DEFAULT_SYSTEM_PROMPT): + client = OpenAI() if not AZURE_ENDPOINT else AzureOpenAI(azure_endpoint = AZURE_ENDPOINT, api_key=OPENAI_API_KEY, api_version="2024-02-15-preview") + num_attempts = 0 + while True: + if num_attempts >= 10: + raise ValueError("OpenAI request failed.") + try: + if any("image" in c["type"] for m in messages for c in m["content"]): + payload = { + "model": "gpt-4-turbo", + "messages": messages, + } + + response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) + return response.json()["choices"][0]["message"].get("content", "").strip() + else: + response = client.chat.completions.create( + model=model_id, + messages=messages if messages[0]["role"] == "system" else [{"role": "system", "content": system_prompt}] + messages, + temperature=0.5, + top_p=0.95, + frequency_penalty=0, + presence_penalty=0, + stop=None + ) + return response.choices[0].message.content.strip() + except openai.AuthenticationError as e: + print(e) + return None + except openai.RateLimitError as e: + print(e) + print("Sleeping for 10s...") + time.sleep(10) + num_attempts += 1 + except Exception as e: + print(e) + print("Sleeping for 10s...") + time.sleep(10) + num_attempts += 1 + +if __name__ == "__main__": + prompt = '''CURRENT OBSERVATION: +RootWebArea [2634] 'My Account' + link [3987] 'My Account' + link [3985] 'My Wish List' + link [3989] 'Sign Out' + text 'Welcome to One Stop Market' + link [3800] 'Skip to Content' + link [3809] 'store logo' + link [3996] 'My Cart' + combobox [4190] 'Search' [required: False] + link [4914] 'Advanced Search' + button [4193] 'Search' [disabled: True] + tablist [3699] + tabpanel + menu "[3394] 'Beauty & Personal Care'; [3459] 'Sports & Outdoors'; [3469] 'Clothing, Shoes & Jewelry'; [3483] 'Home & Kitchen'; [3520] 'Office Products'; [3528] 'Tools & Home Improvement'; [3533] 'Health & Household'; [3539] 'Patio, Lawn & Garden'; [3544] 'Electronics'; [3605] 'Cell Phones & Accessories'; [3620] 'Video Games'; [3633] 'Grocery & Gourmet Food'" + main + heading 'My Account' + text 'Contact Information' + text 'Emma Lopez' + text 'emma.lopezgmail.com' + link [3863] 'Change Password' + text 'Newsletters' + text "You aren't subscribed to our newsletter." + link [3877] 'Manage Addresses' + text 'Default Billing Address' + group [3885] + text 'Emma Lopez' + text '101 S San Mateo Dr' + text 'San Mateo, California, 94010' + text 'United States' + text 'T:' + link [3895] '6505551212' + text 'Default Shipping Address' + group [3902] + text 'Emma Lopez' + text '101 S San Mateo Dr' + text 'San Mateo, California, 94010' + text 'United States' + text 'T:' + link [3912] '6505551212' + link [3918] 'View All' + table 'Recent Orders' + row '| Order | Date | Ship To | Order Total | Status | Action |' + row '| --- | --- | --- | --- | --- | --- |' + row "| 000000170 | 5/17/23 | Emma Lopez | 365.42 | Canceled | View OrderReorder\tlink [4110] 'View Order'\tlink [4111] 'Reorder' |" + row "| 000000189 | 5/2/23 | Emma Lopez | 754.99 | Pending | View OrderReorder\tlink [4122] 'View Order'\tlink [4123] 'Reorder' |" + row "| 000000188 | 5/2/23 | Emma Lopez | 2,004.99 | Pending | View OrderReorder\tlink [4134] 'View Order'\tlink [4135] 'Reorder' |" + row "| 000000187 | 5/2/23 | Emma Lopez | 1,004.99 | Pending | View OrderReorder\tlink [4146] 'View Order'\tlink [4147] 'Reorder' |" + row "| 000000180 | 3/11/23 | Emma Lopez | 65.32 | Complete | View OrderReorder\tlink [4158] 'View Order'\tlink [4159] 'Reorder' |" + link [4165] 'My Orders' + link [4166] 'My Downloadable Products' + link [4167] 'My Wish List' + link [4169] 'Address Book' + link [4170] 'Account Information' + link [4171] 'Stored Payment Methods' + link [4173] 'My Product Reviews' + link [4174] 'Newsletter Subscriptions' + heading 'Compare Products' + text 'You have no items to compare.' + heading 'My Wish List' + text 'You have no items in your wish list.' + contentinfo + textbox [4177] 'Sign Up for Our Newsletter:' [required: False] + button [4072] 'Subscribe' + link [4073] 'Privacy and Cookie Policy' + link [4074] 'Search Terms' + link [4075] 'Advanced Search' + link [4076] 'Contact Us' + text 'Copyright 2013-present Magento, Inc. All rights reserved.' + text 'Help Us Keep Magento Healthy' + link [3984] 'Report All Bugs' +Today is 6/12/2023. Base on the aforementioned webpage, tell me how many fulfilled orders I have over the past month, and the total amount of money I spent over the past month.''' + print(call_gpt(prompt=prompt, model_id="gpt-4-turbo")) \ No newline at end of file diff --git a/AgentOccam/llms/llama.py b/AgentOccam/llms/llama.py new file mode 100644 index 0000000..3fa164f --- /dev/null +++ b/AgentOccam/llms/llama.py @@ -0,0 +1,41 @@ +import boto3 +import json + +DEFAULT_SYSTEM_PROMPT = '''You are an AI assistant. Your goal is to provide informative and substantive responses to queries.''' + +def call_llama(prompt, model_id = "meta.llama3-8b-instruct-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + client = boto3.client("bedrock-runtime", region_name="us-east-1") + + formatted_prompt = f'''\n<|begin_of_text|>\n<|start_header_id|>user<|end_header_id|>\n{system_prompt}\n{prompt}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n''' + + native_request = { + "prompt": formatted_prompt, + "max_gen_len": 512, + "temperature": 0.5, + } + + request = json.dumps(native_request) + + try: + response = client.invoke_model(modelId=model_id, body=request) + + except Exception as e: + raise KeyError(f"ERROR: Can't invoke '{model_id}'. Reason: {e}") + + model_response = json.loads(response["body"].read()) + + response_text = model_response["generation"] + return response_text + +def arrange_message_for_llama(item_list): + for item in item_list: + if item[0] == "image": + raise NotImplementedError() + prompt = "".join([item[1] for item in item_list]) + return prompt + +def call_llama_with_messages(messages, model_id="meta.llama3-8b-instruct-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + return call_llama(prompt=messages, model_id=model_id, system_prompt=system_prompt) + +if __name__ == "__main__": + print(call_llama('''Hi''')) \ No newline at end of file diff --git a/AgentOccam/llms/mistral.py b/AgentOccam/llms/mistral.py new file mode 100644 index 0000000..e778cf9 --- /dev/null +++ b/AgentOccam/llms/mistral.py @@ -0,0 +1,42 @@ +import boto3 +import json +from botocore.exceptions import ClientError + +DEFAULT_SYSTEM_PROMPT = '''You are an AI assistant. Your goal is to provide informative and substantive responses to queries.''' + +def call_mistral(prompt, model_id="mistral.mistral-large-2402-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + client = boto3.client("bedrock-runtime", region_name="us-east-1") + + formatted_prompt = f"[INST] {system_prompt}\n{prompt} [/INST]" + + native_request = { + "prompt": formatted_prompt, + "max_tokens": 512, + "temperature": 0.5, + } + + request = json.dumps(native_request) + try: + response = client.invoke_model(modelId=model_id, body=request) + + except (ClientError, Exception) as e: + print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}") + + model_response = json.loads(response["body"].read()) + + response_text = model_response["outputs"][0]["text"] + return response_text + +def arrange_message_for_mistral(item_list): + for item in item_list: + if item[0] == "image": + raise NotImplementedError() + prompt = "".join([item[1] for item in item_list]) + return prompt + +def call_mistral_with_messages(messages, model_id="mistral.mistral-large-2402-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + return call_mistral(prompt=messages, model_id=model_id, system_prompt=system_prompt) + +if __name__ == "__main__": + print(call_mistral('''Hi''')) + \ No newline at end of file diff --git a/AgentOccam/llms/titan.py b/AgentOccam/llms/titan.py new file mode 100644 index 0000000..13bd82e --- /dev/null +++ b/AgentOccam/llms/titan.py @@ -0,0 +1,44 @@ +import boto3 +import json +from botocore.exceptions import ClientError + +DEFAULT_SYSTEM_PROMPT = '''You are an AI assistant. Your goal is to provide informative and substantive responses to queries.''' + +def call_titan(prompt, model_id="amazon.titan-text-premier-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + client = boto3.client("bedrock-runtime", region_name="us-east-1") + + formatted_prompt = f"{system_prompt}\n{prompt}" + + native_request = { + "inputText": formatted_prompt, + "textGenerationConfig": { + "maxTokenCount": 512, + "temperature": 0.5, + }, + } + + request = json.dumps(native_request) + try: + response = client.invoke_model(modelId=model_id, body=request) + + except (ClientError, Exception) as e: + print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}") + + model_response = json.loads(response["body"].read()) + + response_text = model_response["results"][0]["outputText"] + return response_text + +def arrange_message_for_titan(item_list): + for item in item_list: + if item[0] == "image": + raise NotImplementedError() + prompt = "".join([item[1] for item in item_list]) + return prompt + +def call_titan_with_messages(messages, model_id="amazon.titan-text-premier-v1:0", system_prompt=DEFAULT_SYSTEM_PROMPT): + return call_titan(prompt=messages, model_id=model_id, system_prompt=system_prompt) + +if __name__ == "__main__": + print(call_titan('''Hi''')) + \ No newline at end of file diff --git a/AgentOccam/obs_opt.py b/AgentOccam/obs_opt.py new file mode 100644 index 0000000..6fa7b56 --- /dev/null +++ b/AgentOccam/obs_opt.py @@ -0,0 +1,410 @@ +import re +from browser_env.processors import TreeNode +from functools import partial + +RETAINED_PROPERTIES = ["required", "disabled", "checked", "valuemin", "valuemax", "valuetext", "selected", "page_dialog_message"] +UNWANTED_PROPERTIES = ["focused", "autocomplete", "hasPopup", "expanded", "multiselectable", "orientation", "controls"] +UNINTERACTIVE_ROLES = ["StaticText", "LabelText", "main", "heading", "LayoutTable", "tabpanel", "LayoutTableRow", "LayoutTableCell", "time", "list", "contentinfo", "table", "row", "rowheader", "columnheader", "gridcell", "caption", "DescriptionList", "DescriptionListTerm", "DescriptionListDetail", "RootWebArea", "rowgroup", "alert"] +ROLE_REPLACEMENT_DICT = { + "StaticText": "text", + "LabelText": "text", + # "caption": "text", + # "generic": "text" +} + +def parse_text_to_tree(text): + lines = text.split('\n') + + root = None + parent_stack = {} + + for line in lines: + if line.strip() == "": + continue + line_strip = line.strip() + line_parts = line_strip.split(' ') + id = line_parts[0][1:-1] + type = line_parts[1] + text = ' '.join(line_parts[2:]) + level = 0 + for char in line: + if char == '\t': + level += 1 + else: + break + + node = TreeNode(id, type, text, level) + + if line.startswith('\t'): + parent_stack[level].add_child(node) + else: + root = node + + parent_stack[level+1] = node + + return root + +def remove_unwanted_characters(text): + text = text.replace('\xa0', ' ') + cleaned_text = re.sub(r'[^\w\s,.!?;:\-\'\"()&/\u2019@]+', '', text, flags=re.UNICODE) + cleaned_text = re.sub(r'\s+', ' ', cleaned_text) + return cleaned_text.strip() + +def search_node_by_id(node, target_id): + if node.node_id == target_id: + return node + for child in node.children: + result = search_node_by_id(child, target_id) + if result: + return result + return None + +def action_replace_node_role(node:TreeNode, role_replacement_dict:dict): + if node.role in role_replacement_dict.keys(): + node.role = role_replacement_dict[node.role] + +def action_remove_unwanted_characters(node:TreeNode): + node.name = remove_unwanted_characters(node.name) + +def action_remove_unwanted_properties(node:TreeNode): + if node.has_properties(): + node.properties = {p: node.properties[p] for p in node.properties.keys() if p not in UNWANTED_PROPERTIES} + if node.parent and node.parent.role=="row" and not node.properties["required"]: + del node.properties["required"] + if len(node.properties) == 0: + node.properties = None + +def action_remove_redundant_statictext_node(node:TreeNode): + if not node.visible: + return + if not (node.all_children_invisible() and node.role in ["StaticText", "LabelText", "caption"]): + return + if (not node.name) or (node.parent and node.name in node.parent.name) or (node.parent and any(node.name in sibling.name for sibling in node.siblings())): + node.visible = False + +def action_merge_statictext_to_parent(node:TreeNode): + if not node.visible: + return + if not (node.all_children_invisible() and node.role in ["StaticText", "LabelText", "caption"]): + return + if node.parent and not node.parent.name and len(node.parent.children) == 1: + node.parent.name = node.name + node.visible = False + +def action_merge_menuitem_and_option(node:TreeNode): + if not node.visible: + return + if not ((node.visible_children() and all(c.role=="menuitem" for c in node.visible_children())) or (node.visible_children() and all(c.role=="option" for c in node.visible_children()))): + return + if node.visible_children()[0].role == "menuitem": + if not node.name.strip(): + node.name = "; ".join([action_return_visible_node(c).strip()[len("menuitem "):] for c in node.visible_children()]) + else: + node.name += ": " + "; ".join([action_return_visible_node(c).strip()[len("menuitem "):] for c in node.visible_children()]) + elif node.visible_children()[0].role == "option": + if not node.name.strip(): + node.name = "; ".join([action_return_visible_node(c).strip()[len("option "):] for c in node.visible_children()]) + else: + node.name += ": " + "; ".join([action_return_visible_node(c).strip()[len("option "):] for c in node.visible_children()]) + for c in node.visible_children(): + c.visible = False + +def action_merge_description_list(node:TreeNode): + if not node.visible: + return + def reformat_sublist(current_list_term_buffer): + if len(current_list_term_buffer) > 1: + list_term_node_appended_name = [] + for n in current_list_term_buffer[1:]: + list_term_node_appended_name.append(n.name) + n.visible = False + current_list_term_buffer[0].name += ": " + "; ".join(list_term_node_appended_name) + + if not node.role == "DescriptionList": + return + for child in node.visible_children(): + if child.role == "DescriptionListDetail" and not child.name and len(child.visible_children()) == 1: + child.name = action_return_visible_node(child.visible_children()[0]).strip() + child.visible_children()[0].visible = False + list_term_buffer = [] + for child in node.visible_children(): + if child.role == "DescriptionListTerm" and child.all_children_invisible(): + reformat_sublist(current_list_term_buffer=list_term_buffer) + list_term_buffer = [child] + elif child.role == "DescriptionListDetail" and child.all_children_invisible() and list_term_buffer: + list_term_buffer.append(child) + elif child.role == "DescriptionListDetail" and not child.all_children_invisible(): + list_term_buffer = [] + else: + reformat_sublist(current_list_term_buffer=list_term_buffer) + list_term_buffer = [] + reformat_sublist(current_list_term_buffer=list_term_buffer) + +def action_remove_image(node:TreeNode): + if not node.visible: + return + if node.all_children_invisible() and (node.role=="img" or node.name=="Image"): + node.visible = False + +def action_set_invisible(node:TreeNode): + node.visible = False + +def action_set_visible(node:TreeNode): + node.visible = True + +def action_set_visible_if_with_name(node:TreeNode): + if node.name: + node.visible = True + +def action_reformat_table(node:TreeNode): + if not node.visible: + return + def merge_gridcell(gridcell_node:TreeNode): + if gridcell_node.role not in ["gridcell", "columnheader", "rowheader", "LayoutTableCell"] or not gridcell_node.visible: + return + gridcell_buffer = [] + parse_node_descendants(gridcell_node, action_return_visible_node, gridcell_buffer) + if len(gridcell_buffer) == 1: + return + gridcell_buffer = [s.strip() for s in gridcell_buffer] + if gridcell_node.name: + gridcell_node.name += "\t" + "\t".join(gridcell_buffer[1:]) + else: + gridcell_node.name = "\t".join(gridcell_buffer[1:]) + parse_node_descendants(gridcell_node, action_set_invisible) + gridcell_node.visible = True + + try: + if node.role == "table": + + def reformat_subtable(row_list, current_table_children): + import copy + new_table_children = copy.deepcopy(current_table_children) + if row_list: + # if row_list[0].children[0].role == "columnheader": + if any(row_0_child.role == "columnheader" for row_0_child in row_list[0].children): + if new_table_children and any(n.visible for n in new_table_children): + new_table_children.append(TreeNode(node_id=row_list[0].node_id, role="row", name="", depth=row_list[0].depth)) + for i, row in enumerate(row_list): + new_role_name = [] + for row_element in row.children: + new_role_name.append(row_element.name) + new_table_children.append(TreeNode(node_id=row.node_id, role="row", name="| "+" | ".join(new_role_name)+" |", depth=row.depth)) + if i == 0 and len(row_list) > 1: + new_table_children.append(TreeNode(node_id=row.node_id, role="row", name="| "+" | ".join(["---"]*len(new_role_name))+" |", depth=row.depth)) + elif row_list[0].children[0].role == "rowheader": + if new_table_children and any(n.visible for n in new_table_children): + new_table_children.append(TreeNode(node_id=row_list[0].node_id, role="row", name="", depth=row_list[0].depth)) + titles = [r.children[0].name for r in row_list] + values = [r.children[1].name for r in row_list] + new_table_children.append(TreeNode(node_id=row_list[0].node_id, role="row", name="| "+" | ".join(titles)+" |", depth=row_list[0].depth)) + new_table_children.append(TreeNode(node_id=row_list[0].node_id, role="row", name="| "+" | ".join(["---"]*len(titles))+" |", depth=row_list[0].depth)) + new_table_children.append(TreeNode(node_id=row_list[0].node_id, role="row", name="| "+" | ".join(values)+" |", depth=row_list[0].depth)) + elif row_list[0].children[0].role == "gridcell": + if new_table_children and any(n.visible for n in new_table_children): + new_table_children.append(TreeNode(node_id=row_list[0].node_id, role="row", name="", depth=row_list[0].depth)) + for row in row_list: + new_table_children.append(TreeNode(node_id=row.node_id, role="row", name="| "+" | ".join([row_element.name for row_element in row.children])+" |", depth=row.depth)) + else: + raise NotImplementedError("Unrecognized table format.") + return new_table_children + + new_table_children = [] + row_list = [] + row_mode = False + for child in node.children: + if child.role == "row": + for row_element in child.visible_children(): # TODO: Visible? + merge_gridcell(row_element) + + # if child.role == "row" and child.children[0].role == "columnheader": + if child.role == "row" and any(row_child.role == "columnheader" for row_child in child.children): + row_list = [child] + row_mode = False + elif child.role == "row" and child.children[0].role == "rowheader": + if row_mode: + row_list.append(child) + else: + new_table_children = reformat_subtable(row_list=row_list, current_table_children=new_table_children) + row_list = [child] + row_mode = True + elif child.role == "row" and child.children[0].role == "gridcell": + row_list.append(child) + row_mode = False + elif child.role != "row": + new_table_children = reformat_subtable(row_list=row_list, current_table_children=new_table_children) + if child.role == "rowgroup": + for grandchild in child.visible_children(): # grandchild: row + for row_element in grandchild.visible_children(): # TODO: Visible? + merge_gridcell(row_element) + child.children = reformat_subtable(row_list=child.children, current_table_children=[]) + new_table_children.append(child) + row_list = [] + else: + raise NotImplementedError() + new_table_children = reformat_subtable(row_list=row_list, current_table_children=new_table_children) + node.children = new_table_children + elif node.role == "LayoutTable": + def merge_adjacent_text_nodes(nodes): + if not nodes: + return [] + + merged_nodes = [] + current_node = nodes[0] + + for i in range(1, len(nodes)): + if current_node.visible and current_node.role in ["LayoutTableCell", "StaticText", "generic"]+list(set(ROLE_REPLACEMENT_DICT.values())) and nodes[i].visible and nodes[i].role in ["LayoutTableCell", "StaticText", "generic"]+list(set(ROLE_REPLACEMENT_DICT.values())): + current_node.role = ROLE_REPLACEMENT_DICT["StaticText"] + current_node.name += " " + nodes[i].name # Merge text values + nodes[i].visible = False + else: + merged_nodes.append(current_node) + current_node = nodes[i] + + merged_nodes.append(current_node) + + return merged_nodes + def dfs_merge_text(n:TreeNode): + if not n.children: + return + for c in n.children: + dfs_merge_text(c) + n.children = merge_adjacent_text_nodes(n.children) + if len(n.visible_children()) == 1 and n.visible_children()[0].role in ["LayoutTableCell", "StaticText", "generic"]+list(set(ROLE_REPLACEMENT_DICT.values())) and n.role in ["LayoutTableCell", "StaticText", "generic"]+list(set(ROLE_REPLACEMENT_DICT.values())): + n.name += "\t" + n.visible_children()[0].name + n.visible_children()[0].visible = False + if n.role == "LayoutTableRow": + for row_element in n.children: + if row_element.visible and row_element.children: + for sub_element in row_element.children: + if sub_element.visible: + node_str = action_return_visible_node(sub_element).strip() + row_element.name += f"\t{node_str}" + row_element.children = [] + n.name = "| " + " | ".join([c.name for c in n.children if c.visible]) + " |" # TODO: Visible? + for row_element in n.children: + row_element.visible = False + dfs_merge_text(node) + except Exception as e: + print("Table reformatting error:", e) + +def action_merge_duplicated_headings(node:TreeNode): + if not node.visible or not node.all_children_invisible() or not node.parent or node.visible_siblings(): + return + if node.role=="heading" and node.parent.role not in UNINTERACTIVE_ROLES and node.name == node.parent.name: + node.visible = False + if node.parent.role=="heading" and node.role not in UNINTERACTIVE_ROLES and node.name == node.parent.name: + node.parent.node_id = node.node_id + node.parent.role = node.role + node.parent.properties = node.properties + node.parent.children = node.children + node.visible = False + +def action_print_tree(node:TreeNode): + print("\t" * node.depth + f"{node.visible} {node.depth} [{node.node_id}] {node.role}: {node.name}") + +def action_return_visible_node(node:TreeNode, intent_bias=0, mode="concise", **kwargs): + if not node.visible: + return None + if mode == "concise": + node_str = node.role + hidden_roles = UNINTERACTIVE_ROLES+list(set(ROLE_REPLACEMENT_DICT.values())) + if "[" in node.name and "hidden_roles" in kwargs.keys(): + hidden_roles += kwargs["hidden_roles"] + if node.role not in hidden_roles: + node_str += f" [{node.node_id}]" + elif mode == "verbose": + node_str = f"{node.role} [{node.node_id}]" + elif mode == "name_only": + node_str = node.role + elif mode == "name_retained_id_only": + node_str = node.role + retained_ids = kwargs.get("retained_ids", []) + if node.node_id in retained_ids: + node_str += f" [{node.node_id}]" + + if node.name: + node_str += f" {repr(node.name)}" + if node.has_properties(): + for p in node.properties: + p_value = node.properties[p] + node_str += f" [{p}: {p_value}]" + return "\t" * (node.depth-intent_bias) + node_str + +def parse_node_siblings(node:TreeNode, action=action_print_tree, tree_buffer=[]): + for sibling in node.siblings(): + res_action = action(sibling) + if res_action: + tree_buffer.append(res_action) + +def parse_node_ancestors(node:TreeNode, action=action_print_tree, tree_buffer=[]): + res_action = action(node) + if res_action: + tree_buffer.append(res_action) + if node.parent: + parse_node_ancestors(node=node.parent, action=action, tree_buffer=tree_buffer) + +def parse_node_descendants(node:TreeNode, action=action_print_tree, tree_buffer=[]): + res_action = action(node) + if res_action: + tree_buffer.append(res_action) + for child in node.children: + parse_node_descendants(node=child, action=action, tree_buffer=tree_buffer) + +def prune_tree_fuzzy_node(node:TreeNode): # TODO: Bugs!!! + if not node.children: + return + + # Iterate over the children in reverse order to safely remove nodes + fuzzy_children = [] + for child in reversed(node.children): + prune_tree_fuzzy_node(child) + if child.all_children_invisible() and not child.is_differentiable(strict=True): + fuzzy_children.append(child) + for child in fuzzy_children: + child.visible = False + +def translate_node_to_str(node: TreeNode, mode="concise", **kwargs): + tree_buffer = [] + parse_node_descendants(node, partial(action_return_visible_node, intent_bias=node.depth, mode=mode, **kwargs), tree_buffer=tree_buffer) + return "\n".join(tree_buffer[:1000]) + +def construct_new_DOM_with_visible_nodes(DOM_root:TreeNode): + def dfs(node:TreeNode): + if not node.visible: + return None + if not node.visible_children(): + return node.copy() + new_self = node.copy() + for child in node.visible_children(): + new_child = dfs(child) + if new_child: + new_self.add_child(new_child) + return new_self + new_DOM_Root = dfs(DOM_root) + return new_DOM_Root + +def prune_tree(objective, root_node, mode="str"): + root_node_copy = construct_new_DOM_with_visible_nodes(root_node) + parse_node_descendants(root_node_copy, action_remove_unwanted_characters) + parse_node_descendants(root_node_copy, action_remove_unwanted_properties) + parse_node_descendants(root_node_copy, action_remove_redundant_statictext_node) + parse_node_descendants(root_node_copy, action_remove_image) + prune_tree_fuzzy_node(root_node_copy) + parse_node_descendants(root_node_copy, action_remove_image) + parse_node_descendants(root_node_copy, action_merge_statictext_to_parent) + parse_node_descendants(root_node_copy, action_remove_redundant_statictext_node) + parse_node_descendants(root_node_copy, partial(action_replace_node_role, role_replacement_dict=ROLE_REPLACEMENT_DICT)) + parse_node_descendants(root_node_copy, action_merge_menuitem_and_option) + parse_node_descendants(root_node_copy, action_merge_description_list) + parse_node_descendants(root_node_copy, action_reformat_table) + parse_node_descendants(root_node_copy, action_merge_duplicated_headings) + + if mode == "str": + browser_content = translate_node_to_str(node=root_node_copy, mode="concise") + elif mode == "node": + browser_content = construct_new_DOM_with_visible_nodes(root_node_copy) + return browser_content + +def contains_keyword(title, keyword): + return keyword in title.lower() diff --git a/AgentOccam/plot.py b/AgentOccam/plot.py new file mode 100644 index 0000000..655376a --- /dev/null +++ b/AgentOccam/plot.py @@ -0,0 +1,291 @@ +import os +import csv +import json +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +import numpy as np + +from AgentOccam.utils import COLOR_DICT, TASK_ID_DICT, MERGED_SITE_TASK_ID_DICT, EVELUATOR_RECTIFICATIONS, RUN_NAME_DICT, TASK_LABELS_MULTISITE, TRAJECTORY_DIR_DICT, OUTPUT_DIR, TOTAL_TASK_NUM_DICT + + +def random_color_generator(): + import random + random.seed(65) + while True: + r = random.randint(0, 255) + g = random.randint(0, 255) + b = random.randint(0, 255) + yield f'#{r:02X}{g:02X}{b:02X}' + +def generate_random_colors(color_num): + colors = [next(random_color_generator) for _ in range(color_num)] + return colors + +def get_colors(trajectory_key_list): + return [COLOR_DICT[k] if k in COLOR_DICT else next(random_color_generator) for k in trajectory_key_list] + +def parse_summary_csv_files(root_dir, site_list, mode="single_site"): + total_reward = 0 + total_tasks = 0 + net_total_reward = 0 + + id_list = [] + for site in site_list: + if mode == "multiple_site": + id_list += TASK_ID_DICT[site] + elif mode == "single_site": + id_list += MERGED_SITE_TASK_ID_DICT[site] + + for subdir, _, files in os.walk(root_dir): + for file in files: + if file == 'summary.csv': + filepath = os.path.join(subdir, file) + with open(filepath, 'r') as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + task_id = int(row['task_id']) + if task_id in id_list: + total_tasks += 1 + total_reward += float(row['reward']) + net_total_reward += 1 if float(row['reward']) == 1. else 0 + + if total_tasks > 0: + return total_reward, net_total_reward, total_tasks + else: + return 0.0, 0.0, 0.0 + +def parse_json_files(root_dir, site_list, evaluator="after", mode="single_site"): + total_reward = 0 + total_tasks = 0 + net_total_reward = 0 + + id_list = [] + for site in site_list: + if mode == "multiple_site": + id_list += TASK_ID_DICT[site] + elif mode == "single_site": + id_list += MERGED_SITE_TASK_ID_DICT[site] + + for filename in os.listdir(root_dir): + if filename.endswith(".json"): + try: + trajectory_obj = json.load(open(os.path.join(root_dir, filename), "r")) + if trajectory_obj["id"] in id_list: + if (evaluator=="before" and trajectory_obj["id"] not in EVELUATOR_RECTIFICATIONS) or evaluator=="after": + if "trajectory" in trajectory_obj.keys(): + last_step = trajectory_obj["trajectory"][-1] + reward = float(last_step['reward']) if "reward" in last_step.keys() else last_step['success'] + else: + reward = trajectory_obj["score"] + total_tasks += 1 + total_reward += reward + net_total_reward += 1 if reward == 1. else 0 + except Exception as e: + print(os.path.join(root_dir, filename)) + print(e) + + if total_tasks > 0: + return total_reward, net_total_reward, total_tasks + else: + return 0.0, 0.0, 0.0 + +def find_summary_csv_files(directories): + summary_files = [] + for directory in directories: + for root, _, files in os.walk(directory): + for file in files: + if file == 'summary.csv': + summary_files.append(os.path.join(root, file)) + return summary_files + +def read_rewards_with_dir_names(summary_files): + rewards_with_dirs = {} + for file in summary_files: + directory_name = os.path.basename(os.path.dirname(file)) + df = pd.read_csv(file) + if 'reward' in df.columns: + rewards_with_dirs[directory_name] = df['reward'].tolist() + return rewards_with_dirs + +def write_rewards_to_csv(rewards, output_file): + with open(output_file, 'w') as f: + f.write('reward\n') + for reward in rewards: + f.write(f'{reward}\n') + +def load_reward(root_dir, evaluator="after"): + reward_dict = {} + net_reward_dict = {} + for filename in os.listdir(root_dir): + if filename.endswith(".json"): + trajectory_obj = json.load(open(os.path.join(root_dir, filename), "r")) + trajectory_id = trajectory_obj["id"] + if (evaluator=="before" and trajectory_obj["id"] not in EVELUATOR_RECTIFICATIONS) or evaluator=="after": + if "trajectory" in trajectory_obj.keys(): + last_step = trajectory_obj["trajectory"][-1] + reward_dict[trajectory_id] = float(last_step['reward']) if "reward" in last_step.keys() else last_step['success'] + else: + reward_dict[trajectory_id] = float(trajectory_obj["score"]) + net_reward_dict[trajectory_id] = 1. if reward_dict[trajectory_id] == 1. else 0. + reward_list = [] + net_reward_list = [] + print("\n"+root_dir) + for i in range(812): + if i in reward_dict.keys(): + reward_list.append(reward_dict[i]) + else: + print(f"{i},", end="") + # reward_list.append(-1) + reward_list.append(0) + if i in net_reward_dict.keys(): + net_reward_list.append(net_reward_dict[i]) + else: + # net_reward_list.append(-1) + net_reward_list.append(0) + return reward_list, net_reward_list + +def compare_rewards(trajectory_key_list=None, evaluator="after"): + import pandas as pd + import matplotlib.pyplot as plt + + basenames = [RUN_NAME_DICT[k] for k in trajectory_key_list] + + tasks = list(range(812)) + labels = TASK_LABELS_MULTISITE + rewards = [load_reward(TRAJECTORY_DIR_DICT[k], evaluator=evaluator)[1] for k in trajectory_key_list] + + label_list = [] + label_index_dict = {} + for i, label in enumerate(labels): + if label not in label_list: + label_list.append(label) + label_index_dict[label] = [] + label_index_dict[label].append(i) + sorted_index_list = [] + for label in label_list: + sorted_index_list += label_index_dict[label] + tasks = [tasks[i] for i in sorted_index_list] + labels = [labels[i] for i in sorted_index_list] + for i in range(len(rewards)): + rewards[i] = [int(rewards[i][j]) for j in sorted_index_list] + + data = { + 'Task': tasks, + 'Site': labels, + **{basename: reward for basename, reward in zip(basenames, rewards)} + } + + df = pd.DataFrame(data) + + csvfile = open(os.path.join(OUTPUT_DIR, "compare.csv"), "w") + csv_writer = csv.writer(csvfile) + csv_writer.writerow(["task", "site"]+basenames) + for i, reward in enumerate(zip(*tuple(rewards))): + csv_writer.writerow([df['Task'][i], df['Site'][i]]+list(reward)) + +def plot_comparative_heatmap(): + import pandas as pd + import numpy as np + import matplotlib.pyplot as plt + + file_path = os.path.join(OUTPUT_DIR, 'compare.csv') + data = pd.read_csv(file_path) + + for site in ["shopping_admin", "shopping", "reddit", "gitlab", "map", "multisite"]: + site_data = data[data['site'] == site] + approach_keys = [k for k in site_data.keys() if k not in ["task", "site"]] + + heatmap_data = pd.DataFrame({ + k: site_data[k] for k in approach_keys + }) + + heatmap_values = heatmap_data.values + + colors = ['#EFEFEF', '#2A786C'] + cmap = mcolors.LinearSegmentedColormap.from_list("CustomCmap", colors) + plt.figure(figsize=(10, 20)) + plt.imshow(heatmap_values, cmap=cmap, aspect='auto') + + plt.xticks(ticks=[0.5 + k for k in list(range(len(approach_keys)))], labels=[]*len(approach_keys)) + plt.yticks([]) + + ax = plt.gca() + + ax.set_yticks([]) + + ax_left = plt.gca().twinx() + ax_left.set_yticks(np.arange(len(site_data))+1) + ax_left.set_yticklabels(site_data.iloc[::-1]["task"], fontsize=3) + + ax_right = plt.gca().twinx() + ax_right.set_yticks(np.arange(len(site_data))+1) + ax_right.set_yticklabels(site_data.iloc[::-1]["task"], fontsize=3) + ax_right.yaxis.set_label_position("right") + + plt.grid(color='white', linestyle='-', linewidth=5) + + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, f"figures/{site}_{len(approach_keys)}.png"), dpi=256) + +def plot_comparative_bar_chart(categories, data_list, labels, colors, title="Comparative Bar Chart", ylabel="Values", figure_name="bar"): + os.makedirs(os.path.join(OUTPUT_DIR, "figures"), exist_ok=True) + + bar_width = 1/(len(labels)+1) + x = np.arange(len(categories)) + + plt.rc('font', family='serif') + plt.figure(figsize=(9, 2)) + + for i, (data, label, color) in enumerate(zip(data_list, labels, colors)): + plt.bar(x + i * bar_width, data, width=bar_width, label=label, color=color) + + for i, (data, label) in enumerate(zip(data_list, labels)): + for j, value in enumerate(data): + plt.text(x[j] + i * bar_width, value, f"{value:.1f}" if isinstance(value, float) else f"{value}", ha='center', va='bottom', fontsize=5) + + if title: + plt.title(title) + plt.ylabel(ylabel, fontsize=11) + plt.xticks(x + bar_width * (len(labels) - 1) / 2, [c.replace("_", " ").capitalize() for c in categories], fontsize=11) + plt.legend(loc='lower center', fontsize=11, bbox_to_anchor=(0.5, 1.05), ncol=3) + plt.grid(axis='y') + + plt.ylim(0, 65) + + plt.tight_layout() + plt.savefig(os.path.join(OUTPUT_DIR, f"figures/{figure_name}.pdf"), dpi=256) + plt.close() + +def compute_success_rate(trajectory_key_list=None, evaluator="after"): + site_lists = ["ALL", "SHOPPING", "SHOPPING_ADMIN", "GITLAB", "MAP", "REDDIT", "MULTISITE"] + csvfile = open(os.path.join(OUTPUT_DIR, "result.csv"), "w") + csv_writer = csv.writer(csvfile) + csv_writer.writerow(["basename", "site", "total_reward", "net_total_reward", "total_tasks"]) + + categories = site_lists + + trajectory_key_list = trajectory_key_list if trajectory_key_list else [k for k in sorted(list(TRAJECTORY_DIR_DICT.keys()), reverse=False)] + labels = [RUN_NAME_DICT[i] for i in trajectory_key_list] + + colors = get_colors(trajectory_key_list) + + reward_percentage_list = {l:[] for l in labels} + net_reward_percentage_list = {l:[] for l in labels} + + for i, key in enumerate(trajectory_key_list): + root_directory = TRAJECTORY_DIR_DICT[key] + basename = labels[i] + for site_list in site_lists: + total_reward, net_total_reward, total_tasks = parse_json_files(root_directory, [site_list], evaluator=evaluator, mode="multiple_site") + total_tasks = TOTAL_TASK_NUM_DICT[site_list] + reward_percentage_list[basename].append(total_reward/total_tasks*100) + net_reward_percentage_list[basename].append(net_total_reward/total_tasks*100) + csv_writer.writerow([basename, site_list, total_reward, net_total_reward, total_tasks]) + csvfile.close() + plot_comparative_bar_chart(categories=categories, data_list=[reward_percentage_list[l] for l in labels], labels=labels, colors=colors, title="Reward Percentage", figure_name="reward_percentage") + plot_comparative_bar_chart(categories=categories, data_list=[net_reward_percentage_list[l] for l in labels], labels=labels, colors=colors, title="", ylabel="Success Rate", figure_name="net_reward_percentage") + +if __name__ == "__main__": + ablation_study_key_list = [7, 3, 4, 5, 6, 0] + compute_success_rate(ablation_study_key_list) \ No newline at end of file diff --git a/AgentOccam/prompts/AgentOccam_prompt.py b/AgentOccam/prompts/AgentOccam_prompt.py new file mode 100644 index 0000000..ee738e1 --- /dev/null +++ b/AgentOccam/prompts/AgentOccam_prompt.py @@ -0,0 +1,92 @@ +actor = { +"instruction_template": { + "with_planning": '''You are an AI assistant performing tasks on a web browser. You will be provided with task objective, current step, web page observations, previous plans, and interaction history. You need to issue an action for this step. + +Generate the response in the following format: +{output_specifications} + +You are ONLY allowed to use the following action commands. Strictly adheres to the given format. Only issue one single action. +If you think you should refine the plan, use the following actions: +{planning_specifications} +Otherwise, use the following actions: +{navigation_specifications}''', + + "without_planning": '''You are an AI assistant performing tasks on a web browser. You will be provided with task objective, current step, web page observations, and other relevant information. You need to issue an action for this step. + +Generate the response in the following format: +{output_specifications} + +You are ONLY allowed to use the following action commands. Strictly adheres to the given format. Only issue one single action. +{navigation_specifications}''' +}, + +"input_template":'''{input}''', + +"QA": { +"instruction_template": '''You are a proficient assistant good at answering web page related questions. Given the web page textual description, you are required to answer the question. + +Generate the response in the following format: +RESPONSE: +Your response here. + +Adhere to the following response requirements: +* If you are not fully sure that you can answer the question correcly with the information given, only take note of crucial relevant information. +* Otherwise, if you are confident about the answer, return your full answer. Ensure that your response is correct and comprehensive that fully explain your conclusion.''', +"input_template": '''WEB PAGE CONTENT: +{current_observation} + +QUESTION: +{objective}''' +}, + +"planning": { +"instruction_template": '''You are an AI assistant performing tasks on a web browser. You will be provided with task objective, current step, url, web page observations, previous plans, and actions. You need to issue a plan for this step. + +Generate the response in the following format: +{output_specifications} + +You are ONLY allowed to use the following planning commands. Strictly adheres to the given format. Only issue one single planning command. +{planning_specifications}''', +"input_template": '''''' +}, + +"reflection": { +"instruction_template": '''You are an AI assistant performing tasks on a web browser. You will be provided with task objective, current step, url, web page observations, previous plans, and actions. You need to reflect on past mistakes, take corrective action, and maximize future rewards. + +Generate the response in the following format: +{output_specifications} + +You are ONLY allowed to use the following action commands. Strictly adheres to the given format. Only issue one single action. +If you think you should refine the plan, use the following actions: +{planning_specifications} +Otherwise, use the following actions: +{navigation_specifications}''', +"input_template": '''''' +}, +} +critic = { + +"harsh": {"instruction_template": '''Below are the objective (high-level goal) and corresponding web observations and actions I took to navigate the web and achieve the goal, which has proven to be **unsuccessful**. As the objective is fully achievable within the current environment, I am expecting skeptical feedback on why I failed based on my interaction history and the current state. + +Adhere to the following output format: +{output_specifications}''', + + +"input_template": '''The following is all my interaction history and current state: +{input}'''}, + +"normal": { + "instruction_template": '''You are a seasoned web navigator. You now assess the performance of another web navigation agent based on the objective, their previous interaction history and the web's current state.\nAdhere to the following output format:\n{output_specifications}''', + "input_template": '''The following is all my interaction history and current state:\n{input}''', +} + +} +judge = { +"instruction_template": '''You are a seasoned web navigator. You now assess the value and risk of serveral web navigation actions based on the objective, the previous interaction history and the web's current state. Then, you select the action with the most value and least risk with which you would earn the maximum objective fulfillment reward in the future. + +Adhere to the following output format: +{output_specifications} + +Note that `branch` and `prune` are planning actions that will modify the PREVIOUS PLAN section and won't interact with the web environment.''', +"input_template": '''The following is the interaction history, current state, and action choices.\n{input}''' +} \ No newline at end of file diff --git a/AgentOccam/prompts/navigation_specifications/click.txt b/AgentOccam/prompts/navigation_specifications/click.txt new file mode 100644 index 0000000..d31caf2 --- /dev/null +++ b/AgentOccam/prompts/navigation_specifications/click.txt @@ -0,0 +1 @@ +click [id]: To click on an element with its numerical ID on the webpage. E.g., `click [7]` If clicking on a specific element doesn't trigger the transition to your desired web state, this is due to the element's lack of interactivity or GUI visibility. In such cases, move on to interact with OTHER similar or relevant elements INSTEAD. \ No newline at end of file diff --git a/AgentOccam/prompts/navigation_specifications/go_back.txt b/AgentOccam/prompts/navigation_specifications/go_back.txt new file mode 100644 index 0000000..9226898 --- /dev/null +++ b/AgentOccam/prompts/navigation_specifications/go_back.txt @@ -0,0 +1 @@ +go_back: To return to the previously viewed page. \ No newline at end of file diff --git a/AgentOccam/prompts/navigation_specifications/go_home.txt b/AgentOccam/prompts/navigation_specifications/go_home.txt new file mode 100644 index 0000000..ba5c47f --- /dev/null +++ b/AgentOccam/prompts/navigation_specifications/go_home.txt @@ -0,0 +1 @@ +go_home: To return to the homepage where you can find other websites. \ No newline at end of file diff --git a/AgentOccam/prompts/navigation_specifications/note.txt b/AgentOccam/prompts/navigation_specifications/note.txt new file mode 100644 index 0000000..564b301 --- /dev/null +++ b/AgentOccam/prompts/navigation_specifications/note.txt @@ -0,0 +1 @@ +note [content]: To take note of all important info w.r.t. completing the task to enable reviewing it later. E.g., `note [Spent $10 on 4/1/2024]` \ No newline at end of file diff --git a/AgentOccam/prompts/navigation_specifications/scroll.txt b/AgentOccam/prompts/navigation_specifications/scroll.txt new file mode 100644 index 0000000..c6ca961 --- /dev/null +++ b/AgentOccam/prompts/navigation_specifications/scroll.txt @@ -0,0 +1 @@ +scroll [down/up] [reason]: To navigate the webpage content. E.g., `scroll [up] [Previous observations contain a link that might be useful.]` \ No newline at end of file diff --git a/AgentOccam/prompts/navigation_specifications/stop.txt b/AgentOccam/prompts/navigation_specifications/stop.txt new file mode 100644 index 0000000..ece0610 --- /dev/null +++ b/AgentOccam/prompts/navigation_specifications/stop.txt @@ -0,0 +1 @@ +stop [answer]: To stop interaction and return response. Present your answer within the brackets. If the task doesn't require a textual answer or appears insurmountable, indicate "N/A" and additional reasons and all relevant information you gather as the answer. E.g., `stop [5h 47min]` \ No newline at end of file diff --git a/AgentOccam/prompts/navigation_specifications/type.txt b/AgentOccam/prompts/navigation_specifications/type.txt new file mode 100644 index 0000000..72b87cd --- /dev/null +++ b/AgentOccam/prompts/navigation_specifications/type.txt @@ -0,0 +1 @@ +type [id] [content] [press_enter_after=0|1]: To type content into a field with a specific ID. By default, the "Enter" key is pressed after typing unless `press_enter_after` is set to 0. E.g., `type [15] [Carnegie Mellon University] [1]` If you can't find what you're looking for on your first attempt, consider refining your search keywords by breaking them down or trying related terms. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/action.txt b/AgentOccam/prompts/output_specifications/action.txt new file mode 100644 index 0000000..485b52a --- /dev/null +++ b/AgentOccam/prompts/output_specifications/action.txt @@ -0,0 +1 @@ +Select your action here. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/action_assessment.txt b/AgentOccam/prompts/output_specifications/action_assessment.txt new file mode 100644 index 0000000..e65d47a --- /dev/null +++ b/AgentOccam/prompts/output_specifications/action_assessment.txt @@ -0,0 +1 @@ +Assess the value and risk of each action. Consider both the best-case and worst-case outcomes resulting from its implementation. Itemize the assessment using this format: `- action [action_id]: [action value, including but not limited to what outcomes you can expect by executing the action, or whether the note is of the most correct and comprehensive content] [action risk, including but not limited to whether the note/stop content is correct, and whether you can gather more information by continuing playing rather than ending the trial] [{best_case}] [{worst_case}]`. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/action_candidates.txt b/AgentOccam/prompts/output_specifications/action_candidates.txt new file mode 100644 index 0000000..d1e2c48 --- /dev/null +++ b/AgentOccam/prompts/output_specifications/action_candidates.txt @@ -0,0 +1 @@ +Propose ALL potential actions at this step. Itemize the actions using this format: `- reason: [{reason_for_proposing_the_following_action0}]\n- action: [{action0_command}]\n\n- reason: [{reason_for_proposing_the_following_action1}]\n- action: [{action1_command}]\n\n...`. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/action_selection.txt b/AgentOccam/prompts/output_specifications/action_selection.txt new file mode 100644 index 0000000..4510590 --- /dev/null +++ b/AgentOccam/prompts/output_specifications/action_selection.txt @@ -0,0 +1 @@ +List the numerical id of your selected action here. You can only choose one action. E.g., `1`. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/interaction_history_summary.txt b/AgentOccam/prompts/output_specifications/interaction_history_summary.txt new file mode 100644 index 0000000..6817fc9 --- /dev/null +++ b/AgentOccam/prompts/output_specifications/interaction_history_summary.txt @@ -0,0 +1 @@ +Emphasize all important details in the INTERACTION HISTORY section. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/mistakes.txt b/AgentOccam/prompts/output_specifications/mistakes.txt new file mode 100644 index 0000000..f2a13af --- /dev/null +++ b/AgentOccam/prompts/output_specifications/mistakes.txt @@ -0,0 +1,26 @@ +Point out the major mistakes of previous steps by ONLY using the following templates: + - You have make a reasoning mistake by "{quote}". The correct reasoning should be "{correction}". + - You should check the "{link_name}" link first. + - You should know that the recent order table doesn't include all previous orders. Don't hush to a conclusion. + - You have missed important details on this page: {details}. + - I don't think your answer follow the task requirements. That's a fault I wouldn't expect. Reconsider seriously. + - You have employed different approaches/the same approach many times to do the task but failed. The task assigner might just want to challenge you to answer no and there might be no answer for this brain teaser question. + - If the task ask for the most extreme case (e.g., with highest price), I suggest you sort them by that key first. + - If there are multiple requirements for an item, break down the requirements and search them one by one. + - The active plan is a complex task. Don't rush. Further break down the task by using the planning commands. + - There might be multiple relevant orders to check before reach the conclusion. First, view ALL previous orders to finalize the order checklist and take notes of orders to be checked with `note [note_content]` command while viewing. Second, view the order details one by one and take notes of all crucial information. Finally, view all notes and think step by step before concluding the answer. + - You have reasoned too much in one step which leads to errors. Break down the task with planning. + - You should change the "selected" state of the items in the combobox. + - From my observation and consideration, I suggest you conclude the task as there's no answer even though you have tried multiple times with different approaches. + - When the task mentioned "category", it imples you can navigate to that category by selecting menus step by step. Select the most relevant first and the subcategories would appear. Select the appropriate subcategory then. + - You have not gone over all the reviews, {review_page_num} pages in total. + - You have not gone over all the items, {item_page_num} pages in total. + - Don't take the same notes multiple times. + - You should select and click the radio (required field) first. + - You should go over all relevant items and take notes of all crucial information with `note [note_content]`. Then finalize your choice by carefully consider based on your notes. + - Don't submit yet. Just show the form completion page. Retry. + - You missed a required field before submission, which leads to the failure of your last attempt. Retry. + - Canceled Orders and pending orders are not fulfilled orders. + - There are {order_num} relevant orders on this page, which is/are {order_ids}. You have viewed {order_ids} and taken notes, and {order_ids} still requires reviewing and taking notes. + - You have gone over all review/item/order pages. + - Except when keywords "category", "subcategories", etc are specifically mentioned in the objective, the fastest way to find items is to use the `search` feature. diff --git a/AgentOccam/prompts/output_specifications/observation_description.txt b/AgentOccam/prompts/output_specifications/observation_description.txt new file mode 100644 index 0000000..e05ed36 --- /dev/null +++ b/AgentOccam/prompts/output_specifications/observation_description.txt @@ -0,0 +1 @@ +Describe information in the CURRENT OBSERVATION section. Emphasize elements and features that are relevant or potentially helpful for fulfilling the objective in detail. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/observation_highlight.txt b/AgentOccam/prompts/output_specifications/observation_highlight.txt new file mode 100644 index 0000000..0c321e3 --- /dev/null +++ b/AgentOccam/prompts/output_specifications/observation_highlight.txt @@ -0,0 +1 @@ +List the numerical ids of elements on the current webpage based on which you would issue your action. Also include elements on the current webpage you would attend to if you fail in the future and have to restore to this step. Don't include elements from the previous pages. Select elements at a higher hierarchical level if most their children nodes are considered crucial. Sort by relevance and potential values from high to low, and separate the ids with commas. E.g., `1321, 52, 756, 838`. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/plan_progress_assessment.txt b/AgentOccam/prompts/output_specifications/plan_progress_assessment.txt new file mode 100644 index 0000000..989fdc5 --- /dev/null +++ b/AgentOccam/prompts/output_specifications/plan_progress_assessment.txt @@ -0,0 +1 @@ +Review critically why the plans have not been fulfilled or the objective achieved. Justify your assessment with detailed evidence drawn from the objective, observations, and actions taken. Itemize the assessment using this format: `- plan [{plan_id}]\n\t[{step_ids_taken_for_this_milestone}] [{concrete_proof_from_observation}] [{why_milestone_a_not_successful}]\n\t[{step_ids_taken_for_this_milestone}] [{concrete_proof_from_observation}] [{why_milestone_b_not_successful}]\n\t...`. \ No newline at end of file diff --git a/AgentOccam/prompts/output_specifications/reason.txt b/AgentOccam/prompts/output_specifications/reason.txt new file mode 100644 index 0000000..5ba3c85 --- /dev/null +++ b/AgentOccam/prompts/output_specifications/reason.txt @@ -0,0 +1 @@ +Provide your rationale for proposing the subsequent action commands here. \ No newline at end of file diff --git a/AgentOccam/prompts/planning_specifications/branch.txt b/AgentOccam/prompts/planning_specifications/branch.txt new file mode 100644 index 0000000..8e0345b --- /dev/null +++ b/AgentOccam/prompts/planning_specifications/branch.txt @@ -0,0 +1 @@ +branch [parent_plan_id] [new_subplan_intent]: To create a new subplan based on PREVIOUS PLANS. Ensure the new subplan is connected to the appropriate parent plan by using its ID. E.g., `branch [12] [Navigate to the "Issue" page to check all the issues.]` \ No newline at end of file diff --git a/AgentOccam/prompts/planning_specifications/prune.txt b/AgentOccam/prompts/planning_specifications/prune.txt new file mode 100644 index 0000000..419568a --- /dev/null +++ b/AgentOccam/prompts/planning_specifications/prune.txt @@ -0,0 +1 @@ +prune [resume_plan_id] [reason]: To return to a previous plan state when the current plan is deemed impractical. Enter the ID of the plan state you want to resume. E.g., `prune [5] [The current page lacks items "black speaker," prompting a return to the initial page to restart the item search.]` \ No newline at end of file diff --git a/AgentOccam/utils.py b/AgentOccam/utils.py new file mode 100644 index 0000000..4e70e14 --- /dev/null +++ b/AgentOccam/utils.py @@ -0,0 +1,401 @@ +import os +import csv +import json + +CURRENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +PARENT_DIR = os.path.dirname(CURRENT_DIR) +EVALUATOR_DIR = os.path.join(CURRENT_DIR, "config_files") +TRAJECTORY_DIR = os.path.join(PARENT_DIR, "AgentOccam-Trajectories") +OUTPUT_DIR = os.path.join(CURRENT_DIR, "output") +HOMEPAGE_URL = "localhost:4399" + + +TASK_ID_DICT = { + "ALL": list(range(812)), + "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790], + "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 757, 758, 761, 762, 763, 764, 765, 766, 767], + "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798], + "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735], + "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811], + "MULTISITE": [97, 265, 266, 267, 268, 424, 425, 426, 427, 428, 429, 430, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 671, 672, 673, 674, 675, 681, 682, 683, 684, 685, 686, 687, 688, 737, 738, 739, 740, 741, 759, 760, 791], +} + +MERGED_SITE_TASK_ID_DICT = { + "ALL": list(range(812)), + "SHOPPING_ADMIN": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 41, 42, 43, 62, 63, 64, 65, 77, 78, 79, 94, 95, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 122, 123, 127, 128, 129, 130, 131, 157, 183, 184, 185, 186, 187, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 243, 244, 245, 246, 247, 288, 289, 290, 291, 292, 344, 345, 346, 347, 348, 374, 375, 423, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 470, 471, 472, 473, 474, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 676, 677, 678, 679, 680, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 790], + "MAP": [7, 8, 9, 10, 16, 17, 18, 19, 20, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 70, 71, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99, 100, 101, 137, 138, 139, 140, 151, 152, 153, 154, 155, 218, 219, 220, 221, 222, 223, 224, 236, 237, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 265, 266, 267, 268, 287, 356, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 377, 378, 379, 380, 381, 382, 383, 424, 425, 426, 427, 428, 429, 430, 737, 738, 739, 740, 741, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767], + "SHOPPING": [21, 22, 23, 24, 25, 26, 47, 48, 49, 50, 51, 96, 117, 118, 124, 125, 126, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 188, 189, 190, 191, 192, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 238, 239, 240, 241, 242, 260, 261, 262, 263, 264, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 298, 299, 300, 301, 302, 313, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 351, 352, 353, 354, 355, 358, 359, 360, 361, 362, 368, 376, 384, 385, 386, 387, 388, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 465, 466, 467, 468, 469, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 528, 529, 530, 531, 532, 571, 572, 573, 574, 575, 585, 586, 587, 588, 589, 653, 654, 655, 656, 657, 671, 672, 673, 674, 675, 689, 690, 691, 692, 693, 792, 793, 794, 795, 796, 797, 798], + "REDDIT": [27, 28, 29, 30, 31, 66, 67, 68, 69, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 580, 581, 582, 583, 584, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 681, 682, 683, 684, 685, 686, 687, 688, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735], + "GITLAB": [44, 45, 46, 102, 103, 104, 105, 106, 132, 133, 134, 135, 136, 156, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 205, 206, 207, 258, 259, 293, 294, 295, 296, 297, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 315, 316, 317, 318, 339, 340, 341, 342, 343, 349, 350, 357, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 522, 523, 524, 525, 526, 527, 533, 534, 535, 536, 537, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 576, 577, 578, 579, 590, 591, 592, 593, 594, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 736, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 783, 784, 785, 786, 787, 788, 789, 791, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] +} + +TASK_LABELS = ['shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'reddit', 'reddit', 'reddit', 'reddit', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping', 'map', 'map', 'map', 'map', 'map', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'map', 'map', 'map', 'gitlab', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'map', 'map', 'map', 'shopping', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'gitlab', 'map', 'map', 'map', 'map', 'map', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab'] +TASK_LABELS_MULTISITE = ['shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'reddit', 'reddit', 'reddit', 'reddit', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping', 'multisite', 'map', 'map', 'map', 'map', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'map', 'map', 'map', 'gitlab', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'multisite', 'multisite', 'multisite', 'multisite', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'map', 'map', 'map', 'map', 'map', 'shopping', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'reddit', 'gitlab', 'multisite', 'multisite', 'multisite', 'multisite', 'multisite', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'map', 'map', 'multisite', 'multisite', 'map', 'map', 'map', 'map', 'map', 'map', 'map', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'shopping_admin', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'shopping_admin', 'multisite', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab', 'gitlab'] + +TOTAL_TASK_NUM_DICT = { + "ALL": 812, + "SHOPPING_ADMIN": 182, + "MAP": 109, + "SHOPPING": 187, + "REDDIT": 106, + "GITLAB": 180, + "MULTISITE": 48 +} + +EVELUATOR_RECTIFICATIONS = [16, 17, 18, 19, 20, 97, 146, 178, 179, 180, 181, 182, 240, 254, 261, 262, 263, 264, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 308, 309, 310, 311, 312, 330, 351, 352, 353, 354, 355, 363, 364, 365, 366, 367, 415, 416, 417, 418, 489, 528, 529, 530, 531, 532, 583, 584, 601, 603, 606, 608, 629, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 653, 654, 655, 656, 657, 679, 707, 708, 709] + +TRAJECTORY_DIR_DICT = { + # Main Results + 0: os.path.join(TRAJECTORY_DIR, "AgentOccam"), + 1: os.path.join(TRAJECTORY_DIR, "AgentOccam-SteP"), + 2: os.path.join(TRAJECTORY_DIR, "AgentOccam-Judge"), + # Ablation Study + 3: os.path.join(TRAJECTORY_DIR, "reduced_action"), + 4: os.path.join(TRAJECTORY_DIR, "reduced_action-X_scrolling"), + 5: os.path.join(TRAJECTORY_DIR, "reduced_action-X_scrolling-obs_opt"), + 6: os.path.join(TRAJECTORY_DIR, "reduced_action-X_scrolling-obs_opt-history"), + # Replications + 7: os.path.join(TRAJECTORY_DIR, "WebArena-replication"), + 8: os.path.join(TRAJECTORY_DIR, "SteP-replication"), +} + +RUN_NAME_DICT = { + # Main Results + 0: "AgentOccam", + 1: "AgentOccam + SteP", + 2: "AgentOccam + Judge", + # Ablation Study + 3: "↓ Actions", + 4: "Above + X Scrolling", + 5: "Above + Obs Opt.", + 6: "Above + History", + # Replications + 7: "WebArena-replication", + 8: "SteP-replication", +} + +COLOR_DICT = { + # Main Results + 0: "#45C4B0", + 1: "#B68193", + 2: "#E7D5BF", + # Ablation Study + 3: "#594D47", + 4: "#6E8480", + 5: "#D98C6C", + 6: "#997E73", + # Replications + 7: "#203330", + 8: "#969696", +} + +def print_trajectory(json_path): + import json + item = json.load(open(json_path, "r")) + for step in item["trajectory"]: + obj = step["objective"] + url = step["url"] + obs = step["observation"] + reason = step["reason"] + action = step["action"] + if "plan" in step.keys(): + plan = step["plan"] + print(f"### Objective\n{obj}") + print(f"### Url\n{url}") + print(f"### Observation\n{obs}") + print(f"### Plan\n{plan}") + print(f"### Reason\n{reason}") + print(f"### Action\n{action}") + else: + print(f"### Objective\n{obj}") + print(f"### Url\n{url}") + print(f"### Observation\n{obs}") + print(f"### Reason\n{reason}") + print(f"### Action\n{action}") + +def find_task_by_intent_template_id(intent_template_id, task_config_data_dir=EVALUATOR_DIR): + import json + for filename in sorted([p for p in os.listdir(task_config_data_dir) if p[0].isdigit()], key=lambda item: int(os.path.basename(item)[:-len(".json")])): + if filename.endswith(".json"): + filepath = os.path.join(task_config_data_dir, filename) + with open(filepath, 'r', encoding='utf-8') as file: + try: + data = json.load(file) + if data.get('intent_template_id') == intent_template_id: + intent = data.get("intent") + print(f"File: {filename}\nIntent: {intent}") + except json.JSONDecodeError as e: + print(f"Error decoding JSON from file {filename}: {str(e)}") + +def traverse_task_configs(task_config_data_dir=EVALUATOR_DIR): + import json + task_id_dict = {} + for filename in sorted([p for p in os.listdir(task_config_data_dir) if p[0].isdigit()], key=lambda item: int(os.path.basename(item)[:-len(".json")])): + if filename.endswith(".json"): + filepath = os.path.join(task_config_data_dir, filename) + with open(filepath, 'r', encoding='utf-8') as file: + try: + data = json.load(file) + data_site = data["sites"][0] + if len(data["sites"]) > 1: + data_site = "multisite" + if data_site not in task_id_dict.keys(): + task_id_dict[data_site] = [] + task_id_dict[data_site].append(data["task_id"]) + except json.JSONDecodeError as e: + print(f"Error decoding JSON from file {filename}: {str(e)}") + for k in task_id_dict.keys(): + print(f"\"{k.upper()}\":", "["+", ".join([str(item) for item in sorted(task_id_dict[k], reverse=False)])+"],") + print() + for i in range(812): + for k in task_id_dict.keys(): + if i in task_id_dict[k]: + print(repr(k)+", ", end="") + +def load_json_obj_from_file(filepath): + with open(filepath, 'r') as f: + data = json.load(f) + return data + +def print_task_info_by_id(task_config_dir=EVALUATOR_DIR, task_ids=[65]): + for task_id in task_ids: + filepath = os.path.join(task_config_dir, f"{task_id}.json") + task_data = load_json_obj_from_file(filepath) + print(task_data["start_url"]) + +def clean_trajectory_files(input_dir, output_dir): + os.makedirs(output_dir, exist_ok=True) + for filename in [f"{i}.json" for i in range(812)]: + trajectory_obj = json.load(open(os.path.join(input_dir, filename), "r")) + new_obj = {} + for k in ["task", "id", "model"]: + new_obj[k] = trajectory_obj[k] + new_obj["type"] = "observation_action_space_refinement" + new_obj["objective"] = trajectory_obj["trajectory"][0]["objective"] + new_obj["trajectory"] = [] + for step in range(len(trajectory_obj["trajectory"])): + step_obj = {} + for k in ["url", "observation", "num_actions", "plan", "interaction_history_summary", "observation_description", "reason", "action", "observation_highlight_idxs", "done"]: + step_obj[k] = trajectory_obj["trajectory"][step][k] + step_obj["reward"] = trajectory_obj["trajectory"][step]["reward"] if trajectory_obj["trajectory"][step]["reward"] == 1. else 0. + new_obj["trajectory"].append(step_obj) + json.dump(new_obj, open(os.path.join(output_dir, filename), "w"), indent=4) + +def check_shopping_admin_login_expire(): + login_expire_dict = {} + for k in set(TRAJECTORY_DIR_DICT.keys())-set({16}): + for task_id in MERGED_SITE_TASK_ID_DICT["SHOPPING_ADMIN"]: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + step_0_data = trajectory_data["trajectory"][0] + if "username" in step_0_data["observation"].lower() and "password" in step_0_data["observation"].lower() and "sign in" in step_0_data["observation"].lower(): + if k in login_expire_dict.keys(): + login_expire_dict[k].append(task_id) + else: + login_expire_dict[k] = [task_id] + for k in sorted(list(login_expire_dict.keys())): + print(k) + print(login_expire_dict[k]) + +def check_reddit_post_limit(): + post_limit_dict = {} + for k in set(TRAJECTORY_DIR_DICT.keys())-set({16}): + for task_id in MERGED_SITE_TASK_ID_DICT["REDDIT"]: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + for step_data in trajectory_data["trajectory"]: + if "You cannot post more. Wait a while before trying again." in step_data["observation"]: + if k in post_limit_dict.keys(): + post_limit_dict[k].append(task_id) + else: + post_limit_dict[k] = [task_id] + break + for k in sorted(list(post_limit_dict.keys())): + print(k) + print(post_limit_dict[k]) + +def get_action_statistics(trajectory_list=[-1], action_list=["click", "type"]): + ACTION_WITH_ID_LIST = ["click", "type", "scroll", "goto", "note", "stop", "branch", "prune"] + trial_dict = {} + for k in trajectory_list: + trial_dict[k] = {} + for task_id in range(812): + if k != 16: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + for step_data in trajectory_data["trajectory"]: + for action in action_list: + if (action in ACTION_WITH_ID_LIST and f"{action} [" in step_data["action"]) or (action not in ACTION_WITH_ID_LIST and action in step_data["action"]): + if action in trial_dict[k].keys(): + trial_dict[k][action] += 1 + else: + trial_dict[k][action] = 1 + else: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"trace_{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + for step_data in trajectory_data["trace"]: + for action in action_list: + if (action in ACTION_WITH_ID_LIST and f"{action} [" in step_data["target"]) or (action not in ACTION_WITH_ID_LIST and action in step_data["target"]): + if action in trial_dict[k].keys(): + trial_dict[k][action] += 1 + else: + trial_dict[k][action] = 1 + csvfile = open(os.path.join(OUTPUT_DIR, "action_statistics.csv"), "w") + csv_writer = csv.writer(csvfile) + csv_writer.writerow(["TRIAL NAME"] + action_list) + for k in trajectory_list: + print(k) + csv_writer.writerow([RUN_NAME_DICT[k]] + [trial_dict[k][a] if a in trial_dict[k].keys() else 0 for a in action_list]) + for a in trial_dict[k].keys(): + print(a, trial_dict[k][a]) + +def get_avr_obs_token_num_statistics(trajectory_list=[-1]): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained("gpt2") + trial_dict = {} + for k in trajectory_list: + trial_dict[k] = {} + for task_id in range(812): + site_label = TASK_LABELS_MULTISITE[task_id] + trajectory_total_token_num = 0 + if k != 16: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + step_num = len(trajectory_data["trajectory"]) + for step_data in trajectory_data["trajectory"]: + trajectory_total_token_num += len(tokenizer.tokenize(step_data["observation"])) + else: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"trace_{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + step_num = len(trajectory_data["trace"]) + for step_data in trajectory_data["trace"]: + for source_data in step_data["source"]: + trajectory_total_token_num += len(tokenizer.tokenize(source_data["content"])) + print(trajectory_total_token_num, step_num) + if site_label in trial_dict[k].keys(): + trial_dict[k][site_label]["total_token_num"] += trajectory_total_token_num + trial_dict[k][site_label]["total_step_num"] += step_num + else: + trial_dict[k][site_label] = {"total_token_num": trajectory_total_token_num, "total_step_num": step_num} + if "all" in trial_dict[k].keys(): + trial_dict[k]["all"]["total_token_num"] += trajectory_total_token_num + trial_dict[k]["all"]["total_step_num"] += step_num + else: + trial_dict[k]["all"] = {"total_token_num": trajectory_total_token_num, "total_step_num": step_num} + csvfile = open(os.path.join(OUTPUT_DIR, "avr_obs_token_num_statistics.csv"), "w") + csv_writer = csv.writer(csvfile) + SITES = ["ALL", "SHOPPING", "SHOPPING_ADMIN", "GITLAB", "MAP", "REDDIT", "MAP", "MULTISITE"] + csv_writer.writerow(["TRIAL NAME"] + SITES) + for k in trajectory_list: + csv_writer.writerow([RUN_NAME_DICT[k]] + ["{:.1f}".format(trial_dict[k][s.lower()]["total_token_num"]/trial_dict[k][s.lower()]["total_step_num"]) for s in SITES]) + +def get_avr_step_num_statistics(trajectory_list=[-1]): + trial_dict = {} + for k in trajectory_list: + trial_dict[k] = {} + for task_id in range(812): + site_label = TASK_LABELS_MULTISITE[task_id] + if k != 16: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + if site_label in trial_dict[k].keys(): + trial_dict[k][site_label] += len(trajectory_data["trajectory"]) + else: + trial_dict[k][site_label] = len(trajectory_data["trajectory"]) + if "all" in trial_dict[k].keys(): + trial_dict[k]["all"] += len(trajectory_data["trajectory"]) + else: + trial_dict[k]["all"] = len(trajectory_data["trajectory"]) + else: + filepath = os.path.join(TRAJECTORY_DIR_DICT[k], f"trace_{task_id}.json") + if not os.path.exists(filepath): + print(f"{filepath} doesn't exist.") + continue + trajectory_data = json.load(open(filepath, 'r')) + if site_label in trial_dict[k].keys(): + trial_dict[k][site_label] += len(trajectory_data["trace"]) + else: + trial_dict[k][site_label] = len(trajectory_data["trace"]) + if "all" in trial_dict[k].keys(): + trial_dict[k]["all"] += len(trajectory_data["trace"]) + else: + trial_dict[k]["all"] = len(trajectory_data["trace"]) + csvfile = open(os.path.join(OUTPUT_DIR, "avr_step_num_statistics.csv"), "w") + csv_writer = csv.writer(csvfile) + SITES = ["ALL", "SHOPPING", "SHOPPING_ADMIN", "GITLAB", "MAP", "REDDIT", "MULTISITE"] + csv_writer.writerow(["TRIAL NAME"] + SITES) + for k in trajectory_list: + csv_writer.writerow([RUN_NAME_DICT[k]] + ["{:.1f}".format(trial_dict[k][s.lower()]/TOTAL_TASK_NUM_DICT[s]) for s in SITES]) + +def compare_evaluators(dir1, dir2): + def compare_evaluator(i, file1, file2): + def load_json(file_path): + with open(file_path, 'r') as f: + return json.load(f) + + def compare_json(json1, json2, path=""): + differences = {} + + for key in json1: + new_path = f"{path}.{key}" if path else key + + if key not in json2: + differences[new_path] = json1[key] + else: + if isinstance(json1[key], dict) and isinstance(json2[key], dict): + nested_diff = compare_json(json1[key], json2[key], new_path) + differences.update(nested_diff) + elif json1[key] != json2[key]: + differences[new_path] = (json1[key], json2[key]) + + for key in json2: + new_path = f"{path}.{key}" if path else key + + if key not in json1: + differences[new_path] = json2[key] + + return differences + + json1 = load_json(file1) + json2 = load_json(file2) + + differences = compare_json(json1, json2) + + if differences: + print(f"# Task {i}", file=file) + for key, value in differences.items(): + if isinstance(value, tuple): + print("### {}\n{}\n{}".format(key, value[0], value[1]), file=file) + else: + print("### {}\n{}".format(key, value), file=file) + print(file=file) + + file = open(os.path.join(OUTPUT_DIR, "evaluator_differences.txt"), "w") + for i in range(812): + file1 = os.path.join(dir1, f"{i}.json") + file2 = os.path.join(dir2, f"{i}.json") + compare_evaluator(i, file1, file2) + +if __name__ == "__main__": + get_action_statistics(trajectory_list=[7, 3, 4, 5, 6, 0], action_list=["click", "hover", "type", "scroll", "go_back", "goto", "note", "stop", "go_home", "branch", "prune"]) \ No newline at end of file diff --git a/Agent_E/ae/config.py b/Agent_E/ae/config.py new file mode 100644 index 0000000..73beffb --- /dev/null +++ b/Agent_E/ae/config.py @@ -0,0 +1,26 @@ +# config.py at the project source code root +import os + +PROJECT_SOURCE_ROOT = os.path.dirname(os.path.abspath(__file__)) +SOURCE_LOG_FOLDER_PATH = os.path.join(PROJECT_SOURCE_ROOT, 'log_files') + +PROJECT_ROOT = os.path.dirname(PROJECT_SOURCE_ROOT) + +PROJECT_TEMP_PATH = os.path.join(PROJECT_ROOT, 'temp') + +USER_PREFERENCES_PATH = os.path.join(PROJECT_SOURCE_ROOT, 'user_preferences') +PROJECT_TEST_ROOT = os.path.join(PROJECT_ROOT, 'test') + +# Check if the log folder exists, and if not, create it +if not os.path.exists(SOURCE_LOG_FOLDER_PATH): + os.makedirs(SOURCE_LOG_FOLDER_PATH) + print(f"Created log folder at: {SOURCE_LOG_FOLDER_PATH}") + +#create user prefernces folder if it does not exist +if not os.path.exists(USER_PREFERENCES_PATH): + os.makedirs(USER_PREFERENCES_PATH) + print(f"Created user preferences folder at: {USER_PREFERENCES_PATH}") + +if not os.path.exists(PROJECT_TEMP_PATH): + os.makedirs(PROJECT_TEMP_PATH) + print(f"Created temp folder at: {PROJECT_TEMP_PATH}") diff --git a/Agent_E/ae/core/__init__.py b/Agent_E/ae/core/__init__.py new file mode 100644 index 0000000..8bc702d --- /dev/null +++ b/Agent_E/ae/core/__init__.py @@ -0,0 +1,9 @@ +from Agent_E.ae.core import agents +from Agent_E.ae.core import memory +from Agent_E.ae.core import skills +from Agent_E.ae.core.autogen_wrapper import AutogenWrapper +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.core.post_process_responses import final_reply_callback_user_proxy +from Agent_E.ae.core.prompts import LLM_PROMPTS +from Agent_E.ae.core.system_orchestrator import SystemOrchestrator +from Agent_E.ae.core.ui_manager import UIManager diff --git a/Agent_E/ae/core/agents/__init__.py b/Agent_E/ae/core/agents/__init__.py new file mode 100644 index 0000000..4c31537 --- /dev/null +++ b/Agent_E/ae/core/agents/__init__.py @@ -0,0 +1 @@ +from Agent_E.ae.core.agents.browser_nav_agent import BrowserNavAgent \ No newline at end of file diff --git a/Agent_E/ae/core/agents/browser_nav_agent.py b/Agent_E/ae/core/agents/browser_nav_agent.py new file mode 100644 index 0000000..d7e6499 --- /dev/null +++ b/Agent_E/ae/core/agents/browser_nav_agent.py @@ -0,0 +1,164 @@ +import importlib +import os +from datetime import datetime +from string import Template +from typing import Any + +import autogen # type: ignore + +from Agent_E.ae.core.memory.static_ltm import get_user_ltm +from Agent_E.ae.core.prompts import LLM_PROMPTS +from Agent_E.ae.core.skills.click_using_selector import click as click_element + +# from Agent_E.ae.core.skills.enter_text_and_click import enter_text_and_click +from Agent_E.ae.core.skills.enter_text_using_selector import bulk_enter_text +from Agent_E.ae.core.skills.enter_text_using_selector import entertext +from Agent_E.ae.core.skills.get_dom_with_content_type import get_dom_with_content_type +from Agent_E.ae.core.skills.get_url import geturl +from Agent_E.ae.core.skills.open_url import openurl +from Agent_E.ae.core.skills.pdf_text_extractor import extract_text_from_pdf + +#from Agent_E.ae.core.skills.pdf_text_extractor import extract_text_from_pdf +from Agent_E.ae.core.skills.press_key_combination import press_key_combination +from Agent_E.ae.core.skills.skill_registry import skill_registry +from Agent_E.ae.utils.logger import logger + + +class BrowserNavAgent: + def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_prompt: str|None, browser_nav_executor: autogen.UserProxyAgent): # type: ignore + """ + Initialize the BrowserNavAgent and store the AssistantAgent instance + as an instance attribute for external access. + + Parameters: + - model_config_list: A list of configuration parameters required for AssistantAgent. + - llm_config_params: A dictionary of configuration parameters for the LLM. + - system_prompt: The system prompt to be used for this agent or the default will be used if not provided. + - user_proxy_agent: An instance of the UserProxyAgent class. + """ + self.browser_nav_executor = browser_nav_executor + user_ltm = self.__get_ltm() + + system_message = LLM_PROMPTS["BROWSER_AGENT_PROMPT"] + if system_prompt and len(system_prompt) > 0: + if isinstance(system_prompt, list): + system_message = "\n".join(system_prompt) + else: + system_message = system_prompt + logger.info(f"Using custom system prompt for BrowserNavAgent: {system_message}") + + system_message = system_message + "\n" + f"Today's date is {datetime.now().strftime('%d %B %Y')}" + if user_ltm: #add the user LTM to the system prompt if it exists + user_ltm = "\n" + user_ltm + system_message = Template(system_message).substitute(basic_user_information=user_ltm) + logger.info(f"Browser nav agent using model: {model_config_list[0]['model']}") + self.agent = autogen.ConversableAgent( + name="browser_navigation_agent", + system_message=system_message, + llm_config={ + "config_list": model_config_list, + **llm_config_params #unpack all the name value pairs in llm_config_params as is + }, + ) + self.__register_skills() + + + def __get_ltm(self): + """ + Get the the long term memory of the user. + returns: str | None - The user LTM or None if not found. + """ + return get_user_ltm() + + + def __register_skills(self): + """ + Register all the skills that the agent can perform. + """ + + # Register each skill for LLM by assistant agent and for execution by user_proxy_agen + + self.agent.register_for_llm(description=LLM_PROMPTS["OPEN_URL_PROMPT"])(openurl) + self.browser_nav_executor.register_for_execution()(openurl) + + # self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"])(enter_text_and_click) + # self.browser_nav_executor.register_for_execution()(enter_text_and_click) + + self.agent.register_for_llm(description=LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"])(get_dom_with_content_type) + self.browser_nav_executor.register_for_execution()(get_dom_with_content_type) + + self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT"])(click_element) + self.browser_nav_executor.register_for_execution()(click_element) + + self.agent.register_for_llm(description=LLM_PROMPTS["GET_URL_PROMPT"])(geturl) + self.browser_nav_executor.register_for_execution()(geturl) + + self.agent.register_for_llm(description=LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"])(bulk_enter_text) + self.browser_nav_executor.register_for_execution()(bulk_enter_text) + + self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_PROMPT"])(entertext) + self.browser_nav_executor.register_for_execution()(entertext) + + self.agent.register_for_llm(description=LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"])(press_key_combination) + self.browser_nav_executor.register_for_execution()(press_key_combination) + + self.agent.register_for_llm(description=LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"])(extract_text_from_pdf) + self.browser_nav_executor.register_for_execution()(extract_text_from_pdf) + + ''' + # Register reply function for printing messages + self.browser_nav_executor.register_reply( # type: ignore + [autogen.Agent, None], + reply_func=print_message_from_user_proxy, + config={"callback": None}, + ) + self.agent.register_reply( # type: ignore + [autogen.Agent, None], + reply_func=print_message_from_browser_agent, + config={"callback": None}, + ) + ''' + self.__load_additional_skills() + + #print(f">>> Function map: {self.browser_nav_executor.function_map}") # type: ignore + + + def __load_additional_skills(self): + """ + Dynamically load additional skills from directories or specific Python files + specified by an environment variable. + """ + # Get additional skill directories or files from environment variable + additional_skill_dirs: str = os.getenv('ADDITIONAL_SKILL_DIRS', "") + if len(additional_skill_dirs) == 0: + logger.debug("No additional skill directories or files specified.") + return + + additional_skill_paths: list[str] = additional_skill_dirs.split(',') + + for skill_path in additional_skill_paths: + skill_path = skill_path.strip() # Strip whitespace + + if os.path.isdir(skill_path): + # If the path is a directory, process all .py files in it + for filename in os.listdir(skill_path): + if filename.endswith(".py"): + module_name = filename[:-3] # Remove .py extension + module_path = f"{skill_path.replace('/', '.')}.{module_name}" + importlib.import_module(module_path) + + elif skill_path.endswith(".py") and os.path.isfile(skill_path): + # If the path is a specific .py file, load it directly + module_name = os.path.basename(skill_path)[:-3] # Strip .py extension + directory_path = os.path.dirname(skill_path).replace('/', '.') + module_path = f"{directory_path}.{module_name}" + importlib.import_module(module_path) + else: + logger.warning(f"Invalid skill path specified: {skill_path}") + + # Register the skills that were dynamically discovered + for skill in skill_registry: + self.agent.register_for_llm(description=skill['description'])(skill['func']) + self.browser_nav_executor.register_for_execution()(skill['func']) + logger.debug(f"Registered additional skill: {skill['name']}") + diff --git a/Agent_E/ae/core/agents/high_level_planner_agent.py b/Agent_E/ae/core/agents/high_level_planner_agent.py new file mode 100644 index 0000000..388886f --- /dev/null +++ b/Agent_E/ae/core/agents/high_level_planner_agent.py @@ -0,0 +1,77 @@ +import os +from datetime import datetime +from string import Template +from typing import Any + +import autogen # type: ignore +from autogen import ConversableAgent # type: ignore + +from Agent_E.ae.core.memory.static_ltm import get_user_ltm +from Agent_E.ae.core.post_process_responses import final_reply_callback_planner_agent as print_message_as_planner # type: ignore +from Agent_E.ae.core.prompts import LLM_PROMPTS +from Agent_E.ae.core.skills.get_user_input import get_user_input +from Agent_E.ae.utils.logger import logger + + +class PlannerAgent: + def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_prompt: str|None, user_proxy_agent:ConversableAgent): # type: ignore + """ + Initialize the PlannerAgent and store the AssistantAgent instance + as an instance attribute for external access. + + Parameters: + - model_config_list: A list of configuration parameters required for AssistantAgent. + - llm_config_params: A dictionary of configuration parameters for the LLM. + - system_prompt: The system prompt to be used for this agent or the default will be used if not provided. + - user_proxy_agent: An instance of the UserProxyAgent class. + """ + enable_user_input = os.getenv("PLANNER_USER_INPUT_SKILL_ENABLED", "false").lower() == "true" + + user_ltm = self.__get_ltm() + system_message = LLM_PROMPTS["PLANNER_AGENT_PROMPT"] + + if system_prompt and len(system_prompt) > 0: + if isinstance(system_prompt, list): + system_message = "\n".join(system_prompt) + else: + system_message = system_prompt + logger.info(f"Using custom system prompt for PlannerAgent: {system_message}") + + + if user_ltm: #add the user LTM to the system prompt if it exists + user_ltm = "\n" + user_ltm + system_message = Template(system_message).substitute(basic_user_information=user_ltm) + system_message = system_message + "\n" + f"Today's date is {datetime.now().strftime('%d %B %Y')}" + logger.info(f"Planner agent using model: {model_config_list[0]['model']}") + + self.agent = autogen.AssistantAgent( + name="planner_agent", + system_message=system_message, + llm_config={ + "config_list": model_config_list, + **llm_config_params #unpack all the name value pairs in llm_config_params as is + }, + ) + + if enable_user_input: + # Register get_user_input skill for LLM by assistant agent + self.agent.register_for_llm(description=LLM_PROMPTS["GET_USER_INPUT_PROMPT"])(get_user_input) + # Register get_user_input skill for execution by user_proxy_agent + user_proxy_agent.register_for_execution()(get_user_input) + else: + logger.debug("User input skill is disabled for PlannerAgent") + + self.agent.register_reply( # type: ignore + [autogen.AssistantAgent, None], + reply_func=print_message_as_planner, + config={"callback": None}, + ignore_async_in_sync_chat=True + ) + + def __get_ltm(self): + """ + Get the the long term memory of the user. + returns: str | None - The user LTM or None if not found. + """ + return get_user_ltm() + diff --git a/Agent_E/ae/core/agents_llm_config.py b/Agent_E/ae/core/agents_llm_config.py new file mode 100644 index 0000000..42da739 --- /dev/null +++ b/Agent_E/ae/core/agents_llm_config.py @@ -0,0 +1,197 @@ + +import json +import os +from typing import Any + +from dotenv import load_dotenv + +from Agent_E.ae.utils.logger import logger + + +class AgentsLLMConfig: + # Mapping from environment keys to model config keys + KEY_MAPPING_ENV_MODEL: dict[str, str] = { + "AUTOGEN_MODEL_NAME": "model", + "AUTOGEN_MODEL_API_KEY": "api_key", + "AUTOGEN_MODEL_BASE_URL": "base_url", + "AUTOGEN_MODEL_API_TYPE": "api_type", + "AUTOGEN_MODEL_API_VERSION": "api_version", + } + + # Mapping from environment keys to LLM config keys + KEY_MAPPING_ENV_LLM: dict[str, str] = { + "AUTOGEN_LLM_TEMPERATURE": "temperature", + "AUTOGEN_LLM_TOP_P": "top_p", + } + + # Mapping from file keys to model config keys + KEY_MAPPING_FILE: dict[str, str] = { + "model_name": "model", + "model_api_key": "api_key", + "model_base_url": "base_url", + "model_api_type": "api_type", + } + + def __init__(self, env_file_path: str = ".env", llm_config: dict[str,Any] | None = None) -> None: + load_dotenv(env_file_path, verbose=True, override=True) + if llm_config: + self.config: dict[str, Any] = self.load_config_from_api(llm_config) + else: + self.config: dict[str, Any] = self._load_config() + + + def _load_config(self) -> dict[str, Any]: + config_file = os.getenv("AGENTS_LLM_CONFIG_FILE") + config_file_ref_key = os.getenv("AGENTS_LLM_CONFIG_FILE_REF_KEY") + + if config_file: + try: + with open(config_file, 'r') as file: # noqa: UP015 + file_config = json.load(file) + + if config_file_ref_key: + if config_file_ref_key in file_config: + logger.info(f"Loading configuration from: {config_file} with key: {config_file_ref_key}") + raw_config = file_config[config_file_ref_key] + + # Process configurations for both planner_agent and browser_nav_agent + planner_config = self._normalize_config(raw_config.get("planner_agent", {})) + browser_nav_config = self._normalize_config(raw_config.get("browser_nav_agent", {})) + + config = { + "planner_agent": planner_config, + "browser_nav_agent": browser_nav_config, + "other_settings": {k: v for k, v in raw_config.items() if k not in ["planner_agent", "browser_nav_agent"]}, + } + logger.info(f"Using configuration key '{config_file_ref_key}' from the config file.") + else: + logger.error(f"Key '{config_file_ref_key}' not found in the configuration file.") + raise KeyError(f"Key '{config_file_ref_key}' not found in the configuration file.") + else: + logger.error("AGENTS_LLM_CONFIG_FILE_REF_KEY is not provided.") + raise ValueError("AGENTS_LLM_CONFIG_FILE_REF_KEY must be provided if AGENTS_LLM_CONFIG_FILE is set.") + + except Exception as e: + logger.error(f"Error loading configuration file: {e}") + raise e + else: + logger.info("Loading configuration from environment variables") + # Load configurations from environment variables + normalized_config = self._normalize_config_from_env() + + config = { + "planner_agent": normalized_config, + "browser_nav_agent": normalized_config + } + + return config + + def load_config_from_api(self, llm_config: dict[str, Any]) -> dict[str, Any]: + """ + Load configuration from a JSON provided during execution. + + Parameters + ---------- + config_string : dict[str,Any] + A JSON representing the configuration. + + Returns + ------- + dict[str, Any] + The loaded and normalized configuration. + """ + try: + + logger.info("Loading LLM configuration provided via API.") + + # Process configurations for both planner_agent and browser_nav_agent + planner_config = self._normalize_config(llm_config.get("planner_agent", {})) + browser_nav_config = self._normalize_config(llm_config.get("browser_nav_agent", {})) + + config = { + "planner_agent": planner_config, + "browser_nav_agent": browser_nav_config, + "other_settings": {k: v for k, v in llm_config.items() if k not in ["planner_agent", "browser_nav_agent"]}, + } + + return config + + except json.JSONDecodeError as e: + logger.error(f"Error decoding JSON string: {e}") + raise e + + def _normalize_config(self, agent_config: dict[str, Any]) -> dict[str, Any]: + """Normalize agent-specific config from a file, grouping keys into model_config_params, llm_config_params, and other_settings.""" + model_config = {} + llm_config_params = {} + other_settings = {} + + for k, v in agent_config.items(): + if k in self.KEY_MAPPING_FILE: + model_config[self.KEY_MAPPING_FILE[k]] = v + elif k == "llm_config_params": + llm_config_params = v # Keep llm_config_params as is + else: + other_settings[k] = v + + return { + "model_config_params": model_config, + "llm_config_params": llm_config_params, + "other_settings": other_settings, + } + + def _normalize_config_from_env(self) -> dict[str, Any]: + """Normalize config from environment variables, adding defaults for 'temperature', 'top_p', and 'seed' based on model name.""" + model_config = {} + llm_config_params = {} + other_settings = {} + + # Populate model_config_params + for original_key, mapped_key in self.KEY_MAPPING_ENV_MODEL.items(): + value = os.getenv(original_key) + if value is not None: + model_config[mapped_key] = value + + # Populate llm_config_params + for original_key, mapped_key in self.KEY_MAPPING_ENV_LLM.items(): + value = os.getenv(original_key) + if value is not None: + llm_config_params[mapped_key] = value + + # Capture other settings that start with 'AUTOGEN_MODEL' + for original_key in os.environ: + if original_key.startswith("AUTOGEN_MODEL") and original_key not in self.KEY_MAPPING_ENV_MODEL: + other_settings[original_key] = os.getenv(original_key) + + # Apply defaults for 'temperature', 'top_p', 'seed' if not present + model_name:str = model_config.get("model", "").lower() # type: ignore + + if model_name.startswith("gpt"): # type: ignore + llm_config_params.setdefault("temperature", 0.0) # type: ignore + llm_config_params.setdefault("top_p", 0.001) # type: ignore + llm_config_params.setdefault("seed", 12345) # type: ignore + else: + llm_config_params.setdefault("temperature", 0.1) # type: ignore + llm_config_params.setdefault("top_p", 0.1) # type: ignore + + return { + "model_config_params": model_config, + "llm_config_params": llm_config_params, + "other_settings": other_settings, + } + + def get_planner_agent_config(self) -> dict[str, Any]: + return self.config["planner_agent"] + + def get_browser_nav_agent_config(self) -> dict[str, Any]: + return self.config["browser_nav_agent"] + + def get_full_config(self) -> dict[str, Any]: + return self.config + +# Example usage +if __name__ == "__main__": + config = AgentsLLMConfig() + + planner_config = config.get_planner_agent_config() + browser_nav_config = config.get_browser_nav_agent_config() diff --git a/Agent_E/ae/core/autogen_wrapper.py b/Agent_E/ae/core/autogen_wrapper.py new file mode 100644 index 0000000..d54e757 --- /dev/null +++ b/Agent_E/ae/core/autogen_wrapper.py @@ -0,0 +1,384 @@ +import asyncio +import json +import os +import tempfile +import traceback +from string import Template +from time import time_ns +from typing import Any + +import autogen # type: ignore +import nest_asyncio # type: ignore +import openai + +#from autogen import Cache +from Agent_E.ae.config import SOURCE_LOG_FOLDER_PATH +from Agent_E.ae.core.agents.browser_nav_agent import BrowserNavAgent +from Agent_E.ae.core.agents.high_level_planner_agent import PlannerAgent +from Agent_E.ae.core.post_process_responses import final_reply_callback_planner_agent as notify_planner_messages # type: ignore +from Agent_E.ae.core.prompts import LLM_PROMPTS +from Agent_E.ae.core.skills.get_url import geturl +from Agent_E.ae.utils.autogen_sequential_function_call import UserProxyAgent_SequentialFunctionExecution +from Agent_E.ae.utils.detect_llm_loops import is_agent_stuck_in_loop +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.response_parser import parse_response +from Agent_E.ae.utils.ui_messagetype import MessageType + +nest_asyncio.apply() # type: ignore + +class AutogenWrapper: + """ + A wrapper class for interacting with the Autogen library. + + Args: + planner_max_chat_round (int): The maximum number of chat rounds for the planner agent. + browser_nav_max_chat_round (int): The maximum number of chat rounds for the browser navigation agent. + + Attributes: + number_of_rounds (int): The maximum number of chat rounds. + agents_map (dict): A dictionary of the agents that are instantiated in this autogen instance. + + """ + + def __init__(self, save_chat_logs_to_files: bool = True, planner_max_chat_round: int = 50, browser_nav_max_chat_round: int = 10): + self.planner_number_of_rounds = planner_max_chat_round + self.browser_number_of_rounds = browser_nav_max_chat_round + + self.agents_map: dict[str, UserProxyAgent_SequentialFunctionExecution | autogen.AssistantAgent | autogen.ConversableAgent ] | None = None + + self.planner_agent_model_config : list[dict[str, str]] | None = None + self.browser_nav_agent_model_config : list[dict[str, str]] | None = None + + self.planner_agent_config: dict[str, Any] | None = None + self.browser_nav_agent_config: dict[str, Any] | None = None + + self.chat_logs_dir: str = SOURCE_LOG_FOLDER_PATH + self.save_chat_logs_to_files = save_chat_logs_to_files + + @classmethod + async def create(cls, planner_agent_config: dict[str, Any], browser_nav_agent_config: dict[str, Any], agents_needed: list[str] | None = None, + save_chat_logs_to_files: bool = True, planner_max_chat_round: int = 50, browser_nav_max_chat_round: int = 10): + """ + Create an instance of AutogenWrapper. + + Args: + planner_agent_config: dict[str, Any]: A dictionary containing the configuration parameters for the planner agent. For example: + { + "model_name": "gpt-4o", + "model_api_key": "", + "model_base_url": null, + "system_prompt": ["optional prompt unless you want to use the built in"], + "llm_config_params": { #all name value pairs here will go to the llm config of autogen verbatim + "cache_seed": null, + "temperature": 0.001, + "top_p": 0.001 + } + } + browser_nav_agent_config: dict[str, Any]: A dictionary containing the configuration parameters for the browser navigation agent. Same format as planner_agent_config. + agents_needed (list[str], optional): The list of agents needed. If None, then ["user", "browser_nav_executor", "planner_agent", "browser_nav_agent"] will be used. + save_chat_logs_to_files (bool, optional): Whether to save chat logs to files. Defaults to True. + planner_max_chat_rounds (int, optional): The maximum number of chat rounds for the planner. Defaults to 50. + browser_nav_max_chat_round (int, optional): The maximum number of chat rounds for the browser navigation agent. Defaults to 10. + + Returns: + AutogenWrapper: An instance of AutogenWrapper. + + """ + print(f">>> Creating AutogenWrapper with {agents_needed}, Planner max chat rounds: {planner_max_chat_round}, browser nav max chat rounds: {browser_nav_max_chat_round}. Save chat logs to files: {save_chat_logs_to_files}") + if agents_needed is None: + agents_needed = ["user", "browser_nav_executor", "planner_agent", "browser_nav_agent"] + # Create an instance of cls + self = cls(save_chat_logs_to_files=save_chat_logs_to_files, planner_max_chat_round=planner_max_chat_round, browser_nav_max_chat_round=browser_nav_max_chat_round) + + os.environ["AUTOGEN_USE_DOCKER"] = "False" + + self.planner_agent_config = planner_agent_config + self.browser_nav_agent_config = browser_nav_agent_config + + self.planner_agent_model_config = self.convert_model_config_to_autogen_format(self.planner_agent_config["model_config_params"]) + self.browser_nav_agent_model_config = self.convert_model_config_to_autogen_format(self.browser_nav_agent_config["model_config_params"]) + + self.agents_map = await self.__initialize_agents(agents_needed) + + def trigger_nested_chat(manager: autogen.ConversableAgent): + content:str=manager.last_message()["content"] # type: ignore + content_json = parse_response(content) # type: ignore + next_step = content_json.get('next_step', None) + plan = content_json.get('plan', None) + if plan is not None: + notify_planner_messages(plan, message_type=MessageType.PLAN) + + if next_step is None: + notify_planner_messages("Received no response, terminating..", message_type=MessageType.INFO) # type: ignore + return False + else: + notify_planner_messages(next_step, message_type=MessageType.STEP) # type: ignore + return True + + def get_url() -> str: + return asyncio.run(geturl()) + + def my_custom_summary_method(sender: autogen.ConversableAgent,recipient: autogen.ConversableAgent, summary_args: dict ) : # type: ignore + messages_str_keys = {str(key): value for key, value in sender.chat_messages.items()} # type: ignore + self.__save_chat_log(list(messages_str_keys.values())[0]) # type: ignore + last_message=recipient.last_message(sender)["content"] # type: ignore + if not last_message or last_message.strip() == "": # type: ignore + # print(f">>> Last message from browser nav was empty. Max turns: {self.browser_number_of_rounds*2}, number of messages: {len(list(sender.chat_messages.items())[0][1])}") + # print(">>> Sender messages:", json.dumps( list(sender.chat_messages.items())[0][1], indent=2)) + return "I received an empty message. This is not an error and is recoverable. Try to reformulate the task..." + elif "##TERMINATE TASK##" in last_message: + last_message=last_message.replace("##TERMINATE TASK##", "") # type: ignore + last_message=last_message+" "+ get_url() # type: ignore + notify_planner_messages(last_message, message_type=MessageType.ACTION) # type: ignore + return last_message # type: ignore + return recipient.last_message(sender)["content"] # type: ignore + + def reflection_message(recipient, messages, sender, config): # type: ignore + last_message=messages[-1]["content"] # type: ignore + content_json = parse_response(last_message) # type: ignore + next_step = content_json.get('next_step', None) + + if next_step is None: + print ("Message to nested chat returned None") + return None + else: + next_step = next_step.strip() +" " + get_url() # type: ignore + return next_step # type: ignore + + # print(f">>> Registering nested chat. Available agents: {self.agents_map}") + self.agents_map["user"].register_nested_chats( # type: ignore + [ + { + "sender": self.agents_map["browser_nav_executor"], + "recipient": self.agents_map["browser_nav_agent"], + "message":reflection_message, + "max_turns": self.browser_number_of_rounds, + "summary_method": my_custom_summary_method, + } + ], + trigger=trigger_nested_chat, # type: ignore + ) + + return self + + + def convert_model_config_to_autogen_format(self, model_config: dict[str, str]) -> list[dict[str, Any]]: + env_var: list[dict[str, str]] = [model_config] + with tempfile.NamedTemporaryFile(delete=False, mode='w') as temp: + json.dump(env_var, temp) + temp_file_path = temp.name + + return autogen.config_list_from_json(env_or_file=temp_file_path) + + def get_chat_logs_dir(self) -> str|None: + """ + Get the directory for saving chat logs. + + Returns: + str|None: The directory path or None if there is not one + + """ + return self.chat_logs_dir + + def set_chat_logs_dir(self, chat_logs_dir: str): + """ + Set the directory for saving chat logs. + + Args: + chat_logs_dir (str): The directory path. + + """ + self.chat_logs_dir = chat_logs_dir + + + def __save_chat_log(self, chat_log: list[dict[str, Any]]): + if not self.save_chat_logs_to_files: + logger.info("Nested chat logs", extra={"nested_chat_log": chat_log}) + else: + chat_logs_file = os.path.join(self.get_chat_logs_dir() or "", f"nested_chat_log_{str(time_ns())}.json") + # Save the chat log to a file + with open(chat_logs_file, "w") as file: + json.dump(chat_log, file, indent=4) + + + async def __initialize_agents(self, agents_needed: list[str]): + """ + Instantiate all agents with their appropriate prompts/skills. + + Args: + agents_needed (list[str]): The list of agents needed, this list must have user_proxy in it or an error will be generated. + + Returns: + dict: A dictionary of agent instances. + + """ + agents_map: dict[str, UserProxyAgent_SequentialFunctionExecution | autogen.ConversableAgent]= {} + + user_delegate_agent = await self.__create_user_delegate_agent() + agents_map["user"] = user_delegate_agent + agents_needed.remove("user") + + browser_nav_executor = self.__create_browser_nav_executor_agent() + agents_map["browser_nav_executor"] = browser_nav_executor + agents_needed.remove("browser_nav_executor") + + for agent_needed in agents_needed: + if agent_needed == "browser_nav_agent": + browser_nav_agent: autogen.ConversableAgent = self.__create_browser_nav_agent(agents_map["browser_nav_executor"] ) + agents_map["browser_nav_agent"] = browser_nav_agent + elif agent_needed == "planner_agent": + planner_agent = self.__create_planner_agent(user_delegate_agent) + agents_map["planner_agent"] = planner_agent + else: + raise ValueError(f"Unknown agent type: {agent_needed}") + return agents_map + + + async def __create_user_delegate_agent(self) -> autogen.ConversableAgent: + """ + Create a ConversableAgent instance. + + Returns: + autogen.ConversableAgent: An instance of ConversableAgent. + + """ + def is_planner_termination_message(x: dict[str, str])->bool: # type: ignore + should_terminate = False + function: Any = x.get("function", None) + if function is not None: + return False + + content:Any = x.get("content", "") + if content is None: + content = "" + should_terminate = True + else: + try: + content_json = parse_response(content) + _terminate = content_json.get('terminate', "no") + final_response = content_json.get('final_response', None) + if(_terminate == "yes"): + should_terminate = True + if final_response: + notify_planner_messages(final_response, message_type=MessageType.ANSWER) + except json.JSONDecodeError: + logger.error("Error decoding JSON response:\n{content}.\nTerminating..") + should_terminate = True + + return should_terminate # type: ignore + + task_delegate_agent = UserProxyAgent_SequentialFunctionExecution( + name="user", + llm_config=False, + system_message=LLM_PROMPTS["USER_AGENT_PROMPT"], + is_termination_msg=is_planner_termination_message, # type: ignore + human_input_mode="NEVER", + max_consecutive_auto_reply=self.planner_number_of_rounds, + ) + return task_delegate_agent + + def __create_browser_nav_executor_agent(self): + """ + Create a UserProxyAgent instance for executing browser control. + + Returns: + autogen.UserProxyAgent: An instance of UserProxyAgent. + + """ + def is_browser_executor_termination_message(x: dict[str, str])->bool: # type: ignore + + tools_call:Any = x.get("tool_calls", "") + if tools_call : + chat_messages=self.agents_map["browser_nav_executor"].chat_messages #type: ignore + # Get the only key from the dictionary + agent_key = next(iter(chat_messages)) # type: ignore + # Get the chat messages corresponding to the only key + messages = chat_messages[agent_key] # type: ignore + return is_agent_stuck_in_loop(messages) # type: ignore + else: + print("Terminating browser executor") + return True + + browser_nav_executor_agent = UserProxyAgent_SequentialFunctionExecution( + name="browser_nav_executor", + is_termination_msg=is_browser_executor_termination_message, + human_input_mode="NEVER", + llm_config=None, + max_consecutive_auto_reply=self.browser_number_of_rounds, + code_execution_config={ + "last_n_messages": 1, + "work_dir": "tasks", + "use_docker": False, + }, + ) + print(">>> Created browser_nav_executor_agent:", browser_nav_executor_agent) + return browser_nav_executor_agent + + def __create_browser_nav_agent(self, user_proxy_agent: UserProxyAgent_SequentialFunctionExecution) -> autogen.ConversableAgent: + """ + Create a BrowserNavAgent instance. + + Args: + user_proxy_agent (autogen.UserProxyAgent): The instance of UserProxyAgent that was created. + + Returns: + autogen.AssistantAgent: An instance of BrowserNavAgent. + + """ + browser_nav_agent = BrowserNavAgent(self.browser_nav_agent_model_config, self.browser_nav_agent_config["llm_config_params"], # type: ignore + self.browser_nav_agent_config["other_settings"].get("system_prompt", None), user_proxy_agent) # type: ignore + #print(">>> browser agent tools:", json.dumps(browser_nav_agent.agent.llm_config.get("tools"), indent=2)) + return browser_nav_agent.agent + + def __create_planner_agent(self, assistant_agent: autogen.ConversableAgent): + """ + Create a Planner Agent instance. This is mainly used for exploration at this point + + Returns: + autogen.AssistantAgent: An instance of PlannerAgent. + + """ + planner_agent = PlannerAgent(self.planner_agent_model_config, self.planner_agent_config["llm_config_params"], # type: ignore + self.planner_agent_config["other_settings"].get("system_prompt", None), assistant_agent) # type: ignore + return planner_agent.agent + + async def process_command(self, command: str, current_url: str | None = None) -> autogen.ChatResult | None: + """ + Process a command by sending it to one or more agents. + + Args: + command (str): The command to be processed. + current_url (str, optional): The current URL of the browser. Defaults to None. + + Returns: + autogen.ChatResult | None: The result of the command processing, or None if an error occurred. Contains chat log, cost(tokens/price) + + """ + current_url_prompt_segment = "" + if current_url: + current_url_prompt_segment = f"Current Page: {current_url}" + + prompt = Template(LLM_PROMPTS["COMMAND_EXECUTION_PROMPT"]).substitute(command=command, current_url_prompt_segment=current_url_prompt_segment) + logger.info(f"Prompt for command: {prompt}") + #with Cache.disk() as cache: + try: + if self.agents_map is None: + raise ValueError("Agents map is not initialized.") + + result=await self.agents_map["user"].a_initiate_chat( # type: ignore + self.agents_map["planner_agent"], # self.manager # type: ignore + max_turns=self.planner_number_of_rounds, + #clear_history=True, + message=prompt, + silent=False, + cache=None, + ) + # reset usage summary for all agents after each command + for agent in self.agents_map.values(): + if hasattr(agent, "client") and agent.client is not None: + agent.client.clear_usage_summary() # type: ignore + return result + except openai.BadRequestError as bre: + logger.error(f"Unable to process command: \"{command}\". {bre}") + traceback.print_exc() + diff --git a/Agent_E/ae/core/memory/static_ltm.py b/Agent_E/ae/core/memory/static_ltm.py new file mode 100644 index 0000000..a53711a --- /dev/null +++ b/Agent_E/ae/core/memory/static_ltm.py @@ -0,0 +1,22 @@ +import os + +from Agent_E.ae.config import USER_PREFERENCES_PATH +from Agent_E.ae.utils.logger import logger + + +def get_user_ltm(): + """ + Get the user preferences stored in the user_preferences.txt file. + returns: str | None - The user preferences stored in the user_preferences.txt file or None if not found. + """ + user_preferences_file_name = 'user_preferences.txt' + user_preferences_file = os.path.join(USER_PREFERENCES_PATH, user_preferences_file_name) + try: + with open(user_preferences_file) as f: + user_pref = f.read() + logger.info(f"User preferences loaded from: {user_preferences_file}") + return user_pref + except FileNotFoundError: + logger.warning(f"""User preferences file \"{user_preferences_file_name}\" not found. +To add your preferences for this agent to use, create a file called "{user_preferences_file_name}" in directory "{USER_PREFERENCES_PATH}".\n""") + return None diff --git a/Agent_E/ae/core/notification_manager.py b/Agent_E/ae/core/notification_manager.py new file mode 100644 index 0000000..7d5f0d5 --- /dev/null +++ b/Agent_E/ae/core/notification_manager.py @@ -0,0 +1,53 @@ +from collections.abc import Callable + + +class NotificationManager: + """ + NotificationManager handles the dispatching of notifications to registered listeners. + + Attributes: + listeners (list[Callable[[dict[str, str]], None]]): A list of listener callbacks to notify. + """ + + def __init__(self): + """ + Initialize the NotificationManager with no listeners. + """ + self.listeners: list[Callable[[dict[str, str]], None]] = [] + + def notify(self, message: str, message_type: str) -> None: + """ + Notify all registered listeners with a message and its type. + + Args: + message (str): The message to notify. + message_type (str): The type of the message. + """ + notification = { + "message": message, + "type": message_type, + } + + if self.listeners: + for listener in self.listeners: + listener(notification) + else: + print(f"No listeners available, discarding message: {notification}") + + def register_listener(self, listener: Callable[[dict[str, str]], None]) -> None: + """ + Register a new listener to receive notifications. + + Args: + listener (Callable[[dict[str, str]], None]): The listener callback to register. + """ + self.listeners.append(listener) + + def unregister_listener(self, listener: Callable[[dict[str, str]], None]) -> None: + """ + Unregister a listener from receiving notifications. + + Args: + listener (Callable[[dict[str, str]], None]): The listener callback to unregister. + """ + self.listeners.remove(listener) diff --git a/Agent_E/ae/core/playwright_manager.py b/Agent_E/ae/core/playwright_manager.py new file mode 100644 index 0000000..4a2106b --- /dev/null +++ b/Agent_E/ae/core/playwright_manager.py @@ -0,0 +1,452 @@ +import asyncio +import os +import tempfile +import time + +from playwright.async_api import async_playwright as playwright +from playwright.async_api import BrowserContext +from playwright.async_api import Page +from playwright.async_api import Playwright + +from Agent_E.ae.core.notification_manager import NotificationManager +from Agent_E.ae.core.ui_manager import UIManager +from Agent_E.ae.utils.dom_mutation_observer import dom_mutation_change_detected +from Agent_E.ae.utils.dom_mutation_observer import handle_navigation_for_mutation_observer +from Agent_E.ae.utils.js_helper import beautify_plan_message +from Agent_E.ae.utils.js_helper import escape_js_message +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + +# Enusres that playwright does not wait for font loading when taking screenshots. Reference: https://github.com/microsoft/playwright/issues/28995 +os.environ["PW_TEST_SCREENSHOT_NO_FONTS_READY"] = "1" + +class PlaywrightManager: + """ + A singleton class to manage Playwright instances and browsers. + + Attributes: + browser_type (str): The type of browser to use ('chromium', 'firefox', 'webkit'). + isheadless (bool): Flag to launch the browser in headless mode or not. + + The class ensures only one instance of itself, Playwright, and the browser is created during the application lifecycle. + """ + _homepage = "https://www.google.com" + _instance = None + _playwright = None # type: ignore + _browser_context = None + __async_initialize_done = False + _take_screenshots = False + _screenshots_dir = None + + def __new__(cls, *args, **kwargs): # type: ignore + """ + Ensures that only one instance of PlaywrightManager is created (singleton pattern). + """ + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance.__initialized = False + logger.debug("Playwright instance created..") + return cls._instance + + + def __init__(self, browser_type: str = "chromium", headless: bool = False, gui_input_mode: bool = True, screenshots_dir: str = "", take_screenshots: bool = False): + """ + Initializes the PlaywrightManager with the specified browser type and headless mode. + Initialization occurs only once due to the singleton pattern. + + Args: + browser_type (str, optional): The type of browser to use. Defaults to "chromium". + headless (bool, optional): Flag to launch the browser in headless mode or not. Defaults to False (non-headless). + """ + if self.__initialized: + return + self.browser_type = browser_type + self.isheadless = headless + self.__initialized = True + self.notification_manager = NotificationManager() + self.user_response_event = asyncio.Event() + if gui_input_mode: + self.ui_manager: UIManager = UIManager() + + self.set_take_screenshots(take_screenshots) + self.set_screenshots_dir(screenshots_dir) + + + async def async_initialize(self): + """ + Asynchronously initialize necessary components and handlers for the browser context. + """ + if self.__async_initialize_done: + return + + # Step 1: Ensure Playwright is started and browser context is created + await self.start_playwright() + await self.ensure_browser_context() + + # Step 2: Deferred setup of handlers + await self.setup_handlers() + + # Step 3: Navigate to homepage + await self.go_to_homepage() + + self.__async_initialize_done = True + + + async def ensure_browser_context(self): + """ + Ensure that a browser context exists, creating it if necessary. + """ + if self._browser_context is None: + await self.create_browser_context() + + + async def setup_handlers(self): + """ + Setup various handlers after the browser context has been ensured. + """ + await self.set_overlay_state_handler() + await self.set_user_response_handler() + await self.set_navigation_handler() + + + async def start_playwright(self): + """ + Starts the Playwright instance if it hasn't been started yet. This method is idempotent. + """ + if not PlaywrightManager._playwright: + PlaywrightManager._playwright: Playwright = await playwright().start() + + + async def stop_playwright(self): + """ + Stops the Playwright instance and resets it to None. This method should be called to clean up resources. + """ + # Close the browser context if it's initialized + if PlaywrightManager._browser_context is not None: + await PlaywrightManager._browser_context.close() + PlaywrightManager._browser_context = None + + # Stop the Playwright instance if it's initialized + if PlaywrightManager._playwright is not None: # type: ignore + await PlaywrightManager._playwright.stop() + PlaywrightManager._playwright = None # type: ignore + + + async def create_browser_context(self): + user_dir:str = os.environ.get('BROWSER_STORAGE_DIR', '') + if self.browser_type == "chromium": + logger.info(f"User dir: {user_dir}") + try: + PlaywrightManager._browser_context = await PlaywrightManager._playwright.chromium.launch_persistent_context(user_dir, + channel= "chrome", headless=self.isheadless, + args=["--disable-blink-features=AutomationControlled", + "--disable-session-crashed-bubble", # disable the restore session bubble + "--disable-infobars", # disable informational popups, + ], + no_viewport=True + ) + except Exception as e: + if "Target page, context or browser has been closed" in str(e): + new_user_dir = tempfile.mkdtemp() + logger.error(f"Failed to launch persistent context with user dir {user_dir}: {e} Trying to launch with a new user dir {new_user_dir}") + PlaywrightManager._browser_context = await PlaywrightManager._playwright.chromium.launch_persistent_context(new_user_dir, + channel= "chrome", headless=self.isheadless, + args=["--disable-blink-features=AutomationControlled", + "--disable-session-crashed-bubble", # disable the restore session bubble + "--disable-infobars", # disable informational popups, + ], + no_viewport=True + ) + elif "Chromium distribution 'chrome' is not found " in str(e): + raise ValueError("Chrome is not installed on this device. Install Google Chrome or install playwright using 'playwright install chrome'. Refer to the readme for more information.") from None + else: + raise e from None + else: + raise ValueError(f"Unsupported browser type: {self.browser_type}") + + + async def get_browser_context(self): + """ + Returns the existing browser context, or creates a new one if it doesn't exist. + """ + await self.ensure_browser_context() + return self._browser_context + + + async def get_current_url(self) -> str | None: + """ + Get the current URL of current page + + Returns: + str | None: The current URL if any. + """ + try: + current_page: Page =await self.get_current_page() + return current_page.url + except Exception: + pass + return None + + async def get_current_page(self) -> Page : + """ + Get the current page of the browser + + Returns: + Page: The current page if any. + """ + try: + browser: BrowserContext = await self.get_browser_context() # type: ignore + # Filter out closed pages + pages: list[Page] = [page for page in browser.pages if not page.is_closed()] + page: Page | None = pages[-1] if pages else None + logger.debug(f"Current page: {page.url if page else None}") + if page is not None: + return page + else: + page:Page = await browser.new_page() # type: ignore + return page + except Exception: + logger.warn("Browser context was closed. Creating a new one.") + PlaywrightManager._browser_context = None + _browser:BrowserContext= await self.get_browser_context() # type: ignore + page: Page | None = await self.get_current_page() + return page + + + async def close_all_tabs(self, keep_first_tab: bool = True): + """ + Closes all tabs in the browser context, except for the first tab if `keep_first_tab` is set to True. + + Args: + keep_first_tab (bool, optional): Whether to keep the first tab open. Defaults to True. + """ + browser_context = await self.get_browser_context() + pages: list[Page] = browser_context.pages #type: ignore + pages_to_close: list[Page] = pages[1:] if keep_first_tab else pages # type: ignore + for page in pages_to_close: # type: ignore + await page.close() # type: ignore + + + async def close_except_specified_tab(self, page_to_keep: Page): + """ + Closes all tabs in the browser context, except for the specified tab. + + Args: + page_to_keep (Page): The Playwright page object representing the tab that should remain open. + """ + browser_context = await self.get_browser_context() + for page in browser_context.pages: # type: ignore + if page != page_to_keep: # Check if the current page is not the one to keep + await page.close() # type: ignore + + + async def go_to_homepage(self): + page:Page = await PlaywrightManager.get_current_page(self) + await page.goto(self._homepage) + + + async def set_navigation_handler(self): + page:Page = await PlaywrightManager.get_current_page(self) + page.on("domcontentloaded", self.ui_manager.handle_navigation) # type: ignore + page.on("domcontentloaded", handle_navigation_for_mutation_observer) # type: ignore + await page.expose_function("dom_mutation_change_detected", dom_mutation_change_detected) # type: ignore + + async def set_overlay_state_handler(self): + logger.debug("Setting overlay state handler") + context = await self.get_browser_context() + await context.expose_function('overlay_state_changed', self.overlay_state_handler) # type: ignore + await context.expose_function('show_steps_state_changed',self.show_steps_state_handler) # type: ignore + + async def overlay_state_handler(self, is_collapsed: bool): + page = await self.get_current_page() + self.ui_manager.update_overlay_state(is_collapsed) + if not is_collapsed: + await self.ui_manager.update_overlay_chat_history(page) + + async def show_steps_state_handler(self, show_details: bool): + page = await self.get_current_page() + await self.ui_manager.update_overlay_show_details(show_details, page) + + async def set_user_response_handler(self): + context = await self.get_browser_context() + await context.expose_function('user_response', self.receive_user_response) # type: ignore + + + async def notify_user(self, message: str, message_type: MessageType = MessageType.STEP): + """ + Notify the user with a message. + + Args: + message (str): The message to notify the user with. + message_type (enum, optional): Values can be 'PLAN', 'QUESTION', 'ANSWER', 'INFO', 'STEP'. Defaults to 'STEP'. + To Do: Convert to Enum. + """ + + if message.startswith(":"): + message = message[1:] + + if message.endswith(","): + message = message[:-1] + + if message_type == MessageType.PLAN: + message = beautify_plan_message(message) + message = "Plan:\n" + message + elif message_type == MessageType.STEP: + if "confirm" in message.lower(): + message = "Verify: " + message + else: + message = "Next step: " + message + elif message_type == MessageType.QUESTION: + message = "Question: " + message + elif message_type == MessageType.ANSWER: + message = "Response: " + message + + safe_message = escape_js_message(message) + self.ui_manager.new_system_message(safe_message, message_type) + + if self.ui_manager.overlay_show_details == False: # noqa: E712 + if message_type not in (MessageType.PLAN, MessageType.QUESTION, MessageType.ANSWER, MessageType.INFO): + return + + if self.ui_manager.overlay_show_details == True: # noqa: E712 + if message_type not in (MessageType.PLAN, MessageType.QUESTION , MessageType.ANSWER, MessageType.INFO, MessageType.STEP): + return + + safe_message_type = escape_js_message(message_type.value) + try: + js_code = f"addSystemMessage({safe_message}, is_awaiting_user_response=false, message_type={safe_message_type});" + page = await self.get_current_page() + await page.evaluate(js_code) + except Exception as e: + logger.error(f"Failed to notify user with message \"{message}\". However, most likey this will work itself out after the page loads: {e}") + + self.notification_manager.notify(message, message_type.value) + + async def highlight_element(self, selector: str, add_highlight: bool): + try: + page: Page = await self.get_current_page() + if add_highlight: + # Add the 'agente-ui-automation-highlight' class to the element. This class is used to apply the fading border. + await page.eval_on_selector(selector, '''e => { + let originalBorderStyle = e.style.border; + e.classList.add('agente-ui-automation-highlight'); + e.addEventListener('animationend', () => { + e.classList.remove('agente-ui-automation-highlight') + });}''') + logger.debug(f"Applied pulsating border to element with selector {selector} to indicate text entry operation") + else: + # Remove the 'agente-ui-automation-highlight' class from the element. + await page.eval_on_selector(selector, "e => e.classList.remove('agente-ui-automation-highlight')") + logger.debug(f"Removed pulsating border from element with selector {selector} after text entry operation") + except Exception: + # This is not significant enough to fail the operation + pass + + async def receive_user_response(self, response: str): + self.user_response = response # Store the response for later use. + logger.debug(f"Received user response to system prompt: {response}") + # Notify event loop that the user's response has been received. + self.user_response_event.set() + + + async def prompt_user(self, message: str) -> str: + """ + Prompt the user with a message and wait for a response. + + Args: + message (str): The message to prompt the user with. + + Returns: + str: The user's response. + """ + logger.debug(f"Prompting user with message: \"{message}\"") + #self.ui_manager.new_system_message(message) + + page = await self.get_current_page() + + await self.ui_manager.show_overlay(page) + self.log_system_message(message, MessageType.QUESTION) # add the message to history after the overlay is opened to avoid double adding it. add_system_message below will add it + + safe_message = escape_js_message(message) + + js_code = f"addSystemMessage({safe_message}, is_awaiting_user_response=true, message_type='question');" + await page.evaluate(js_code) + + await self.user_response_event.wait() + result = self.user_response + logger.info(f"User prompt reponse to \"{message}\": {result}") + self.user_response_event.clear() + self.user_response = "" + self.ui_manager.new_user_message(result) + return result + + def set_take_screenshots(self, take_screenshots: bool): + self._take_screenshots = take_screenshots + + def get_take_screenshots(self): + return self._take_screenshots + + def set_screenshots_dir(self, screenshots_dir: str): + self._screenshots_dir = screenshots_dir + + def get_screenshots_dir(self): + return self._screenshots_dir + + async def take_screenshots(self, name: str, page: Page|None, full_page: bool = True, include_timestamp: bool = True, + load_state: str = 'domcontentloaded', take_snapshot_timeout: int = 5*1000): + if not self._take_screenshots: + return + if page is None: + page = await self.get_current_page() + + screenshot_name = name + + if include_timestamp: + screenshot_name = f"{int(time.time_ns())}_{screenshot_name}" + screenshot_name += ".png" + screenshot_path = f"{self.get_screenshots_dir()}/{screenshot_name}" + try: + await page.wait_for_load_state(state=load_state, timeout=take_snapshot_timeout) # type: ignore + await page.screenshot(path=screenshot_path, full_page=full_page, timeout=take_snapshot_timeout, caret="initial", scale="device") + logger.debug(f"Screen shot saved to: {screenshot_path}") + except Exception as e: + logger.error(f"Failed to take screenshot and save to \"{screenshot_path}\". Error: {e}") + + + def log_user_message(self, message: str): + """ + Log the user's message. + + Args: + message (str): The user's message to log. + """ + self.ui_manager.new_user_message(message) + + + def log_system_message(self, message: str, type: MessageType = MessageType.STEP): + """ + Log a system message. + + Args: + message (str): The system message to log. + """ + self.ui_manager.new_system_message(message, type) + + async def update_processing_state(self, processing_state: str): + """ + Update the processing state of the overlay. + + Args: + is_processing (str): "init", "processing", "done" + """ + page = await self.get_current_page() + + await self.ui_manager.update_processing_state(processing_state, page) + + async def command_completed(self, command: str, elapsed_time: float | None = None): + """ + Notify the overlay that the command has been completed. + """ + logger.debug(f"Command \"{command}\" has been completed. Focusing on the overlay input if it is open.") + page = await self.get_current_page() + await self.ui_manager.command_completed(page, command, elapsed_time) diff --git a/Agent_E/ae/core/post_process_responses.py b/Agent_E/ae/core/post_process_responses.py new file mode 100644 index 0000000..907c9de --- /dev/null +++ b/Agent_E/ae/core/post_process_responses.py @@ -0,0 +1,43 @@ +import asyncio +from typing import Any + +import autogen # type: ignore + +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +def final_reply_callback_user_proxy(recipient: autogen.ConversableAgent, messages: list[dict[str, Any]], sender: autogen.Agent, config: dict[str, Any]): + """ + Callback function that is called each time the user proxy agent receives a message. + It picks the last message from the list of messages and checks if it contains the termination signal. + If the termination signal is found, it extracts the final response and outputs it. + + Args: + recipient (autogen.ConversableAgent): The recipient of the message. + messages (Optional[list[dict[str, Any]]]): The list of messages received by the agent. + sender (Optional[autogen.Agent]): The sender of the message. + config (Optional[Any]): Additional configuration parameters. + + Returns: + Tuple[bool, None]: A tuple indicating whether the processing should stop and the response to be sent. + """ + global last_agent_response + last_message = messages[-1] + logger.debug(f"Post Process Message (User Proxy):{last_message}") + if last_message.get('content') and "##TERMINATE##" in last_message['content']: + last_agent_response = last_message['content'].replace("##TERMINATE##", "").strip() + if last_agent_response: + logger.debug("*****Final Reply*****") + logger.debug(f"Final Response: {last_agent_response}") + logger.debug("*********************") + return True, None + + return False, None + +def final_reply_callback_planner_agent(message:str, message_type:MessageType = MessageType.STEP): # type: ignore + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + loop = asyncio.get_event_loop() + loop.run_until_complete(browser_manager.notify_user(message, message_type=message_type)) + return False, None # required to ensure the agent communication flow continues diff --git a/Agent_E/ae/core/prompts.py b/Agent_E/ae/core/prompts.py new file mode 100644 index 0000000..3048739 --- /dev/null +++ b/Agent_E/ae/core/prompts.py @@ -0,0 +1,185 @@ +LLM_PROMPTS = { + "USER_AGENT_PROMPT": """A proxy for the user for executing the user commands.""", + "BROWSER_NAV_EXECUTOR_PROMPT": """A proxy for the user for executing the user commands.""", + + "PLANNER_AGENT_PROMPT": """You are a web automation task planner. You will receive tasks from the user and will work with a naive helper to accomplish it. +You will think step by step and break down the tasks into sequence of simple subtasks. Subtasks will be delegated to the helper to execute. + +Return Format: +Your reply will strictly be a well-fromatted JSON with four attributes. +"plan": This is a string that contains the high-level plan. This is optional and needs to be present only when a task starts and when the plan needs to be revised. +"next_step": This is a string that contains a detailed next step that is consistent with the plan. The next step will be delegated to the helper to execute. This needs to be present for every response except when terminating +"terminate": yes/no. Return yes when the exact task is complete without any compromises or you are absolutely convinced that the task cannot be completed, no otherwise. This is mandatory for every response. +"final_response": This is the final answer string that will be returned to the user. In search tasks, unless explicitly stated, you will provide the single best suited result in the response instead of listing multiple options. This attribute only needs to be present when terminate is true. + +Capabilities and limitation of the helper: +1. Helper can navigate to urls, perform simple interactions on a page or answer any question you may have about the current page. +2. Helper cannot perform complex planning, reasoning or analysis. You will not delegate any such tasks to helper, instead you will perform them based on information from the helper. +3. Helper is stateless and treats each step as a new task. Helper will not remember previous pages or actions. So, you will provide all necessary information as part of each step. +4. Very Important: Helper cannot go back to previous pages. If you need the helper to return to a previous page, you must explicitly add the URL of the previous page in the step (e.g. return to the search result page by navigating to the url https://www.google.com/search?q=Finland") + +Guidelines: +1. If you know the direct URL, use it directly instead of searching for it (e.g. go to www.espn.com). Optimise the plan to avoid unnecessary steps. +2. Do not assume any capability exists on the webpage. Ask questions to the helper to confirm the presence of features (e.g. is there a sort by price feature available on the page?). This will help you revise the plan as needed and also establish common ground with the helper. +3. Do not combine multiple steps into one. A step should be strictly as simple as interacting with a single element or navigating to a page. If you need to interact with multiple elements or perform multiple actions, you will break it down into multiple steps. +4. Important: You will NOT ask for any URLs of hyperlinks in the page from the helper, instead you will simply ask the helper to click on specific result. URL of the current page will be automatically provided to you with each helper response. +5. Very Important: Add verification as part of the plan, after each step and specifically before terminating to ensure that the task is completed successfully. Ask simple questions to verify the step completion (e.g. Can you confirm that White Nothing Phone 2 with 16GB RAM is present in the cart?). Do not assume the helper has performed the task correctly. +6. If the task requires multiple informations, all of them are equally important and should be gathered before terminating the task. You will strive to meet all the requirements of the task. +7. If one plan fails, you MUST revise the plan and try a different approach. You will NOT terminate a task untill you are absolutely convinced that the task is impossible to accomplish. + +Complexities of web navigation: +1. Many forms have mandatory fields that need to be filled up before they can be submitted. Ask the helper for what fields look mandatory. +2. In many websites, there are multiple options to filter or sort results. Ask the helper to list any elements on the page which will help the task (e.g. are there any links or interactive elements that may lead me to the support page?). +3. Always keep in mind complexities such as filtering, advanced search, sorting, and other features that may be present on the website. Ask the helper whether these features are available on the page when relevant and use them when the task requires it. +4. Very often list of items such as, search results, list of products, list of reviews, list of people etc. may be divided into multiple pages. If you need complete information, it is critical to explicitly ask the helper to go through all the pages. +5. Sometimes search capabilities available on the page will not yield the optimal results. Revise the search query to either more specific or more generic. +6. When a page refreshes or navigates to a new page, information entered in the previous page may be lost. Check that the information needs to be re-entered (e.g. what are the values in source and destination on the page?). +7. Sometimes some elements may not be visible or be disabled until some other action is performed. Ask the helper to confirm if there are any other fields that may need to be interacted for elements to appear or be enabled. + +Example 1: +Task: Find the cheapest premium economy flights from Helsinki to Stockholm on 15 March on Skyscanner. Current page: www.google.com +{"plan":"1. Go to www.skyscanner.com. +2. List the interaction options available on skyscanner page relevant for flight reservation along with their default values. +3. Select the journey option to one-way (if not default). +4. Set number of passengers to 1 (if not default). +5. Set the departure date to 15 March 2025 (since 15 March 2024 is already past). +6. Set ticket type to Economy Premium. +7. Set from airport to ""Helsinki". +8. Set destination airport to Stockhokm +9. Confirm that current values in the source airport, destination airport and departure date fields are Helsinki, Stockholm and 15 August 2024 respectively. +10. Click on the search button to get the search results. +11. Confirm that you are on the search results page. +12. Extract the price of the cheapest flight from Helsinki to Stokchol from the search results.", +"next_step": "Go to https://www.skyscanner.com", +"terminate":"no"}, +After the task is completed and when terminating: +Your reply: {"terminate":"yes", "final_response": "The cheapest premium economy flight from Helsinki to Stockholm on 15 March 2025 is ."} + +Notice above how there is confirmation after each step and how interaction (e.g. setting source and destination) with each element is a seperate step. Follow same pattern. +Remember: you are a very very persistent planner who will try every possible strategy to accomplish the task perfectly. +Revise search query if needed, ask for more information if needed, and always verify the results before terminating the task. +Some basic information about the user: $basic_user_information""", + + "BROWSER_AGENT_PROMPT": """You will perform web navigation tasks, which may include logging into websites and interacting with any web content using the functions made available to you. + Use the provided DOM representation for element location or text summarization. + Interact with pages using only the "mmid" attribute in DOM elements. + You must extract mmid value from the fetched DOM, do not conjure it up. + Execute function sequentially to avoid navigation timing issues. Once a task is completed, confirm completion with ##TERMINATE TASK##. + The given actions are NOT parallelizable. They are intended for sequential execution. + If you need to call multiple functions in a task step, call one function at a time. Wait for the function's response before invoking the next function. This is important to avoid collision. + Strictly for search fields, submit the field by pressing Enter key. For other forms, click on the submit button. + Unless otherwise specified, the task must be performed on the current page. Use openurl only when explicitly instructed to navigate to a new page with a url specified. If you do not know the URL ask for it. + You will NOT provide any URLs of links on webpage. If user asks for URLs, you will instead provide the text of the hyperlink on the page and offer to click on it. This is very very important. + When inputing information, remember to follow the format of the input field. For example, if the input field is a date field, you will enter the date in the correct format (e.g. YYYY-MM-DD), you may get clues from the placeholder text in the input field. + if the task is ambigous or there are multiple options to choose from, you will ask the user for clarification. You will not make any assumptions. + Individual function will reply with action success and if any changes were observed as a consequence. Adjust your approach based on this feedback. + Once the task is completed or cannot be completed, return a short summary of the actions you performed to accomplish the task, and what worked and what did not. This should be followed by ##TERMINATE TASK##. Your reply will not contain any other information. + Additionally, If task requires an answer, you will also provide a short and precise answer followed by ##TERMINATE TASK##. + Ensure that user questions are answered from the DOM and not from memory or assumptions. To answer a question about textual information on the page, prefer to use text_only DOM type. To answer a question about interactive elements, use all_fields DOM type. + Do not provide any mmid values in your response. + Important: If you encounter an issues or is unsure how to proceed, simply ##TERMINATE TASK## and provide a detailed summary of the exact issue encountered. + Do not repeat the same action multiple times if it fails. Instead, if something did not work after a few attempts, terminate the task.""", + + + "VERFICATION_AGENT": """Given a conversation and a task, your task is to analyse the conversation and tell if the task is completed. If not, you need to tell what is not completed and suggest next steps to complete the task.""", + "ENTER_TEXT_AND_CLICK_PROMPT": """This skill enters text into a specified element and clicks another element, both identified by their DOM selector queries. + Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. + Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered. + Always prefer this dual-action skill for tasks that combine text input and element clicking to leverage its streamlined operation.""", + + + "OPEN_URL_PROMPT": """Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened.""", + + + "GO_BACK_PROMPT": """Goes back to previous page in the browser history. Useful when correcting an incorrect action that led to a new page or when needing to revisit a previous page for information. Returns the full URL of the page after the back action is performed.""", + + + "COMMAND_EXECUTION_PROMPT": """Execute the user task "$command" $current_url_prompt_segment""", + + + "GET_USER_INPUT_PROMPT": """Get clarification by asking the user or wait for user to perform an action on webpage. This is useful e.g. when you encounter a login or captcha and requires the user to intervene. This skill will also be useful when task is ambigious and you need more clarification from the user (e.g. ["which source website to use to accomplish a task"], ["Enter your credentials on your webpage and type done to continue"]). Use this skill very sparingly and only when absolutely needed.""", + + + "GET_DOM_WITHOUT_CONTENT_TYPE_PROMPT": """Retrieves the DOM of the current web browser page. + Each DOM element will have an \"mmid\" attribute injected for ease of DOM interaction. + Returns a minified representation of the HTML DOM where each HTML DOM Element has an attribute called \"mmid\" for ease of DOM query selection. When \"mmid\" attribute is available, use it for DOM query selectors.""", + + + # This one below had all three content types including input_fields + "GET_DOM_WITH_CONTENT_TYPE_PROMPT": """Retrieves the DOM of the current web site based on the given content type. + The DOM representation returned contains items ordered in the same way they appear on the page. Keep this in mind when executing user requests that contain ordinals or numbered items. + text_only - returns plain text representing all the text in the web site. Use this for any information retrieval task. This will contain the most complete textual information. + input_fields - returns a JSON string containing a list of objects representing text input html elements with mmid attribute. Use this strictly for interaction purposes with text input fields. + all_fields - returns a JSON string containing a list of objects representing all interactive elements and their attributes with mmid attribute. Use this strictly to identify and interact with any type of elements on page. + If information is not available in one content type, you must try another content_type.""", + + + "GET_ACCESSIBILITY_TREE": """Retrieves the accessibility tree of the current web site. + The DOM representation returned contains items ordered in the same way they appear on the page. Keep this in mind when executing user requests that contain ordinals or numbered items.""", + + + "CLICK_PROMPT": """Executes a click action on the element matching the given mmid attribute value. It is best to use mmid attribute as the selector. + Returns Success if click was successful or appropriate error message if the element could not be clicked.""", + + + "CLICK_PROMPT_ACCESSIBILITY": """Executes a click action on the element a name and role. + Returns Success if click was successful or appropriate error message if the element could not be clicked.""", + + + "GET_URL_PROMPT": """Get the full URL of the current web page/site. If the user command seems to imply an action that would be suitable for an already open website in their browser, use this to fetch current website URL.""", + + + "ENTER_TEXT_PROMPT": """Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else. + Returns Success if text entry was successful or appropriate error message if text could not be entered.""", + + + "CLICK_BY_TEXT_PROMPT": """Executes a click action on the element matching the text. If multiple text matches are found, it will click on all of them. Use this as last resort when all else fails.""", + + "BULK_ENTER_TEXT_PROMPT": """Bulk enter text in multiple DOM fields. To be used when there are multiple fields to be filled on the same page. + Enters text in the DOM elements matching the given mmid attribute value. + The input will receive a list of objects containing the DOM query selector and the text to enter. + This will only enter the text and not press enter or anything else. + Returns each selector and the result for attempting to enter text.""", + + + "PRESS_KEY_COMBINATION_PROMPT": """Presses the given key on the current web page. + This is useful for pressing the enter button to submit a search query, PageDown to scroll, ArrowDown to change selection in a focussed list etc.""", + + + "ADD_TO_MEMORY_PROMPT": """"Save any information that you may need later in this term memory. This could be useful for saving things to do, saving information for personalisation, or even saving information you may need in future for efficiency purposes E.g. Remember to call John at 5pm, This user likes Tesla company and considered buying shares, The user enrollment form is available in etc.""", + + "HOVER_PROMPT": """Hover on a element with the given mmid attribute value. Hovering on an element can reveal additional information such as a tooltip or trigger a dropdown menu with different navigation options.""", + "GET_MEMORY_PROMPT": """Retrieve all the information previously stored in the memory""", + + + "PRESS_ENTER_KEY_PROMPT": """Presses the enter key in the given html field. This is most useful on text input fields.""", + + + "EXTRACT_TEXT_FROM_PDF_PROMPT": """Extracts text from a PDF file hosted at the given URL.""", + + + "BROWSER_AGENT_NO_SKILLS_PROMPT": """You are an autonomous agent tasked with performing web navigation on a Playwright instance, including logging into websites and executing other web-based actions. + You will receive user commands, formulate a plan and then write the PYTHON code that is needed for the task to be completed. + It is possible that the code you are writing is for one step at a time in the plan. This will ensure proper execution of the task. + Your operations must be precise and efficient, adhering to the guidelines provided below: + 1. **Asynchronous Code Execution**: Your tasks will often be asynchronous in nature, requiring careful handling. Wrap asynchronous operations within an appropriate async structure to ensure smooth execution. + 2. **Sequential Task Execution**: To avoid issues related to navigation timing, execute your actions in a sequential order. This method ensures that each step is completed before the next one begins, maintaining the integrity of your workflow. Some steps like navigating to a site will require a small amount of wait time after them to ensure they load correctly. + 3. **Error Handling and Debugging**: Implement error handling to manage exceptions gracefully. Should an error occur or if the task doesn't complete as expected, review your code, adjust as necessary, and retry. Use the console or logging for debugging purposes to track the progress and issues. + 4. **Using HTML DOM**: Do not assume what a DOM selector (web elements) might be. Rather, fetch the DOM to look for the selectors or fetch DOM inner text to answer a questions. This is crucial for accurate task execution. When you fetch the DOM, reason about its content to determine appropriate selectors or text that should be extracted. To fetch the DOM using playwright you can: + - Fetch entire DOM using page.content() method. In the fetched DOM, consider if appropriate to remove entire sections of the DOM like `script`, `link` elements + - Fetch DOM inner text only text_content = await page.evaluate("() => document.body.innerText || document.documentElement.innerText"). This is useful for information retrieval. + 5. **DOM Handling**: Never ever substring the extracted HTML DOM. You can remove entire sections/elements of the DOM like `script`, `link` elements if they are not needed for the task. This is crucial for accurate task execution. + 6. **Execution Verification**: After executing the user the given code, ensure that you verify the completion of the task. If the task is not completed, revise your plan then rewrite the code for that step. + 7. **Termination Protocol**: Once a task is verified as complete or if it's determined that further attempts are unlikely to succeed, conclude the operation and respond with `##TERMINATE##`, to indicate the end of the session. This signal should only be used when the task is fully completed or if there's a consensus that continuation is futile. + 8. **Code Modification and Retry Strategy**: If your initial code doesn't achieve the desired outcome, revise your approach based on the insights gained during the process. When DOM selectors you are using fail, fetch the DOM and reason about it to discover the right selectors.If there are timeouts, adjust increase times. Add other error handling mechanisms before retrying as needed. + 9. **Code Generation**: Generated code does not need documentation or usage examples. Assume that it is being executed by an autonomous agent acting on behalf of the user. Do not add placeholders in the code. + 10. **Browser Handling**: Do not user headless mode with playwright. Do not close the browser after every step or even after task completion. Leave it open. + 11. **Reponse**: Remember that you are communicating with an autonomous agent that does not reason. All it does is execute code. Only respond with code that it can execute unless you are terminating. + 12. **Playwrite Oddities**: There are certain things that Playwright does not do well: + - page.wait_for_selector: When providing a timeout value, it will almost always timeout. Put that call in a try/except block and catch the timeout. If timeout occurs just move to the next statement in the code and most likely it will work. For example, if next statement is page.fill, just execute it. + + + By following these guidelines, you will enhance the efficiency, reliability, and user interaction of your web navigation tasks. + Always aim for clear, concise, and well-structured code that aligns with best practices in asynchronous programming and web automation. + """, +} diff --git a/Agent_E/ae/core/skills/__init__.py b/Agent_E/ae/core/skills/__init__.py new file mode 100644 index 0000000..393fac4 --- /dev/null +++ b/Agent_E/ae/core/skills/__init__.py @@ -0,0 +1,18 @@ +from Agent_E.ae.core.skills.click_using_selector import click +from Agent_E.ae.core.skills.click_using_selector import do_click +from Agent_E.ae.core.skills.click_using_selector import is_element_present +from Agent_E.ae.core.skills.click_using_selector import perform_javascript_click +from Agent_E.ae.core.skills.click_using_selector import perform_playwright_click + +from Agent_E.ae.core.skills.enter_text_and_click import enter_text_and_click + +from Agent_E.ae.core.skills.enter_text_using_selector import bulk_enter_text +from Agent_E.ae.core.skills.enter_text_using_selector import custom_fill_element +from Agent_E.ae.core.skills.enter_text_using_selector import do_entertext + +from Agent_E.ae.core.skills.get_dom_with_content_type import get_dom_with_content_type +from Agent_E.ae.core.skills.get_url import geturl +from Agent_E.ae.core.skills.get_user_input import get_user_input +from Agent_E.ae.core.skills.open_url import openurl + +from Agent_E.ae.core.skills.press_key_combination import press_key_combination \ No newline at end of file diff --git a/Agent_E/ae/core/skills/click_using_selector.py b/Agent_E/ae/core/skills/click_using_selector.py new file mode 100644 index 0000000..1b1fd42 --- /dev/null +++ b/Agent_E/ae/core/skills/click_using_selector.py @@ -0,0 +1,217 @@ +import asyncio +import inspect +import traceback +from typing import Annotated + +from playwright.async_api import ElementHandle +from playwright.async_api import Page + +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.dom_helper import get_element_outer_html +from Agent_E.ae.utils.dom_mutation_observer import subscribe # type: ignore +from Agent_E.ae.utils.dom_mutation_observer import unsubscribe # type: ignore +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +async def click(selector: Annotated[str, "The properly formed query selector string to identify the element for the click action (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector."], + wait_before_execution: Annotated[float, "Optional wait time in seconds before executing the click event logic.", float] = 0.0) -> Annotated[str, "A message indicating success or failure of the click."]: + """ + Executes a click action on the element matching the given query selector string within the currently open web page. + If there is no page open, it will raise a ValueError. An optional wait time can be specified before executing the click logic. Use this to wait for the page to load especially when the last action caused the DOM/Page to load. + + Parameters: + - selector: The query selector string to identify the element for the click action. + - wait_before_execution: Optional wait time in seconds before executing the click event logic. Defaults to 0.0 seconds. + + Returns: + - Success if the click was successful, Appropropriate error message otherwise. + """ + logger.info(f"Executing ClickElement with \"{selector}\" as the selector") + + # Initialize PlaywrightManager and get the active browser page + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + page = await browser_manager.get_current_page() + + if page is None: # type: ignore + raise ValueError('No active page found. OpenURL command opens a new page.') + + function_name = inspect.currentframe().f_code.co_name # type: ignore + + await browser_manager.take_screenshots(f"{function_name}_start", page) + + await browser_manager.highlight_element(selector, True) + + dom_changes_detected=None + def detect_dom_changes(changes:str): # type: ignore + nonlocal dom_changes_detected + dom_changes_detected = changes # type: ignore + + subscribe(detect_dom_changes) + result = await do_click(page, selector, wait_before_execution) + await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes + unsubscribe(detect_dom_changes) + await browser_manager.take_screenshots(f"{function_name}_end", page) + await browser_manager.notify_user(result["summary_message"], message_type=MessageType.ACTION) + + if dom_changes_detected: + return f"Success: {result['summary_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action to click {selector} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." + return result["detailed_message"] + + +async def do_click(page: Page, selector: str, wait_before_execution: float) -> dict[str, str]: + """ + Executes the click action on the element with the given selector within the provided page. + + Parameters: + - page: The Playwright page instance. + - selector: The query selector string to identify the element for the click action. + - wait_before_execution: Optional wait time in seconds before executing the click event logic. + + Returns: + dict[str,str] - Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'. + """ + logger.info(f"Executing ClickElement with \"{selector}\" as the selector. Wait time before execution: {wait_before_execution} seconds.") + + # Wait before execution if specified + if wait_before_execution > 0: + await asyncio.sleep(wait_before_execution) + + # Wait for the selector to be present and ensure it's attached and visible. If timeout, try javascript click + try: + logger.info(f"Executing ClickElement with \"{selector}\" as the selector. Waiting for the element to be attached and visible.") + + element = await asyncio.wait_for( + page.wait_for_selector(selector, state="attached", timeout=2000), + timeout=2000 + ) + if element is None: + raise ValueError(f"Element with selector: \"{selector}\" not found") + + logger.info(f"Element with selector: \"{selector}\" is attached. scrolling it into view if needed.") + try: + await element.scroll_into_view_if_needed(timeout=200) + logger.info(f"Element with selector: \"{selector}\" is attached and scrolled into view. Waiting for the element to be visible.") + except Exception: + # If scrollIntoView fails, just move on, not a big deal + pass + + try: + await element.wait_for_element_state("visible", timeout=200) + logger.info(f"Executing ClickElement with \"{selector}\" as the selector. Element is attached and visibe. Clicking the element.") + except Exception: + # If the element is not visible, try to click it anyway + pass + + element_tag_name = await element.evaluate("element => element.tagName.toLowerCase()") + element_outer_html = await get_element_outer_html(element, page, element_tag_name) + + + if element_tag_name == "option": + element_value = await element.get_attribute("value") # get the text that is in the value of the option + parent_element = await element.evaluate_handle("element => element.parentNode") + # await parent_element.evaluate(f"element => element.select_option(value=\"{element_value}\")") + await parent_element.select_option(value=element_value) # type: ignore + + logger.info(f'Select menu option "{element_value}" selected') + + return {"summary_message": f'Select menu option "{element_value}" selected', + "detailed_message": f'Select menu option "{element_value}" selected. The select element\'s outer HTML is: {element_outer_html}.'} + + + #Playwright click seems to fail more often than not, disabling it for now and just going with JS click + #await perform_playwright_click(element, selector) + msg = await perform_javascript_click(page, selector) + return {"summary_message": msg, "detailed_message": f"{msg} The clicked element's outer HTML is: {element_outer_html}."} # type: ignore + except Exception as e: + logger.error(f"Unable to click element with selector: \"{selector}\". Error: {e}") + traceback.print_exc() + msg = f"Unable to click element with selector: \"{selector}\" since the selector is invalid. Proceed by retrieving DOM again." + return {"summary_message": msg, "detailed_message": f"{msg}. Error: {e}"} + + +async def is_element_present(page: Page, selector: str) -> bool: + """ + Checks if an element is present on the page. + + Parameters: + - page: The Playwright page instance. + - selector: The query selector string to identify the element. + + Returns: + - True if the element is present, False otherwise. + """ + element = await page.query_selector(selector) + return element is not None + + +async def perform_playwright_click(element: ElementHandle, selector: str): + """ + Performs a click action on the element using Playwright's click method. + + Parameters: + - element: The Playwright ElementHandle instance representing the element to be clicked. + - selector: The query selector string of the element. + + Returns: + - None + """ + logger.info(f"Performing first Step: Playwright Click on element with selector: {selector}") + await element.click(force=False, timeout=200) + + +async def perform_javascript_click(page: Page, selector: str): + """ + Performs a click action on the element using JavaScript. + + Parameters: + - page: The Playwright page instance. + - selector: The query selector string of the element. + + Returns: + - None + """ + js_code = """(selector) => { + let element = document.querySelector(selector); + + if (!element) { + console.log(`perform_javascript_click: Element with selector ${selector} not found`); + return `perform_javascript_click: Element with selector ${selector} not found`; + } + + if (element.tagName.toLowerCase() === "option") { + let value = element.text; + let parent = element.parentElement; + + parent.value = element.value; // Directly set the value if possible + // Trigger change event if necessary + let event = new Event('change', { bubbles: true }); + parent.dispatchEvent(event); + + console.log("Select menu option", value, "selected"); + return "Select menu option: "+ value+ " selected"; + } + else { + console.log("About to click selector", selector); + // If the element is a link, make it open in the same tab + if (element.tagName.toLowerCase() === "a") { + element.target = "_self"; + } + let ariaExpandedBeforeClick = element.getAttribute('aria-expanded'); + element.click(); + let ariaExpandedAfterClick = element.getAttribute('aria-expanded'); + if (ariaExpandedBeforeClick === 'false' && ariaExpandedAfterClick === 'true') { + return "Executed JavaScript Click on element with selector: "+selector +". Very important: As a consequence a menu has appeared where you may need to make further selction. Very important: Get all_fields DOM to complete the action."; + } + return "Executed JavaScript Click on element with selector: "+selector; + } + }""" + try: + logger.info(f"Executing JavaScript click on element with selector: {selector}") + result:str = await page.evaluate(js_code, selector) + logger.debug(f"Executed JavaScript Click on element with selector: {selector}") + return result + except Exception as e: + logger.error(f"Error executing JavaScript click on element with selector: {selector}. Error: {e}") + traceback.print_exc() + diff --git a/Agent_E/ae/core/skills/enter_text_and_click.py b/Agent_E/ae/core/skills/enter_text_and_click.py new file mode 100644 index 0000000..6ebd09c --- /dev/null +++ b/Agent_E/ae/core/skills/enter_text_and_click.py @@ -0,0 +1,82 @@ +import asyncio +import inspect +from typing import Annotated + +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.core.skills.click_using_selector import do_click +from Agent_E.ae.core.skills.enter_text_using_selector import do_entertext +from Agent_E.ae.core.skills.press_key_combination import do_press_key_combination +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +async def enter_text_and_click( + text_selector: Annotated[str, "The properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use mmid attribute."], + text_to_enter: Annotated[str, "The text that will be entered into the element specified by text_selector."], + click_selector: Annotated[str, "The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after text entry."], + wait_before_click_execution: Annotated[float, "Optional wait time in seconds before executing the click.", float] = 0.0 +) -> Annotated[str, "A message indicating success or failure of the text entry and click."]: + """ + Enters text into an element and then clicks on another element. + + Parameters: + - text_selector: The selector for the element to enter text into. It should be a properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use the mmid attribute. + - text_to_enter: The text to enter into the element specified by text_selector. + - click_selector: The selector for the element to click. It should be a properly formatted DOM selector query, for example [mmid='1234']. + - wait_before_click_execution: Optional wait time in seconds before executing the click action. Default is 0.0. + + Returns: + - A message indicating the success or failure of the text entry and click. + + Raises: + - ValueError: If no active page is found. The OpenURL command opens a new page. + + Example usage: + ``` + await enter_text_and_click("[mmid='1234']", "Hello, World!", "[mmid='5678']", wait_before_click_execution=1.5) + ``` + """ + logger.info(f"Entering text '{text_to_enter}' into element with selector '{text_selector}' and then clicking element with selector '{click_selector}'.") + + # Initialize PlaywrightManager and get the active browser page + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + page = await browser_manager.get_current_page() + if page is None: # type: ignore + logger.error("No active page found") + raise ValueError('No active page found. OpenURL command opens a new page.') + + await browser_manager.highlight_element(text_selector, True) + + function_name = inspect.currentframe().f_code.co_name # type: ignore + await browser_manager.take_screenshots(f"{function_name}_start", page) + + text_entry_result = await do_entertext(page, text_selector, text_to_enter, use_keyboard_fill=True) + + #await browser_manager.notify_user(text_entry_result["summary_message"]) + if not text_entry_result["summary_message"].startswith("Success"): + await browser_manager.take_screenshots(f"{function_name}_end", page) + return(f"Failed to enter text '{text_to_enter}' into element with selector '{text_selector}'. Check that the selctor is valid.") + + result = text_entry_result + + #if the text_selector is the same as the click_selector, press the Enter key instead of clicking + if text_selector == click_selector: + do_press_key_combination_result = await do_press_key_combination(browser_manager, page, "Enter") + if do_press_key_combination_result: + result["detailed_message"] += f" Instead of click, pressed the Enter key successfully on element: \"{click_selector}\"." + await browser_manager.notify_user(f"Pressed the Enter key successfully on element: \"{click_selector}\".", message_type=MessageType.ACTION) + else: + result["detailed_message"] += f" Clicking the same element after entering text in it, is of no value. Tried pressing the Enter key on element \"{click_selector}\" instead of click and failed." + await browser_manager.notify_user("Failed to press the Enter key on element \"{click_selector}\".", message_type=MessageType.ACTION) + else: + await browser_manager.highlight_element(click_selector, True) + + do_click_result = await do_click(page, click_selector, wait_before_click_execution) + result["detailed_message"] += f' {do_click_result["detailed_message"]}' + #await browser_manager.notify_user(do_click_result["summary_message"]) + + await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes + + await browser_manager.take_screenshots(f"{function_name}_end", page) + + return result["detailed_message"] diff --git a/Agent_E/ae/core/skills/enter_text_using_selector.py b/Agent_E/ae/core/skills/enter_text_using_selector.py new file mode 100644 index 0000000..078ceda --- /dev/null +++ b/Agent_E/ae/core/skills/enter_text_using_selector.py @@ -0,0 +1,263 @@ +import asyncio +import inspect +import traceback +from dataclasses import dataclass +from typing import Annotated +from typing import List # noqa: UP035 + +from playwright.async_api import Page + +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.core.skills.press_key_combination import press_key_combination +from Agent_E.ae.utils.dom_helper import get_element_outer_html +from Agent_E.ae.utils.dom_mutation_observer import subscribe +from Agent_E.ae.utils.dom_mutation_observer import unsubscribe +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +@dataclass +class EnterTextEntry: + """ + Represents an entry for text input. + + Attributes: + query_selector (str): A valid DOM selector query. Use the mmid attribute. + text (str): The text to enter in the element identified by the query_selector. + """ + + query_selector: str + text: str + + def __getitem__(self, key: str) -> str: + if key == "query_selector": + return self.query_selector + elif key == "text": + return self.text + else: + raise KeyError(f"{key} is not a valid key") + + +async def custom_fill_element(page: Page, selector: str, text_to_enter: str): + """ + Sets the value of a DOM element to a specified text without triggering keyboard input events. + + This function directly sets the 'value' property of a DOM element identified by the given CSS selector, + effectively changing its current value to the specified text. This approach bypasses the need for + simulating keyboard typing, providing a more efficient and reliable way to fill in text fields, + especially in automated testing scenarios where speed and accuracy are paramount. + + Args: + page (Page): The Playwright Page object representing the browser tab in which the operation will be performed. + selector (str): The CSS selector string used to locate the target DOM element. The function will apply the + text change to the first element that matches this selector. + text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten. + + Example: + await custom_fill_element(page, '#username', 'test_user') + + Note: + This function does not trigger input-related events (like 'input' or 'change'). If application logic + relies on these events being fired, additional steps may be needed to simulate them. + """ + selector = f"{selector}" # Ensures the selector is treated as a string + try: + result = await page.evaluate( + """(inputParams) => { + const selector = inputParams.selector; + let text_to_enter = inputParams.text_to_enter; + text_to_enter = text_to_enter.trim(); + const element = document.querySelector(selector); + if (!element) { + throw new Error(`Element not found: ${selector}`); + } + element.value = text_to_enter; + return `Value set for ${selector}`; + }""", + {"selector": selector, "text_to_enter": text_to_enter}, + ) + logger.debug(f"custom_fill_element result: {result}") + except Exception as e: + logger.error(f"Error in custom_fill_element, Selector: {selector}, Text: {text_to_enter}. Error: {str(e)}") + raise + +async def entertext(entry: Annotated[EnterTextEntry, "An object containing 'query_selector' (DOM selector query using mmid attribute e.g. [mmid='114']) and 'text' (text to enter on the element)."]) -> Annotated[str, "Explanation of the outcome of this operation."]: + """ + Enters text into a DOM element identified by a CSS selector. + + This function enters the specified text into a DOM element identified by the given CSS selector. + It uses the Playwright library to interact with the browser and perform the text entry operation. + The function supports both direct setting of the 'value' property and simulating keyboard typing. + + Args: + entry (EnterTextEntry): An object containing 'query_selector' (DOM selector query using mmid attribute) + and 'text' (text to enter on the element). + + Returns: + str: Explanation of the outcome of this operation. + + Example: + entry = EnterTextEntry(query_selector='#username', text='test_user') + result = await entertext(entry) + + Note: + - The 'query_selector' should be a valid CSS selector that uniquely identifies the target element. + - The 'text' parameter specifies the text to be entered into the element. + - The function uses the PlaywrightManager to manage the browser instance. + - If no active page is found, an error message is returned. + - The function internally calls the 'do_entertext' function to perform the text entry operation. + - The 'do_entertext' function applies a pulsating border effect to the target element during the operation. + - The 'use_keyboard_fill' parameter in 'do_entertext' determines whether to simulate keyboard typing or not. + - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text. + - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text. + """ + logger.info(f"Entering text: {entry}") + query_selector: str = entry['query_selector'] + text_to_enter: str = entry['text'] + + # Create and use the PlaywrightManager + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + page = await browser_manager.get_current_page() + if page is None: # type: ignore + return "Error: No active page found. OpenURL command opens a new page." + + function_name = inspect.currentframe().f_code.co_name # type: ignore + + await browser_manager.take_screenshots(f"{function_name}_start", page) + + await browser_manager.highlight_element(query_selector, True) + + dom_changes_detected=None + def detect_dom_changes(changes:str): # type: ignore + nonlocal dom_changes_detected + dom_changes_detected = changes # type: ignore + + subscribe(detect_dom_changes) + + await page.evaluate( + """ + (selector) => { + const element = document.querySelector(selector); + if (element) { + element.value = ''; + } else { + console.error('Element not found:', selector); + } + } + """, + query_selector, + ) + + result = await do_entertext(page, query_selector, text_to_enter) + await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes + unsubscribe(detect_dom_changes) + + await browser_manager.take_screenshots(f"{function_name}_end", page) + + await browser_manager.notify_user(result["summary_message"], message_type=MessageType.ACTION) + if dom_changes_detected: + return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." + return result["detailed_message"] + + +async def do_entertext(page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool=True): + """ + Performs the text entry operation on a DOM element. + + This function performs the text entry operation on a DOM element identified by the given CSS selector. + It applies a pulsating border effect to the element during the operation for visual feedback. + The function supports both direct setting of the 'value' property and simulating keyboard typing. + + Args: + page (Page): The Playwright Page object representing the browser tab in which the operation will be performed. + selector (str): The CSS selector string used to locate the target DOM element. + text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten. + use_keyboard_fill (bool, optional): Determines whether to simulate keyboard typing or not. + Defaults to False. + + Returns: + dict[str, str]: Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'. + + Example: + result = await do_entertext(page, '#username', 'test_user') + + Note: + - The 'use_keyboard_fill' parameter determines whether to simulate keyboard typing or not. + - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text. + - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text. + """ + try: + + logger.debug(f"Looking for selector {selector} to enter text: {text_to_enter}") + + elem = await page.query_selector(selector) + + if elem is None: + error = f"Error: Selector {selector} not found. Unable to continue." + return {"summary_message": error, "detailed_message": error} + + logger.info(f"Found selector {selector} to enter text") + element_outer_html = await get_element_outer_html(elem, page) + + if use_keyboard_fill: + await elem.focus() + await asyncio.sleep(0.1) + await press_key_combination("Control+A") + await asyncio.sleep(0.1) + await press_key_combination("Backspace") + await asyncio.sleep(0.1) + logger.debug(f"Focused element with selector {selector} to enter text") + #add a 100ms delay + await page.keyboard.type(text_to_enter, delay=1) + else: + await custom_fill_element(page, selector, text_to_enter) + await elem.focus() + logger.info(f"Success. Text \"{text_to_enter}\" set successfully in the element with selector {selector}") + success_msg = f"Success. Text \"{text_to_enter}\" set successfully in the element with selector {selector}" + return {"summary_message": success_msg, "detailed_message": f"{success_msg} and outer HTML: {element_outer_html}."} + + except Exception as e: + traceback.print_exc() + error = f"Error entering text in selector {selector}." + return {"summary_message": error, "detailed_message": f"{error} Error: {e}"} + + +async def bulk_enter_text( + entries: Annotated[List[dict[str, str]], "List of objects, each containing 'query_selector' and 'text'."] # noqa: UP006 +) -> Annotated[List[dict[str, str]], "List of dictionaries, each containing 'query_selector' and the result of the operation."]: # noqa: UP006 + """ + Enters text into multiple DOM elements using a bulk operation. + + This function enters text into multiple DOM elements using a bulk operation. + It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair. + The function internally calls the 'entertext' function to perform the text entry operation for each entry. + + Args: + entries: List of objects, each containing 'query_selector' and 'text'. + + Returns: + List of dictionaries, each containing 'query_selector' and the result of the operation. + + Example: + entries = [ + {"query_selector": "#username", "text": "test_user"}, + {"query_selector": "#password", "text": "test_password"} + ] + results = await bulk_enter_text(entries) + + Note: + - Each entry in the 'entries' list should be a dictionary with 'query_selector' and 'text' keys. + - The result is a list of dictionaries, where each dictionary contains the 'query_selector' and the result of the operation. + """ + + results: List[dict[str, str]] = [] # noqa: UP006 + logger.info("Executing bulk Enter Text Command") + for entry in entries: + query_selector = entry['query_selector'] + text_to_enter = entry['text'] + logger.info(f"Entering text: {text_to_enter} in element with selector: {query_selector}") + result = await entertext(EnterTextEntry(query_selector=query_selector, text=text_to_enter)) + + results.append({"query_selector": query_selector, "result": result}) + + return results diff --git a/Agent_E/ae/core/skills/get_dom_with_content_type.py b/Agent_E/ae/core/skills/get_dom_with_content_type.py new file mode 100644 index 0000000..981cbb1 --- /dev/null +++ b/Agent_E/ae/core/skills/get_dom_with_content_type.py @@ -0,0 +1,115 @@ +import os +import time +from typing import Annotated +from typing import Any + +from playwright.async_api import Page + +from Agent_E.ae.config import SOURCE_LOG_FOLDER_PATH +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.dom_helper import wait_for_non_loading_dom_state +from Agent_E.ae.utils.get_detailed_accessibility_tree import do_get_accessibility_info +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +async def get_dom_with_content_type( + content_type: Annotated[str, "The type of content to extract: 'text_only': Extracts the innerText of the highest element in the document and responds with text, or 'input_fields': Extracts the text input and button elements in the dom."] + ) -> Annotated[dict[str, Any] | str | None, "The output based on the specified content type."]: + """ + Retrieves and processes the DOM of the active page in a browser instance based on the specified content type. + + Parameters + ---------- + content_type : str + The type of content to extract. Possible values are: + - 'text_only': Extracts the innerText of the highest element in the document and responds with text. + - 'input_fields': Extracts the text input and button elements in the DOM and responds with a JSON object. + - 'all_fields': Extracts all the fields in the DOM and responds with a JSON object. + + Returns + ------- + dict[str, Any] | str | None + The processed content based on the specified content type. This could be: + - A JSON object for 'input_fields' with just inputs. + - Plain text for 'text_only'. + - A minified DOM represented as a JSON object for 'all_fields'. + + Raises + ------ + ValueError + If an unsupported content_type is provided. + """ + + logger.info(f"Executing Get DOM Command based on content_type: {content_type}") + start_time = time.time() + # Create and use the PlaywrightManager + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + page = await browser_manager.get_current_page() + if page is None: # type: ignore + raise ValueError('No active page found. OpenURL command opens a new page.') + + extracted_data = None + await wait_for_non_loading_dom_state(page, 2000) # wait for the DOM to be ready, non loading means external resources do not need to be loaded + user_success_message = "" + if content_type == 'all_fields': + user_success_message = "Fetched all the fields in the DOM" + extracted_data = await do_get_accessibility_info(page, only_input_fields=False) + elif content_type == 'input_fields': + logger.debug('Fetching DOM for input_fields') + extracted_data = await do_get_accessibility_info(page, only_input_fields=True) + if extracted_data is None: + return "Could not fetch input fields. Please consider trying with content_type all_fields." + user_success_message = "Fetched only input fields in the DOM" + elif content_type == 'text_only': + # Extract text from the body or the highest-level element + logger.debug('Fetching DOM for text_only') + text_content = await get_filtered_text_content(page) + with open(os.path.join(SOURCE_LOG_FOLDER_PATH, 'text_only_dom.txt'), 'w', encoding='utf-8') as f: + f.write(text_content) + extracted_data = text_content + user_success_message = "Fetched the text content of the DOM" + else: + raise ValueError(f"Unsupported content_type: {content_type}") + + elapsed_time = time.time() - start_time + logger.info(f"Get DOM Command executed in {elapsed_time} seconds") + await browser_manager.notify_user(user_success_message, message_type=MessageType.ACTION) + return extracted_data # type: ignore + + +async def get_filtered_text_content(page: Page) -> str: + text_content = await page.evaluate(""" + () => { + // Array of query selectors to filter out + const selectorsToFilter = ['#agente-overlay']; + + // Store the original visibility values to revert later + const originalStyles = []; + + // Hide the elements matching the query selectors + selectorsToFilter.forEach(selector => { + const elements = document.querySelectorAll(selector); + elements.forEach(element => { + originalStyles.push({ element: element, originalStyle: element.style.visibility }); + element.style.visibility = 'hidden'; + }); + }); + + // Get the text content of the page + let textContent = document?.body?.innerText || document?.documentElement?.innerText || ""; + + // Get all the alt text from images on the page + let altTexts = Array.from(document.querySelectorAll('img')).map(img => img.alt); + altTexts="Other Alt Texts in the page: " + altTexts.join(' '); + + // Revert the visibility changes + originalStyles.forEach(entry => { + entry.element.style.visibility = entry.originalStyle; + }); + textContent=textContent+" "+altTexts; + return textContent; + } + """) + return text_content + diff --git a/Agent_E/ae/core/skills/get_url.py b/Agent_E/ae/core/skills/get_url.py new file mode 100644 index 0000000..b6fff0b --- /dev/null +++ b/Agent_E/ae/core/skills/get_url.py @@ -0,0 +1,40 @@ +from typing import Annotated + +from Agent_E.ae.core.playwright_manager import PlaywrightManager + + +async def geturl() -> Annotated[str, "Returns the full URL of the current active web site/page."]: + """ + Returns the full URL of the current page + + Parameters: + + Returns: + - Full URL the browser's active page. + """ + + + try: + # Create and use the PlaywrightManager + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + page = await browser_manager.get_current_page() + + if not page: + raise ValueError('No active page found. OpenURL command opens a new page.') + + await page.wait_for_load_state("domcontentloaded") + + # Get the URL of the current page + try: + title = await page.title() + current_url = page.url + if len(current_url) >250: + current_url = current_url[:250] + "..." + return f"Current Page: {current_url}, Title: {title}" # type: ignore + except: # noqa: E722 + current_url = page.url + return f"Current Page: {current_url}" + + except Exception as e: + raise ValueError('No active page found. OpenURL command opens a new page.') from e + diff --git a/Agent_E/ae/core/skills/get_user_input.py b/Agent_E/ae/core/skills/get_user_input.py new file mode 100644 index 0000000..bf82e73 --- /dev/null +++ b/Agent_E/ae/core/skills/get_user_input.py @@ -0,0 +1,26 @@ +from typing import Annotated +from typing import List # noqa: UP035 + +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.cli_helper import answer_questions_over_cli + + +async def get_user_input(questions: Annotated[List[str], "List of questions to ask the user each one represented as a string"] ) -> dict[str, str]: # noqa: UP006 + """ + Asks the user a list of questions and returns the answers in a dictionary. + + Parameters: + - questions: A list of questions to ask the user ["What is Username?", "What is your password?"]. + + Returns: + - Newline separated list of questions to ask the user + """ + + answers: dict[str, str] = {} + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + if browser_manager.ui_manager: + for question in questions: + answers[question] = await browser_manager.prompt_user(f"Question: {question}") + else: + answers = await answer_questions_over_cli(questions) + return answers diff --git a/Agent_E/ae/core/skills/open_url.py b/Agent_E/ae/core/skills/open_url.py new file mode 100644 index 0000000..967263e --- /dev/null +++ b/Agent_E/ae/core/skills/open_url.py @@ -0,0 +1,70 @@ +import inspect +from typing import Annotated + +from playwright.async_api import TimeoutError as PlaywrightTimeoutError + +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +async def openurl(url: Annotated[str, "The URL to navigate to. Value must include the protocol (http:// or https://)."], + timeout: Annotated[int, "Additional wait time in seconds after initial load."] = 3) -> Annotated[str, "Returns the result of this request in text form"]: + """ + Opens a specified URL in the active browser instance. Waits for an initial load event, then waits for either + the 'domcontentloaded' event or a configurable timeout, whichever comes first. + + Parameters: + - url: The URL to navigate to. + - timeout: Additional time in seconds to wait after the initial load before considering the navigation successful. + + Returns: + - URL of the new page. + """ + logger.info(f"Opening URL: {url}") + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + await browser_manager.get_browser_context() + page = await browser_manager.get_current_page() + try: + url = ensure_protocol(url) + if page.url == url: + logger.info(f"Current page URL is the same as the new URL: {url}. No need to refresh.") + title = await page.title() + return f"Page already loaded: {url}, Title: {title}" # type: ignore + + # Navigate to the URL with a short timeout to ensure the initial load starts + function_name = inspect.currentframe().f_code.co_name # type: ignore + + await browser_manager.take_screenshots(f"{function_name}_start", page) + + await page.goto(url, timeout=timeout*1000) # type: ignore + except PlaywrightTimeoutError as pte: + logger.warn(f"Initial navigation to {url} failed: {pte}. Will try to continue anyway.") # happens more often than not, but does not seem to be a problem + except Exception as e: + logger.error(f"An error occurred while opening the URL: {url}. Error: {e}") + import traceback + traceback.print_exc() + + await browser_manager.take_screenshots(f"{function_name}_end", page) + + await browser_manager.notify_user(f"Opened URL: {url}", message_type=MessageType.ACTION) + # Get the page title + title = await page.title() + url=page.url + return f"Page loaded: {url}, Title: {title}" # type: ignore + +def ensure_protocol(url: str) -> str: + """ + Ensures that a URL has a protocol (http:// or https://). If it doesn't have one, + https:// is added by default. + + Parameters: + - url: The URL to check and modify if necessary. + + Returns: + - A URL string with a protocol. + """ + if not url.startswith(('http://', 'https://')): + url = 'https://' + url # Default to http if no protocol is specified + logger.info(f"Added 'https://' protocol to URL because it was missing. New URL is: {url}") + return url diff --git a/Agent_E/ae/core/skills/pdf_text_extractor.py b/Agent_E/ae/core/skills/pdf_text_extractor.py new file mode 100644 index 0000000..3a4995e --- /dev/null +++ b/Agent_E/ae/core/skills/pdf_text_extractor.py @@ -0,0 +1,88 @@ +import os +from typing import Annotated + +import httpx +import pdfplumber + +from Agent_E.ae.config import PROJECT_TEMP_PATH +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +async def extract_text_from_pdf(pdf_url: Annotated[str, "The URL of the PDF file to extract text from."]) -> Annotated[str, "All the text found in the PDF file."]: + """ + Extract text from a PDF file. + pdf_url: str - The URL of the PDF file to extract text from. + returns: str - All the text found in the PDF. + """ + file_path = os.path.join(PROJECT_TEMP_PATH, "downloaded_file.pdf") # fixed file path for downloading the PDF + + try: + # Create and use the PlaywrightManager + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + + # Download the PDF + download_result = await download_pdf(pdf_url, file_path) + if not os.path.exists(download_result): + return download_result # Return error message if download failed + + # Open the PDF using pdfplumber and extract text + text = "" + with pdfplumber.open(download_result) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text += page_text + "\n" + extracted_text = text.strip() + word_count = len(extracted_text.split()) + await browser_manager.notify_user(f"Extracted text from the PDF successfully. Found {word_count} words.", message_type=MessageType.ACTION) + return "Text found in the PDF:\n" + extracted_text + except httpx.HTTPStatusError as e: + logger.error(f"An error occurred while downloading the PDF from {pdf_url}: {str(e)}") + return f"An error occurred while downloading the PDF: {str(e)}" + except Exception as e: + logger.error(f"An error occurred while extracting text from the PDF that was downloaded from {pdf_url}: {str(e)}") + return f"An error occurred while extracting text: {str(e)}" + finally: + # Cleanup: Ensure the downloaded file is removed + cleanup_temp_files(file_path) + +def cleanup_temp_files(*file_paths: str) -> None: + """ + Remove the specified temporary files. + + *file_paths: str - One or more file paths to be removed. + """ + for file_path in file_paths: + if os.path.exists(file_path): + try: + os.remove(file_path) + logger.debug(f"Cleaned file from the filesystem: {file_path}") + except Exception as e: + logger.error(f"Failed to remove {file_path}: {str(e)}") + else: + logger.debug(f"File not found. Unable to clean it from the filesystem: {file_path}") + +async def download_pdf(pdf_url: str, file_path: str) -> str: + """ + Download the PDF file from the given URL and save it to the specified path. + + pdf_url: str - The URL of the PDF file to download. + file_path: str - The local path to save the downloaded PDF. + + returns: str - The file path of the downloaded PDF if successful, otherwise an error message. + raises: Exception - If an error occurs during the download process. + """ + try: + logger.info(f"Downloading PDF from: {pdf_url} to: {file_path}") + async with httpx.AsyncClient() as client: + response = await client.get(pdf_url) + response.raise_for_status() # Ensure the request was successful + with open(file_path, 'wb') as pdf_file: + pdf_file.write(response.content) + return file_path + # except httpx.HTTPStatusError as e: + # raise e + except Exception as e: + raise e diff --git a/Agent_E/ae/core/skills/press_key_combination.py b/Agent_E/ae/core/skills/press_key_combination.py new file mode 100644 index 0000000..5035df4 --- /dev/null +++ b/Agent_E/ae/core/skills/press_key_combination.py @@ -0,0 +1,111 @@ +import asyncio +import inspect +from typing import Annotated + +from playwright.async_api import Page # type: ignore + +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.dom_mutation_observer import subscribe # type: ignore +from Agent_E.ae.utils.dom_mutation_observer import unsubscribe # type: ignore +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +async def press_key_combination(key_combination: Annotated[str, "The key to press, e.g., Enter, PageDown etc"]) -> str: + """ + Presses a key combination on the current active page managed by PlaywrightManager. + + This function simulates the pressing of a key or a combination of keys on the current active web page. + The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination. + For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows. + + Parameters: + - key_combination (Annotated[str, "The key combination to press, e.g., 'Control+C'."]): The key combination to press, represented as a string. For combinations, use '+' as a separator. + + Raises: + - ValueError: If no active page is found. + + Returns: + str: status of the operation expressed as a string + """ + + logger.info(f"Executing press_key_combination with key combo: {key_combination}") + # Create and use the PlaywrightManager + browser_manager = PlaywrightManager() + page = await browser_manager.get_current_page() + + if page is None: # type: ignore + raise ValueError('No active page found. OpenURL command opens a new page.') + + # Split the key combination if it's a combination of keys + keys = key_combination.split('+') + + dom_changes_detected=None + def detect_dom_changes(changes:str): # type: ignore + nonlocal dom_changes_detected + dom_changes_detected = changes # type: ignore + + subscribe(detect_dom_changes) + # If it's a combination, hold down the modifier keys + for key in keys[:-1]: # All keys except the last one are considered modifier keys + await page.keyboard.down(key) + + # Press the last key in the combination + await page.keyboard.press(keys[-1]) + + # Release the modifier keys + for key in keys[:-1]: + await page.keyboard.up(key) + await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes + unsubscribe(detect_dom_changes) + + if dom_changes_detected: + return f"Key {key_combination} executed successfully.\n As a consequence of this action, new elements have appeared in view:{dom_changes_detected}. This means that the action is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." + + await browser_manager.notify_user(f"Key {key_combination} executed successfully", message_type=MessageType.ACTION) + return f"Key {key_combination} executed successfully" + + +async def do_press_key_combination(browser_manager: PlaywrightManager, page: Page, key_combination: str) -> bool: + """ + Presses a key combination on the provided page. + + This function simulates the pressing of a key or a combination of keys on a web page. + The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination. + For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows. + + Parameters: + - browser_manager (PlaywrightManager): The PlaywrightManager instance. + - page (Page): The Playwright page instance. + - key_combination (str): The key combination to press, represented as a string. For combinations, use '+' as a separator. + + Returns: + bool: True if success and False if failed + """ + + logger.info(f"Executing press_key_combination with key combo: {key_combination}") + try: + function_name = inspect.currentframe().f_code.co_name # type: ignore + await browser_manager.take_screenshots(f"{function_name}_start", page) + # Split the key combination if it's a combination of keys + keys = key_combination.split('+') + + # If it's a combination, hold down the modifier keys + for key in keys[:-1]: # All keys except the last one are considered modifier keys + await page.keyboard.down(key) + + # Press the last key in the combination + await page.keyboard.press(keys[-1]) + + # Release the modifier keys + for key in keys[:-1]: + await page.keyboard.up(key) + + except Exception as e: + logger.error(f"Error executing press_key_combination \"{key_combination}\": {e}") + return False + + await browser_manager.take_screenshots(f"{function_name}_end", page) + + return True + diff --git a/Agent_E/ae/core/skills/skill_registry.py b/Agent_E/ae/core/skills/skill_registry.py new file mode 100644 index 0000000..a40b872 --- /dev/null +++ b/Agent_E/ae/core/skills/skill_registry.py @@ -0,0 +1,29 @@ +# skill_registry.py +from collections.abc import Callable +from typing import Any + +# Define the type of the functions that will be registered as skills +SkillType = Callable[..., Any] + +# Global registry to store private skill functions and their metadata +skill_registry: list[dict[str, Any]] = [] + +def skill(description: str, name: str|None = None) -> Callable[[SkillType], SkillType]: + """ + Decorator for registering private skills. + + Parameters: + - description: A string describing the skill's function. + - name: Optional name to register the skill with. If not provided, the function's name will be used. + + Returns: + - A decorator function that registers the skill in the global registry. + """ + def decorator(func: SkillType) -> SkillType: + skill_registry.append({ + "name": name if name else func.__name__, # Use provided name or fallback to function name + "func": func, + "description": description + }) + return func + return decorator diff --git a/Agent_E/ae/core/system_orchestrator.py b/Agent_E/ae/core/system_orchestrator.py new file mode 100644 index 0000000..ce3103e --- /dev/null +++ b/Agent_E/ae/core/system_orchestrator.py @@ -0,0 +1,227 @@ +import asyncio +import json +import os +import time + +from dotenv import load_dotenv + +import Agent_E.ae.core.playwright_manager as browserManager +from Agent_E.ae.config import SOURCE_LOG_FOLDER_PATH +from Agent_E.ae.core.agents_llm_config import AgentsLLMConfig +from Agent_E.ae.core.autogen_wrapper import AutogenWrapper +from Agent_E.ae.utils.cli_helper import async_input # type: ignore +from Agent_E.ae.utils.formatting_helper import str_to_bool +from Agent_E.ae.utils.http_helper import make_post_request +from Agent_E.ae.utils.logger import logger + + +class SystemOrchestrator: + """ + Orchestrates the system's operation, handling input from both a command prompt and a web interface, + and coordinating between the Autogen wrapper and the Playwright manager. + + Attributes: + agent_scenario (str): The agent scenario to use for command processing. Defaults to "user_proxy,browser_nav_agent". + input_mode (str): The input mode of the system, determining whether command prompt input is enabled. Defaults to "GUI_ONLY". + browser_manager (PlaywrightManager): The Playwright manager instance for web interaction. + autogen_wrapper (AutogenWrapper): The Autogen wrapper instance for agent-based command processing. + is_running (bool): Flag indicating whether the system is currently processing a command. + shutdown_event (asyncio.Event): Event to wait for an exit command to be processed. + """ + + def __init__(self, agent_scenario:str="user,planner_agent,browser_nav_agent,browser_nav_executor", input_mode:str="GUI_ONLY", + planner_max_chat_round: int = 50, browser_nav_max_chat_round: int = 10): + """ + Initializes the system orchestrator with the specified agent scenario and input mode. + + Args: + agent_scenario (str, optional): The agent scenario to use for command processing. Defaults to "user_proxy,browser_nav_agent". + input_mode (str, optional): The input mode of the system. Defaults to "GUI_ONLY". + planner_max_chat_rounds (int, optional): The maximum number of chat rounds for the planner. Defaults to 50. + browser_nav_max_chat_round (int, optional): The maximum number of chat rounds for the browser navigation agent. Defaults to 10. + """ + load_dotenv() + self.planner_number_of_rounds = planner_max_chat_round + self.browser_number_of_rounds = browser_nav_max_chat_round + + self.agent_scenario = agent_scenario + self.input_mode = input_mode + self.browser_manager = None + self.autogen_wrapper = None + self.is_running = False + + self.save_chat_logs_to_files = str_to_bool(os.getenv('SAVE_CHAT_LOGS_TO_FILE', True)) + + if os.getenv('ORCHESTRATOR_API_KEY', None) is not None and os.getenv('ORCHESTRATOR_GATEWAY', None) is not None: + self.__populate_orchestrator_info() + logger.info(f"Orchestrator endpoint: {self.orchestrator_endpoint}") + else: + self.use_orchestrator = False + + self.__parse_user_and_browser_agent_names() + self.shutdown_event = asyncio.Event() #waits for an exit command to be processed + + + def __populate_orchestrator_info(self): + """ + Populates the orchestrator information by retrieving the API key, gateway, and endpoint from environment variables. + """ + self.orchestrator_api_key = os.getenv('ORCHESTRATOR_API_KEY') + self.orchestrator_gateway = os.getenv('ORCHESTRATOR_GATEWAY') + self.orchestrator_endpoint = f"{self.orchestrator_gateway}/api/orchestrate" + self.use_orchestrator = True + + + def __parse_user_and_browser_agent_names(self): + """ + Parse the user and browser agent names from agent_scenario + """ + self.agent_names = self.agent_scenario.split(',') + for agent_name in self.agent_names: + if 'user' in agent_name: + self.ser_agent_name = agent_name + elif 'planner' in agent_name: + self.planner_agent_name = agent_name + elif 'browser' in agent_name: + self.browser_agent_name = agent_name + + async def initialize(self): + """ + Initializes the components required for the system's operation, including the Autogen wrapper and the Playwright manager. + """ + # Load the configuration using AgentsLLMConfig + llm_config = AgentsLLMConfig() + + # Retrieve planner agent and browser nav agent configurations + self.planner_agent_config = llm_config.get_planner_agent_config() + self.browser_nav_agent_config = llm_config.get_browser_nav_agent_config() + + self.autogen_wrapper = await AutogenWrapper.create(self.planner_agent_config, self.browser_nav_agent_config, agents_needed=self.agent_names, + save_chat_logs_to_files=self.save_chat_logs_to_files, + planner_max_chat_round=self.planner_number_of_rounds, browser_nav_max_chat_round=self.browser_number_of_rounds) + + self.browser_manager = browserManager.PlaywrightManager(gui_input_mode=self.input_mode == "GUI_ONLY") + await self.browser_manager.async_initialize() + + if self.input_mode == "GUI_ONLY": + browser_context = await self.browser_manager.get_browser_context() + await browser_context.expose_function('process_task', self.receive_command) # type: ignore + + async def start(self): + """ + Starts the system orchestrator, initializing components and starting the command prompt loop if necessary. + """ + await self.initialize() + + if self.input_mode != "GUI_ONLY": + await self.command_prompt_loop() + + await self.wait_for_exit() + + async def command_prompt_loop(self): + """ + Continuously reads and processes commands from the command prompt until an 'exit' command is received. + """ + while not self.is_running: + command: str = await async_input("Enter your command (or type 'exit' to quit): ") # type: ignore + await self.process_command(command) # type: ignore + + async def receive_command(self, command: str): + """ + Callback function to process commands received from the web interface. + + Args: + command (str): The command received from the web interface. + """ + await self.process_command(command) + + async def __orchestrate_command(self, command: str): + if not self.use_orchestrator: + return command + + orch_response = make_post_request(self.orchestrator_endpoint, {"query": command}, self.orchestrator_api_key, api_key_header_name="X-API-Key") # type: ignore + + if not orch_response: + return command + + if "user_notification" in orch_response: + await self.browser_manager.notify_user(orch_response["user_notification"]) # type: ignore + if "is_terminating" in orch_response and orch_response["is_terminating"]: + logger.info("Orchestrator indicated command execution completed.") + return None + if "reformulated_query" in orch_response: + logger.info(f"Orchestrator reformulated command to: {orch_response['reformulated_query']}") + return orch_response["reformulated_query"] + + + async def process_command(self, command: str): + """ + Processes a given command, coordinating with the Autogen wrapper for execution and handling special commands like 'exit'. + + Args: + command (str): The command to process. + """ + logger.info(f"Received command: {command}") + if command.lower() == 'exit': + await self.shutdown() + return + + if command: + self.is_running = True + start_time = time.time() + current_url = await self.browser_manager.get_current_url() if self.browser_manager else None + self.browser_manager.ui_manager.clear_conversation_history() # type: ignore + self.browser_manager.log_user_message(command) # type: ignore + result = None + logger.info(f"Processing command: {command}") + if self.autogen_wrapper: + await self.browser_manager.update_processing_state("processing") # type: ignore + orchestrated_command = await self.__orchestrate_command(command) + if orchestrated_command is not None: + result = await self.autogen_wrapper.process_command(orchestrated_command, current_url) + else: + result = await self.autogen_wrapper.process_command(command, current_url) + + await self.browser_manager.update_processing_state("done") # type: ignore + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + logger.info(f"Command \"{command}\" took: {elapsed_time} seconds.") + await self.save_planner_chat_messages() + if result is not None: + chat_history= result.chat_history # type: ignore + last_message = chat_history[-1] if chat_history else None # type: ignore + if last_message and "terminate" in last_message and last_message["terminate"]=="yes": + await self.browser_manager.notify_user(last_message, "answer") # type: ignore + + await self.browser_manager.notify_user(f"Task Completed ({elapsed_time}s).", "info") # type: ignore + await self.browser_manager.command_completed(command, elapsed_time) # type: ignore + self.is_running = False + + async def save_planner_chat_messages(self): + """ + Saves the chat messages from the Autogen wrapper's agents to a JSON file. + """ + + messages = self.autogen_wrapper.agents_map[self.planner_agent_name].chat_messages # type: ignore + messages_str_keys = {str(key): value for key, value in messages.items()} # type: ignore + if self.save_chat_logs_to_files: + with open(os.path.join(SOURCE_LOG_FOLDER_PATH, 'chat_messages.json'), 'w', encoding='utf-8') as f: + json.dump(messages_str_keys, f, ensure_ascii=False, indent=4) + logger.debug("Chat messages saved") + else: + logger.info("Planner chat log: ", extra={"planner_chat_log": messages_str_keys}) # type: ignore + + async def wait_for_exit(self): + """ + Waits for an exit command to be processed, keeping the system active in the meantime. + """ + await self.shutdown_event.wait() # Wait until the shutdown event is set + + async def shutdown(self): + """ + Shuts down the system orchestrator, stopping the Playwright manager and exiting the command prompt loop. + """ + logger.info("Shutting down System Orchestrator...") + if self.browser_manager: + await self.browser_manager.stop_playwright() + self.shutdown_event.set() # Signal the shutdown event to stop waiting in wait_for_exit diff --git a/Agent_E/ae/core/ui_manager.py b/Agent_E/ae/core/ui_manager.py new file mode 100644 index 0000000..106a447 --- /dev/null +++ b/Agent_E/ae/core/ui_manager.py @@ -0,0 +1,221 @@ + +import os +import traceback + +from playwright.async_api import Frame +from playwright.async_api import Page + +from Agent_E.ae.config import PROJECT_SOURCE_ROOT +from Agent_E.ae.utils.js_helper import escape_js_message +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.ui_messagetype import MessageType + + +class UIManager: + """ + Manages the UI overlay for this application. The application uses playwright for the browser driver. + This includes handling navigation events, showing or hiding overlays, and maintaining + a conversation history within the UI overlay. + + Attributes: + overlay_is_collapsed (bool): Indicates if the overlay is currently collapsed. + conversation_history (list[dict[str, str]]): The chat history between user and system. Each entry contains 'from' and 'message' keys. + __update_overlay_chat_history_running (bool): A flag to prevent concurrent updates to the chat history. + """ + + overlay_is_collapsed: bool = True + + overlay_processing_state: str = "init" #init: initialised, processing: processing is ongoing, done: processing is done + overlay_show_details:bool = True + + conversation_history:list[dict[str, str]] = [] + __update_overlay_chat_history_running: bool = False + + + def __init__(self): + """ + Initializes the UIManager instance by adding default system messages to the conversation history. + """ + self.add_default_system_messages() + + + async def handle_navigation(self, frame: Frame): + """ + Handles navigation events by injecting JavaScript code into the frame to manage the overlay state + and updating the overlay chat history. + + Args: + frame (Frame): The Playwright Frame object to inject JavaScript into and manage. + """ + try: + await frame.wait_for_load_state("load") + overlay_injection_file = os.path.join(PROJECT_SOURCE_ROOT, "ui", "injectOverlay.js") + with open(overlay_injection_file, 'r') as file: # noqa: UP015 + js_code = file.read() + + # Inject the JavaScript code into the page + await frame.evaluate(js_code) + js_bool = str(self.overlay_show_details).lower() + if self.overlay_is_collapsed: + await frame.evaluate(f"showCollapsedOverlay('{self.overlay_processing_state}', {js_bool});") + else: + await frame.evaluate(f"showExpandedOverlay('{self.overlay_processing_state}', {js_bool});") + + #update chat history in the overlay + await self.update_overlay_chat_history(frame) + + except Exception as e: + if "Frame was detached" not in str(e): + raise e + + + async def show_overlay(self, page: Page): + """ + Displays the overlay in an expanded state on the given page if it's currently collapsed. + + Args: + page (Page): The Playwright Page object on which to show the overlay. + """ + if not self.overlay_is_collapsed: + logger.debug("Overlay is already expanded, ignoring show_overlay call") + return + await page.evaluate("showExpandedOverlay();") + self.overlay_is_collapsed = True + + + def update_overlay_state(self, is_collapsed: bool): + """ + Updates the state of the overlay to either collapsed or expanded. + + Args: + is_collapsed (bool): True to collapse the overlay, False to expand it. + """ + self.overlay_is_collapsed = is_collapsed + + + + async def update_overlay_show_details(self, show_details: bool, page: Page): + """ + Updates the state of the overlay to either show steps or not. + + Args: + show_steps (bool): True to show steps, False to hide them. + """ + self.overlay_show_details = show_details + await self.update_overlay_chat_history(page) + + + async def update_processing_state(self, state: str, page: Page): + """ + Updates the processing state of the overlay. + + Args: + state (str): The processing state to update. + """ + self.overlay_processing_state = state + try: + js_bool = str(self.overlay_is_collapsed).lower() + await page.evaluate(f"updateOverlayState('{self.overlay_processing_state}', {js_bool});") + except Exception as e: + logger.debug(f"JavaScript error: {e}") + + async def update_overlay_chat_history(self, frame_or_page: Frame | Page): + """ + Updates the chat history in the overlay. If the overlay is expanded and not currently being updated, + it clears existing messages and adds them fresh from the conversation history. + + Args: + frame_or_page (Frame | Page): The Playwright Frame or Page object to update the chat history in. + """ + logger.debug("Updating overlay chat history") + + if self.overlay_is_collapsed: + logger.debug("Overlay is collapsed, not updating chat history") + return + if self.__update_overlay_chat_history_running: + logger.debug("update_overlay_chat_history is already running, returning" + frame_or_page.url) + return + + self.__update_overlay_chat_history_running = True + #update chat history in the overlay by removing all messages and adding them again fresh + try: + await frame_or_page.evaluate("clearOverlayMessages();") + for message in self.conversation_history: + safe_message = escape_js_message(message["message"]) + safe_message_type = escape_js_message(message.get("message_type", MessageType.STEP.value)) + if message["from"] == "user": + await frame_or_page.evaluate(f"addUserMessage({safe_message});") + else: + #choose chich message types to be shown depending on UI setting + if self.overlay_show_details == False: # noqa: E712 + if message["message_type"] not in (MessageType.PLAN.value, MessageType.QUESTION.value, MessageType.ANSWER.value, MessageType.INFO.value): + continue + else: + if message["message_type"] not in (MessageType.PLAN.value, MessageType.QUESTION.value , MessageType.ANSWER.value, MessageType.INFO, MessageType.STEP.value): + continue + + js_code = f"addSystemMessage({safe_message}, is_awaiting_user_response=false, message_type={safe_message_type});" + await frame_or_page.evaluate(js_code) + logger.debug("Chat history updated in overlay, removing update lock flag") + except Exception: + traceback.print_exc() + finally: + self.__update_overlay_chat_history_running = False + + def clear_conversation_history(self): + """ + Clears the conversation history. + """ + self.conversation_history = [] + self.add_default_system_messages() + + def get_conversation_history(self): + """ + Returns the current conversation history. + + Returns: + list[dict[str, str]]: The conversation history. + """ + return self.conversation_history + + + def new_user_message(self, message: str): + """ + Adds a new user message to the conversation history. + + Args: + message (str): The message text to add. + """ + + self.conversation_history.append({"from":"user", "message":message}) + + + def new_system_message(self, message: str, type:MessageType=MessageType.STEP): + """ + Adds a new system message to the conversation history. + + Args: + message (str): The message text to add. + """ + + self.conversation_history.append({"from":"system", "message":message, "message_type":type.value}) + print(f"Adding system message: {message}") + + def add_default_system_messages(self): + """ + Adds default system messages to the conversation history to greet the user or provide initial instructions. + """ + pass + + async def command_completed(self, page: Page, command: str, elapsed_time: float|None = None): + """ + Handles the completion of a command, focusing on the overlay input and indicating that the command has finished. + + Args: + page (Page): The Playwright Page object where the command was executed. + command (str): The command that was completed. + elapsed_time (float | None, optional): The time taken to complete the command, if relevant. + """ + if not self.overlay_is_collapsed: + await page.evaluate("focusOnOverlayInput();") + await page.evaluate("commandExecutionCompleted();") diff --git a/Agent_E/ae/server/__init__.py b/Agent_E/ae/server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Agent_E/ae/server/api_routes.py b/Agent_E/ae/server/api_routes.py new file mode 100644 index 0000000..4ec2735 --- /dev/null +++ b/Agent_E/ae/server/api_routes.py @@ -0,0 +1,191 @@ +import asyncio +import json +import logging +import os +import uuid +from queue import Empty +from queue import Queue +from typing import Any + +import uvicorn +from fastapi import FastAPI +from fastapi import Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import StreamingResponse +from pydantic import BaseModel +from pydantic import Field + +import Agent_E.ae.core.playwright_manager as browserManager +from Agent_E.ae.config import SOURCE_LOG_FOLDER_PATH +from Agent_E.ae.core.agents_llm_config import AgentsLLMConfig +from Agent_E.ae.core.autogen_wrapper import AutogenWrapper +from Agent_E.ae.utils.formatting_helper import is_terminating_message +from Agent_E.ae.utils.ui_messagetype import MessageType + +browser_manager = browserManager.PlaywrightManager(headless=False) + +APP_VERSION = "1.0.0" +APP_NAME = "Agent-E Web API" +API_PREFIX = "/api" +IS_DEBUG = False +HOST = os.getenv("HOST", "0.0.0.0") +PORT = int(os.getenv("PORT", 8080)) +WORKERS = 1 + +container_id = os.getenv("CONTAINER_ID", "") + +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger("uvicorn") + + +class CommandQueryModel(BaseModel): + command: str = Field(..., description="The command related to web navigation to execute.") # Required field with description + llm_config: dict[str,Any] | None = Field(None, description="The LLM configuration string to use for the agents.") + planner_max_chat_round: int = Field(50, description="The maximum number of chat rounds for the planner.") + browser_nav_max_chat_round: int = Field(10, description="The maximum number of chat rounds for the browser navigation agent.") + clientid: str | None = Field(None, description="Client identifier, optional") + request_originator: str | None = Field(None, description="Optional id of the request originator") + + +def get_app() -> FastAPI: + """Starts the Application""" + fast_app = FastAPI(title=APP_NAME, version=APP_VERSION, debug=IS_DEBUG) + + fast_app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]) + + return fast_app + + +app = get_app() + + +@app.on_event("startup") # type: ignore +async def startup_event(): + """ + Startup event handler to initialize browser manager asynchronously. + """ + global container_id + + if container_id.strip() == "": + container_id = str(uuid.uuid4()) + os.environ["CONTAINER_ID"] = container_id + await browser_manager.async_initialize() + + +@app.post("/execute_task", description="Execute a given command related to web navigation and return the result.") +async def execute_task(request: Request, query_model: CommandQueryModel): + notification_queue = Queue() # type: ignore + transaction_id = str(uuid.uuid4()) if query_model.clientid is None else query_model.clientid + register_notification_listener(notification_queue) + return StreamingResponse(run_task(request, transaction_id, query_model.command, browser_manager, notification_queue, query_model.request_originator,query_model.llm_config, + planner_max_chat_round=query_model.planner_max_chat_round, + browser_nav_max_chat_round=query_model.browser_nav_max_chat_round), media_type="text/event-stream") + + +def run_task(request: Request, transaction_id: str, command: str, playwright_manager: browserManager.PlaywrightManager, notification_queue: Queue, request_originator: str|None = None, llm_config: dict[str,Any]|None = None, # type: ignore + planner_max_chat_round: int = 50, browser_nav_max_chat_round: int = 10): + """ + Run the task to process the command and generate events. + + Args: + request (Request): The request object to detect client disconnect. + transaction_id (str): The transaction ID to identify the request. + command (str): The command to execute. + playwright_manager (PlaywrightManager): The manager handling browser interactions and notifications. + notification_queue (Queue): The queue to hold notifications for this request. + request_originator (str|None): The originator of the request. + llm_config (dict[str,Any]|None): The LLM configuration to use for the agents. + planner_max_chat_rounds (int, optional): The maximum number of chat rounds for the planner. Defaults to 50. + browser_nav_max_chat_round (int, optional): The maximum number of chat rounds for the browser navigation agent. Defaults to 10. + + Yields: + str: JSON-encoded string representing a notification. + """ + + async def event_generator(): + task = asyncio.create_task(process_command(command, playwright_manager, planner_max_chat_round, browser_nav_max_chat_round, llm_config)) + task_detail = f"transaction_id={transaction_id}, request_originator={request_originator}, command={command}" + + try: + while not task.done() or not notification_queue.empty(): + if await request.is_disconnected(): + logger.info(f"Client disconnected. Cancelling the task: {task_detail}") + task.cancel() + break + try: + notification = notification_queue.get_nowait() # type: ignore + notification["transaction_id"] = transaction_id # Include the transaction ID in the notification + notification["request_originator"] = request_originator # Include the request originator in the notification + yield f"data: {json.dumps(notification)}\n\n" # Using 'data: ' to follow the SSE format + except Empty: + await asyncio.sleep(0.1) + except asyncio.CancelledError: + logger.info(f"Task was cancelled due to client disconnection. {task_detail}") + except Exception as e: + logger.error(f"An error occurred while processing task: {task_detail}. Error: {e}") + + await task + except asyncio.CancelledError: + logger.info(f"Task was cancelled due to client disconnection. {task_detail}") + await task + + return event_generator() + + + +async def process_command(command: str, playwright_manager: browserManager.PlaywrightManager, planner_max_chat_round: int, browser_nav_max_chat_round: int, llm_config:dict[str,Any]|None = None): + """ + Process the command and send notifications. + + Args: + command (str): The command to process. + playwright_manager (PlaywrightManager): The manager handling browser interactions and notifications. + """ + await playwright_manager.go_to_homepage() # Go to the homepage before processing the command + current_url = await playwright_manager.get_current_url() + await playwright_manager.notify_user("Processing command", MessageType.INFO) + + # Load the configuration using AgentsLLMConfig + normalized_llm_config = None + if llm_config is None: + normalized_llm_config = AgentsLLMConfig() + else: + normalized_llm_config = AgentsLLMConfig(llm_config=llm_config) + logger.info("Applied LLM config received via API.") + + # Retrieve planner agent and browser nav agent configurations + planner_agent_config = normalized_llm_config.get_planner_agent_config() + browser_nav_agent_config = normalized_llm_config.get_browser_nav_agent_config() + + ag = await AutogenWrapper.create(planner_agent_config, browser_nav_agent_config, planner_max_chat_round=planner_max_chat_round, + browser_nav_max_chat_round=browser_nav_max_chat_round) + command_exec_result = await ag.process_command(command, current_url) # type: ignore + messages=ag.agents_map["planner_agent"].chat_messages + messages_str_keys = {str(key): value for key, value in messages.items()} # type: ignore + + with open(os.path.join(SOURCE_LOG_FOLDER_PATH, 'chat_messages.json'), 'w', encoding='utf-8') as f: + json.dump(messages_str_keys, f, ensure_ascii=False, indent=4) + logger.debug("Chat messages saved") + + if is_terminating_message(command_exec_result.summary): + await playwright_manager.notify_user("DONE", MessageType.DONE) + else: + await playwright_manager.notify_user("Max turns reached", MessageType.MAX_TURNS_REACHED) + + +def register_notification_listener(notification_queue: Queue): # type: ignore + """ + Register the event generator as a listener in the NotificationManager. + """ + + def listener(notification: dict[str, str]) -> None: + notification["container_id"] = container_id # Include the container ID (or UUID) in the notification + notification_queue.put(notification) # type: ignore + + browser_manager.notification_manager.register_listener(listener) + + +if __name__ == "__main__": + logger.info("**********Application Started**********") + uvicorn.run("main:app", host=HOST, port=PORT, workers=WORKERS, reload=IS_DEBUG, log_level="info") diff --git a/Agent_E/ae/ui/injectOverlay.js b/Agent_E/ae/ui/injectOverlay.js new file mode 100644 index 0000000..f707de3 --- /dev/null +++ b/Agent_E/ae/ui/injectOverlay.js @@ -0,0 +1,801 @@ +let awaitingUserResponse = false; // flag to check if the agent is awaiting user response + +// disabled and enabled styles as injected style element +function injectOveralyStyles() { + // Create a new style element + let style = document.createElement('style'); + // Set the styles + style.textContent = ` +@import url(https://fonts.googleapis.com/earlyaccess/notosanssc.css); + +::-webkit-scrollbar { + width: 6px; + border: solid 3px transparent; +} + +::-webkit-scrollbar-track { + background-color: transparent; +} + +::-webkit-scrollbar-thumb { + background-color: rgba(255, 255, 255, 0.4); + border-radius: 4px; +} + +::-webkit-scrollbar-thumb:hover { + background-color: rgba(255, 255, 255, 0.6); + } + + .agente-pre-line { + white-space: pre-line; !important; + } + + #agente-closebutton{ + width:30px; + height:30px; + min-width:30px; + min-height:30px; + margin-left: auto; + color:darkgray; + cursor: pointer; + background: transparent; + transition: transform 0.2s ease; + border: None; + } + #agente-closebutton:hover{ + transform: scale(1.1); + } + + #agente-closebutton:active{ + transform: scale(0.8); + } + + @keyframes agente-gradient-animation { + 0% {background-position: 100% 0%} + 100% {background-position: 15% 100%} + } + + @keyframes agente-rotate { + 100% { + transform: rotate(1turn); + } + } + + @keyframes automation_highlight_fadeout_animation { + 0% { border-color: rgba(128, 0, 128, 1); } + 50% { border-color: rgba(128, 0, 128, 1); } + 100% { border-color: rgba(128, 0, 128, 0); } + } + + .agente-ui-automation-highlight { + border-width: 2px !important; + border-style: solid !important; + animation: automation_highlight_fadeout_animation 5s linear 1 forwards !important; + } + + .agente-processing{ + background: linear-gradient(90deg, + rgba(255, 0, 255, 1) 0%, /* Bright Magenta */ + rgba(0, 191, 255, 1) 100% /* Deep Sky Blue */ + ); + background-size: 100% 200%; + animation: agente-rotate 1s linear infinite; + } + + .agente-init{ + background: darkgray; + box-shadow: rgba(120, 120, 120, 0.3) 0px 0px 20px + } + + .agente-done{ + background: lightgreen; + } + + .agente-processingLine { + background: linear-gradient(45deg, + rgba(255, 0, 0, 1) 0%, /* Red */ + rgba(255, 127, 0, 1) 25%, /* Orange */ + rgba(0, 255, 0, 1) 50%, /* Green */ + rgba(0, 0, 255, 1) 75%, /* Blue */ + rgba(255, 0, 0, 1) 90%, /* Red */ + rgba(255, 0, 0, 1) 100% /* Red */ + ); + background-size: 500% 100%; + animation: agente-gradient-animation 3s linear infinite; + } + + .agente-initStateLine{ + background: lightgray; + } + + .agente-doneStateLine{ + background: lightgreen; + } + + .agente-collapsed{ + cursor: pointer; + background-color: rgba(0, 0, 0, 0.1); + background-repeat: no-repeat; + background-position: center; + background-size: cover; + width: 6vh; + height: 6vh; + border-radius: 50%; + right: 1.5vw; + bottom: 1.5vw; + box-shadow: rgba(0, 0, 0, 0.3) 0px 0px 20px + } + + .agente-chat-container { + margin:1%,1%,1%,1%; + width: 30vw; + min-width: 350px; + height:70vh; + bottom: 2vh; + position: relative; + display: flex; + flex-direction: column; + top: 6%; + padding: 1%; + box-sizing: border-box; + } + + .agente-chat-input{ + display: flex; + flex-direction: row; + align-items: center; + width: 95%; + margin-top:1.5vh; + } + + .agente-agent{ + justify-content: flex-start; + } + + .agente-user{ + justify-content: flex-end; + } + + #agente-user-input { + flex: 1; + padding: 3px 3px; + border: transparent; + width:100%; + resize: none; + font-family: 'Noto Sans SC'; + font-size: 1.6vh; + min-font-size: 12px; + line-height: 1.5; + display: flex; + vertical-align: middle; + text-align: middle; + align-items: center; + justify-content: center; + border-color: #ccc; + background: white; + color:black; + min-height: calc(1.2em * 2); + scrollbar-width: thin; + } + + #agente-user-input:focus { + outline: none !important; + border:0px solid transparent !important; + box-shadow: none !important; + } + + #agente-send-btn { + cursor: pointer; + transition: transform 0.2s ease; + } + + #agente-send-btn:hover{ + transform: scale(1.1); + } + + .agente-highlight_overlay{ + box-shadow: 1px 1px 1px 1px rgb(50 50 50 / 40%); + border-radius: 16px; + border: 1px solid #E1DEE2; + bottom:3px; + right:5px; + background: #FBFAFA; + } + + #agente-chat-box { + overflow-y: auto; + scrollbar-width: thin; + height: 90%; + display: flex; + flex-direction: column; + gap:1%; + margin:1% 5%; + padding-bottom:1%; + margin-top:10%; + } + + #agente-overlay { + position: fixed; + min-width: 50px; + min-height: 50px; + margin-left: auto; + margin-right: auto; + z-index:20000000; + scrollbar-color: gray lightgray; + margin-bottom: 1%; + display: flex; + flex-direction: column; + } + + .agente-input-container { + display: flex; + flex-direction: column; + margin: 1% 3%; + padding: 1%; + height:20%; + background: white; + border: 1px solid #E1DEE2; + border-radius: 8px; + } + + .agente-chat{ + width: 80%; + color: black; + overflow-wrap: break-word; + font-family: 'Noto Sans SC'; + + } + + .agente-systemMessage{ + text-align: left; + justify-content: flex-start; + font-family: 'Noto Sans SC'; + padding: 2% 4%; + font-size: 1.5vh; + min-font-size: 12px; + min-height: 30px; + background: #EEEEEF; + line-height: 1.7; + border-radius: 10px; + width:auto; + max-width: 90%; + } + + .agente-usertext{ + text-align: right; + justify-content: flex-end; + align-items: flex-end; + font-family: 'Noto Sans SC'; + font-size: 1.5vh; + min-font-size: 12px; + padding: 2% 4%; + line-height: 1.7; + min-height: 30px; + width:auto; + background: #ECEBF3; + border-radius: 10px; + color: black; + } + + .agente-agentstep{ + color: #4B4B4B; + } + .agente-agentplan{ + color: #4B4B4B; + } + .agente-agentanswer{ + color: black; + } + + + .agente-toggle { + -webkit-appearance: none; + -moz-appearance: none; + appearance: none; + margin: 0; + display: inline-block; + position: relative; + border-radius: 50px; + overflow: hidden; + outline: none; + border: none; + cursor: pointer; + background-color: #E1DEE2; + transition: background-color ease 0.3s; + align-self: center; +} +.agente-toggle:focus { + border: none; !important; + outline: none; !important; +} +.agente-toggle:before { + content: ""; + display: block; + position: absolute; + z-index: 2; + width: 20px; + height: 20px; + background: #fff; + left: 2px; + top: 2px; + border-radius: 50%; + color: #fff; + text-shadow: -1px -1px rgba(0,0,0,0.15); + white-space: nowrap; + box-shadow: 0 1px 2px rgba(0,0,0,0.2); + transition: all cubic-bezier(0.3, 1.5, 0.7, 1) 0.3s; +} + +.agente-toggle:checked { + background-color: #786E96; +} + +.agente-toggle:checked:before { + left: 20px; +} +`; + // Append the style element to the head of the document + document.head.appendChild(style); +} +let savedSelection = null; +let show_details = true; + + +function showCollapsedOverlay(processing_state = "processing", steps) { + show_details = steps; + removeOverlay(); + window.overlay_state_changed(true); + let collapsed_agente = document.createElement("div"); + collapsed_agente.id = "agente-overlay"; + collapsed_agente.classList.add("agente-collapsed"); + collapsed_agente.style.backgroundColor = "transparent"; + collapsed_agente.setAttribute("aria-hidden", "true"); + collapsed_agente.style.justifyContent = "center"; + let wrapper = document.createElement("div"); + wrapper.style.position = "relative"; + wrapper.style.width = "100%"; + wrapper.style.height = "100%"; + wrapper.style.justifyContent = "center"; + let logodiv= document.createElement("div"); + logodiv.style.width = "90%"; + logodiv.style.height = "90%"; + logodiv.style.left = "5%"; + logodiv.style.top = "5%"; + let borderdiv = document.createElement("div"); + borderdiv.style.width = "100%"; + borderdiv.style.height = "100%"; + borderdiv.style.borderRadius = "50%"; + + let logo = ``; + let encodedSvg = encodeURIComponent(logo); + let svgUrl = 'data:image/svg+xml;utf8,' + encodedSvg; + logodiv.style.backgroundImage = `url("${svgUrl}")`; + logodiv.style.backgroundRepeat = "no-repeat"; + logodiv.style.backgroundSize = "contain"; + logodiv.style.borderRadius = "50%"; + logodiv.style.backgroundPosition = "center"; + logodiv.style.backgroundColor = "white"; + logodiv.style.alignSelf = "center"; + borderdiv.style.position = "absolute"; + borderdiv.style.top = "0"; + borderdiv.style.left = "0"; + borderdiv.id="AgentEOverlayBorder"; + logodiv.style.position = "absolute"; + logodiv.style.justifySelf = "center"; + wrapper.appendChild(borderdiv); + wrapper.appendChild(logodiv); + collapsed_agente.appendChild(wrapper); + document.body.appendChild(collapsed_agente); + + updateOverlayState(processing_state, true); + + let element = document.getElementById('agente-overlay'); + document.getElementById('agente-overlay').addEventListener('mouseover', function () { + this.style.transform = 'scale(1.1)'; + }); + + document.getElementById('agente-overlay').addEventListener('mouseout', function () { + this.style.transform = 'scale(1)'; + }); + document.getElementById('agente-overlay').addEventListener('click', function () { + let ui_state = document.getElementById("AgentEOverlayBorder").classList.contains("agente-init") ? "init" : document.getElementById("AgentEOverlayBorder").classList.contains("agente-processing") ? "processing" : "done"; + showExpandedOverlay(ui_state, show_details); + }); +} + +function removeOverlay() { + let element = document.getElementById("agente-overlay"); + if (element) { + element.parentNode.removeChild(element); + } +} + +function clearOverlayMessages(keep_default=false) { + try { + let chatBox = document.getElementById('agente-chat-box'); + if (!chatBox) { + return; + } + while (chatBox.firstChild) { + chatBox.removeChild(chatBox.firstChild); + } + } catch (error) { + //No action can be taken at this point. Just ensure subsequent messages are not affected + console.error("Error clearing chat box", error); + } +} + +function updateOverlayState(processing_state, is_collapsed) +{ + if (is_collapsed) { + let borderdiv = document.getElementById("AgentEOverlayBorder"); + if (processing_state === "init"){ + borderdiv.classList.add("agente-init"); + borderdiv.classList.remove("agente-processing"); + borderdiv.classList.remove("agente-done"); + } + else if (processing_state === "processing"){ + borderdiv.classList.remove("agente-init"); + borderdiv.classList.add("agente-processing"); + borderdiv.classList.remove("agente-done"); + } + else if (processing_state === "done"){ + borderdiv.classList.remove("agente-init"); + borderdiv.classList.remove("agente-processing"); + borderdiv.classList.add("agente-done"); + } + } else { + let animation = document.getElementById("AgentEExpandedAnimation"); + if (processing_state === "init"){ + animation.classList.remove("agente-processingLine"); + animation.classList.add("agente-initStateLine"); + animation.classList.remove("agente-doneStateLine"); + enableOverlay(); + } + else if (processing_state === "processing"){ + animation.classList.add("agente-processingLine"); + animation.classList.remove("agente-initStateLine"); + animation.classList.remove("agente-doneStateLine"); + disableOverlay(); + } + else if (processing_state === "done"){ + animation.classList.remove("agente-processingLine"); + animation.classList.remove("agente-initStateLine"); + animation.classList.add("agente-doneStateLine"); + enableOverlay(); + } + } +} + +function showExpandedOverlay(processing_state = "init", show_steps=true) { + ui_state = processing_state; + show_details = show_steps; + let agente_logo = ``; + let close_icon = ``; + let icon = ``; + removeOverlay(); + window.overlay_state_changed(false); + let newDiv = document.createElement("div"); + newDiv.id = "agente-overlay"; + newDiv.classList.add("agente-highlight_overlay"); + newDiv.setAttribute("aria-hidden", "true"); + newDiv.setAttribute("tabindex", "0"); + + let header = document.createElement("div"); + header.style.display = "flex"; + header.style.flexDirection = "row"; + header.style.margin = "4%"; + + let logoIcon= document.createElement("div"); + logoIcon.style.width = "25px"; + logoIcon.style.height = "25px"; + logoIcon.style.backgroundImage = `url('data:image/svg+xml;utf8,${encodeURIComponent(icon)}')`; + logoIcon.style.backgroundRepeat = "no-repeat"; + logoIcon.style.backgroundSize = "contain"; + logoIcon.style.backgroundPosition = "bottom"; + logoIcon.style.order = 1; + logoIcon.style.alignSelf = "flex-end"; + logoIcon.style.marginRight = "1%"; + + let logoDiv = document.createElement("div"); + logoDiv.style.width = "100px"; + logoDiv.style.height = "25px"; + logoDiv.style.backgroundImage = `url('data:image/svg+xml;utf8,${encodeURIComponent(agente_logo)}')`; + logoDiv.style.backgroundRepeat = "no-repeat"; + logoDiv.style.backgroundSize = "contain"; + logoDiv.style.backgroundPosition = "bottom"; + // Style the logoDiv and button + logoDiv.style.order = 1; + + + let closeButton = document.createElement("button"); + closeButton.id = "agente-closebutton"; + closeButton.style.backgroundImage = `url('data:image/svg+xml;utf8,${encodeURIComponent(close_icon)}')`; + closeButton.style.backgroundRepeat = "no-repeat"; + closeButton.style.backgroundSize = "contain"; + closeButton.style.backgroundPosition = "bottom"; + closeButton.onclick = function () { + let ui_state = document.getElementById("AgentEExpandedAnimation").classList.contains("agente-initStateLine") ? "init" : document.getElementById("AgentEExpandedAnimation").classList.contains("agente-processingLine") ? "processing" : "done"; + showCollapsedOverlay(ui_state, show_details); + }; + closeButton.style.order = 3; + header.appendChild(logoIcon); + header.appendChild(logoDiv); + let animation = document.createElement("div"); + animation.id = "AgentEExpandedAnimation"; + animation.style.height = "2px"; + animation.style.width = "100%"; + + header.appendChild(closeButton); + // Append the close button to the newDiv + newDiv.appendChild(header); + + + newDiv.appendChild(animation); + let chatContainer = document.createElement("div"); + chatContainer.className = "agente-chat-container"; + + let chatBox = document.createElement("div"); + chatBox.id = "agente-chat-box"; + + let chatInput = document.createElement("div"); + chatInput.className = "agente-chat-input"; + chatBox.appendChild(chatInput); + + let inputContainer = document.createElement("div"); + inputContainer.className = "agente-input-container"; + inputContainer.id = "agente-input-container"; + let userInput = document.createElement("textarea"); + userInput.id = "agente-user-input"; + userInput.placeholder = "What can I help you solve today?"; + userInput.addEventListener('input', function(event) { + let text = event.target.value; + if (text.trim() == "") { + let button_disabled_svg =``; + let sendBtn = document.getElementById('agente-send-btn'); + sendBtn.style.backgroundImage = `url('data:image/svg+xml;utf8,${encodeURIComponent(button_disabled_svg)}')`; + } + else{ + let button_enabled_svg= ``; + let sendBtn = document.getElementById('agente-send-btn'); + sendBtn.style.backgroundImage = `url('data:image/svg+xml;utf8,${encodeURIComponent(button_enabled_svg)}')`; + } + }); + let userinput_footer = document.createElement("div"); + userinput_footer.style.display = "flex"; + userinput_footer.style.flexDirection = "row"; + userinput_footer.style.justifyContent = "space-between"; + userinput_footer.style.alignItems = "center"; + userinput_footer.style.height = "40%"; + userinput_footer.style.margin = "2% 1%"; + userinput_footer.id="userinput_section" + + let toggleLabel = document.createElement("label"); // Create a new label element + toggleLabel.textContent = "Show Details"; // Set the text content of the label + toggleLabel.style.color = "#6B6673"; // Set the color of the label + toggleLabel.style.fontFamily = "Noto Sans SC"; // Set the font of the label + toggleLabel.style.fontSize = "14px"; // Set the font size of the label + toggleLabel.style.fontWeight = "400"; // Set the font weight of the label + toggleLabel.style.margin = "0px"; // Add some margin to the right of the label + toggleLabel.style.marginRight = "10px"; // Add some margin to the right of the label + + let toggleSwitch = document.createElement("input"); + + toggleSwitch.type = "checkbox"; + toggleSwitch.className = "agente-toggle"; + toggleSwitch.style.width = "44px"; + toggleSwitch.style.height = "24px"; + toggleSwitch.style.margin = "0px"; + + if (show_details){ + toggleSwitch.checked = true; + } + else{ + toggleSwitch.checked = false; + } + + toggleSwitch.addEventListener('change', function() { + if(this.checked) { + show_details = true; + window.show_steps_state_changed(true) + } else { + show_details = false; + window.show_steps_state_changed(false) + } +}); + + let sendicon =``; + let sendBtn = document.createElement("div"); + sendBtn.id = "agente-send-btn"; + sendBtn.style.backgroundImage = `url('data:image/svg+xml;utf8,${encodeURIComponent(sendicon)}')`; + sendBtn.style.backgroundRepeat = "no-repeat"; + sendBtn.style.backgroundSize = "contain"; + sendBtn.style.backgroundPosition = "right"; + sendBtn.style.width = "8%"; + sendBtn.style.height = "100%"; + sendBtn.style.marginLeft = "auto"; + + userinput_footer.appendChild(toggleLabel); // Add the label to the div + userinput_footer.appendChild(toggleSwitch); + userinput_footer.appendChild(sendBtn); + + inputContainer.appendChild(userInput); + inputContainer.appendChild(userinput_footer); + + chatContainer.appendChild(chatBox); + chatContainer.appendChild(inputContainer); + + newDiv.appendChild(chatContainer); + + let disclaimer = document.createElement("p"); + disclaimer.style.fontFamily = "Noto Sans SC"; + disclaimer.style.fontSize = "12px"; + disclaimer.style.color = "#6B6673"; + disclaimer.style.alignSelf = "center"; + disclaimer.style.position = "absolute"; + disclaimer.style.bottom = "0%"; + disclaimer.style.margin = "0% 0% 1% 0%"; + disclaimer.textContent = "Agent-E may make mistakes. Verify key info."; + + newDiv.appendChild(disclaimer); + + document.body.appendChild(newDiv); + updateOverlayState(processing_state, false); + document.getElementById('agente-send-btn').addEventListener('click', function () { + let task = document.getElementById('agente-user-input').value + let task_trimmed = task.trim(); + if (task_trimmed && !isDisabled() && task_trimmed.length > 0) { + if (awaitingUserResponse) { + addUserMessage(task); + document.getElementById('agente-user-input').value = ""; + } else { + clearOverlayMessages(); + addUserMessage(task); + disableOverlay(); + window.process_task(task) + document.getElementById('agente-user-input').value = ""; + } + } + else { + console.log("Empty message no task to send"); + } + }); + + userInput.addEventListener('focus', function() { + if (window.getSelection().rangeCount > 0) { + let selectedText = window.getSelection().toString(); + if (selectedText) { + document.getElementById('agente-user-input').value = selectedText + '\n'; + setTimeout(function() { + userInput.selectionStart = userInput.selectionEnd = userInput.value.length; + userInput.scrollTop = userInput.scrollHeight; + }, 0); + + } + } +}); + +userInput.addEventListener('blur', function() { + if (savedSelection) { + let selection = window.getSelection(); + selection.removeAllRanges(); + selection.addRange(savedSelection); + } +}); + + document.getElementById('agente-user-input').addEventListener('keydown', function (event) { + // Check if the pressed key is the Enter key + if (event.key === "Enter") { + event.preventDefault(); + + let targetElement = document.getElementById('agente-send-btn'); + + // Create a new click event + let clickEvent = new MouseEvent('click', { + bubbles: true, + cancelable: true + }); + + // Dispatch the click event on the send button + targetElement.dispatchEvent(clickEvent); + } + }); + focusOnOverlayInput(); +} + + +function focusOnOverlayInput() { + document.getElementById('agente-user-input').focus(); +} + +function addMessage(message, sender, message_type = "plan") { + let newDiv = document.createElement("div"); + newDiv.classList.add("agente-chat-input"); + let chatDiv = document.createElement("div"); + chatDiv.classList.add("agente-chat"); + + let parsedMessage = message; + + try { + parsedMessage = JSON.parse(message); + } catch (e) { + console.log("Message is not in JSON format, using original message."); + } + + // Customize based on the sender + if (sender === "system") { + newDiv.classList.add("agente-agent"); + chatDiv.classList.add("agente-systemMessage", "agente-pre-line"); + if (message_type === "step") { + chatDiv.classList.add("agente-agentstep"); + } + else if (message_type === "plan" || message_type === "question") { + chatDiv.classList.add("agente-agentplan"); + } + + else if (message_type === "answer") { + chatDiv.classList.add("agente-agentanswer"); + } + if ((message_type === "info" && message.includes("Task Completed")) || message_type==="question") { + enableOverlay(); + } + chatDiv.textContent = parsedMessage; + } else if (sender === "user") { + newDiv.classList.add("agente-user") + chatDiv.classList.add("agente-usertext", "agente-pre-line"); + chatDiv.textContent = parsedMessage; + } + newDiv.appendChild(chatDiv); + let chatBox = document.getElementById('agente-chat-box'); + chatBox.appendChild(newDiv); + chatBox.scrollTop = chatBox.scrollHeight; + newDiv.scrollIntoView({ behavior: 'instant' }); + + if (sender === "user" && awaitingUserResponse) { + awaitingUserResponse = false; + // Notify the server that the user has responded to the agent's prompt + window.user_response(message); + } + +} + +function addSystemMessage(message, is_awaiting_user_response = false, message_type = "plan") { + // Function to actually add the message + function executeAddMessage() { + awaitingUserResponse = is_awaiting_user_response; + addMessage(message, "system", message_type); + } + requestAnimationFrame(executeAddMessage); +} + +function addUserMessage(message) { + addMessage(message, "user"); +} + +function disableOverlay() { + let input_field= document.getElementById("agente-user-input"); + if(input_field){ + input_field.placeholder = "Processing..."; + } +} + +function isDisabled() { + let input_field= document.getElementById("agente-user-input"); + if(input_field){ + return input_field.placeholder === "Processing..."; + } +} + + +function enableOverlay() { + let input_field= document.getElementById("agente-user-input"); + if(input_field){ + input_field.placeholder = "What can I help you solve today?"; + } +} + +function commandExecutionCompleted() { + console.log("Command execution completed"); +} + +injectOveralyStyles(); diff --git a/Agent_E/ae/user_preferences/user_preferences.txt b/Agent_E/ae/user_preferences/user_preferences.txt new file mode 100644 index 0000000..3a96bd9 --- /dev/null +++ b/Agent_E/ae/user_preferences/user_preferences.txt @@ -0,0 +1,11 @@ +Personal Info: +First Name: John +Last Name: Doe +Date of birth: 10/10/2010 +Occupation: Software Engineer +Address: 49 Featherstone Street, LONDON, EC1Y 8SY, UNITED KINGDOM +Email: myemail@gmail.com +Phone Number: 123-456-7890 +Here are some of my preferences: +Favorite news source: www.bbc.com +Favorite flight booking site to use with every flight related query: https://www.google.com/travel/flights \ No newline at end of file diff --git a/Agent_E/ae/utils/__init__.py b/Agent_E/ae/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Agent_E/ae/utils/anthropic_llm_helper.py b/Agent_E/ae/utils/anthropic_llm_helper.py new file mode 100644 index 0000000..6fbc870 --- /dev/null +++ b/Agent_E/ae/utils/anthropic_llm_helper.py @@ -0,0 +1,52 @@ +import os + +import anthropic +from anthropic import AsyncAnthropic +from dotenv import load_dotenv + + +class AnthropicLLMHelper: + def __init__(self): + load_dotenv() + self.client = AsyncAnthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) + + async def get_chat_completion_response_async(self, system_msg:str, user_msgs:list[str], model_name:str="claude-3-opus-20240229", temperature:float=0.1, max_tokens:int=256, top_p:int=1, top_k: int=1) -> str: + formatted_user_msgs: list[dict[str, str]] = [] + for user_msg in user_msgs: + formatted_user_msgs.append({"type": "text", "text": user_msg}) + + try: + message = await self.client.messages.create( + model=model_name, + max_tokens=max_tokens, + temperature=temperature, + system=system_msg, + messages=[ + { + "role": "user", + "content": formatted_user_msgs # type: ignore + + } + ] + ) + print(message) + return message.content[0].text + except anthropic.APIConnectionError as e: + print("The server could not be reached") + print(e.__cause__) # an underlying Exception, likely raised within httpx. + raise Exception(f"Calling {model_name} LLM failed. The server could not be reached. Error: {e}") # noqa: B904 + except anthropic.RateLimitError as e: + print("A 429 status code was received; we should back off a bit.") + raise Exception(f"Calling {model_name} LLM failed. Rate limit error. Error: {e}") # noqa: B904 + except anthropic.APIStatusError as e: + print(e.status_code) + print(e.response) + raise Exception(f"Calling {model_name} LLM failed. Error: {e}") # noqa: B904 + +# async def main(): +# from ae.core.prompts import LLM_PROMPTS +# helper = AnthropicLLMHelper() +# response = await helper.get_chat_completion_response_async(LLM_PROMPTS["SKILLS_HARVESTING_PROMPT"], ["What is the weather like today?"], temperature=0, max_tokens=4000) +# print("*******\nResponse: ", response, "\n*******\n") + +# asyncio.run(main()) diff --git a/Agent_E/ae/utils/autogen_sequential_function_call.py b/Agent_E/ae/utils/autogen_sequential_function_call.py new file mode 100644 index 0000000..9d08443 --- /dev/null +++ b/Agent_E/ae/utils/autogen_sequential_function_call.py @@ -0,0 +1,85 @@ + +import asyncio +import inspect +from typing import Any + +from autogen import Agent # type: ignore +from autogen import UserProxyAgent # type: ignore + + +class UserProxyAgent_SequentialFunctionExecution(UserProxyAgent): + def __init__(self, *args, **kwargs): # type: ignore + super().__init__(*args, **kwargs) # type: ignore + #position = 2 allows termination check to be called earlier, this helps detect loops. + self.register_reply(Agent, UserProxyAgent_SequentialFunctionExecution.sequential_generate_tool_calls_reply, position=2) # type: ignore + + + def sequential_generate_tool_calls_reply( # type: ignore + self, + messages: list[dict] | None = None, # type: ignore + sender: Agent | None = None, + config: Any | None = None, + ) -> tuple[bool, dict[str, Any] | None]: + """Generate a reply using tool call.""" + if config is None: + config = self + if messages is None: + messages = self._oai_messages[sender] # type: ignore + message = messages[-1] # type: ignore + tool_returns = [] + skip_flag:bool = False + for tool_call in message.get("tool_calls", []): # type: ignore + function_call = tool_call.get("function", {}) # type: ignore + func = self._function_map.get(function_call.get("name", None), None) # type: ignore + func_return = None + if inspect.iscoroutinefunction(func): # type: ignore + try: + # get the running loop if it was already created + loop = asyncio.get_running_loop() + close_loop = False + except RuntimeError: + # create a loop if there is no running loop + loop = asyncio.new_event_loop() + close_loop = True + if (not skip_flag): + _, func_return = loop.run_until_complete(self.a_execute_function(function_call)) # type: ignore + if close_loop: + loop.close() + else: + if (not skip_flag): + _, func_return = self.execute_function(function_call) # type: ignore + if func_return is None: # type: ignore + if skip_flag: + content = "VERY IMPORTANT: This function could not be executed since previous function resulted in a Webpage change. You must get all_fields DOM and repeat the function if needed." + else: + content = "" + else: + content = func_return.get("content", "") # type: ignore + + if content is None: + content = "" + + if ("as a consequence of this action" in content.lower()): # type: ignore + skip_flag = True + + tool_call_id = tool_call.get("id", None) # type: ignore + if tool_call_id is not None: + tool_call_response = { # type: ignore + "tool_call_id": tool_call_id, + "role": "tool", + "content": content, + } + else: + tool_call_response = { # type: ignore + "role": "tool", + "content": content, + } + tool_returns.append(tool_call_response) # type: ignore + + if tool_returns: + return True, { + "role": "tool", + "tool_responses": tool_returns, + "content": "\n\n".join([self._str_for_tool_response(tool_return) for tool_return in tool_returns]), # type: ignore + } + return False, None diff --git a/Agent_E/ae/utils/cli_helper.py b/Agent_E/ae/utils/cli_helper.py new file mode 100644 index 0000000..4c153ce --- /dev/null +++ b/Agent_E/ae/utils/cli_helper.py @@ -0,0 +1,34 @@ +import asyncio +from asyncio import Future + + +def async_input(prompt: str) -> Future: # type: ignore + """ + Display a prompt to the user and wait for input in an asynchronous manner. + + Parameters: + - prompt: The message to display to the user. + + Returns: + - A Future object that will be fulfilled with the user's input. + """ + loop = asyncio.get_event_loop() + return loop.run_in_executor(None, input, prompt) + + +async def answer_questions_over_cli(questions: list[str]) -> dict[str, str]: + """ + Asks a question over the command line and awaits the user's response. + + Parameters: + - questions: A list of questions to ask the user, e.g., ["What is your favorite site?", "What do you want to search for?"]. + + Returns: + - A dictionary where each key is a question and each value is the user's response. + """ + answers: dict[str, str] = {} + print("*********************************") + for question in questions: + answers[question] = await async_input("Question: "+str(question)+" : ") + print("*********************************") + return answers diff --git a/Agent_E/ae/utils/detect_llm_loops.py b/Agent_E/ae/utils/detect_llm_loops.py new file mode 100644 index 0000000..e73f1f0 --- /dev/null +++ b/Agent_E/ae/utils/detect_llm_loops.py @@ -0,0 +1,46 @@ +from typing import Any + +from Agent_E.ae.utils.logger import logger + + +def is_agent_stuck_in_loop(messages: list[dict[str, Any]]) -> bool: + """ + Detects loops in the agent's responses by iterating over the last N responses. + + Parameters + ---------- + messages : list[dict[str, Any]] + A list of dictionaries representing the agent's messages. + + Returns + ------- + bool + True if a loop is detected, False otherwise. + """ + number_of_turns_to_check_for_loops: int = 6 + number_of_rounds_to_check_for_loops: int = number_of_turns_to_check_for_loops // 2 #integer division since we are checking for pairs of messages and can't have fractions + # Detect any loops by checking the last number_of_rounds_to_check_for_loops tool responses and their corresponding tool calls + if len(messages) > number_of_turns_to_check_for_loops: + last_six_items = messages[-number_of_turns_to_check_for_loops:] + logger.debug(f"More than {number_of_turns_to_check_for_loops} messages in the conversation. Checking for loops..") + # Filter items by role + tool_calls = [item for item in last_six_items if item.get("role") == "assistant"] + + # Check if function attributes are the same for tool items + if tool_calls: + tool_functions = [item.get("tool_calls", [{}])[0].get("function") for item in tool_calls] + logger.debug(f"Last {number_of_rounds_to_check_for_loops} tool calls: {tool_functions}") + if all(func == tool_functions[0] for func in tool_functions): + logger.debug(f"Last {number_of_rounds_to_check_for_loops} tool calls are identical. Checking Tool responses..") + # Check if content attributes are the same for assistant items + tool_responses = [item for item in last_six_items if item.get("role") == "tool"] + + if tool_responses: + assistant_contents = [item.get("content") for item in tool_responses] + logger.debug(f"Last N tool responses: {assistant_contents}") + if all(content == assistant_contents[0] for content in assistant_contents): + logger.debug(f"Last {number_of_rounds_to_check_for_loops} tool responses are identical. Terminating") + logger.info("Terminating browser executor since a loop was detected...") + return True + + return False diff --git a/Agent_E/ae/utils/dom_helper.py b/Agent_E/ae/utils/dom_helper.py new file mode 100644 index 0000000..40c371c --- /dev/null +++ b/Agent_E/ae/utils/dom_helper.py @@ -0,0 +1,45 @@ +import asyncio + +from playwright.async_api import ElementHandle +from playwright.async_api import Page + +from Agent_E.ae.utils.logger import logger + + +async def wait_for_non_loading_dom_state(page: Page, max_wait_millis: int): + max_wait_seconds = max_wait_millis / 1000 + end_time = asyncio.get_event_loop().time() + max_wait_seconds + while asyncio.get_event_loop().time() < end_time: + dom_state = await page.evaluate("document.readyState") + if dom_state != "loading": + logger.debug(f"DOM state is not 'loading': {dom_state}") + break # Exit the loop if the DOM state is not 'loading' + + await asyncio.sleep(0.05) + + +async def get_element_outer_html(element: ElementHandle, page: Page, element_tag_name: str|None = None) -> str: + """ + Constructs the opening tag of an HTML element along with its attributes. + + Args: + element (ElementHandle): The element to retrieve the opening tag for. + page (Page): The page object associated with the element. + element_tag_name (str, optional): The tag name of the element. Defaults to None. If not passed, it will be retrieved from the element. + + Returns: + str: The opening tag of the HTML element, including a select set of attributes. + """ + tag_name: str = element_tag_name if element_tag_name else await page.evaluate("element => element.tagName.toLowerCase()", element) + + attributes_of_interest: list[str] = ['id', 'name', 'aria-label', 'placeholder', 'href', 'src', 'aria-autocomplete', 'role', 'type', + 'data-testid', 'value', 'selected', 'aria-labelledby', 'aria-describedby', 'aria-haspopup'] + opening_tag: str = f'<{tag_name}' + + for attr in attributes_of_interest: + value: str = await element.get_attribute(attr) # type: ignore + if value: + opening_tag += f' {attr}="{value}"' + opening_tag += '>' + + return opening_tag diff --git a/Agent_E/ae/utils/dom_mutation_observer.py b/Agent_E/ae/utils/dom_mutation_observer.py new file mode 100644 index 0000000..95a6f5e --- /dev/null +++ b/Agent_E/ae/utils/dom_mutation_observer.py @@ -0,0 +1,88 @@ + +import asyncio +import json +from typing import Callable # noqa: UP035 + +from playwright.async_api import Page + +# Create an event loop +loop = asyncio.get_event_loop() + +DOM_change_callback: list[Callable[[str], None]] = [] + +def subscribe(callback: Callable[[str], None]) -> None: + DOM_change_callback.append(callback) + +def unsubscribe(callback: Callable[[str], None]) -> None: + DOM_change_callback.remove(callback) + + +async def add_mutation_observer(page:Page): + """ + Adds a mutation observer to the page to detect changes in the DOM. + When changes are detected, the observer calls the dom_mutation_change_detected function in the browser context. + This changes can be detected by subscribing to the dom_mutation_change_detected function by individual skills. + + Current implementation only detects when a new node is added to the DOM. + However, in many cases, the change could be a change in the style or class of an existing node (e.g. toggle visibility of a hidden node). + """ + + await page.evaluate(""" + console.log('Adding a mutation observer for DOM changes'); + new MutationObserver((mutationsList, observer) => { + let changes_detected = []; + for(let mutation of mutationsList) { + if (mutation.type === 'childList') { + let allAddedNodes=mutation.addedNodes; + for(let node of allAddedNodes) { + if(node.tagName && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.tagName) && !node.closest('#agentDriveAutoOverlay')) { + let visibility=true; + let content = node.innerText.trim(); + if(visibility && node.innerText.trim()){ + if(content) { + changes_detected.push({tag: node.tagName, content: content}); + } + } + } + } + } else if (mutation.type === 'characterData') { + let node = mutation.target; + if(node.parentNode && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.parentNode.tagName) && !node.parentNode.closest('#agentDriveAutoOverlay')) { + let visibility=true; + let content = node.data.trim(); + if(visibility && content && window.getComputedStyle(node.parentNode).display !== 'none'){ + if(content && !changes_detected.some(change => change.content.includes(content))) { + changes_detected.push({tag: node.parentNode.tagName, content: content}); + } + } + } + } + } + if(changes_detected.length > 0) { + window.dom_mutation_change_detected(JSON.stringify(changes_detected)); + } + }).observe(document, {subtree: true, childList: true, characterData: true}); + """) + + +async def handle_navigation_for_mutation_observer(page:Page): + await add_mutation_observer(page) + +async def dom_mutation_change_detected(changes_detected: str): + """ + Detects changes in the DOM (new nodes added) and emits the event to all subscribed callbacks. + The changes_detected is a string in JSON formatt containing the tag and content of the new nodes added to the DOM. + + e.g. The following will be detected when autocomplete recommendations show up when one types Nelson Mandela on google search + [{'tag': 'SPAN', 'content': 'nelson mandela wikipedia'}, {'tag': 'SPAN', 'content': 'nelson mandela movies'}] + """ + changes_detected = json.loads(changes_detected.replace('\t', '').replace('\n', '')) + if len(changes_detected) > 0: + # Emit the event to all subscribed callbacks + for callback in DOM_change_callback: + # If the callback is a coroutine function + if asyncio.iscoroutinefunction(callback): + await callback(changes_detected) + # If the callback is a regular function + else: + callback(changes_detected) diff --git a/Agent_E/ae/utils/formatting_helper.py b/Agent_E/ae/utils/formatting_helper.py new file mode 100644 index 0000000..9ff2f12 --- /dev/null +++ b/Agent_E/ae/utils/formatting_helper.py @@ -0,0 +1,56 @@ + +import json +import re +from typing import Any + + +def str_to_bool(s: str | bool) -> bool: + """ + Convert a string representation of truth to True or False. + + Parameters: + s (str | bool): The string to convert, or a boolean. + + Returns: + bool: True if the string represents a truth value, False otherwise. + """ + if isinstance(s, bool): + return s + return s.lower() in ['true', '1', 't', 'y', 'yes'] + +def str_to_json(s: str) -> dict[str, Any] | None: + """ + Convert a string representation of a JSON object to a dictionary. + + Parameters: + s (str): The string to convert. + + Returns: + dict[str, Any] | None: The dictionary representation of the JSON object. If the parsing fails, returns None. + """ + s_fixed = re.sub(r'(? bool: + """ + Check if a message is a terminating message. + + Parameters: + message (str): The message to check. + + Returns: + bool: True if the message is a terminating message, False otherwise. + """ + message_as_json = str_to_json(message) + if message_as_json is None: + if message.find('"terminate": "yes"') != -1: + return True + return False + else: + return message_as_json.get("terminate") == "yes" diff --git a/Agent_E/ae/utils/gemini_llm_helper.py b/Agent_E/ae/utils/gemini_llm_helper.py new file mode 100644 index 0000000..1bfece7 --- /dev/null +++ b/Agent_E/ae/utils/gemini_llm_helper.py @@ -0,0 +1,77 @@ +import os +import re +from typing import Any + +import google.generativeai as genai # type: ignore +from dotenv import load_dotenv + +from Agent_E.ae.utils.logger import logger + +GCP_BLOCK_NONE_SAFETY_SETTINGS: list[dict[str, str]] = [ + { + "category": "HARM_CATEGORY_HARASSMENT", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "threshold": "BLOCK_NONE" + }, +] + +# Pre-compile the regular expression pattern for removing json markers from LLM response +llm_json_or_python_begin_response_pattern = re.compile(r"^```(python|json)?\n?") +llm_end_response_pattern = re.compile(r"```$") + +class GeminiLLMHelper: + def __init__(self): + load_dotenv() + genai.configure(api_key=os.environ.get("GEMINI_API_KEY")) # type: ignore + + def process_llm_response(self, response: str): + if response: + # Use the compiled regex to replace the patterns with an empty string + response = llm_json_or_python_begin_response_pattern.sub("", response) + response = llm_end_response_pattern.sub("", response) + return response + + async def get_chat_completion_response_async(self, system_msg:str, user_msgs:list[str], model_name:str="gemini-1.5-pro-latest", temperature:float=0.1, + max_tokens:int=256, top_p:int=1, top_k: int=1, safety_settings:list[dict[str, str]]=GCP_BLOCK_NONE_SAFETY_SETTINGS) -> str|None: + formatted_msgs: list[dict[str, Any]] = [{"role": "user", "parts": [system_msg]}] + user_msgs_parts: list[str] = [] + for user_msg in user_msgs: + user_msgs_parts.append(user_msg) + + formatted_msgs.append({"role": "user", "parts": user_msgs_parts}) + response = None + try: + model = genai.GenerativeModel(model_name) + response = model.generate_content(formatted_msgs, stream=False, # type: ignore + generation_config=genai.types.GenerationConfig( + max_output_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k), + safety_settings=safety_settings) + return self.process_llm_response(response.text) + except ValueError: + if response: + logger.error(f"Response from GCP Gen AI did not contain text. prompt: {system_msg} and user messages: {user_msgs}. Candidates: {response.candidates}") + else: + logger.error(f"There was no response from GCP Gen AI for prompt: {system_msg} and user messages: {user_msgs}") + return None + +# async def main(): +# from Agent_E.ae.core.prompts import LLM_PROMPTS +# helper = GeminiLLMHelper() +# response = await helper.get_chat_completion_response_async(LLM_PROMPTS["SKILLS_HARVESTING_PROMPT"], ["What is the weather like today?", "And How are you?"], temperature=0, max_tokens=4000) +# print("*******\nResponse: ", response, "\n*******\n") + +# asyncio.run(main()) diff --git a/Agent_E/ae/utils/get_detailed_accessibility_tree.py b/Agent_E/ae/utils/get_detailed_accessibility_tree.py new file mode 100644 index 0000000..7e5a200 --- /dev/null +++ b/Agent_E/ae/utils/get_detailed_accessibility_tree.py @@ -0,0 +1,529 @@ +import json +import os +import re +import traceback +from typing import Annotated +from typing import Any + +from playwright.async_api import Page + +from Agent_E.ae.config import SOURCE_LOG_FOLDER_PATH +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.logger import logger + +space_delimited_mmid = re.compile(r'^[\d ]+$') + +def is_space_delimited_mmid(s: str) -> bool: + """ + Check if the given string matches the the mmid pattern of number space repeated. + + Parameters: + - s (str): The string to check against the pattern. + + Returns: + - bool: True if the string matches the pattern, False otherwise. + """ + # Use fullmatch() to ensure the entire string matches the pattern + return bool(space_delimited_mmid.fullmatch(s)) + + +async def __inject_attributes(page: Page): + """ + Injects 'mmid' and 'aria-keyshortcuts' into all DOM elements. If an element already has an 'aria-keyshortcuts', + it renames it to 'orig-aria-keyshortcuts' before injecting the new 'aria-keyshortcuts' + This will be captured in the accessibility tree and thus make it easier to reconcile the tree with the DOM. + 'aria-keyshortcuts' is choosen because it is not widely used aria attribute. + """ + + last_mmid = await page.evaluate("""() => { + const allElements = document.querySelectorAll('*'); + let id = 0; + allElements.forEach(element => { + const origAriaAttribute = element.getAttribute('aria-keyshortcuts'); + const mmid = `${++id}`; + element.setAttribute('mmid', mmid); + element.setAttribute('aria-keyshortcuts', mmid); + //console.log(`Injected 'mmid'into element with tag: ${element.tagName} and mmid: ${mmid}`); + if (origAriaAttribute) { + element.setAttribute('orig-aria-keyshortcuts', origAriaAttribute); + } + }); + return id; + }""") + logger.debug(f"Added MMID into {last_mmid} elements") + + +async def __fetch_dom_info(page: Page, accessibility_tree: dict[str, Any], only_input_fields: bool): + """ + Iterates over the accessibility tree, fetching additional information from the DOM based on 'mmid', + and constructs a new JSON structure with detailed information. + + Args: + page (Page): The page object representing the web page. + accessibility_tree (dict[str, Any]): The accessibility tree JSON structure. + only_input_fields (bool): Flag indicating whether to include only input fields in the new JSON structure. + + Returns: + dict[str, Any]: The pruned tree with detailed information from the DOM. + """ + + logger.debug("Reconciling the Accessibility Tree with the DOM") + # Define the attributes to fetch for each element + attributes = ['name', 'aria-label', 'placeholder', 'mmid', "id", "for", "data-testid"] + backup_attributes = [] #if the attributes are not found, then try to get these attributes + tags_to_ignore = ['head','style', 'script', 'link', 'meta', 'noscript', 'template', 'iframe', 'g', 'main', 'c-wiz','svg', 'path'] + attributes_to_delete = ["level", "multiline", "haspopup", "id", "for"] + ids_to_ignore = ['agentDriveAutoOverlay'] + + # Recursive function to process each node in the accessibility tree + async def process_node(node: dict[str, Any]): + if 'children' in node: + for child in node['children']: + await process_node(child) + + # Use 'name' attribute from the accessibility node as 'mmid' + mmid_temp: str = node.get('keyshortcuts') # type: ignore + + # If the name has multiple mmids, take the last one + if(mmid_temp and is_space_delimited_mmid(mmid_temp)): + #TODO: consider if we should grab each of the mmids and process them separately as seperate nodes copying this node's attributes + mmid_temp = mmid_temp.split(' ')[-1] + + #focusing on nodes with mmid, which is the attribute we inject + try: + mmid = int(mmid_temp) + except (ValueError, TypeError): + #logger.error(f"'name attribute contains \"{node.get('name')}\", which is not a valid numeric mmid. Adding node as is: {node}") + return node.get('name') + + if node['role'] == 'menuitem': + return node.get('name') + + if node.get('role') == 'dialog' and node.get('modal') == True: # noqa: E712 + node["important information"] = "This is a modal dialog. Please interact with this dialog and close it to be able to interact with the full page (e.g. by pressing the close button or selecting an option)." + + if mmid: + # Determine if we need to fetch 'innerText' based on the absence of 'children' in the accessibility node + should_fetch_inner_text = 'children' not in node + + js_code = """ + (input_params) => { + const should_fetch_inner_text = input_params.should_fetch_inner_text; + const mmid = input_params.mmid; + const attributes = input_params.attributes; + const tags_to_ignore = input_params.tags_to_ignore; + const ids_to_ignore = input_params.ids_to_ignore; + + const element = document.querySelector(`[mmid="${mmid}"]`); + + if (!element) { + console.log(`No element found with mmid: ${mmid}`); + return null; + } + + if (ids_to_ignore.includes(element.id)) { + console.log(`Ignoring element with id: ${element.id}`, element); + return null; + } + //Ignore "option" because it would have been processed with the select element + if (tags_to_ignore.includes(element.tagName.toLowerCase()) || element.tagName.toLowerCase() === "option") return null; + + let attributes_to_values = { + 'tag': element.tagName.toLowerCase() // Always include the tag name + }; + + // If the element is an input, include its type as well + if (element.tagName.toLowerCase() === 'input') { + attributes_to_values['tag_type'] = element.type; // This will capture 'checkbox', 'radio', etc. + } + else if (element.tagName.toLowerCase() === 'select') { + attributes_to_values["mmid"] = element.getAttribute('mmid'); + attributes_to_values["role"] = "combobox"; + attributes_to_values["options"] = []; + + for (const option of element.options) { + let option_attributes_to_values = { + "mmid": option.getAttribute('mmid'), + "text": option.text, + "value": option.value, + "selected": option.selected + }; + attributes_to_values["options"].push(option_attributes_to_values); + } + return attributes_to_values; + } + + for (const attribute of attributes) { + let value = element.getAttribute(attribute); + + if(value){ + /* + if(attribute === 'href'){ + value = value.split('?')[0] + } + */ + attributes_to_values[attribute] = value; + } + } + + if (should_fetch_inner_text && element.innerText) { + attributes_to_values['description'] = element.innerText; + } + + let role = element.getAttribute('role'); + if(role==='listbox' || element.tagName.toLowerCase()=== 'ul'){ + let children=element.children; + let filtered_children = Array.from(children).filter(child => child.getAttribute('role') === 'option'); + console.log("Listbox or ul found: ", filtered_children); + let attributes_to_include = ['mmid', 'role', 'aria-label','value']; + attributes_to_values["additional_info"]=[] + for (const child of children) { + let children_attributes_to_values = {}; + + for (let attr of child.attributes) { + // If the attribute is not in the predefined list, add it to children_attributes_to_values + if (attributes_to_include.includes(attr.name)) { + children_attributes_to_values[attr.name] = attr.value; + } + } + + attributes_to_values["additional_info"].push(children_attributes_to_values); + } + } + // Check if attributes_to_values contains more than just 'name', 'role', and 'mmid' + const keys = Object.keys(attributes_to_values); + const minimalKeys = ['tag', 'mmid']; + const hasMoreThanMinimalKeys = keys.length > minimalKeys.length || keys.some(key => !minimalKeys.includes(key)); + + if (!hasMoreThanMinimalKeys) { + //If there were no attributes found, then try to get the backup attributes + for (const backupAttribute of input_params.backup_attributes) { + let value = element.getAttribute(backupAttribute); + if(value){ + attributes_to_values[backupAttribute] = value; + } + } + + //if even the backup attributes are not found, then return null, which will cause this element to be skipped + if(Object.keys(attributes_to_values).length <= minimalKeys.length) { + if (element.tagName.toLowerCase() === 'button') { + attributes_to_values["mmid"] = element.getAttribute('mmid'); + attributes_to_values["role"] = "button"; + attributes_to_values["additional_info"] = []; + let children=element.children; + let attributes_to_exclude = ['width', 'height', 'path', 'class', 'viewBox', 'mmid'] + + // Check if the button has no text and no attributes + if (element.innerText.trim() === '') { + + for (const child of children) { + let children_attributes_to_values = {}; + + for (let attr of child.attributes) { + // If the attribute is not in the predefined list, add it to children_attributes_to_values + if (!attributes_to_exclude.includes(attr.name)) { + children_attributes_to_values[attr.name] = attr.value; + } + } + + attributes_to_values["additional_info"].push(children_attributes_to_values); + } + console.log("Button with no text and no attributes: ", attributes_to_values); + return attributes_to_values; + } + } + + return null; // Return null if only minimal keys are present + } + } + return attributes_to_values; + } + """ + + # Fetch attributes and possibly 'innerText' from the DOM element by 'mmid' + element_attributes = await page.evaluate(js_code, + {"mmid": mmid, "attributes": attributes, "backup_attributes": backup_attributes, + "should_fetch_inner_text": should_fetch_inner_text, + "tags_to_ignore": tags_to_ignore, + "ids_to_ignore": ids_to_ignore}) + + if 'keyshortcuts' in node: + del node['keyshortcuts'] #remove keyshortcuts since it is not needed + + node["mmid"]=mmid + + # Update the node with fetched information + if element_attributes: + node.update(element_attributes) + + # check if 'name' and 'mmid' are the same + if node.get('name') == node.get('mmid') and node.get('role') != "textbox": + del node['name'] # Remove 'name' from the node + + if 'name' in node and 'description' in node and (node['name'] == node['description'] or node['name'] == node['description'].replace('\n', ' ') or node['description'].replace('\n', '') in node['name']): + del node['description'] #if the name is same as description, then remove the description to avoid duplication + + if 'name' in node and 'aria-label' in node and node['aria-label'] in node['name']: + del node['aria-label'] #if the name is same as the aria-label, then remove the aria-label to avoid duplication + + if 'name' in node and 'text' in node and node['name'] == node['text']: + del node['text'] #if the name is same as the text, then remove the text to avoid duplication + + if node.get('tag') == "select": #children are not needed for select menus since "options" attriburte is already added + node.pop("children", None) + node.pop("role", None) + node.pop("description", None) + + #role and tag can have the same info. Get rid of role if it is the same as tag + if node.get('role') == node.get('tag'): + del node['role'] + + # avoid duplicate aria-label + if node.get("aria-label") and node.get("placeholder") and node.get("aria-label") == node.get("placeholder"): + del node["aria-label"] + + if node.get("role") == "link": + del node["role"] + if node.get("description"): + node["text"] = node["description"] + del node["description"] + + #textbox just means a text input and that is expressed well enough with the rest of the attributes returned + #if node.get('role') == "textbox": + # del node['role'] + + if node.get('role') == "textbox": + #get the id attribute of this field from the DOM + if "id" in element_attributes and element_attributes["id"]: + #find if there is an element in the DOM that has this id in aria-labelledby. + js_code = """ + (inputParams) => { + let referencingElements = []; + const referencedElement = document.querySelector(`[aria-labelledby="${inputParams.aria_labelled_by_query_value}"]`); + if(referencedElement) { + const mmid = referencedElement.getAttribute('mmid'); + if (mmid) { + return {"mmid": mmid, "tag": referencedElement.tagName.toLowerCase()}; + } + } + return null; + } + """ + #textbox just means a text input and that is expressed well enough with the rest of the attributes returned + #del node['role'] + + #remove attributes that are not needed once processing of a node is complete + for attribute_to_delete in attributes_to_delete: + if attribute_to_delete in node: + node.pop(attribute_to_delete, None) + else: + logger.debug(f"No element found with mmid: {mmid}, deleting node: {node}") + node["marked_for_deletion_by_mm"] = True + + + # Process each node in the tree starting from the root + await process_node(accessibility_tree) + + pruned_tree = __prune_tree(accessibility_tree, only_input_fields) + + logger.debug("Reconciliation complete") + return pruned_tree + + +async def __cleanup_dom(page: Page): + """ + Cleans up the DOM by removing injected 'aria-description' attributes and restoring any original 'aria-keyshortcuts' + from 'orig-aria-keyshortcuts'. + """ + logger.debug("Cleaning up the DOM's previous injections") + await page.evaluate("""() => { + const allElements = document.querySelectorAll('*[mmid]'); + allElements.forEach(element => { + element.removeAttribute('aria-keyshortcuts'); + const origAriaLabel = element.getAttribute('orig-aria-keyshortcuts'); + if (origAriaLabel) { + element.setAttribute('aria-keyshortcuts', origAriaLabel); + element.removeAttribute('orig-aria-keyshortcuts'); + } + }); + }""") + logger.debug("DOM cleanup complete") + + +def __prune_tree(node: dict[str, Any], only_input_fields: bool) -> dict[str, Any] | None: + """ + Recursively prunes a tree starting from `node`, based on pruning conditions and handling of 'unraveling'. + + The function has two main jobs: + 1. Pruning: Remove nodes that don't meet certain conditions, like being marked for deletion. + 2. Unraveling: For nodes marked with 'marked_for_unravel_children', we replace them with their children, + effectively removing the node and lifting its children up a level in the tree. + + This happens in place, meaning we modify the tree as we go, which is efficient but means you should + be cautious about modifying the tree outside this function during a prune operation. + + Args: + - node (Dict[str, Any]): The node we're currently looking at. We'll check this node, its children, + and so on, recursively down the tree. + - only_input_fields (bool): If True, we're only interested in pruning input-related nodes (like form fields). + This lets you narrow the focus if, for example, you're only interested in cleaning up form-related parts + of a larger tree. + + Returns: + - dict[str, Any] | None: The pruned version of `node`, or None if `node` was pruned away. When we 'unravel' + a node, we directly replace it with its children in the parent's list of children, so the return value + will be the parent, updated in place. + + Notes: + - 'marked_for_deletion_by_mm' is our flag for nodes that should definitely be removed. + - Unraveling is neat for flattening the tree when a node is just a wrapper without semantic meaning. + - We use a while loop with manual index management to safely modify the list of children as we iterate over it. + """ + if "marked_for_deletion_by_mm" in node: + return None + + if 'children' in node: + i = 0 + while i < len(node['children']): + child = node['children'][i] + if 'marked_for_unravel_children' in child: + # Replace the current child with its children + if 'children' in child: + node['children'] = node['children'][:i] + child['children'] + node['children'][i+1:] + i += len(child['children']) - 1 # Adjust the index for the new children + else: + # If the node marked for unraveling has no children, remove it + node['children'].pop(i) + i -= 1 # Adjust the index since we removed an element + else: + # Recursively prune the child if it's not marked for unraveling + pruned_child = __prune_tree(child, only_input_fields) + if pruned_child is None: + # If the child is pruned, remove it from the children list + node['children'].pop(i) + i -= 1 # Adjust the index since we removed an element + else: + # Update the child with the pruned version + node['children'][i] = pruned_child + i += 1 # Move to the next child + + # After processing all children, if the children array is empty, remove it + if not node['children']: + del node['children'] + + # Apply existing conditions to decide if the current node should be pruned + return None if __should_prune_node(node, only_input_fields) else node + + +def __should_prune_node(node: dict[str, Any], only_input_fields: bool): + """ + Determines if a node should be pruned based on its 'role' and 'element_attributes'. + + Args: + node (dict[str, Any]): The node to be evaluated. + only_input_fields (bool): Flag indicating whether only input fields should be considered. + + Returns: + bool: True if the node should be pruned, False otherwise. + """ + #If the request is for only input fields and this is not an input field, then mark the node for prunning + if node.get("role") != "WebArea" and only_input_fields and not (node.get("tag") in ("input", "button", "textarea") or node.get("role") == "button"): + return True + + if node.get('role') == 'generic' and 'children' not in node and not ('name' in node and node.get('name')): # The presence of 'children' is checked after potentially deleting it above + return True + + if node.get('role') in ['separator', 'LineBreak']: + return True + processed_name = "" + if 'name' in node: + processed_name:str =node.get('name') # type: ignore + processed_name = processed_name.replace(',', '') + processed_name = processed_name.replace(':', '') + processed_name = processed_name.replace('\n', '') + processed_name = processed_name.strip() + if len(processed_name) <3: + processed_name = "" + + #check if the node only have name and role, then delete that node + if len(node) == 2 and 'name' in node and 'role' in node and not (node.get('role') == "text" and processed_name != ""): + return True + return False + +async def get_node_dom_element(page: Page, mmid: str): + return await page.evaluate(""" + (mmid) => { + return document.querySelector(`[mmid="${mmid}"]`); + } + """, mmid) + + +async def get_element_attributes(page: Page, mmid: str, attributes: list[str]): + return await page.evaluate(""" + (inputParams) => { + const mmid = inputParams.mmid; + const attributes = inputParams.attributes; + const element = document.querySelector(`[mmid="${mmid}"]`); + if (!element) return null; // Return null if element is not found + + let attrs = {}; + for (let attr of attributes) { + attrs[attr] = element.getAttribute(attr); + } + return attrs; + } + """, {"mmid": mmid, "attributes": attributes}) + + +async def get_dom_with_accessibility_info() -> Annotated[dict[str, Any] | None, "A minified representation of the HTML DOM for the current webpage"]: + """ + Retrieves, processes, and minifies the Accessibility tree of the active page in a browser instance. + Strictly follow the name and role tag for any interaction with the nodes. + + Returns: + - The minified JSON content of the browser's active page. + """ + logger.debug("Executing Get Accessibility Tree Command") + # Create and use the PlaywrightManager + browser_manager = PlaywrightManager(browser_type='chromium', headless=False) + page = await browser_manager.get_current_page() + if page is None: # type: ignore + raise ValueError('No active page found') + + return await do_get_accessibility_info(page) + + +async def do_get_accessibility_info(page: Page, only_input_fields: bool = False): + """ + Retrieves the accessibility information of a web page and saves it as JSON files. + + Args: + page (Page): The page object representing the web page. + only_input_fields (bool, optional): If True, only retrieves accessibility information for input fields. + Defaults to False. + + Returns: + dict[str, Any] or None: The enhanced accessibility tree as a dictionary, or None if an error occurred. + """ + await __inject_attributes(page) + accessibility_tree: dict[str, Any] = await page.accessibility.snapshot(interesting_only=True) # type: ignore + + with open(os.path.join(SOURCE_LOG_FOLDER_PATH, 'json_accessibility_dom.json'), 'w', encoding='utf-8') as f: + f.write(json.dumps(accessibility_tree, indent=2)) + logger.debug("json_accessibility_dom.json saved") + + await __cleanup_dom(page) + try: + enhanced_tree = await __fetch_dom_info(page, accessibility_tree, only_input_fields) + + logger.debug("Enhanced Accessibility Tree ready") + + with open(os.path.join(SOURCE_LOG_FOLDER_PATH, 'json_accessibility_dom_enriched.json'), 'w', encoding='utf-8') as f: + f.write(json.dumps(enhanced_tree, indent=2)) + logger.debug("json_accessibility_dom_enriched.json saved") + + return enhanced_tree + except Exception as e: + logger.error(f"Error while fetching DOM info: {e}") + traceback.print_exc() + return None diff --git a/Agent_E/ae/utils/http_helper.py b/Agent_E/ae/utils/http_helper.py new file mode 100644 index 0000000..3520b68 --- /dev/null +++ b/Agent_E/ae/utils/http_helper.py @@ -0,0 +1,43 @@ +from typing import Any + +import requests + + +def make_post_request(url: str, data: dict[str, Any], api_key: str, api_key_header_name: str = "apikey") -> dict[str, Any]|None: + """ + Makes a POST request to the specified URL with a JSON body and an API key header. + + Args: + url (str): The URL to send the POST request to. + data (Dict[str, Any]): The JSON data to include in the POST request body. + api_key (str): The API key to include in the request headers. + api_key_header_name (str): The name of the header to include the API key in. Defaults to "apikey". + + Returns: + Optional[Dict[str, Any]]: The JSON response from the server if the request was successful and the response is in JSON format. + None: If the request failed or the response is not in JSON format. + + Raises: + requests.exceptions.RequestException: If an error occurs during the HTTP request. + """ + # Define the headers for the request + headers = { + 'Content-Type': 'application/json', + api_key_header_name: api_key + } + + try: + # Make the POST request with the given URL, data, and headers + response = requests.post(url, json=data, headers=headers) + + # Check if the request was successful + response.raise_for_status() + + # Attempt to return the JSON response + return response.json() + except requests.exceptions.RequestException as e: + print(f"Error: {e}") + return None + except ValueError: + print("Error: Response is not in JSON format") + return None diff --git a/Agent_E/ae/utils/js_helper.py b/Agent_E/ae/utils/js_helper.py new file mode 100644 index 0000000..0df6885 --- /dev/null +++ b/Agent_E/ae/utils/js_helper.py @@ -0,0 +1,34 @@ +import json +import re + +from Agent_E.ae.utils.logger import logger + + +def escape_js_message(message: str) -> str: + """ + Escape a message for use in JavaScript code. + + Args: + message (str): The message to escape. + + Returns: + str: The escaped message. + """ + return json.dumps(message) + + +def beautify_plan_message(message:str) -> str: + """ + Add a newline between each numbered step in the plan message if it does not already exist. + + Args: + message (str): The plan message. + + Returns: + str: The plan message with newlines added between each numbered step. + """ + logger.debug(f"beautify_plan_message original:\n{message}") + # Add a newline before each numbered step that is not already preceded by a newline + plan_with_newlines = re.sub(r'(? None: + log_format = os.getenv("LOG_MESSAGES_FORMAT", "text").lower() + + # Set log level for the main logger + logger.setLevel(level.upper()) + + # Create a handler for logging + handler = logging.StreamHandler() + + if log_format == "json": + # JSON format + formatter = jsonlogger.JsonFormatter( + fmt='%(asctime)s %(name)s %(levelname)s %(message)s %(filename)s %(lineno)d', + datefmt='%Y-%m-%d %H:%M:%S' + ) + else: + # Text format + formatter = logging.Formatter( + fmt='[%(asctime)s] %(levelname)s {%(filename)s:%(lineno)d} - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + handler.setFormatter(formatter) + logger.handlers = [] # Clear existing handlers + logger.addHandler(handler) + + # Ensure other loggers have the same handler + http_loggers = ["openai", "autogen"] + for http_logger in http_loggers: + lib_logger = logging.getLogger(http_logger) + lib_logger.setLevel(logging.DEBUG) + lib_logger.handlers = [] # Clear any existing handlers + lib_logger.addHandler(handler) # Add the same handler + + +# Call the configure logger function to set up the logger initially +configure_logger(level="INFO") + +# Function to set log level +def set_log_level(level: str) -> None: + """ + Set the log level for the logger. + + Parameters: + - level (str): A logging level such as 'debug', 'info', 'warning', 'error', or 'critical'. + """ + configure_logger(level) + +# Set default log levels for other libraries +# logging.getLogger("httpcore").setLevel(logging.DEBUG) +# logging.getLogger("httpx").setLevel(logging.DEBUG) +# logging.getLogger("openai").setLevel(logging.DEBUG) +# logging.getLogger("autogen").setLevel(logging.DEBUG) +logging.getLogger("matplotlib.pyplot").setLevel(logging.WARNING) +logging.getLogger("PIL.PngImagePlugin").setLevel(logging.WARNING) +logging.getLogger("PIL.Image").setLevel(logging.WARNING) + +# Re-export the logger for ease of use +__all__ = ["logger", "set_log_level"] diff --git a/Agent_E/ae/utils/openai_llm_helper.py b/Agent_E/ae/utils/openai_llm_helper.py new file mode 100644 index 0000000..e863453 --- /dev/null +++ b/Agent_E/ae/utils/openai_llm_helper.py @@ -0,0 +1,51 @@ +import os +from typing import Any + +import openai +from dotenv import load_dotenv +from openai import AsyncOpenAI + + +class OpenAILLMHelper: + def __init__(self): + load_dotenv() + self.client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + async def get_chat_completion_response_async(self, system_msg:str, user_msgs:list[str], model_name:str="gpt-4-turbo-preview", temperature:float=0.1, max_tokens:int=256, frequency_penalty:float=0.0, top_p: float=1.0, top_k: int=1, presence_penalty: float=0.0): + formatted_msgs: list[dict[str, Any]] = [{"role": "system", "content": system_msg}] + + for user_msg in user_msgs: + formatted_msgs.append({"role": "user", "content": user_msg}) + + try: + response = await self.client.chat.completions.create( + model=model_name, + max_tokens=max_tokens, + temperature=temperature, + frequency_penalty=frequency_penalty, + top_p=top_p, + presence_penalty=presence_penalty, + messages=formatted_msgs # type: ignore + ) + print(">>> openai response:", response) + if response.choices and len(response.choices) > 0 and response.choices[0].message and response.choices[0].message.content: + return response.choices[0].message.content + return None + except openai.APIConnectionError as e: + print("The server could not be reached") + print(e.__cause__) # an underlying Exception, likely raised within httpx. + raise Exception(f"Calling {model_name} LLM failed. The server could not be reached.") from e + except openai.RateLimitError as e: + print("A 429 status code was received; we should back off a bit.") + raise Exception(f"Calling {model_name} LLM failed. Rate limit error.") from e + except openai.APIStatusError as e: + print(e.status_code) + print(e.response) + raise Exception(f"Calling {model_name} LLM failed. Error: {e}") from e + +# async def main(): +# helper = OpenAILLMHelper() +# response = await helper.get_chat_completion_response_async(LLM_PROMPTS["SKILLS_HARVESTING_PROMPT"], ["What is the weather like today?"], temperature=0, max_tokens=4000) +# print("*******\nResponse: ", response, "\n*******\n") + +# asyncio.run(main()) diff --git a/Agent_E/ae/utils/response_parser.py b/Agent_E/ae/utils/response_parser.py new file mode 100644 index 0000000..982227d --- /dev/null +++ b/Agent_E/ae/utils/response_parser.py @@ -0,0 +1,60 @@ +import json +from typing import Any + +from Agent_E.ae.utils.logger import logger + + +def parse_response(message: str) -> dict[str, Any]: + """ + Parse the response from the browser agent and return the response as a dictionary. + """ + # Parse the response content + json_response = {} + #if message starts with ``` and ends with ``` then remove them + if message.startswith("```"): + message = message[3:] + if message.endswith("```"): + message = message[:-3] + if message.startswith("json"): + message = message[4:] + + message = message.strip() + try: + json_response: dict[str, Any] = json.loads(message) + except Exception as e: + # If the response is not a valid JSON, try pass it using string matching. + #This should seldom be triggered + logger.warn(f"LLM response was not properly formed JSON. Will try to use it as is. LLM response: \"{message}\". Error: {e}") + message = message.replace("\\n", "\n") + message = message.replace("\n", " ") # type: ignore + if ("plan" in message and "next_step" in message): + start = message.index("plan") + len("plan") + end = message.index("next_step") + json_response["plan"] = message[start:end].replace('"', '').strip() + if ("next_step" in message and "terminate" in message): + start = message.index("next_step") + len("next_step") + end = message.index("terminate") + json_response["next_step"] = message[start:end].replace('"', '').strip() + if ("terminate" in message and "final_response" in message): + start = message.index("terminate") + len("terminate") + end = message.index("final_response") + matched_string=message[start:end].replace('"', '').strip() + if ("yes" in matched_string): + json_response["terminate"] = "yes" + else: + json_response["terminate"] = "no" + + start=message.index("final_response") + len("final_response") + end=len(message)-1 + json_response["final_response"] = message[start:end].replace('"', '').strip() + + elif ("terminate" in message): + start = message.index("terminate") + len("terminate") + end = len(message)-1 + matched_string=message[start:end].replace('"', '').strip() + if ("yes" in matched_string): + json_response["terminate"] = "yes" + else: + json_response["terminate"] = "no" + + return json_response diff --git a/Agent_E/ae/utils/ui_messagetype.py b/Agent_E/ae/utils/ui_messagetype.py new file mode 100644 index 0000000..2eac400 --- /dev/null +++ b/Agent_E/ae/utils/ui_messagetype.py @@ -0,0 +1,14 @@ +from enum import Enum + + +class MessageType(Enum): + PLAN = "plan" + STEP = "step" + ACTION ="action" + ANSWER = "answer" + QUESTION = "question" + INFO = "info" + FINAL = "final" + DONE = "transaction_done" + ERROR = "error" + MAX_TURNS_REACHED = "max_turns_reached" diff --git a/Agent_E/test/evaluators.py b/Agent_E/test/evaluators.py new file mode 100644 index 0000000..f8f94aa --- /dev/null +++ b/Agent_E/test/evaluators.py @@ -0,0 +1,437 @@ +"""base class for evaluation""" +import collections +import html +import time +import urllib +import urllib.parse +from Agent_E.test.test_utils import clean_answer +from Agent_E.test.test_utils import evaluate_exact_match +from Agent_E.test.test_utils import evaluate_fuzzy_match +from Agent_E.test.test_utils import evaluate_must_include +from Agent_E.test.test_utils import evaluate_ua_match +from typing import Any + +from Agent_E.ae.utils.logger import logger +from playwright.sync_api import CDPSession +from playwright.sync_api import Page +from termcolor import colored + + +class Evaluator: + """Base class for evaluation strategies. + + Attributes: + eval_tag (str): A tag to identify or categorize the evaluator. + """ + + def __init__(self, eval_tag: str = "") -> None: + """Initialize the evaluator with an optional evaluation tag.""" + self.eval_tag = eval_tag + + async def __call__(self, task_config: dict[str, Any], page: Page, client: CDPSession, answer: str) -> dict[str, float|str]: + """Abstract method to be implemented by subclasses for evaluation. + + Raises: + NotImplementedError: This method should be overridden by subclasses. + """ + raise NotImplementedError("This method should be overridden by subclasses.") + + +class StringEvaluator(Evaluator): + """Evaluates string-based answers using various matching criteria. + + Supports exact matches, some matches, fuzzy matching using LLM, and unachievable task matching. + """ + + async def __call__( + self, + task_config: dict[str, Any], + page: Page | None = None, + client: CDPSession | None = None, + answer: str | None = None, + + ) -> dict[str, float|str]: + last_action = answer or "" + pred = clean_answer(last_action) + + score = 1.0 + for approach, value in task_config["eval"]["reference_answers"].items(): + + match approach: + case "exact_match": + logger.info(f"Evaluating exact_match for answer: Predicted: {pred} , Reference: {value}") + score *= evaluate_exact_match(ref=value, pred=pred) + + case "must_include": + logger.info(f"Evaluating must_include for answer: \"{answer}\" to see if it includes the expeced values: \"{value}\"\n") + assert isinstance(value, list) + for must_value in value: # type: ignore + score *= evaluate_must_include( + ref=must_value, # type: ignore + pred=pred, + tokenize=(len(value) == 1), # type: ignore + ) + case "some_matches": + min_required_matches = value.get("min_required", 1) + matches = sum(evaluate_must_include(ref=phrase, pred=pred, tokenize=False) for phrase in value["phrases"]) + score *= float(matches >= min_required_matches) + case "fuzzy_match": + logger.info(f"Evaluating fuzzy_match for answer: {answer}") + intent = task_config["intent"] + if value == "N/A": + # if the instruction only asks the model to generate N/A when encountering an unachievable task + # without more concrete reasons + score *= evaluate_exact_match(ref=value, pred=pred) + # if the instruction also asks the model to generate the reason why the task is unachievable + # this should be the default as it will prevent false positive N/A` + if score != 1: + score = 1.0 * evaluate_ua_match( + intent=task_config["intent"], + ref=task_config["eval"]["string_note"], + pred=pred, + ) + else: + logger.info(f"Evaluating generic for answer: {answer}") + assert isinstance(value, list) + for reference in value: # type: ignore + score *= evaluate_fuzzy_match( + ref=reference, pred=pred, intent=intent # type: ignore + ) + case _: + logger.info(f"Unknown approach value received: {approach}") + return {"score": score} + + +class URLEvaluator(Evaluator): + """Evaluates if the given URL matches the expected URL criteria defined in the configuration. + + This includes checking if the base path of the URL and its query parameters match those specified in the reference URLs. + """ + + async def __call__( + self, + task_config: dict[str, Any], + page: Page, + client: CDPSession | None = None, + answer: str | None = None + ) -> dict[str, float|str]: + """Evaluates the current page URL against reference URLs specified in the config file. + + Parameters: + task_config (dict[str, Any]): The task configuration containing evaluation criteria. + page (Page): The Playwright page object for the current webpage. + client (CDPSession | None, optional): The Chrome DevTools Protocol session object. Not used in this evaluator. + answer (str | None, optional): Not used in this evaluator. + + Returns: + dict[str, float|str]: "score" 1.0 if the page URL matches any of the reference URLs, considering the matching rule; otherwise 0.0. + + Raises: + ValueError: If an unknown matching rule is specified in the config file. + """ + + def clean_url(url: str) -> str: + url = str(url) + url = url.rstrip("/") + url = url.lower() + return url + + def parse_url(url: str) -> tuple[str, dict[str, list[str]]]: + """Parse a URL into its base, path, and query components.""" + parsed_url = urllib.parse.urlparse(url) + base_path = parsed_url.netloc + parsed_url.path + query = urllib.parse.parse_qs(parsed_url.query) + return base_path, query + + def parse_urls( + urls: list[str], + ) -> tuple[list[str], dict[str, set[str]]]: + """Parse a list of URLs.""" + base_paths: list[str] = [] + queries: dict[str, set[str]] = collections.defaultdict(set) + for url in urls: + base_path, query = parse_url(url) + base_paths.append(base_path) + for k, v in query.items(): + queries[k].update(v) + return base_paths, queries + + pred = clean_url(page.url) + ref_urls = task_config["eval"]["reference_url"].split(" |OR| ") + ref_urls = [clean_url(url) for url in ref_urls] + matching_rule = task_config["eval"].get("url_note", "GOLD in PRED") + if matching_rule == "GOLD in PRED": + for ref_url in ref_urls: + ref_base_path, ref_query = parse_url(ref_url) + pred_base_paths, pred_query = parse_url(pred) + # Calculate base score for each ref_url + base_score = float(ref_base_path in pred_base_paths) + # Calculate query score for each ref_url + query_score = 1.0 + for k, possible_values in ref_query.items(): # type: ignore + if k in pred_query: + query_score *= float( + any( + possible_ref_value in pred_query.get(k, []) # type: ignore + for possible_ref_value in possible_values # type: ignore + ) + ) + else: + # If the key is not in pred_query, check if the reference URL has no query parameters + if not possible_values: + query_score *= 1.0 # No query parameters to match, so consider it a match + else: + query_score *= 0.0 # Reference URL has query parameters but predicted URL does not + # Calculate final score for each ref_url + score = base_score * query_score + # Return immediately if any score is 1 + if score == 1.0: + return {"score": score} + + else: + raise ValueError(f"Unknown matching rule: {matching_rule}") + + return {"score": 0.0} + + +class HTMLContentEvaluator(Evaluator): + """Evaluates if specified HTML content or elements appear on the webpage. + + This involves navigating to URLs specified in the configuration and checking for the presence of HTML elements or content using various strategies. + """ + + async def __call__( + self, + task_config: dict[str, Any], + page: Page, + client: CDPSession | None = None, + answer: str | None = None + ) -> dict[str, float|str]: + """Evaluates the presence of specified HTML content on the webpage. + + Parameters: + task_config (dict[str, Any]): The task configuration containing evaluation criteria. + page (Page): The Playwright page object for the current webpage. + client (CDPSession | None, optional): The Chrome DevTools Protocol session object. Not used in this evaluator. + answer (str | None, optional): Not used in this evaluator. + + Returns: + dict[str, float|str]: "score" A score between 0.0 and 1.0 representing the presence of required HTML content on the webpage. + + Raises: + ValueError: If an unknown locator strategy is specified in the config file. + """ + targets = task_config["eval"]["program_html"] + + score = 1.0 + for target in targets: + target_url: str = target["url"] # which url to check + if target_url.startswith("func"): + func = target_url.split("func:")[1] + func = func.replace("__last_url__", page.url) + target_url = eval(func) + + locator: str = target["locator"] # js element locator + + # navigate to that url + if target_url != "last": + page.goto(target_url) + time.sleep(3) + + # empty, use the full page + if not locator.strip(): + selected_element = page.content() + # use JS to select the element + elif locator.startswith("document.") or locator.startswith("[...document.") or locator.startswith("jsblock:"): + if "prep_actions" in target: + try: + for prep_action in target["prep_actions"]: + page.evaluate(f"() => {prep_action}") + except Exception: + pass + try: + if locator.startswith("jsblock:"): + locator = locator.split("jsblock:")[1] + + selected_element = str(await page.evaluate(f"() => {locator}")) + if not selected_element: + selected_element = "" + except Exception: + # the page is wrong, return empty + selected_element = "" + # run program to call API + elif locator.startswith("func:"): # a helper function + func = locator.split("func:")[1] + func = func.replace("__page__", "page") + selected_element = eval(func) + else: + raise ValueError(f"Unknown locator: {locator}") + + selected_element = html.unescape(selected_element) + + if "exact_match" in target["required_contents"]: + required_contents = target["required_contents"]["exact_match"] + cur_score = evaluate_exact_match( + ref=required_contents, pred=selected_element + ) + score *= float(cur_score) + # logger.info(f"[exact match] {cur_score}, selected element: {selected_element}, required contents: {required_contents}") + elif "must_include" in target["required_contents"]: + required_contents = target["required_contents"]["must_include"] + assert isinstance(required_contents, list) + for content in required_contents: # type: ignore + content_or = content.split(" |OR| ") # type: ignore + cur_score = any( + [ + evaluate_must_include( + ref=content, # type: ignore + pred=selected_element, + tokenize=False, + ) + for content in content_or # type: ignore + ] + ) + score *= float(cur_score) + # logger.info(f"[must include] {cur_score}, selected element: {selected_element}, required contents: {content_or}") + else: + raise ValueError( + f"Unknown required_contents: {target['required_contents'].keys()}" + ) + return {"score": score} + +class ManualContentEvaluator(Evaluator): + """Evaluation Route for Manual Evaluation.""" + async def __call__( + self, + task_config: dict[str, Any], + page: Page, + client: CDPSession | None = None, + answer: str | None = None + ) -> dict[str, float|str]: + """Pauses Execution to get manual evaluation score from user. + + Parameters: + task_config (dict[str, Any]): The task configuration containing evaluation criteria. + page (Page): The Playwright page object for the current webpage. + client (CDPSession | None, optional): The Chrome DevTools Protocol session object. Not used in this evaluator. + answer (str | None, optional): Not used in this evaluator. + + Returns: + dict[str, float|str]: A score representig the status 1 = pass, 0 = fail and -0.1 is a skip. Additionaly, a reason can be provided for the score (mainly for fail/skip). + """ + task = task_config["intent"] + reference_answer = task_config["eval"]["reference_answers"]["manual_check"]["answer"] + answer_type = task_config["eval"]["reference_answers"]["manual_check"]["type"] + id = str(task_config["task_id"]) + index = str(task_config["task_index"]) + + print(colored("\n\n***************************\n", "green", attrs=["bold"])) + print(colored("Task ID: ", "blue", attrs=["bold"]) + id + "\n") + print(colored("Task Index: ", "blue", attrs=["bold"]) + index + "\n") + print(colored("Task: ", "blue", attrs=["bold"]) + task + "\n") + print(colored("Agent answer: ", "blue", attrs=["bold"]) + str(answer or "") + "\n") + + if answer_type.strip().lower() == "possible": + print(colored("Possible answer (reference): ", "yellow") + f"~~~{reference_answer}~~~") + elif answer_type.strip().lower() == "golden": + print(colored("Golden answer (reference): ", "yellow") + reference_answer) + + user_response = input(colored("Annotate the task as Pass, Fail or Skip (please use Skip sparingly)? ", "magenta", attrs=["bold"])) + eval_response: dict[str, float|str] = {} + if(user_response.lower()=="pass"): + eval_response["score"] = 1.0 + elif user_response.lower()=="fail": + eval_response["score"] = 0.0 + elif user_response.lower()=="skip": + eval_response["score"] = -0.1 + else: + print(colored(f"Received response: {user_response}", "red")) + raise ValueError("Invalid user response. Please enter 'Pass', 'Fail' or 'Skip'.") + reason: str|None = None + + if eval_response["score"] <= 0: + reason = input("Reason for rating: ") + eval_response["reason"] = reason + + return eval_response + +class EvaluatorComb(Evaluator): + """Combines multiple evaluators to perform a comprehensive evaluation based on different criteria. + + Attributes: + evaluators (list[Evaluator]): A list of evaluator instances to be used for evaluation. + """ + + def __init__(self, evaluators: list[Evaluator]) -> None: + """Initializes the composite evaluator with a list of individual evaluators. + + Parameters: + evaluators (list[Evaluator]): The list of evaluators to include in the composite evaluation. + """ + self.evaluators = evaluators + + + async def __call__( + self, + task_config: dict[str, Any], + page: Page, + client: CDPSession, + answer: str, + ) -> dict[str, float|str]: + """Performs the evaluation using all included evaluators and aggregates their scores. + + Parameters: + task_config (dict[str, Any]): The task configuration containing evaluation criteria. + page (Page): The Playwright page object for the current webpage. + client (CDPSession): The Chrome DevTools Protocol session object. + answer (str): The answer or content to be evaluated. + + Returns: + dict[str, float|str]: "score" - The aggregated score from all evaluators, representing the overall evaluation result. "reason" - The reason for the evaluation score, if applicable. + """ + score: float = 1.0 + reason: str | None = None + for evaluator in self.evaluators: + eval_result = await evaluator(task_config, page, client, answer) + score: float = score * eval_result["score"] # type: ignore + if "reason" in eval_result: + if reason is None: + reason = eval_result["reason"] # type: ignore + else: + reason += f"\n{eval_result['reason']}" + return {"score": score, "reason": reason} # type: ignore + + +def evaluator_router(task_config: dict[str, Any]) -> EvaluatorComb: + """Creates and configures a composite evaluator based on the evaluation types specified in the configuration file. + + Parameters: + task_config dict[str, Any]: configuration specifying the evaluation types to use. + + Returns: + EvaluatorComb: A composite evaluator configured with the specified types of individual evaluators. + + Raises: + ValueError: If an unsupported evaluation type is specified in the configuration file. + """ + + eval_types = task_config["eval"]["eval_types"] + evaluators: list[Evaluator] = [] + for eval_type in eval_types: + match eval_type: + case "string_match": + logger.info("Adding string evaluator") + evaluators.append(StringEvaluator()) + case "url_match": + logger.info("Adding URL evaluator") + evaluators.append(URLEvaluator()) + case "program_html": + logger.info("Adding HTML evaluator") + evaluators.append(HTMLContentEvaluator()) + case "manual": + logger.info("Adding manual evaluator") + evaluators.append(ManualContentEvaluator()) + case _: + raise ValueError(f"eval_type {eval_type} is not supported") + + return EvaluatorComb(evaluators) diff --git a/Agent_E/test/test_utils.py b/Agent_E/test/test_utils.py new file mode 100644 index 0000000..6ef6def --- /dev/null +++ b/Agent_E/test/test_utils.py @@ -0,0 +1,263 @@ +"""Implements helper functions to assist evaluation cases where other evaluators are not suitable.""" +import json +import os +from datetime import datetime +from pathlib import Path +from typing import Any + +from dotenv import load_dotenv +from nltk.tokenize import word_tokenize # type: ignore +from openai import OpenAI + +load_dotenv() +client = OpenAI() + + +def llm_fuzzy_match(pred: str, reference: str, question: str) -> float: + """ + Evaluates if a predicted answer matches a reference answer semantically, considering the context of a question. + + This function simulates a grading scenario, understanding that a student's answer may use different wording or phrasing from the reference answer. It uses GPT-4-turbo model to assess semantic equivalence. + + Parameters: + pred (str): The student's predicted answer. + reference (str): The reference answer to compare against. + question (str): The question related to the answers. + + Returns: + float: Returns 1.0 if the predicted answer is semantically equivalent to the reference, otherwise 0.0. + """ + messages: list[dict[str, Any]] = [] + # construct the question to ask + message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student may use different phrasing or wording to answer the question. The goal is to evaluate whether the answer is semantically equivalent to the reference answer.\n" + message += f"question: {question}\n" + message += f"reference answer: {reference}\n" + message += "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n" + message += f"student answer: {pred}\n" + message += "Conclude the judgement by correct/incorrect/partially correct." + messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": message}, + ] + + response = generate_from_openai_chat_completion( + model="gpt-4-turbo-preview", + messages=messages, + temperature=0, + max_tokens=768, + top_p=1.0, + context_length=0, + ).lower() + if "partially correct" in response or "incorrect" in response: + return 0.0 + else: + assert "correct" in response + return 1.0 + + +def llm_ua_match(pred: str, reference: str, question: str) -> float: + """ + Evaluates the alignment between a reported reason for a task being unachievable and the actual reason. + + This function reviews both the actual and reported reasons for a task's unachievability within the context of the task. + It assesses if the reported reason is implicitly or explicitly in line with the actual reason, using GPT-turbo model. + + Parameters: + pred (str): The reported unachievable reason by an individual. + reference (str): The actual reason why the task is unachievable. + question (str): The task in question. + + Returns: + float: Returns 1.0 if the reported reason aligns with the actual reason, otherwise 0.0. + """ + messages: list[dict[str, Any]] = [] + # construct the question to ask + message = "" + message += f"task: {question}\n" + message += f"actual unachievable reason: {reference}\n" + message += f"reported unachievable reason: {pred}\n" + message += ( + "The task described above is inherently unachievable due to the reason specified under 'actual unachievable reason'. " + "An individual previously attempted this task and was unable to complete it. They provided a reason for their failure, " + "which is listed under 'reported unachievable reason'. Your role is to review both the actual and reported reasons. " + "Determine if the reported reason aligns with the actual reason, even if implicitly. " + "If the stated reason is in line with the actual reason, respond with 'same'. Otherwise, respond with 'different'." + ) + messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": message}, + ] + + response = generate_from_openai_chat_completion( + model="gpt-4-turbo-preview", + messages=messages, + temperature=0, + max_tokens=768, + top_p=1.0, + context_length=0, + ).lower() + if "different" in response: + return 0.0 + else: + assert "same" in response + return 1.0 + + + +def generate_from_openai_chat_completion( + messages: list[dict[str, str]], + model: str, + temperature: float, + max_tokens: int, + top_p: float, + context_length: int, + stop_token: str | None = None, +) -> str: + """ + Generates a response from OpenAI's chat completions based on a conversation constructed from a list of messages. + + This function makes a call to the OpenAI API using specified parameters to control the generation. + It requires an API key to be set in the environment variables. + + Parameters: + messages (list[dict[str, str]]): A list of messages to construct the conversation context. + model (str): The model name to use for generating the completion. + temperature (float): Sampling temperature for generation. + max_tokens (int): Maximum number of tokens to generate. + top_p (float): Nucleus sampling parameter controlling the size of the probability mass to sample from. + context_length (int): The maximum number of tokens from `messages` to use for context. + stop_token (str, optional): A token at which to stop generating further tokens. + + Returns: + str: The generated response as a string. + + Raises: + ValueError: If the 'OPENAI_API_KEY' environment variable is not set. + """ + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "OPENAI_API_KEY environment variable must be set when using OpenAI API." + ) + client.api_key = os.environ["OPENAI_API_KEY"] + client.organization = os.environ.get("OPENAI_ORGANIZATION", "") + + response = client.chat.completions.create( + model=model, + messages=messages, # type: ignore + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + n=1, + stop=[stop_token] if stop_token else None, + ) + answer: str = response.choices[0].message.content # type: ignore + return answer + +def clean_answer(answer: str) -> str: + """Cleans and preprocesses the answer string for evaluation. + + Parameters: + answer (str): The answer string to clean. + + Returns: + str: The cleaned and lowercased answer string. + """ + answer = answer.strip().strip('"').strip("'").lower() + return answer + +def evaluate_exact_match(ref: str, pred: str) -> float: + """Evaluates if the predicted answer exactly matches the reference answer. + + Parameters: + ref (str): The reference answer. + pred (str): The predicted answer. + + Returns: + float: 1.0 if the answers match exactly, otherwise 0.0. + """ + return float(clean_answer(pred) == clean_answer(ref)) + +def evaluate_must_include(ref: str, pred: str, tokenize: bool = False) -> float: + """Checks if the predicted answer includes all phrases from the reference answer. + + Parameters: + ref (str): The reference answer containing phrases that must be included. + pred (str): The predicted answer to be evaluated. + tokenize (bool, optional): Tokenizes the answers before evaluation if True. Default is False. + + Returns: + float: 1.0 if all phrases are included, otherwise 0.0. + """ + clean_ref = clean_answer(ref) + clean_pred = clean_answer(pred) + if tokenize and len(clean_ref) == 1: + return float(clean_ref in word_tokenize(clean_pred)) + else: + return float(clean_ref in clean_pred) + +def evaluate_fuzzy_match(ref: str, pred: str, intent: str) -> float: + """Evaluates if the predicted answer is semantically similar to the reference answer. + + Uses a large language model to assess similarity based on the intent of the question. + + Parameters: + ref (str): The reference answer. + pred (str): The predicted answer. + intent (str): The intent or context of the question. + + Returns: + float: 1.0 if the answers are considered semantically similar, otherwise 0.0. + """ + return llm_fuzzy_match(pred, ref, intent) + +def evaluate_ua_match(ref: str, pred: str, intent: str) -> float: + """Evaluates if the predicted reason for a task being unachievable matches the reference reason. + + Parameters: + ref (str): The reference reason why the task is unachievable. + pred (str): The predicted reason reported by the model. + intent (str): The intent or context of the task. + + Returns: + float: 1.0 if the reasons match, otherwise 0.0. + """ + return llm_ua_match(pred, ref, intent) + + +def load_config(config_file: Path | str) -> list[dict[str, Any]]: + """Load the confiufiguration for the test cases + + Args: + config_file (Path | str): Path to the config file + + Returns: + list[dict[str, Any]]: All the test cases in the config file + """ + with open(config_file, "r") as f: # noqa: UP015 + configs = json.load(f) + return configs + +def task_config_validator(task_config: dict[str, Any]) -> bool: + # Access the attributes + command = task_config.get('intent') + + if not command: + raise ValueError("Intent is missing in the task config file. Without it the task cannot be run.") + + return True + +def get_formatted_current_timestamp(format: str = "%Y-%m-%d %H:%M:%S") -> str: + """Get the current timestamp in the specified format. + + Args: + format (str, optional): The format of the timestamp. Defaults to "%Y-%m-%d %H:%M:%S". + + Returns: + str: The current timestamp in the specified format. + """ + # Get the current time + current_time = datetime.now() + + # Format the timestamp as a human-readable string + timestamp_str = current_time.strftime(format) + return timestamp_str diff --git a/Agent_E/test/tests_processor.py b/Agent_E/test/tests_processor.py new file mode 100644 index 0000000..28b6393 --- /dev/null +++ b/Agent_E/test/tests_processor.py @@ -0,0 +1,409 @@ +import asyncio +import json +import os +import time +from Agent_E.ae.core.agents_llm_config import AgentsLLMConfig +from Agent_E.test.test_utils import get_formatted_current_timestamp +from Agent_E.test.test_utils import load_config +from Agent_E.test.test_utils import task_config_validator +from typing import Any + +import Agent_E.ae.core.playwright_manager as browserManager +import nltk # type: ignore +from Agent_E.ae.config import PROJECT_TEST_ROOT +from Agent_E.ae.core.autogen_wrapper import AutogenWrapper +from Agent_E.ae.core.playwright_manager import PlaywrightManager +from Agent_E.ae.utils.logger import logger +from Agent_E.ae.utils.response_parser import parse_response +from autogen.agentchat.chat import ChatResult # type: ignore +from playwright.async_api import Page +from tabulate import tabulate +from termcolor import colored + +from evaluation_harness.evaluators import evaluator_router + +nltk.download('punkt') # type: ignore + +last_agent_response = "" + +def check_top_level_test_folders(test_log_dir, test_result_dir): + if not os.path.exists(test_log_dir): + os.makedirs(test_log_dir) + logger.info(f"Created log folder at: {test_log_dir}") + + if not os.path.exists(test_result_dir): + os.makedirs(test_result_dir) + logger.info(f"Created scores folder at: {test_result_dir}") + +def create_task_log_folders(test_log_dir, task_id): + task_log_dir = os.path.join(test_log_dir, task_id) + task_screenshots_dir = os.path.join(task_log_dir, 'snapshots') + if not os.path.exists(task_log_dir): + os.makedirs(task_log_dir) + logger.info(f"Created log dir for task {task_id} at: {task_log_dir}") + if not os.path.exists(task_screenshots_dir): + os.makedirs(task_screenshots_dir) + logger.info(f"Created screenshots dir for task {task_id} at: {task_screenshots_dir}") + + return {"task_log_folder": task_log_dir, "task_screenshots_folder": task_screenshots_dir} + + +def create_results_dir(test_file: str, test_results_id: str|None) -> str: + results_dir = "" + if test_results_id: + results_dir = os.path.join(TEST_RESULTS, f"results_for_{test_results_id}") + else: + test_file_base = os.path.basename(test_file) + test_file_name = os.path.splitext(test_file_base)[0] + results_dir = os.path.join(TEST_RESULTS, f"results_for_test_file_{test_file_name}") + + if not os.path.exists(results_dir): + os.makedirs(results_dir) + logger.info(f"Created results directory: {results_dir}") + + return results_dir + + +def dump_log(task_id: str, messages_str_keys: dict[str, str], logs_dir: str): + file_name = os.path.join(logs_dir, f'execution_logs_{task_id}.json') + with open(file_name, 'w', encoding='utf-8') as f: + json.dump(messages_str_keys, f, ensure_ascii=False, indent=4) + + +def save_test_results(test_results: list[dict[str, str | int | float | None]], test_results_id: str): + file_name = os.path.join(TEST_RESULTS, f'test_results_{test_results_id}.json') + with open(file_name, 'w', encoding='utf-8') as f: + json.dump(test_results, f, ensure_ascii=False, indent=4) + logger.info(f"Test results dumped to: {file_name}") + + +def save_individual_test_result(test_result: dict[str, str | int | float | None], results_dir: str): + task_id = test_result["task_id"] + file_name = os.path.join(results_dir, f'{task_id}.json') + with open(file_name, 'w', encoding='utf-8') as f: + json.dump(test_result, f, ensure_ascii=False, indent=4) + logger.info(f"Test result for task {task_id} dumped to: {file_name}") + + +def extract_last_response(messages: list[dict[str, Any]]) -> str: + """Extract the last response message from chat history.""" + try: + # Iterate over the messages in reverse order + for message in reversed(messages): + if message and 'content' in message: + content=message.get('content', "") + content_json = parse_response(content) + final_answer = content_json.get('final_response', None) + if final_answer: + return final_answer + return "" + except: + logger.error("Error extracting last response from chat history.") + return "" + + +def print_progress_bar(current: int, total: int, bar_length: int = 50) -> None: + """ + Prints a progress bar to the console. + + Parameters: + - current (int): The current progress of the task. + - total (int): The total number of tasks to complete. + - bar_length (int): The character length of the progress bar (default is 50). + + This function dynamically updates a single line in the console to reflect current progress. + + """ + percent = float(current) * 100 / total + arrow = '-' * int(percent/100 * bar_length - 1) + '>' + spaces = ' ' * (bar_length - len(arrow)) + + print(f'\rProgress: [{arrow}{spaces}] {current}/{total} ({percent:.2f}%)', end='') + +def determine_status_and_color(score: float) -> tuple[str, str]: + """ + Determines the status and color for a test result based on the score. + + Parameters: + - score (float): The score of the test task, indicating success (1), failure (0), or skip (-0.1). + + Returns: + - tuple[str, str]: A tuple containing the status ('Pass', 'Fail', or 'Skip') and the corresponding color ('green', 'red', or 'yellow'). + + """ + if score == 1: + return 'Pass', 'green' + elif score < 0: + return 'Skip', 'yellow' + else: + return 'Fail', 'red' + + +def print_test_result(task_result: dict[str, str | int | float | None], index: int, total: int) -> None: + """ + Prints the result of a single test task in a tabulated format. + + Parameters: + - task_result (dict): A dictionary containing the task's evaluation results, including task ID, intent, score, and total command time. + - index (int): The current index of the test in the sequence of all tests being run. + - total (int): The total number of tests to be run. + + The function determines the test status (Pass/Fail) based on the 'score' key in task_result and prints the result with colored status. + + """ + status, color = determine_status_and_color(task_result['score']) # type: ignore + + cost = task_result.get("compute_cost", None) + total_cost = None if cost is None else round(cost.get("cost", -1), 4) # type: ignore + total_tokens = None if cost is None else cost.get("total_tokens", -1) # type: ignore + result_table = [ # type: ignore + ['Test Index', 'Task ID', 'Intent', 'Status', 'Time Taken (s)', 'Total Tokens', 'Total Cost ($)'], + [index, task_result['task_id'], task_result['intent'], colored(status, color), round(task_result['tct'], 2), total_tokens, total_cost] # type: ignore + ] + print('\n' + tabulate(result_table, headers='firstrow', tablefmt='grid')) # type: ignore + +def get_command_exec_cost(command_exec_result: ChatResult): + output: dict[str, Any] = {} + try: + cost = command_exec_result.cost # type: ignore + usage: dict[str, Any] = None + if "usage_including_cached_inference" in cost: + usage: dict[str, Any] = cost["usage_including_cached_inference"] + elif "usage_excluding_cached_inference" in cost: + usage: dict[str, Any] = cost["usage_excluding_cached_inference"] + else: + raise ValueError("Cost not found in the command execution result.") + print("Usage: ", usage) + + for key in usage.keys(): + if isinstance(usage[key], dict) and "prompt_tokens" in usage[key]: + output["cost"] = usage[key]["cost"] + output["prompt_tokens"] = usage[key]["prompt_tokens"] + output["completion_tokens"] = usage[key]["completion_tokens"] + output["total_tokens"] = usage[key]["total_tokens"] + except Exception as e: + logger.debug(f"Error getting command execution cost: {e}") + return output + + +async def execute_single_task(task_config_file: str, browser_manager: PlaywrightManager, ag: AutogenWrapper, page: Page, logs_dir: str) -> dict[str, Any]: + """ + Executes a single test task based on a specified task configuration and evaluates its performance. + + Parameters: + - task_config (dict): The task configuration dictionary containing all necessary parameters for the task. + - browser_manager (PlaywrightManager): The manager handling browser interactions, responsible for page navigation and control. + - ag (AutogenWrapper): The automation generator wrapper that processes commands and interacts with the web page. + - page (Page): The Playwright page object representing the browser tab where the task is executed. + + Returns: + - dict: A dictionary containing the task's evaluation results, including task ID, intent, score, total command time (tct), + the last statement from the chat agent, and the last URL accessed during the task. + """ + command = "" + start_url = None + task_id = None + + start_ts = get_formatted_current_timestamp() + + task_config = json.load(open(task_config_file, "r")) + + task_config_validator(task_config) + + command: str = task_config.get('intent', "") + task_id = task_config.get('task_id') + task_index = task_config.get('task_index') + start_url = task_config.get('start_url') + logger.info(f"Intent: {command}, Task ID: {task_id}") + + if start_url: + await page.goto(start_url, wait_until='load', timeout=30000) + + start_time = time.time() + current_url = await browser_manager.get_current_url() + command_exec_result = await ag.process_command(command, current_url) + end_time = time.time() + + evaluator_result: dict[str, float | str] = {} + last_agent_response: str = "" + command_cost: dict[str, Any] = {} + single_task_result: dict[str, Any] = {} + try: + single_task_result = { + "task_id": task_id, + "task_index": task_index, + "start_url": start_url, + "intent": str(command), + "last_url": page.url, + "tct": end_time - start_time, + "start_ts": start_ts, + "completion_ts": get_formatted_current_timestamp() + } + + agent_name: str = "planner_agent" if ag.agents_map is not None and "planner_agent" in ag.agents_map else "browser_nav_agent" + + command_cost = get_command_exec_cost(command_exec_result) # type: ignore + print(f"Command cost: {command_cost}") + single_task_result["compute_cost"] = command_cost + + logger.info(f"Command \"{command}\" took: {round(end_time - start_time, 2)} seconds.") + logger.info(f"Task {task_id} completed.") + + messages = ag.agents_map[agent_name].chat_messages # type: ignore + messages_str_keys = {str(key): value for key, value in messages.items()} # type: ignore + agent_key = list(messages.keys())[0] # type: ignore + last_agent_response = extract_last_response(messages[agent_key]) # type: ignore + + dump_log(str(task_id), messages_str_keys, logs_dir) + + single_task_result["last_statement"] = last_agent_response + + + evaluator = evaluator_router(task_config_file) + # cdp_session = await page.context.new_cdp_session(page) + evaluator_result = evaluator( + config_file=task_config_file, + page=None, + client=None, + trajectory=[{"answer":last_agent_response}] + ) + + single_task_result["score"] = evaluator_result + except Exception as e: + logger.error(f"Error getting command cost: {e}") + command_cost = {"cost": -1, "total_tokens": -1} + single_task_result["compute_cost"] = command_cost + single_task_result["error"] = str(e) + + return single_task_result + + +async def run_tests(ag: AutogenWrapper, browser_manager: PlaywrightManager, task_ids, + logdir: str="", logname: str="", relative_task_dir: str="", test_results_id: str = "", wait_time_non_headless: int=5, take_screenshots: bool = False) -> list[dict[str, Any]]: + """ + Runs a specified range of test tasks using Playwright for browser interactions and AutogenWrapper for task automation. + It initializes necessary components, processes each task, handles exceptions, and compiles test results into a structured list. + + Parameters: + - ag (AutogenWrapper): The AutoGen wrapper that processes commands. + - browser_manager (PlaywrightManager): The manager handling browser interactions, responsible for page navigation and control. + - logdir (str) + - logname (str) + - task_ids (List[str]) + - relative_task_dir (str) + - wait_time_non_headless (int): Time to wait between tasks when running in non-headless mode, useful for live monitoring or debugging. + - take_screenshots (bool): Whether to take screenshots during test execution. Defaults to False. + + Returns: + - list[dict[str, Any]]: A list of dictionaries, each containing the results from executing a test task. Results include task ID, intent, score, total command time, etc. + + This function also manages logging and saving of test results, updates the progress bar to reflect test execution status, and prints a detailed summary report at the end of the testing session. + """ + test_log_dir = os.path.join(logdir, logname) + test_result_dir = os.path.join(logdir, logname, "results") + check_top_level_test_folders(test_log_dir, test_result_dir) + + config_file_list = [] + if not relative_task_dir or relative_task_dir == "": + relative_task_dir = "tasks" + if task_ids == "all" or task_ids == ["all"]: + task_ids = [filename[:-len(".json")] for filename in os.listdir(f"config_files/{relative_task_dir}") if filename.endswith(".json")] + for task_id in task_ids: + config_file_list.append(f"config_files/{relative_task_dir}/{task_id}.json") + + test_results: list[dict[str, str | int | float | None]] = [] + + llm_config = AgentsLLMConfig() + if not ag: + ag = await AutogenWrapper.create(llm_config.get_planner_agent_config(), llm_config.get_browser_nav_agent_config()) + + if not browser_manager: + browser_manager = browserManager.PlaywrightManager(headless=True) + await browser_manager.async_initialize() + + page=await browser_manager.get_current_page() + test_results = [] + total_tests = len(config_file_list) + + for index, task_config_file in enumerate(config_file_list): + task_config = json.load(open(task_config_file, "r")) + task_id = str(task_config.get('task_id')) + if os.path.exists(os.path.join(test_result_dir, f'{task_id}.json')): + continue + + log_folders = create_task_log_folders(test_log_dir, task_id) + + ag.set_chat_logs_dir(log_folders["task_log_folder"]) + + browser_manager.set_take_screenshots(take_screenshots) + if take_screenshots: + browser_manager.set_screenshots_dir(log_folders["task_screenshots_folder"]) + + print_progress_bar(index, total_tests) + task_result = await execute_single_task(task_config_file, browser_manager, ag, page, log_folders["task_log_folder"]) + test_results.append(task_result) + save_individual_test_result(task_result, test_result_dir) + print_test_result(task_result, index + 1, total_tests) + + if not browser_manager.isheadless: # no need to wait if we are running headless + await asyncio.sleep(wait_time_non_headless) # give time for switching between tasks in case there is a human observer + + await browser_manager.take_screenshots("final", None) + + await browser_manager.close_except_specified_tab(page) # cleanup pages that are not the one we opened here + + print_progress_bar(total_tests, total_tests) # Complete the progress bar + print('\n\nAll tests completed.') + + # Aggregate and print individual test results + print("\nDetailed Test Results:") + detailed_results_table = [['Test Index', 'Task ID', 'Intent', 'Status', 'Time Taken (s)', 'Total Tokens', 'Total Cost ($)']] + for idx, result in enumerate(test_results, 1): + status, color = determine_status_and_color(result['score']) # type: ignore + + cost: str | int | float | None = result.get("compute_cost", None) + total_cost = None if cost is None else round(cost.get("cost", -1), 4) # type: ignore + total_tokens = None if cost is None else cost.get("total_tokens", -1) # type: ignore + + detailed_results_table.append([ + idx, result['task_id'], result['intent'], colored(status, color), round(result['tct'], 2), # type: ignore + total_tokens, total_cost + ]) + + print(tabulate(detailed_results_table, headers='firstrow', tablefmt='grid')) + + # Summary report + + # Calculate aggregated cost and token totals for all tests that have compute cost + total_cost = 0 + total_tokens = 0 + + for result in test_results: + compute_cost = result.get("compute_cost",0) # type: ignore + if compute_cost is not None and isinstance(compute_cost, dict): + total_cost += compute_cost.get("cost", 0) # type: ignore + total_tokens += compute_cost.get("total_tokens", 0) # type: ignore + + passed_tests = [] + skipped_tests = [] + failed_tests = [] + for result in test_results: + if result["score"] == 1: + passed_tests.append(result) # type: ignore + elif result["score"] < 0: # type: ignore + skipped_tests.append(result) # type: ignore + else: + failed_tests.append(result) # type: ignore + + summary_table = [ # type: ignore + ['Total Tests', 'Passed', 'Failed', 'Skipped', 'Average Time Taken (s)', 'Total Time Taken (s)', 'Total Tokens', 'Total Cost ($)'], + [total_tests, len(passed_tests), len(failed_tests), len(skipped_tests), + round(sum(test['tct'] for test in test_results) / total_tests, 2), # type: ignore + round(sum(test['tct'] for test in test_results), 2), # type: ignore + total_tokens, total_cost] + ] + + print('\nSummary Report:') + print(tabulate(summary_table, headers='firstrow', tablefmt='grid')) # type: ignore + + return test_results diff --git a/README.md b/README.md index 847260c..bc425a2 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,227 @@ -## My Project +# AgentOccam +Code for "[AgentOccam: A Simple Yet Strong Baseline for LLM-Based Web Agents]()". -TODO: Fill this README out! +![](files/overview.png) -Be sure to: +We work on automating web tasks! 🏄🏄🏄 We refine the LLM-based web agents by aligning their observation and action space with the capabilities of LLMs. -* Change the title in this README -* Edit your repository description on GitHub +The newly designed agent AgentOccam surpasses previous state-of-the-art methods and concurrent work significantly w/o in-context examples, new agent roles, online feedback or search strategies on [WebArena](https://webarena.dev), a benchmark featuring general-purpose web tasks. 🍺 -## Security +We shed light on LLMs' impressive zero-shot performance on web tasks, and the critical role of carefully tuning observation and action spaces for LLM-based agents. 🧙 -See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. +You can let AgentOccam interact with other websites like Google per your requests by defining the task config files, as seen in the example in `config_files/tasks/standford_cs_head.json`. Have fun playing with it! :) -## License +*Please check whether reddit post exceeds limits, login expires, or any other webarena simulator/website failure exists when you finish one round. You should restart the simluator/relogin to the websites and rerun those tasks before reporting your final success rate. Additionally, LLM policy varies even given the same task as the generation temperature is set to >0 for more diverse exploration. Therefore, it is expected that you can get difference traces when starting the same task multiple times. Try it out with the basic `config_files/tasks/standford_cs_head.json`!* -This project is licensed under the Apache-2.0 License. +## WebArena Replication +### Environment Setup +```bash +git clone https://github.com/web-arena-x/webarena.git +cd webarena +conda create -n webarena python=3.10; conda activate webarena +pip install -r requirements.txt +pip install --upgrade transformers +pip install --upgrade openai +pip install numpy==1.26.4 +playwright install +pip install -e . +cd ../AgentOccam +pip install -r requirements.txt +mkdir .auth +``` +### Experiments +#### AgentOccam-Series and SteP-Replication +* Connect to the WebArena host server. +* Export the env configs: +```bash +export SHOPPING="http://:7770" +export SHOPPING_ADMIN="http://:7780/admin" +export REDDIT="http://:9999" +export GITLAB="http://:8023" +export MAP="http://:3000" +export WIKIPEDIA="http://:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" +export HOMEPAGE="http://:4399" +export OPENAI_API_KEY="" +export GEMINI_API_KEY="" +``` +* Login in: +```bash +python browser_env/auto_login.py +``` +* Test AgentOccam: +```bash +python eval_webarena.py --config AgentOccam/configs/AgentOccam.yml # Replace the yml config with your target one. +``` +*You can use directly run `bash script/run_config.sh` after replacing the experiment configurations.* +#### WebArena-Replication +```bash +bash scripts/run_webarena.sh +``` + +## WebVoyager Replication +### Environment Setup +```bash +git clone https://github.com/EmergenceAI/Agent-E.git +cd Agent-E +./install.sh +source .venv/bin/activate +uv pip install beartype +uv pip install gymnasium +uv pip install lxml +uv pip install text_generation +uv pip install aiolimiter +uv pip install boto3 +uv pip install transformers +export OPENAI_API_KEY="" +export AUTOGEN_MODEL_NAME="gpt-4-turbo" +cd ../AgentOccam +``` +### Experiments +#### AgentOccam +```bash +python eval_webarena.py --config AgentOccam/configs/AgentOccam-WebVoyager.yml +``` +#### Agent-E +```bash +python -m agente_replication --task_ids Allrecipes--3 +``` + + + +## Agent Configuration Explanation + +They following are compiled based on `AgentOccam/configs/AgentOccam.yml`. + +### General + +```yml +logdir: "../AgentOccam-Trajectories" +``` + +This determines where the trajectories will be saved. Use relative path. + +```yml +logname: "AgentOccam" +agent: + others: + logname: "AgentOccam" +``` + +All relevant online files (play series, trash series, and output/screenshot series) will use this log name to differentiate. Change them simultaneously. + +### Agent +#### Base +```yml +agent: + actor: + debug: 0 + verbose: 1 + number: 1 + critic: + mode: false + debug: 0 + verbose: 1 + judge: + mode: false + debug: 0 + verbose: 1 +``` + +All roles have a `debug` key. When `debug==1`, it plays and you decide whether to take its action. When `debug==2`, you will have to generate the action yourself. The actor is always playing so there's no `mode` key for it. For other roles, you can disable them by changing `mode` to false. + +```yml +agent: + actor: + model: "gpt-4-turbo" +``` + +determines which model to use. + +```yml +agent: + actor: + input: ["step", "objective", "previous plans", "interaction history", "current observation"] +``` +arranges the input. The list element order matters here and this applies to all the following list input/output specifications. + +```yml +agent: + actor: + interaction_history: + verbose: True + type: ["text"] + step_num: 3 +``` +determines the interaction history section input type and modality. You can use `type: ["text", "image"]` to enable multimodality inputs. + +```yml +agent: + actor: + current_observation: + type: ["text"] +``` +defines the current observation type. + +```yml +agent: + actor: + output: ["interaction history summary", "observation description", "reason", "action", "observation highlight"] +``` +organize the output specifications, and capable LLMs should generate those content, which would be parsed automatically by the code. You only need to add the description for that entry under `AgentOccam/prompts/output_specifications`. + +```yml +agent: + actor: + planning_command: ["branch", "prune"] + navigation_command: ["click", "type", "stop", "note", "go_back"] +``` +defines the valid actions. + +```yml +agent: + actor: + play: ["step", "objective", "previous plans", "observation description", "reason", "action"] + trash: ["objective", "step", "url", "instruction", "online input", "response", "alter ego response"] +``` +designates the broadcasting content. + +#### Advanced +```yml +agent: + actor: + number: 1 +``` +If you use best-of-**N**-actions with judge, the `number` here defines the **N**. + +```yml +agent: + actor: + identities: + identity_0: + name: "QA" + model: "gpt-4-turbo" + output: ["response"] + identity_1: + name: "planning" + model: "gpt-4-turbo" + planning_command: ["branch", "prune"] + output: ["interaction history summary", "observation description", "reason", "plan", "observation highlight"] + identity_2: + name: "reflection" + model: "gpt-4-turbo" + planning_command: ["branch", "prune"] + navigation_command: ["click", "type", "stop", "note", "go_back"] + output: ["interaction history summary", "observation description", "reflection", "reason", "action", "observation highlight"] +``` +defines different actors. If you don't want them, comment them. + +## Environment +```yml +env: + fullpage: true + prune: true +``` +If `fullpage==True`, the agent takes the entire page as the input. Remember to add `scroll` to the `navigation_action` list if `fullpage` is disabled. + +If `prune==True`, the pipeline carries out observation space alignment. \ No newline at end of file diff --git a/agente_replication.py b/agente_replication.py new file mode 100644 index 0000000..cd09d54 --- /dev/null +++ b/agente_replication.py @@ -0,0 +1,29 @@ +import argparse +import asyncio +from Agent_E.test.tests_processor import run_tests + +if __name__ == "__main__": + # Create the parser + parser = argparse.ArgumentParser(description='Run test suite for specified range of test tasks.') + + # Add arguments + parser.add_argument('-s', '--take_screenshots', type=bool, default=False, + help='Take screenshots after every operation performed (default: False)') + parser.add_argument('-wait', '--wait_time_non_headless', type=int, default=5, + help='Time to wait between test tasks when running in non-headless mode (default: 10 seconds)') + parser.add_argument("-ids", "--task_ids", type=str, nargs='+', help="List of task IDs to execute") + parser.add_argument('-dir', '--logdir', type=str, default="../AgentOccam-Trajectories", + help='Logdir.') + parser.add_argument('-log', '--logname', type=str, default="Agent-E", + help='Logname.') + parser.add_argument('-id', '--test_results_id', type=str, default="", + help='A unique identifier for the test results. If not provided, a timestamp is used.') + parser.add_argument('-config', '--relative_task_dir', type=str, default="webvoyager", + help='Path to the test configuration file.') + + # Parse the command line arguments + args = parser.parse_args() + + # Run the main function with the provided or default arguments, not passing browser_manager or AutoGenWrapper will cause the test processor to create new instances of them + asyncio.run(run_tests(None, None, args.task_ids, logdir=args.logdir, logname=args.logname, relative_task_dir=args.relative_task_dir, + take_screenshots=args.take_screenshots, wait_time_non_headless=args.wait_time_non_headless)) diff --git a/browser_env/__init__.py b/browser_env/__init__.py new file mode 100644 index 0000000..53e06be --- /dev/null +++ b/browser_env/__init__.py @@ -0,0 +1,78 @@ +import asyncio + +from .actions import ( + Action, + ActionParsingError, + ActionTypes, + action2create_function, + action2str, + create_check_action, + create_click_action, + create_focus_and_click_action, + create_focus_and_type_action, + create_go_back_action, + create_go_forward_action, + create_goto_url_action, + create_hover_action, + create_id_based_action, + create_id_based_actions, + create_key_press_action, + create_keyboard_type_action, + create_mouse_click_action, + create_mouse_hover_action, + create_new_tab_action, + create_none_action, + create_page_close_action, + create_page_focus_action, + create_playwright_action, + create_random_action, + create_scroll_action, + create_select_option_action, + create_stop_action, + create_type_action, + is_equivalent, +) +from .async_envs import AsyncScriptBrowserEnv +from .envs import ScriptBrowserEnv +from .processors import ObservationMetadata +from .trajectory import Trajectory +from .utils import DetachedPage, StateInfo + +__all__ = [ + "ScriptBrowserEnv", + "AsyncScriptBrowserEnv", + "DetachedPage", + "StateInfo", + "ObservationMetadata", + "Action", + "ActionTypes", + "action2str", + "create_random_action", + "create_focus_and_click_action", + "create_focus_and_type_action", + "is_equivalent", + "create_mouse_click_action", + "create_mouse_hover_action", + "create_none_action", + "create_keyboard_type_action", + "create_page_focus_action", + "create_new_tab_action", + "create_go_back_action", + "create_go_forward_action", + "create_goto_url_action", + "create_page_close_action", + "action2create_function", + "create_playwright_action", + "create_id_based_action", + "create_id_based_actions", + "create_scroll_action", + "create_key_press_action", + "create_check_action", + "create_click_action", + "create_type_action", + "create_hover_action", + "create_select_option_action", + "create_stop_action", + "ActionParsingError", + "Trajectory", +] diff --git a/browser_env/actions.py b/browser_env/actions.py new file mode 100644 index 0000000..9de1c18 --- /dev/null +++ b/browser_env/actions.py @@ -0,0 +1,1948 @@ +""" +Browser Env action space. +Inspited by Farama-Foundation/miniwob-plusplus +""" +import ast +import random +import re +import string +from enum import IntEnum +from itertools import chain +from typing import Any, TypedDict, Union, cast +import time + +import numpy as np +import numpy.typing as npt +from beartype import beartype +from gymnasium import spaces +from playwright._impl._api_structures import ViewportSize +from playwright.async_api import BrowserContext as ABrowserContext +from playwright.async_api import Locator as ALocator +from playwright.async_api import Page as APage +from playwright.sync_api import BrowserContext, Locator, Page + +from browser_env.constants import ( + ASCII_CHARSET, + FREQ_UNICODE_CHARSET, + MAX_ANSWER_LENGTH, + MAX_ELEMENT_ID, + MAX_ELEMENT_INDEX_IN_VIEWPORT, + MAX_PAGE_NUMBER, + MAX_VANILLA_STR_LENGTH, + PLAYWRIGHT_ACTIONS, + PLAYWRIGHT_LOCATORS, + ROLES, + SPECIAL_KEY_MAPPINGS, + SPECIAL_KEYS, + SPECIAL_LOCATORS, + TEXT_MAX_LENGTH, + TYPING_MAX_LENGTH, + URL_MAX_LENGTH, + RolesType, +) +from browser_env.processors import ObservationProcessor + + +class ParsedPlaywrightCode(TypedDict): + function_name: str + arguments: list[str] + keywords: dict[str, Any] + + +from browser_env.processors import ( + ObservationProcessor, + TextObervationProcessor, +) + + +def is_in_viewport( + element: Locator, viewport: ViewportSize, threshold: float = 0.3 +) -> bool: + """Given a playwright locator, check if it is in the viewport""" + box = element.bounding_box() + assert box is not None + boxx0 = box["x"] + boxx1 = box["x"] + box["width"] + boxy0 = box["y"] + boxy1 = box["y"] + box["height"] + viewportx0, viewporty0 = 0, 0 + viewportx1, viewporty1 = viewport["width"], viewport["height"] + inter = max(0, min(boxx1, viewportx1) - max(boxx0, viewportx0)) * max( + 0, min(boxy1, viewporty1) - max(boxy0, viewporty0) + ) + ratio = inter / (box["width"] * box["height"]) + return ratio > threshold + + +async def async_is_in_viewport( + element: ALocator, viewport: ViewportSize, threshold: float = 0.3 +) -> bool: + box = await element.bounding_box() + assert box is not None + boxx0 = box["x"] + boxx1 = box["x"] + box["width"] + boxy0 = box["y"] + boxy1 = box["y"] + box["height"] + viewportx0, viewporty0 = 0, 0 + viewportx1, viewporty1 = viewport["width"], viewport["height"] + inter = max(0, min(boxx1, viewportx1) - max(boxx0, viewportx0)) * max( + 0, min(boxy1, viewporty1) - max(boxy0, viewporty0) + ) + ratio = inter / (box["width"] * box["height"]) + return ratio > threshold + + +class Action(TypedDict): + action_type: int + coords: npt.NDArray[np.float32] + element_role: int + element_name: str + text: list[int] + page_number: int + url: str + nth: int + element_id: str + direction: str + key_comb: str + pw_code: str + answer: str + raw_prediction: str # raw prediction from the model + + +@beartype +def action2str( + action: Action, action_set_tag: str, semantic_element: str = "" +) -> str: + """Return the string representation of an action + + sementic_element: the semantic information of the element + such as a line in an accessibility tree + """ + if action_set_tag == "id_accessibility_tree": + element_id = action["element_id"] + match action["action_type"]: + case ActionTypes.CLICK: + # [ID=X] xxxxx + action_str = f"click [{element_id}] where [{element_id}] is {semantic_element}" + case ActionTypes.TYPE: + text = "".join([_id2key[i] for i in action["text"]]) + text = text.replace("\n", " ") + action_str = f"type [{element_id}] [{text}] where [{element_id}] is {semantic_element}" + case ActionTypes.HOVER: + action_str = f"hover [{element_id}] where [{element_id}] is {semantic_element}" + case ActionTypes.SCROLL: + action_str = f"scroll [{action['direction']}]" + case ActionTypes.KEY_PRESS: + action_str = f"press [{action['key_comb']}]" + case ActionTypes.GOTO_URL: + action_str = f"goto [{action['url']}]" + case ActionTypes.NEW_TAB: + action_str = "new_tab" + case ActionTypes.PAGE_CLOSE: + action_str = "close_tab" + case ActionTypes.GO_BACK: + action_str = "go_back" + case ActionTypes.GO_FORWARD: + action_str = "go_forward" + case ActionTypes.PAGE_FOCUS: + action_str = f"page_focus [{action['page_number']}]" + case ActionTypes.STOP: + action_str = f"stop [{action['answer']}]" + case ActionTypes.NONE: + action_str = "none" + case _: + raise ValueError( + f"Unknown action type {action['action_type']}" + ) + elif action_set_tag == "id_html_tree": + element_id = action["element_id"] + label = action.get("label", "") + match action["action_type"]: + case ActionTypes.CLICK: + # [ID=X] xxxxx + action_str = f"#Click# {label}" + case ActionTypes.TYPE: + text = "".join([_id2key[i] for i in action["text"]]) + text = text.replace("\n", " ") + action_str = f"#Type# {label} {text}" + case ActionTypes.HOVER: + action_str = f"#Hover# {label}" + case ActionTypes.SCROLL: + action_str = f"#Scroll_{action['direction']}#" + case ActionTypes.KEY_PRESS: + action_str = f"#Press# {action['key_comb']}" + case ActionTypes.GOTO_URL: + action_str = f"#Goto# {action['url']}" + case ActionTypes.NEW_TAB: + action_str = "new_tab" + case ActionTypes.PAGE_CLOSE: + action_str = "close_tab" + case ActionTypes.GO_BACK: + action_str = "#Go_backward#" + case ActionTypes.GO_FORWARD: + action_str = "#Go_forward#" + case ActionTypes.PAGE_FOCUS: + action_str = f"page_focus [{action['page_number']}]" + case ActionTypes.STOP: + if len(action['answer']) > 0: + action_str = f"#Answer# {action['answer']}" + else: + action_str = f"#Exit#" + case ActionTypes.SELECT_OPTION: + option = action.get("option", "") + action_str = f"#Select# {label} {option}" + case ActionTypes.NONE: + action_str = "None" + case _: + raise ValueError( + f"Unknown action type {action['action_type']}" + ) + elif action_set_tag == "id_html_nasc_tree": + element_id = action["element_id"] + label = action["label"] + flag = action["flag"] + + match action["action_type"]: + case ActionTypes.CLICK: + # [ID=X] xxxxx + action_str = f"click('{label}')" + case ActionTypes.TYPE: + text = "".join([_id2key[i] for i in action["text"]]) + text = text.replace("\n", " ") + action_str = f"type_string('{label}', '{text}', {flag})" + case ActionTypes.HOVER: + action_str = f"hover('{label}')" + case ActionTypes.SCROLL: + action_str = f"scroll_page('{action['direction']}')" + case ActionTypes.KEY_PRESS: + action_str = f"press('{action['key_comb']}')" + case ActionTypes.GOTO_URL: + action_str = f"jump_to('{action['url']}', {flag})" + case ActionTypes.NEW_TAB: + action_str = "new_tab" + case ActionTypes.PAGE_CLOSE: + action_str = "close_tab" + case ActionTypes.GO_BACK: + action_str = "go('backward')" + case ActionTypes.GO_FORWARD: + action_str = "go('forward')" + case ActionTypes.PAGE_FOCUS: + action_str = f"switch_tab('{action['page_number']}')" + case ActionTypes.STOP: + if len(action['answer']) > 0: + action_str = f"finish('{action['answer']}')" + else: + action_str = f"finish()" + case ActionTypes.SELECT_OPTION: + option = action.get("option", "") + action_str = f"select('{label}', '{option}')" + case ActionTypes.NONE: + action_str = "no_op()" + case _: + raise ValueError( + f"Unknown action type {action['action_type']}" + ) + else: + raise NotImplementedError(f"Unknown action set tag {action_set_tag}") + + return action_str + + +@beartype +def action2create_function(action: Action) -> str: + match (action["action_type"]): + case ActionTypes.NONE: + return "create_none_action()" + # mouse wheel and keyboard action + case ActionTypes.SCROLL: + direction = "up" if "up" in action["direction"] else "down" + return f"create_scroll_action({repr(direction)})" + case ActionTypes.KEY_PRESS: + return f"create_key_press_action({repr(action['key_comb'])})" + # inter-page actions + case ActionTypes.PAGE_FOCUS: + return f"create_page_focus_action({action['page_number']})" + case ActionTypes.NEW_TAB: + return "create_new_tab_action()" + case ActionTypes.GO_BACK: + return "create_go_back_action()" + case ActionTypes.GO_FORWARD: + return "create_go_forward_action()" + case ActionTypes.GOTO_URL: + return f"create_goto_url_action({repr(action['url'])})" + case ActionTypes.PAGE_CLOSE: + return "create_page_close_action()" + + # low-level keyboard and mouse actions + case ActionTypes.MOUSE_CLICK: + return f"create_mouse_click_action({action['coords'][0]}, {action['coords'][1]})" + case ActionTypes.MOUSE_HOVER: + return f"create_mouse_hover_action({action['coords'][0]}, {action['coords'][1]})" + case ActionTypes.KEYBOARD_TYPE: + return f"create_keyboard_type_action({list(map(lambda x: _id2key[x], action['text']))})" + + # mid-level keyboard and mouse actions + case ActionTypes.CLICK: + args = [] + args.append(f"element_id={repr(action['element_id'])}") + args.append( + f"element_role={repr(_id2role[action['element_role']])}" + ) + args.append(f"element_name={repr(action['element_name'])}") + args.append(f"pw_code={repr(action['pw_code'])}") + args_str = ", ".join(args) + return f"create_click_action({args_str})" + case ActionTypes.HOVER: + args = [] + args.append(f"element_id={repr(action['element_id'])}") + args.append( + f"element_role={repr(_id2role[action['element_role']])}" + ) + args.append(f"element_name={repr(action['element_name'])}") + args.append(f"pw_code={repr(action['pw_code'])}") + args_str = ", ".join(args) + return f"create_hover_action({args_str})" + case ActionTypes.TYPE: + args = [] + text = "".join(map(lambda x: _id2key[x], action["text"])) + args.append(f"text={repr(text)}") + args.append(f"element_id={repr(action['element_id'])}") + args.append( + f"element_role={repr(_id2role[action['element_role']])}" + ) + args.append(f"element_name={repr(action['element_name'])}") + args.append(f"pw_code={repr(action['pw_code'])}") + args_str = ", ".join(args) + return f"create_type_action({args_str})" + + # high-level actions, only support locators from playwright + case ActionTypes.CHECK: + return f"create_check_action(pw_code={repr(action['pw_code'])})" + case ActionTypes.SELECT_OPTION: + return f"create_select_option_action(pw_code={repr(action['pw_code'])})" + case ActionTypes.STOP: + return f'create_stop_action({repr(action["answer"])})' + + raise ValueError(f"Invalid action type: {action['action_type']}") + + +class ActionTypes(IntEnum): + """Valid action types for browser env.""" + + NONE = 0 + # mouse wheel and keyboard, universal across all action spaces + SCROLL = 1 + KEY_PRESS = 2 + + # low level mouse and keyboard actions + MOUSE_CLICK = 3 + KEYBOARD_TYPE = 4 + MOUSE_HOVER = 5 + + # mid level mouse and keyboard actions + CLICK = 6 + TYPE = 7 + HOVER = 8 + + # page level actions, universal across all action spaces + PAGE_FOCUS = 9 + NEW_TAB = 10 + GO_BACK = 11 + GO_FORWARD = 12 + GOTO_URL = 13 + PAGE_CLOSE = 14 + + # high-leval actions that playwright support + CHECK = 15 + SELECT_OPTION = 16 + + STOP = 17 + + def __str__(self) -> str: + return f"ACTION_TYPES.{self.name}" + + +@beartype +def is_equivalent(a: Action, b: Action) -> bool: + """Return True if two actions are equal.""" + if a["action_type"] != b["action_type"]: + return False + match (a["action_type"]): + case ActionTypes.NONE: + return True + case ActionTypes.SCROLL: + da = "up" if "up" in a["direction"] else "down" + db = "up" if "up" in b["direction"] else "down" + return da == db + case ActionTypes.KEY_PRESS: + return a["key_comb"] == b["key_comb"] + case ActionTypes.MOUSE_CLICK | ActionTypes.MOUSE_HOVER: + return np.allclose(a["coords"], b["coords"]) + case ActionTypes.KEYBOARD_TYPE: + return a["text"] == b["text"] + case ActionTypes.CLICK | ActionTypes.HOVER | ActionTypes.TYPE: # TODO: can be further optimized + if a["element_id"] and b["element_id"]: + return a["element_id"] == b["element_id"] + elif a["element_role"] and b["element_role"]: + return ( + a["element_role"] == b["element_role"] + and a["element_name"] == b["element_name"] + ) + elif a["pw_code"] and b["pw_code"]: + return a["pw_code"] == b["pw_code"] + else: + return False + case ActionTypes.PAGE_FOCUS: + return a["page_number"] == b["page_number"] + case ActionTypes.NEW_TAB: + return True + case ActionTypes.GO_BACK: + return True + case ActionTypes.GO_FORWARD: + return True + case ActionTypes.GOTO_URL: + return a["url"] == b["url"] + case ActionTypes.PAGE_CLOSE: + return True + case ActionTypes.CHECK | ActionTypes.SELECT_OPTION: + return a["pw_code"] == b["pw_code"] + case ActionTypes.STOP: + return a["answer"] == b["answer"] + case _: + raise ValueError(f"Unknown action type: {a['action_type']}") + + +_key2id: dict[str, int] = { + key: i + for i, key in enumerate( + chain(SPECIAL_KEYS, ASCII_CHARSET, FREQ_UNICODE_CHARSET, ["\n"]) + ) +} +_id2key: list[str] = sorted(_key2id, key=_key2id.get) # type: ignore[arg-type] +_role2id: dict[RolesType, int] = { + cast(RolesType, role): i + for i, role in enumerate(chain(ROLES, SPECIAL_LOCATORS)) +} +_id2role: list[RolesType] = sorted(_role2id, key=_role2id.get) # type: ignore[arg-type] + + +def _keys2ids(keys: list[int | str] | str) -> list[int]: + return list( + map( + lambda key: _key2id[str(key)] + if isinstance(key, str) + else int(key), + keys, + ) + ) + + +@beartype +def get_action_space() -> spaces.Dict: + """Return the space of serialized actions.""" + space = spaces.Dict( + { + "action_type": spaces.Discrete(len(ActionTypes)), + # coords (left, top) is used for COORD_CLICK + "coords": spaces.Box( + np.array([0.0, 0.0], dtype=np.float32), + np.array([1.0, 1.0], dtype=np.float32), + ), + # element role is used for FOCUS_AND_CLICK and FOCUS_AND_TYPE + "element_role": spaces.Discrete( + len(ROLES) + len(SPECIAL_LOCATORS) + ), + # element name is used with element role + "element_name": spaces.Text(TEXT_MAX_LENGTH), + "element_id": spaces.Text(TEXT_MAX_LENGTH), + # text is only used for TYPE and FOCUS_AND_TYPE + "text": spaces.MultiDiscrete( + [ + len(ASCII_CHARSET) + + len(SPECIAL_KEYS) + + len(FREQ_UNICODE_CHARSET) + ] + * TYPING_MAX_LENGTH + ), + "page_number": spaces.Discrete(MAX_PAGE_NUMBER), + "url": spaces.Text(URL_MAX_LENGTH), + "nth": spaces.Discrete(MAX_ELEMENT_INDEX_IN_VIEWPORT), + "key_comb": spaces.Text(MAX_VANILLA_STR_LENGTH), + "direction": spaces.Text(MAX_VANILLA_STR_LENGTH), + "pw_code": spaces.Text(MAX_VANILLA_STR_LENGTH), + "answer": spaces.Text(MAX_ANSWER_LENGTH), + } + ) + return space + + +@beartype +def create_random_action() -> Action: + """Return a random action.""" + return { + "action_type": np.random.randint(len(ActionTypes)), + "coords": np.random.rand(2).astype(np.float32), + "element_role": np.random.randint(len(ROLES) + len(SPECIAL_LOCATORS)), + "element_name": "".join( + random.choices(ASCII_CHARSET, k=np.random.randint(TEXT_MAX_LENGTH)) + ), + "text": list( + random.choices( + list(range(len(ASCII_CHARSET))), + k=np.random.randint(TYPING_MAX_LENGTH), + ) + ), + "page_number": np.random.randint(MAX_PAGE_NUMBER), + "url": "".join( + random.choices(ASCII_CHARSET, k=np.random.randint(URL_MAX_LENGTH)) + ), + "nth": np.random.randint(MAX_ELEMENT_INDEX_IN_VIEWPORT), + "element_id": str(np.random.randint(MAX_ELEMENT_ID)), + "key_comb": "+".join( + random.choices(SPECIAL_KEYS, k=np.random.randint(3)) + ), + "direction": random.choice(["up", "down"]), + "pw_code": "".join( + random.choices( + string.ascii_uppercase + string.digits, + k=np.random.randint(MAX_VANILLA_STR_LENGTH), + ) + ), + "answer": str(np.random.randint(MAX_ANSWER_LENGTH)), + "raw_prediction": str(np.random.randint(MAX_ANSWER_LENGTH)), + } + + +@beartype +def create_none_action() -> Action: + """Return a valid action object that does nothing.""" + return { + "action_type": ActionTypes.NONE, + "coords": np.zeros(2, dtype=np.float32), + "element_role": 0, + "element_name": "", + "text": [], + "page_number": 0, + "url": "", + "nth": 0, + "pw_code": "", # str that requires further processing + "element_id": "", + "key_comb": "", + "direction": "", + "answer": "", + "raw_prediction": "", + "label": "", + "flag": False, + } + + +@beartype +def create_stop_action(answer: str="") -> Action: + action = create_none_action() + action.update({"action_type": ActionTypes.STOP, "answer": answer}) + return action + + +@beartype +def create_scroll_action(direction: str) -> Action: + """Return the playwright action""" + assert direction in ["up", "down"] + action = create_none_action() + action.update( + { + "action_type": ActionTypes.SCROLL, + "direction": direction, + } + ) + return action + + +@beartype +def create_mouse_hover_action( + left: float | None = None, top: float | None = None +) -> Action: + """Return a valid action object with type COORD_CLICK.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.MOUSE_HOVER, + "coords": np.array([left, top], dtype=np.float32), + } + ) + return action + + +@beartype +def create_key_press_action(key_comb: str) -> Action: + """Return the key press action""" + + def map_keys(key_comb: str) -> str: + keys = key_comb.split("+") + mapped_keys = [] + for key in keys: + mapped_key = SPECIAL_KEY_MAPPINGS.get(key.lower(), key) + mapped_keys.append(mapped_key) + return "+".join(mapped_keys) + + action = create_none_action() + mapped_key_comb = map_keys(key_comb) + action.update( + { + "action_type": ActionTypes.KEY_PRESS, + "key_comb": mapped_key_comb, + } + ) + return action + + +@beartype +def create_page_focus_action(page_number: int) -> Action: + """Return a valid action object with type PAGE_FOCUS.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.PAGE_FOCUS, + "page_number": page_number, + } + ) + return action + + +@beartype +def create_new_tab_action() -> Action: + """Return a valid action object with type NEW_TAB.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.NEW_TAB, + } + ) + return action + + +@beartype +def create_go_back_action() -> Action: + """Return a valid action object with type GO_BACK.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.GO_BACK, + } + ) + return action + + +@beartype +def create_go_forward_action() -> Action: + """Return a valid action object with type GO_FORWARD.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.GO_FORWARD, + } + ) + return action + + +@beartype +def create_goto_url_action(url: str) -> Action: + """Return a valid action object with type GOTO_URL.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.GOTO_URL, + "url": url, + } + ) + return action + +@beartype +def create_our_goto_url_action(url: str, flag: bool) -> Action: + """Return a valid action object with type GOTO_URL.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.GOTO_URL, + "flag": flag, + "url": url, + } + ) + return action + +@beartype +def create_page_close_action() -> Action: + """Return a valid action object with type PAGE_CLOSE.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.PAGE_CLOSE, + } + ) + return action + + +@beartype +def create_mouse_click_action( + left: float | None = None, top: float | None = None +) -> Action: + """Return a valid action object with type COORD_CLICK.""" + action = create_none_action() + if left and top: + action.update( + { + "action_type": ActionTypes.MOUSE_CLICK, + "coords": np.array([left, top], dtype=np.float32), + } + ) + elif (not left) and (not top): + action.update( + { + "action_type": ActionTypes.CLICK, + } + ) + else: + raise ValueError("left and top must be both None or both not None") + return action + + +@beartype +def create_keyboard_type_action(keys: list[int | str] | str) -> Action: + """Return a valid action object with type TYPE.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.KEYBOARD_TYPE, + "text": _keys2ids(keys), + } + ) + return action + + +@beartype +def create_click_action( + element_id: str = "", + element_role: RolesType = "link", + element_name: str = "", + pw_code: str = "", + nth: int = 0, +) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.CLICK, + "element_id": element_id, + "element_role": _role2id[element_role], + "element_name": element_name, + "nth": nth, + "pw_code": pw_code, + } + ) + return action + + +@beartype +def create_hover_action( + element_id: str = "", + element_role: RolesType = "link", + element_name: str = "", + pw_code: str = "", + nth: int = 0, +) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.HOVER, + "element_id": element_id, + "element_role": _role2id[element_role], + "element_name": element_name, + "nth": nth, + "pw_code": pw_code, + } + ) + return action + + +@beartype +def create_type_action( + text: str, + element_id: str = "", + element_role: RolesType = "link", + element_name: str = "", + pw_code: str = "", + nth: int = 0, +) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.TYPE, + "element_id": element_id, + "element_role": _role2id[element_role], + "element_name": element_name, + "nth": nth, + "text": _keys2ids(text), + "pw_code": pw_code, + } + ) + return action + +@beartype +def create_our_type_action( + text: str, + element_id: str = "", + element_role: RolesType = "link", + element_name: str = "", + pw_code: str = "", + nth: int = 0, + flag: bool = True, +) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.TYPE, + "element_id": element_id, + "element_role": _role2id[element_role], + "element_name": element_name, + "nth": nth, + "text": _keys2ids(text), + "pw_code": pw_code, + "flag": flag, + } + ) + return action + + +@beartype +def create_check_action(pw_code: str) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.CHECK, + "pw_code": pw_code, + } + ) + return action + +@beartype +def create_record_action(text: str) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.NONE, + "answer": f"#Record# {text}", + } + ) + return action + +def create_select_option_action( + pw_code: str, +) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.SELECT_OPTION, + "pw_code": pw_code, + } + ) + return action + +def create_our_select_option_action( + pw_code: str, + label: str, + text: str, +) -> Action: + action = create_none_action() + action.update( + { + "action_type": ActionTypes.SELECT_OPTION, + "pw_code": pw_code, + "label": label, + "option": text, + } + ) + return action + +@beartype +def create_focus_action( + element_role: RolesType, element_name: str = "", nth: int = 0 +) -> Action: + """Return a valid action object with type CLICK. + + Keep compatible with the old version.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.CLICK, + "element_role": _role2id[element_role], + "element_name": element_name, + "nth": nth, + } + ) + return action + + +@beartype +def create_focus_and_click_action( + element_role: RolesType, element_name: str = "", nth: int = 0 +) -> Action: + """Return a valid action object with type CLICK. + + Keep compatible with the old version.""" + + action = create_none_action() + action.update( + { + "action_type": ActionTypes.CLICK, + "element_role": _role2id[element_role], + "element_name": element_name, + "nth": nth, + } + ) + return action + + +@beartype +def create_focus_and_type_action( + keys: list[int | str] | str, + element_role: RolesType, + element_name: str = "", + nth: int = 0, +) -> Action: + """Return a valid action object with type TYPE. + + Keep compatible with the old version.""" + action = create_none_action() + action.update( + { + "action_type": ActionTypes.TYPE, + "element_role": _role2id[element_role], + "element_name": element_name, + "text": _keys2ids(keys), + "nth": nth, + } + ) + return action + + +def execute_scroll(direction: str, page: Page) -> None: + # perform the action + # code from natbot + if direction == "up": + page.evaluate( + "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight * 0.8;" + ) + elif direction == "down": + page.evaluate( + "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight * 0.8;" + ) + + +async def aexecute_scroll(direction: str, page: APage) -> None: + # perform the action + # code from natbot + if direction == "up": + await page.evaluate( + "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;" + ) + elif direction == "down": + await page.evaluate( + "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;" + ) + + +def execute_key_press(key: str, page: Page) -> None: + """Press a key.""" + if "Meta" in key and "Mac" not in page.evaluate("navigator.platform"): + key = key.replace("Meta", "Control") + page.keyboard.press(key) + + +async def aexecute_key_press(key: str, page: APage) -> None: + """Press a key.""" + if "Meta" in key and "Mac" not in await page.evaluate( + "navigator.platform" + ): + key = key.replace("Meta", "Control") + await page.keyboard.press(key) + + +def execute_mouse_hover(left: float, top: float, page: Page) -> None: + """Click at coordinates (left, top).""" + viewport_size = page.viewport_size + assert viewport_size + page.mouse.move( + left * viewport_size["width"], top * viewport_size["height"] + ) + + +async def aexecute_mouse_hover(left: float, top: float, page: APage) -> None: + """Click at coordinates (left, top).""" + viewport_size = page.viewport_size + assert viewport_size + await page.mouse.move( + left * viewport_size["width"], top * viewport_size["height"] + ) + + +def execute_mouse_click(left: float, top: float, page: Page) -> None: + """Click at coordinates (left, top).""" + viewport_size = page.viewport_size + assert viewport_size + page.mouse.click( + left * viewport_size["width"], top * viewport_size["height"] + ) + + +async def aexecute_mouse_click(left: float, top: float, page: APage) -> None: + """Click at coordinates (left, top).""" + viewport_size = page.viewport_size + assert viewport_size + await page.mouse.click( + left * viewport_size["width"], top * viewport_size["height"] + ) + + +def execute_keyboard_type(text: str, page: Page) -> None: + """Fill the focused element with text.""" + page.keyboard.type(text) + + +async def aexecute_keyboard_type(text: str, page: APage) -> None: + """Fill the focused element with text.""" + await page.keyboard.type(text) + + +def execute_click_current(page: Page) -> None: + """Click at the current mouse position.""" + locators = page.locator("*:focus") + if not locators.count(): + for frame in page.frames[1:]: + locators = frame.locator("*:focus") + if locators.count(): + break + locators.click() + + +async def aexecute_click_current(page: APage) -> None: + """Click at the current mouse position.""" + locators = page.locator("*:focus") + locator_count = await locators.count() + if not locator_count: + for frame in page.frames[1:]: + locators = frame.locator("*:focus") + locator_count = await locators.count() + if locator_count: + break + await locators.click() + await page.wait_for_load_state("load") + + +def execute_type(keys: list[int], page: Page) -> None: + """Send keystrokes to the focused element.""" + if _id2key[keys[-1]] == "\n": + text = "".join([_id2key[key] for key in keys[:-1]]) + page.keyboard.type(text) + time.sleep(1) + page.keyboard.press("Enter") + else: + text = "".join([_id2key[key] for key in keys]) + page.keyboard.type(text) + + +async def aexecute_type(keys: list[int], page: APage) -> None: + """Send keystrokes to the focused element.""" + text = "".join([_id2key[key] for key in keys]) + await page.keyboard.type(text) + + +def execute_focus( + element_role: int, element_name: str, nth: int, page: Page +) -> None: + """Click the specified DOM element.""" + element_role_str = _id2role[element_role] + if page.viewport_size is None: + raise ValueError("Viewport size is not set for the current page") + element_location_list: list[tuple[Locator, float, float]] = [] + for frame in page.frames: + match element_role_str: + case "alt_text": + locators = frame.get_by_alt_text(element_name) + case "label": + locators = frame.get_by_label(element_name) + case "placeholder": + locators = frame.get_by_placeholder(element_name) + case _: + locators = frame.get_by_role( + role=element_role_str, name=element_name + ) + for locator_idx in range(locators.count()): + locator = locators.nth(locator_idx) + if is_in_viewport(locator, page.viewport_size): + bounding_box = locator.bounding_box() + assert bounding_box + element_location_list.append( + (locator, bounding_box["x"], bounding_box["y"]) + ) + if len(element_location_list) <= nth: + raise ValueError( + f"There are only {len(element_location_list)} elements found in viewport, but {nth + 1} is requested" + ) + element_location_list.sort(key=lambda x: (x[2], x[1])) # row major order + element_location_list[nth][0].focus() + + +async def aexecute_focus( + element_role: int, element_name: str, nth: int, page: APage +) -> None: + """Click the specified DOM element.""" + element_role_str = _id2role[element_role] + if page.viewport_size is None: + raise ValueError("Viewport size is not set for the current page") + element_location_list: list[tuple[ALocator, float, float]] = [] + for frame in page.frames: + match element_role_str: + case "alt_text": + locators = frame.get_by_alt_text(element_name) + case "label": + locators = frame.get_by_label(element_name) + case "placeholder": + locators = frame.get_by_placeholder(element_name) + case _: + locators = frame.get_by_role( + role=element_role_str, name=element_name + ) + for locator_idx in range(await locators.count()): + locator = locators.nth(locator_idx) + if await async_is_in_viewport(locator, page.viewport_size): + bounding_box = await locator.bounding_box() + assert bounding_box + element_location_list.append( + (locator, bounding_box["x"], bounding_box["y"]) + ) + if len(element_location_list) <= nth: + raise ValueError( + f"There are only {len(element_location_list)} elements found in viewport, but {nth + 1} is requested" + ) + element_location_list.sort(key=lambda x: (x[2], x[1])) # row major order + await element_location_list[nth][0].focus() + + +def locate(locator_calls: list[ParsedPlaywrightCode], page: Page) -> Locator: + locator = page + for call in locator_calls: + function_name = call["function_name"] + arguments = call["arguments"] + keywords = call["keywords"] + locator = getattr(locator, function_name)(*arguments, **keywords) + return locator # type: ignore[return-value] + + +async def alocate( + locator_calls: list[ParsedPlaywrightCode], page: APage +) -> ALocator: + locator = page + for call in locator_calls: + function_name = call["function_name"] + arguments = call["arguments"] + keywords = call["keywords"] + locator = await getattr(locator, function_name)(*arguments, **keywords) + return locator # type: ignore[return-value] + + +def execute_playwright_click( + locator_code: list[ParsedPlaywrightCode], + page: Page, + pw_action_args: list[str] = [], + pw_action_kwargs: dict[str, Any] = {}, +) -> None: + locator = locate(locator_code, page) + + # perform the action + locator.click(*pw_action_args, **pw_action_kwargs) + + +async def aexecute_playwright_click( + locator_code: list[ParsedPlaywrightCode], + page: APage, + pw_action_args: list[str] = [], + pw_action_kwargs: dict[str, Any] = {}, +) -> None: + locator = await alocate(locator_code, page) + + # perform the action + await locator.click(*pw_action_args, **pw_action_kwargs) + + +def execute_playwright_hover( + locator_code: list[ParsedPlaywrightCode], page: Page +) -> None: + locator = locate(locator_code, page) + + # perform the action + locator.hover() + + +async def aexecute_playwright_hover( + locator_code: list[ParsedPlaywrightCode], page: APage +) -> None: + locator = await alocate(locator_code, page) + + # perform the action + await locator.hover() + + +def execute_playwright_type( + text: str, + locator_code: list[ParsedPlaywrightCode], + page: Page, + pw_action_args: list[str] = [], + pw_action_kwargs: dict[str, Any] = {}, +) -> None: + locator = locate(locator_code, page) + # perform the action + pw_action_args = [text] + pw_action_args # text is the first argument + locator.type(*pw_action_args, **pw_action_kwargs) + + +async def aexecute_playwright_type( + text: str, + locator_code: list[ParsedPlaywrightCode], + page: APage, + pw_action_args: list[str] = [], + pw_action_kwargs: dict[str, Any] = {}, +) -> None: + locator = await alocate(locator_code, page) + # perform the action + pw_action_args = [text] + pw_action_args # text is the first argument + await locator.type(*pw_action_args, **pw_action_kwargs) + + +def execute_playwright_select_option( + locator_code: list[ParsedPlaywrightCode], + page: Page, + pw_action_args: list[str] = [], + pw_action_kwargs: dict[str, Any] = {}, +) -> None: + locator = locate(locator_code, page) + # perform the action + locator.select_option(*pw_action_args, **pw_action_kwargs) + + +async def aexecute_playwright_select_option( + locator_code: list[ParsedPlaywrightCode], + page: APage, + pw_action_args: list[str] = [], + pw_action_kwargs: dict[str, Any] = {}, +) -> None: + locator = await alocate(locator_code, page) + # perform the action + await locator.select_option(*pw_action_args, **pw_action_kwargs) + + +def execute_playwright_check( + locator_code: list[ParsedPlaywrightCode], page: Page +) -> None: + locator = locate(locator_code, page) + # perform the action + locator.check() + + +async def aexecute_playwright_check( + locator_code: list[ParsedPlaywrightCode], page: APage +) -> None: + locator = await alocate(locator_code, page) + # perform the action + await locator.check() + + +def execute_action( + action: Action, + page: Page, + browser_ctx: BrowserContext, + obseration_processor: ObservationProcessor, +) -> Page: + """Execute the action on the ChromeDriver.""" + action_type = action["action_type"] + def is_at_bottom_of_page(page): + result = page.evaluate('(window.scrollY + window.innerHeight) >= document.documentElement.scrollHeight') + return result + def is_at_top_of_page(page): + result = page.evaluate('window.scrollY === 0') + return result + + if "element_id" in action.keys() and action["element_id"]: + element_id = action["element_id"] + node = obseration_processor.get_node_info_by_element_id(int(element_id)) + while not obseration_processor.element_is_visible(page, element_id) and not is_at_bottom_of_page(page): + execute_scroll("down", page) + while not obseration_processor.element_is_visible(page, element_id) and not is_at_top_of_page(page): + execute_scroll("up", page) + + match action_type: + case ActionTypes.NONE: + pass + + case ActionTypes.SCROLL: + direction = "up" if "up" in action["direction"] else "down" + execute_scroll(direction, page) + case ActionTypes.KEY_PRESS: + keys = action["key_comb"] + execute_key_press(keys, page) + + case ActionTypes.MOUSE_CLICK: + execute_mouse_click(action["coords"][0], action["coords"][1], page) + case ActionTypes.MOUSE_HOVER: + execute_mouse_hover(action["coords"][0], action["coords"][1], page) + case ActionTypes.KEYBOARD_TYPE: + execute_type(action["text"], page) + + case ActionTypes.CLICK: + # check each kind of locator in order + # TODO[shuyanzh]: order is temp now + if action["element_id"]: + node = obseration_processor.get_node_info_by_element_id(int(element_id)) + if node and (node.role=="menuitem" or node.role=="option"): + try: + page.get_by_role(node.role, name=node.name, exact=True).click() + except: + try: + page.get_by_role(node.role, name=node.name).click() + except: + try: + page.get_by_role(node.parent.role, name=node.parent.name, exact=True).select_option(node.name) + except: + page.get_by_role(node.parent.role, name=node.parent.name).select_option(node.name) + else: + try: + page.get_by_role(node.role, name=node.name, exact=True).click() + except Exception as e: + try: + page.get_by_role(node.role, name=node.name).click() + except Exception as e: + element_id = action["element_id"] + element_center = obseration_processor.get_element_center(element_id, page) # type: ignore[attr-defined] + execute_mouse_click(element_center[0], element_center[1], page) + elif action["element_role"] and action["element_name"]: + element_role = int(action["element_role"]) + element_name = action["element_name"] + nth = action["nth"] + execute_focus(element_role, element_name, nth, page) + execute_click_current(page) + elif action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + # [shuyanzh], don't support action args and kwargs now + execute_playwright_click(locator_code=locator_code, page=page) + else: + raise ValueError("No proper locator found for click action") + case ActionTypes.HOVER: + if action["element_id"]: + element_id = action["element_id"] + element_center = obseration_processor.get_element_center(element_id, page) # type: ignore[attr-defined] + execute_mouse_hover(element_center[0], element_center[1], page) + elif action["element_role"] and action["element_name"]: + element_role = int(action["element_role"]) + element_name = action["element_name"] + nth = action["nth"] + execute_focus(element_role, element_name, nth, page) + elif action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + # [shuyanzh], don't support action args and kwargs now + execute_playwright_hover(locator_code=locator_code, page=page) + else: + raise NotImplementedError( + "No proper locator found for hover action" + ) + case ActionTypes.TYPE: + if action["element_id"]: + if not obseration_processor.element_is_visible(page, element_id): + press_enter = True if _id2key[action["text"][-1]] == "\n" else False + node = obseration_processor.get_node_info_by_element_id(int(element_id)) + try: + if press_enter: + page.get_by_role(node.role, name=node.name, exact=True).fill("".join([_id2key[idx] for idx in action["text"][:-1]])) + time.sleep(1) + page.keyboard.press("Enter") + else: + page.get_by_role(node.role, name=node.name, exact=True).fill("".join([_id2key[idx] for idx in action["text"]])) + except: + if press_enter: + page.get_by_role(node.role, name=node.name).fill("".join([_id2key[idx] for idx in action["text"][:-1]])) + time.sleep(1) + page.keyboard.press("Enter") + else: + page.get_by_role(node.role, name=node.name).fill("".join([_id2key[idx] for idx in action["text"]])) + else: + element_id = action["element_id"] + element_center = obseration_processor.get_element_center(element_id, page) # type: ignore[attr-defined] + execute_mouse_click(element_center[0], element_center[1], page) + page.keyboard.press("Control+A") + for _ in range(1): + page.keyboard.press("Backspace") + execute_type(action["text"], page) + elif action["element_role"] and action["element_name"]: + element_role = int(action["element_role"]) + element_name = action["element_name"] + nth = action["nth"] + execute_focus(element_role, element_name, nth, page) + execute_type(action["text"], page) + elif action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + text = parsed_code[-1]["arguments"][0] + # [shuyanzh], don't support action args and kwargs now + execute_playwright_type( + text=text, locator_code=locator_code, page=page + ) + else: + raise NotImplementedError( + "No proper locator found for type action" + ) + + case ActionTypes.PAGE_FOCUS: + page = browser_ctx.pages[action["page_number"]] + page.bring_to_front() + case ActionTypes.NEW_TAB: + page = browser_ctx.new_page() + page.client = page.context.new_cdp_session(page) # type: ignore[attr-defined] + case ActionTypes.GO_BACK: + page.go_back() + if "about:blank" in page.url: + page.go_forward() + case ActionTypes.GO_FORWARD: + page.go_forward() + case ActionTypes.GOTO_URL: + if action["answer"] == "1": + page = browser_ctx.new_page() + page.client = page.context.new_cdp_session(page) + page.goto(action["url"]) + case ActionTypes.PAGE_CLOSE: + page.close() + if len(browser_ctx.pages) > 0: + page = browser_ctx.pages[-1] + else: + page = browser_ctx.new_page() + + case ActionTypes.SELECT_OPTION: + if action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + pw_action_args = parsed_code[-1].get('arguments', []) + execute_playwright_select_option(locator_code, page, pw_action_args) + else: + raise NotImplementedError( + "No proper locator found for select option action" + ) + case ActionTypes.CHECK: + if action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + execute_playwright_check(locator_code, page) + else: + raise NotImplementedError( + "No proper locator found for select option action" + ) + case ActionTypes.STOP: + pass + + case _: + raise ValueError(f"Unknown action type: {action_type}") + + return page + + +async def aexecute_action( + action: Action, page: APage, browser_ctx: ABrowserContext +) -> APage: + """Execute the async action on the ChromeDriver.""" + action_type = action["action_type"] + match action_type: + case ActionTypes.NONE: + pass + case ActionTypes.SCROLL: + direction = "up" if "up" in action["direction"] else "down" + await aexecute_scroll(direction, page) + case ActionTypes.KEY_PRESS: + keys = action["key_comb"] + await aexecute_key_press(keys, page) + + case ActionTypes.MOUSE_CLICK: + await aexecute_mouse_click( + action["coords"][0], action["coords"][1], page + ) + case ActionTypes.MOUSE_HOVER: + await aexecute_mouse_hover( + action["coords"][0], action["coords"][1], page + ) + case ActionTypes.KEYBOARD_TYPE: + await aexecute_type(action["text"], page) + + case ActionTypes.CLICK: + # check each kind of locator in order + # TODO[shuyanzh]: order is temp now + if action["element_id"]: + raise NotImplementedError + elif action["element_role"] and action["element_name"]: + element_role = int(action["element_role"]) + element_name = action["element_name"] + nth = action["nth"] + await aexecute_focus(element_role, element_name, nth, page) + await aexecute_click_current(page) + elif action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + # [shuyanzh], don't support action args and kwargs now + await aexecute_playwright_click( + locator_code=locator_code, page=page + ) + else: + raise ValueError("No proper locator found for click action") + case ActionTypes.HOVER: + if action["element_id"]: + raise NotImplementedError + elif action["element_role"] and action["element_name"]: + element_role = int(action["element_role"]) + element_name = action["element_name"] + nth = action["nth"] + await aexecute_focus(element_role, element_name, nth, page) + elif action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + # [shuyanzh], don't support action args and kwargs now + await aexecute_playwright_hover( + locator_code=locator_code, page=page + ) + else: + raise NotImplementedError( + "No proper locator found for hover action" + ) + case ActionTypes.TYPE: + if action["element_id"]: + raise NotImplementedError + elif action["element_role"] and action["element_name"]: + element_role = int(action["element_role"]) + element_name = action["element_name"] + nth = action["nth"] + await aexecute_focus(element_role, element_name, nth, page) + await aexecute_type(action["text"], page) + elif action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + text = parsed_code[-1]["arguments"][0] + # [shuyanzh], don't support action args and kwargs now + await aexecute_playwright_type( + text=text, locator_code=locator_code, page=page + ) + else: + raise NotImplementedError( + "No proper locator found for type action" + ) + + case ActionTypes.PAGE_FOCUS: + page = browser_ctx.pages[action["page_number"]] + await page.bring_to_front() + case ActionTypes.NEW_TAB: + page = await browser_ctx.new_page() + case ActionTypes.GO_BACK: + await page.go_back() + case ActionTypes.GO_FORWARD: + await page.go_forward() + case ActionTypes.GOTO_URL: + await page.goto(action["url"]) + case ActionTypes.PAGE_CLOSE: + await page.close() + if len(browser_ctx.pages) > 0: + page = browser_ctx.pages[-1] + else: + page = await browser_ctx.new_page() + + case ActionTypes.SELECT_OPTION: + if action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + await aexecute_playwright_select_option(locator_code, page) + else: + raise NotImplementedError( + "No proper locator found for select option action" + ) + case ActionTypes.CHECK: + if action["pw_code"]: + parsed_code = parse_playwright_code(action["pw_code"]) + locator_code = parsed_code[:-1] + await aexecute_playwright_check(locator_code, page) + else: + raise NotImplementedError( + "No proper locator found for select option action" + ) + + case _: + raise ValueError(f"Unknown action type: {action_type}") + + return page + + +def parse_playwright_code(code: str) -> list[ParsedPlaywrightCode]: + # extract function calls + if not code.startswith("page."): + raise ValueError( + f'Playwright action must start with "page.", but got {code}' + ) + + regex = r"\.(?![^\(\)]*\))" + chain = re.split(regex, code)[1:] + + parsed_chain = [] + + for item in chain: + tree = ast.parse(item) + funcs = [] + for node in ast.walk(tree): + if isinstance(node, ast.Call): + function_name = node.func.id # type: ignore[attr-defined] + arguments = [ + ast.literal_eval(arg) if isinstance(arg, ast.Str) else arg + for arg in node.args + ] + keywords = { + str(kw.arg): ast.literal_eval(kw.value) + for kw in node.keywords + } + funcs.append( + ParsedPlaywrightCode( + { + "function_name": function_name, + "arguments": arguments, + "keywords": keywords, + } + ) + ) + + if len(funcs) != 1: + raise ValueError(f"Fail to parse {item} in {code}") + + if ( + funcs[0]["function_name"] + not in PLAYWRIGHT_LOCATORS + PLAYWRIGHT_ACTIONS + ): + raise ValueError( + f"Invalid playwright code {item}, ", + f"the function needs to be one of {PLAYWRIGHT_LOCATORS + PLAYWRIGHT_ACTIONS}", + ) + + parsed_chain.append(funcs[0]) + + last_action = parsed_chain[-1] + if last_action["function_name"] not in PLAYWRIGHT_ACTIONS: + raise ValueError( + f"Invalid playwright action {last_action},", + f"the action needs to be one of {PLAYWRIGHT_ACTIONS}", + ) + + return parsed_chain + + +class ActionParsingError(Exception): + def __init__(self, message: str) -> None: + self.message = message + super().__init__(self.message) + + +@beartype +def create_playwright_action(playwright_code: str) -> Action: + """Main function to return individual playwright action""" + # get the last action + regex = r"\.(?![^\(\)]*\))" + action = re.split(regex, playwright_code)[-1].split("(")[0] + match action: + case "press": + p = r'press\((?:"|\')(.+?)(?:"|\')\)' + match = re.search(p, playwright_code) + if not match: + raise ActionParsingError( + f"Invalid press action, required to be page.press(KEY_COMB_STR)" + ) + key_comb = match.group(1) + return create_key_press_action(key_comb=key_comb) + case "scroll": + direction = "up" if "up" in playwright_code else "down" + return create_scroll_action(direction=direction) + case "click": + return create_click_action(pw_code=playwright_code) + case "hover": + return create_hover_action(pw_code=playwright_code) + case "type" | "fill": + p = r'type|fill\((?:"|\')(.+?)(?:"|\')\)' + match = re.search(p, playwright_code) + if not match: + raise ActionParsingError( + f"Invalid type/fill action, required to be page.type(TEXT)" + ) + text = match.group(1) + return create_type_action(text=text, pw_code=playwright_code) + case "select_option": + return create_select_option_action(pw_code=playwright_code) + case "check": + return create_check_action(pw_code=playwright_code) + case "goto": + p = r'goto\((?:"|\')(.+?)(?:"|\')\)' + match = re.search(p, playwright_code) + if not match: + raise ActionParsingError( + f"Invalid goto action, required to be page.goto(URL_STR)" + ) + url = match.group(1) + return create_goto_url_action(url) + case "page_focus": + # get the page number + p = r"page_focus\((\d+)\)" + match = re.search(p, playwright_code) + if not match: + raise ActionParsingError("page focus requires a page number") + page_num = int(match.group(1)) + return create_page_focus_action(page_num) + case "new_tab": + return create_new_tab_action() + case "go_back": + return create_go_back_action() + case "go_forward": + return create_go_forward_action() + case "page_close": + return create_page_close_action() + case "stop": # page.stop(answer) + p = r'stop\(?"(.+)?"\)' + match = re.search(p, playwright_code) + if not match: + answer = "" + else: + answer = match.group(1) + return create_stop_action(answer) + + raise ActionParsingError(f"Unknown playwright action {action}") + + +@beartype +def create_id_based_action(action_str: str): + """Main function to return individual id based action""" + action_str = action_str.strip() + action = ( + action_str.split("[")[0].strip() + if "[" in action_str + else action_str.split()[0].strip() + ) + match action: + case "click": + match = re.search(r"click ?\[(\d+)\]", action_str) + if not match: + raise ActionParsingError(f"Invalid click action {action_str}") + element_id = match.group(1) + return create_click_action(element_id=element_id) + case "hover": + match = re.search(r"hover ?\[(\d+)\]", action_str) + if not match: + raise ActionParsingError(f"Invalid hover action {action_str}") + element_id = match.group(1) + return create_hover_action(element_id=element_id) + case "type": + # add default enter flag + if not (action_str.endswith("[0]") or action_str.endswith("[1]")): + action_str += " [1]" + + match = re.search( + r"type ?\[(\d+)\] ?\[(.+)\] ?\[(\d+)\]", action_str + ) + if not match: + raise ActionParsingError(f"Invalid type action {action_str}") + element_id, text, enter_flag = ( + match.group(1), + match.group(2), + match.group(3), + ) + enter_flag = True if enter_flag == "1" else False + if enter_flag: + text += "\n" + return create_our_type_action(text=text, element_id=element_id, flag=enter_flag) + case "press": + match = re.search(r"press ?\[(.+)\]", action_str) + if not match: + raise ActionParsingError(f"Invalid press action {action_str}") + key_comb = match.group(1) + return create_key_press_action(key_comb=key_comb) + case "scroll": + # up or down + match = re.search(r"scroll ?\[?(up|down)\]?", action_str) + if not match: + raise ActionParsingError(f"Invalid scroll action {action_str}") + direction = match.group(1) + return create_scroll_action(direction=direction) + case "goto": + if not (action_str.endswith("[0]") or action_str.endswith("[1]")): + action_str += " [1]" + match = re.search(r"goto ?\[(.+)\] \[(\d+)\]", action_str) + if not match: + raise ActionParsingError(f"Invalid goto action {action_str}") + url, new_tab_flag = ( + match.group(1), + match.group(2), + ) + new_tab_flag = True if new_tab_flag == "1" else False + return create_our_goto_url_action(url=url, flag=new_tab_flag) + case "new_tab": + return create_new_tab_action() + case "go_back": + return create_go_back_action() + case "go_forward": + return create_go_forward_action() + case "tab_focus": + match = re.search(r"tab_focus ?\[(\d+)\]", action_str) + if not match: + raise ActionParsingError( + f"Invalid tab_focus action {action_str}" + ) + page_number = int(match.group(1)) + return create_page_focus_action(page_number) + case "close_tab": + return create_page_close_action() + case "stop": # stop answer + match = re.search(r"stop ?\[(.+)\]", action_str) + if not match: # some tasks don't require an answer + answer = "" + else: + answer = match.group(1) + return create_stop_action(answer) + case "select": + match = re.search(r"select ?\[(.+)\] ?\[(.+)\]", action_str) + if not match: + raise ActionParsingError(f"Invalid select action {action_str}") + elem = match.group(1) + option = match.group(2) + pw_code = f'page.get_by_test_id("{elem}").select_option("{option}")' + return create_our_select_option_action(pw_code, elem, option) + case "record": + match = re.search(r"record ?\[(.+)\]", action_str) + text = match.group(1) + return create_record_action(text) + + return False + +def create_id_based_actions(action_str: str): + valid_actions = ["click", "hover", "type", "press", "scroll", "goto", "new_tab", "go_back", "go_forward", "tab_focus", "close tab", "stop", "select", "record", "branch", "prune", "note"] + def parse_str_to_action_list(text:str, actions: list): + remain_text = text + action_list = [] + while remain_text: + find_action = False + for action in actions: + if remain_text.startswith(action): + match = re.search(rf'({re.escape(action)}\s*(.*?))(?=\n(?:{"|".join(map(re.escape, actions))})|$)', remain_text, re.DOTALL) + action_list.append(match[0]) + remain_text = remain_text[len(match[0]):].strip() + find_action = True + if not find_action: + break + return action_list + action_str_list = parse_str_to_action_list(action_str, valid_actions) + action_cmds = [] + for raw_action_str in action_str_list: + action = ( + raw_action_str.split("[")[0].strip() + if "[" in raw_action_str + else raw_action_str.split()[0].strip() + ) + match action: + case "click": + match = re.search(r"click ?\[(\d+)\]", raw_action_str) + if not match: + raise ActionParsingError(f"Invalid click action {raw_action_str}") + element_id = match.group(1) + action_cmds.append(create_click_action(element_id=element_id)) + case "hover": + match = re.search(r"hover ?\[(\d+)\]", raw_action_str) + if not match: + raise ActionParsingError(f"Invalid hover action {raw_action_str}") + element_id = match.group(1) + action_cmds.append(create_hover_action(element_id=element_id)) + case "type": + # add default enter flag + if not (raw_action_str.endswith("[0]") or raw_action_str.endswith("[1]")): + raw_action_str += " [1]" + + match = re.search( + r"type ?\[(\d+)\] ?\[(.*)\] ?\[(\d+)\]", raw_action_str, re.DOTALL + ) + if not match: + raise ActionParsingError(f"Invalid type action {raw_action_str}") + element_id, text, enter_flag = ( + match.group(1), + match.group(2), + match.group(3), + ) + enter_flag = True if enter_flag == "1" else False + if enter_flag: + text += "\n" + action_cmds.append(create_our_type_action(text=text, element_id=element_id, flag=enter_flag)) + case "press": + match = re.search(r"press ?\[(.+)\]", raw_action_str) + if not match: + raise ActionParsingError(f"Invalid press action {raw_action_str}") + key_comb = match.group(1) + action_cmds.append(create_key_press_action(key_comb=key_comb)) + case "scroll": + # up or down + match = re.search(r"scroll ?\[?(up|down)\]?", raw_action_str) + if not match: + raise ActionParsingError(f"Invalid scroll action {raw_action_str}") + direction = match.group(1) + action_cmds.append(create_scroll_action(direction=direction)) + case "goto": + if not (raw_action_str.endswith("[0]") or raw_action_str.endswith("[1]")): + raw_action_str += " [1]" + match = re.search(r"goto ?\[(.+)\] \[(\d+)\]", raw_action_str) + if not match: + raise ActionParsingError(f"Invalid goto action {raw_action_str}") + url, new_tab_flag = ( + match.group(1), + match.group(2), + ) + new_tab_flag = True if new_tab_flag == "1" else False + action_cmds.append(create_our_goto_url_action(url=url, flag=new_tab_flag)) + case "new_tab": + action_cmds.append(create_new_tab_action()) + case "go_back": + action_cmds.append(create_go_back_action()) + case "go_forward": + action_cmds.append(create_go_forward_action()) + case "tab_focus": + match = re.search(r"tab_focus ?\[(\d+)\]", raw_action_str) + if not match: + raise ActionParsingError( + f"Invalid tab_focus action {raw_action_str}" + ) + page_number = int(match.group(1)) + action_cmds.append(create_page_focus_action(page_number)) + case "close_tab": + action_cmds.append(create_page_close_action()) + case "stop": # stop answer + match = re.search(r"stop ?\[?(.+)\]?", raw_action_str, re.DOTALL) + if not match: # some tasks don't require an answer + answer = "" + else: + answer = match.group(1) + if answer.endswith("]"): + answer = answer[:-1] + action_cmds.append(create_stop_action(answer)) + case "select": + match = re.search(r"select ?\[(.+)\] ?\[(.+)\]", raw_action_str) + if not match: + raise ActionParsingError(f"Invalid select action {raw_action_str}") + elem = match.group(1) + option = match.group(2) + pw_code = f'page.get_by_test_id("{elem}").select_option("{option}")' + action_cmds.append(create_our_select_option_action(pw_code, elem, option)) + case "record": + match = re.search(r"record ?\[?(.+)\]?", raw_action_str, re.DOTALL) + text = match.group(1) + action_cmds.append(create_record_action(text)) + case "note": + pass + + return action_cmds \ No newline at end of file diff --git a/browser_env/async_envs.py b/browser_env/async_envs.py new file mode 100644 index 0000000..29fb32f --- /dev/null +++ b/browser_env/async_envs.py @@ -0,0 +1,153 @@ +import asyncio +import json +from dataclasses import dataclass +from pathlib import Path + +import numpy as np +import numpy.typing as npt +from gymnasium import Env +from gymnasium.spaces import Box, Text +from playwright.async_api import Page, ViewportSize, async_playwright + +from .actions import Action, aexecute_action, get_action_space +from .utils import DetachedPage, png_bytes_to_numpy + + +class AsyncScriptBrowserEnv(Env[npt.NDArray[np.uint8], Action]): + """ + The goal of this environment is to produce a prototype of a browser environment. + In the end, we want to support a fully configurable browser environment with wide + range of action spaces and observation spaces, both structured and unstructured. + But in this prototype, we just support action space specified by Playwright script, + and observation space is the html content of the page. + """ + + def __init__( + self, + max_page_length: int = 2048, + headless: bool = True, + slow_mo: int = 0, + timeout: int = 30000, + viewport_size: ViewportSize = {"width": 1280, "height": 720}, + ): + self.observation_space = Box( + 0, + 255, + (viewport_size["height"], viewport_size["width"], 4), + np.uint8, + ) + # TODO: make Space[Action] = ActionSpace + self.action_space = get_action_space() # type: ignore[assignment] + self.headless = headless + self.slow_mo = slow_mo + self.reset_finished = False + self.timeout = timeout + self.viewport_size = viewport_size + + async def setup(self, config_file: Path | None = None) -> None: + self.context_manager = async_playwright() + self.playwright = await self.context_manager.__aenter__() + self.browser = await self.playwright.chromium.launch( + headless=self.headless, slow_mo=self.slow_mo + ) + if config_file: + with open(config_file, "r") as f: + instance_config = json.load(f) + else: + instance_config = {} + + storage_state = instance_config.get("storage_state", None) + start_url = instance_config.get("start_url", None) + geolocation = instance_config.get("geolocation", None) + + self.context = await self.browser.new_context( + viewport=self.viewport_size, + storage_state=storage_state, + geolocation=geolocation, + device_scale_factor=1, + ) + self.page = await self.context.new_page() + if start_url: + await self.page.goto(start_url) + + async def areset( + self, + *, + seed: int | None = None, + options: dict[str, str] | None = None, + ) -> tuple[npt.NDArray[np.uint8], dict[str, object]]: + """ + Reset the environment. + :param options: options for the environment. The options are: + - storage_state: the path to the storage state file + """ + super().reset(seed=seed, options=options) + if self.reset_finished: + await self.context_manager.__aexit__() + if options is not None and "config_file" in options: + config_file = Path(options["config_file"]) + if config_file.exists(): + await self.setup(config_file=config_file) + else: + raise ValueError(f"Config state {config_file} does not exist.") + else: + await self.setup() + self.reset_finished = True + content = await self.page.content() + screenshot = png_bytes_to_numpy(await self.page.screenshot()) + return ( + screenshot, + {"page": DetachedPage(self.page.url, content)}, + ) + + def reset( + self, + *, + seed: int | None = None, + options: dict[str, str] | None = None, + ) -> tuple[npt.NDArray[np.uint8], dict[str, object]]: + return asyncio.run(self.areset(seed=seed, options=options)) + + async def aclose(self) -> None: + if self.reset_finished: + await self.context_manager.__aexit__() + + def close(self) -> None: + asyncio.run(self.aclose()) + + async def astep( + self, action: Action + ) -> tuple[npt.NDArray[np.uint8], float, bool, bool, dict[str, object]]: + if not self.reset_finished: + raise RuntimeError("Call reset first before calling step.") + success = False + fail_error = "" + try: + self.page = await aexecute_action(action, self.page, self.context) + success = True + except Exception as e: + fail_error = str(e) + + try: + content = await self.page.content() + screenshot = png_bytes_to_numpy(await self.page.screenshot()) + except: + await self.page.wait_for_load_state("load") + content = await self.page.content() + screenshot = png_bytes_to_numpy(await self.page.screenshot()) + + return ( + screenshot, + float(success), + False, + False, + { + "page": DetachedPage(self.page.url, content), + "fail_error": fail_error, + }, + ) + + def step( + self, action: Action + ) -> tuple[npt.NDArray[np.uint8], float, bool, bool, dict[str, object]]: + return asyncio.run(self.astep(action), debug=True) diff --git a/browser_env/auto_login.py b/browser_env/auto_login.py new file mode 100644 index 0000000..a54632d --- /dev/null +++ b/browser_env/auto_login.py @@ -0,0 +1,207 @@ +"""Script to automatically login each website""" +import argparse +import glob +import os +import time +from concurrent.futures import ThreadPoolExecutor +from itertools import combinations +from pathlib import Path + +from playwright.sync_api import sync_playwright + +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from browser_env.env_config import ( + ACCOUNTS, + GITLAB, + REDDIT, + SHOPPING, + SHOPPING_ADMIN, +) + +HEADLESS = True +SLOW_MO = 0 + + +SITES = ["gitlab", "shopping", "shopping_admin", "reddit"] +URLS = [ + f"{GITLAB}/-/profile", + f"{SHOPPING}/wishlist/", + f"{SHOPPING_ADMIN}/dashboard", + f"{REDDIT}/user/{ACCOUNTS['reddit']['username']}/account", +] +EXACT_MATCH = [True, True, True, True] +KEYWORDS = ["", "", "Dashboard", "Delete"] + + +def is_expired( + storage_state: Path, url: str, keyword: str, url_exact: bool = True +) -> bool: + """Test whether the cookie is expired""" + if not storage_state.exists(): + return True + + context_manager = sync_playwright() + playwright = context_manager.__enter__() + browser = playwright.chromium.launch(headless=True, slow_mo=SLOW_MO) + context = browser.new_context(storage_state=storage_state) + page = context.new_page() + page.goto(url) + time.sleep(1) + d_url = page.url + content = page.content() + context_manager.__exit__() + if keyword: + return keyword not in content + else: + if url_exact: + return d_url != url + else: + return url not in d_url + + +def renew_comb(comb: list[str], auth_folder: str = "./.auth") -> None: + for c in comb: + context_manager = sync_playwright() + playwright = context_manager.__enter__() + browser = playwright.chromium.launch(headless=HEADLESS) + context = browser.new_context() + page = context.new_page() + + if c == "shopping": + username = ACCOUNTS["shopping"]["username"] + password = ACCOUNTS["shopping"]["password"] + page.goto(f"{SHOPPING}/customer/account/login/") + page.get_by_label("Email", exact=True).fill(username) + page.get_by_label("Password", exact=True).fill(password) + page.get_by_role("button", name="Sign In").click() + + if c == "reddit": + username = ACCOUNTS["reddit"]["username"] + password = ACCOUNTS["reddit"]["password"] + page.goto(f"{REDDIT}/login") + page.get_by_label("Username").fill(username) + page.get_by_label("Password").fill(password) + page.get_by_role("button", name="Log in").click() + + if c == "shopping_admin": + username = ACCOUNTS["shopping_admin"]["username"] + password = ACCOUNTS["shopping_admin"]["password"] + page.goto(f"{SHOPPING_ADMIN}") + page.get_by_placeholder("user name").fill(username) + page.get_by_placeholder("password").fill(password) + page.get_by_role("button", name="Sign in").click() + + if c == "gitlab": + username = ACCOUNTS["gitlab"]["username"] + password = ACCOUNTS["gitlab"]["password"] + page.goto(f"{GITLAB}/users/sign_in") + page.screenshot(path="debug.png") + page.get_by_test_id("username-field").click() + page.get_by_test_id("username-field").fill(username) + page.get_by_test_id("username-field").press("Tab") + page.get_by_test_id("password-field").fill(password) + page.get_by_test_id("sign-in-button").click() + + context.storage_state(path=f"{auth_folder}/{c}_state.json") + + context_manager.__exit__() + context_manager = sync_playwright() + playwright = context_manager.__enter__() + browser = playwright.chromium.launch(headless=HEADLESS) + context = browser.new_context() + page = context.new_page() + + if "shopping" in comb: + username = ACCOUNTS["shopping"]["username"] + password = ACCOUNTS["shopping"]["password"] + page.goto(f"{SHOPPING}/customer/account/login/") + page.get_by_label("Email", exact=True).fill(username) + page.get_by_label("Password", exact=True).fill(password) + page.get_by_role("button", name="Sign In").click() + + if "reddit" in comb: + username = ACCOUNTS["reddit"]["username"] + password = ACCOUNTS["reddit"]["password"] + page.goto(f"{REDDIT}/login") + page.get_by_label("Username").fill(username) + page.get_by_label("Password").fill(password) + page.get_by_role("button", name="Log in").click() + + if "shopping_admin" in comb: + username = ACCOUNTS["shopping_admin"]["username"] + password = ACCOUNTS["shopping_admin"]["password"] + page.goto(f"{SHOPPING_ADMIN}") + page.get_by_placeholder("user name").fill(username) + page.get_by_placeholder("password").fill(password) + page.get_by_role("button", name="Sign in").click() + + if "gitlab" in comb: + username = ACCOUNTS["gitlab"]["username"] + password = ACCOUNTS["gitlab"]["password"] + page.goto(f"{GITLAB}/users/sign_in") + page.get_by_test_id("username-field").click() + page.get_by_test_id("username-field").fill(username) + page.get_by_test_id("username-field").press("Tab") + page.get_by_test_id("password-field").fill(password) + page.get_by_test_id("sign-in-button").click() + + context.storage_state(path=f"{auth_folder}/{'.'.join(comb)}_state.json") + + context_manager.__exit__() + + +def get_site_comb_from_filepath(file_path: str) -> list[str]: + comb = os.path.basename(file_path).rsplit("_", 1)[0].split(".") + return comb + + +def main(auth_folder: str = "./.auth") -> None: + pairs = list(combinations(SITES, 2)) + + max_workers = 8 + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for pair in pairs: + # TODO[shuyanzh] auth don't work on these two sites + if "reddit" in pair and ( + "shopping" in pair or "shopping_admin" in pair + ): + continue + executor.submit( + renew_comb, list(sorted(pair)), auth_folder=auth_folder + ) + + for site in SITES: + executor.submit(renew_comb, [site], auth_folder=auth_folder) + + futures = [] + cookie_files = list(glob.glob(f"{auth_folder}/*.json")) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for c_file in cookie_files: + comb = get_site_comb_from_filepath(c_file) + for cur_site in comb: + url = URLS[SITES.index(cur_site)] + keyword = KEYWORDS[SITES.index(cur_site)] + match = EXACT_MATCH[SITES.index(cur_site)] + future = executor.submit( + is_expired, Path(c_file), url, keyword, match + ) + futures.append(future) + + for i, future in enumerate(futures): + assert not future.result(), f"Cookie {cookie_files[i]} expired." + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--site_list", nargs="+", default=["all"]) + parser.add_argument("--auth_folder", type=str, default="./.auth") + args = parser.parse_args() + if not args.site_list: + main() + else: + if "all" in args.site_list: + main(auth_folder=args.auth_folder) + else: + renew_comb(args.site_list, auth_folder=args.auth_folder) diff --git a/browser_env/constants.py b/browser_env/constants.py new file mode 100644 index 0000000..1276112 --- /dev/null +++ b/browser_env/constants.py @@ -0,0 +1,295 @@ +from typing import Literal + +ROLES = ( + "alert", + "alertdialog", + "application", + "article", + "banner", + "blockquote", + "button", + "caption", + "cell", + "checkbox", + "code", + "columnheader", + "combobox", + "complementary", + "contentinfo", + "definition", + "deletion", + "dialog", + "directory", + "document", + "emphasis", + "feed", + "figure", + "form", + "generic", + "grid", + "gridcell", + "group", + "heading", + "img", + "insertion", + "link", + "list", + "listbox", + "listitem", + "log", + "main", + "marquee", + "math", + "meter", + "menu", + "menubar", + "menuitem", + "menuitemcheckbox", + "menuitemradio", + "navigation", + "none", + "note", + "option", + "paragraph", + "presentation", + "progressbar", + "radio", + "radiogroup", + "region", + "row", + "rowgroup", + "rowheader", + "scrollbar", + "search", + "searchbox", + "separator", + "slider", + "spinbutton", + "status", + "strong", + "subscript", + "superscript", + "switch", + "tab", + "table", + "tablist", + "tabpanel", + "term", + "textbox", + "time", + "timer", + "toolbar", + "tooltip", + "tree", + "treegrid", + "treeitem", +) + +SPECIAL_LOCATORS = ( + "alt_text", + "label", + "placeholder", +) + +ASCII_CHARSET = "".join(chr(x) for x in range(32, 128)) +FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 110000)) +UTTERANCE_MAX_LENGTH = 8192 +ATTRIBUTE_MAX_LENGTH = 256 +TEXT_MAX_LENGTH = 256 +TYPING_MAX_LENGTH = 64 +URL_MAX_LENGTH = 256 +MAX_ELEMENT_INDEX_IN_VIEWPORT = 10 +MAX_ELEMENT_ID = 1000 +MAX_ANSWER_LENGTH = 512 + +MIN_REF = -1000000 +MAX_REF = 1000000 + +WINDOW_WIDTH = 500 +WINDOW_HEIGHT = 240 +TASK_WIDTH = 160 +TASK_HEIGHT = 210 + +FLIGHT_WINDOW_WIDTH = 600 +FLIGHT_WINDOW_HEIGHT = 700 +FLIGHT_TASK_WIDTH = 375 +FLIGHT_TASK_HEIGHT = 667 +MAX_PAGE_NUMBER = 10 + +SPECIAL_KEYS = ( + "Enter", + "Tab", + "Control", + "Shift", + "Meta", + "Backspace", + "Delete", + "Escape", + "ArrowUp", + "ArrowDown", + "ArrowLeft", + "ArrowRight", + "PageDown", + "PageUp", + "Meta+a", +) + +SPECIAL_KEY_MAPPINGS = { + "backquote": "Backquote", + "minus": "Minus", + "equal": "Equal", + "backslash": "Backslash", + "backspace": "Backspace", + "meta": "Meta", + "tab": "Tab", + "delete": "Delete", + "escape": "Escape", + "arrowdown": "ArrowDown", + "end": "End", + "enter": "Enter", + "home": "Home", + "insert": "Insert", + "pagedown": "PageDown", + "pageup": "PageUp", + "arrowright": "ArrowRight", + "arrowup": "ArrowUp", + "f1": "F1", + "f2": "F2", + "f3": "F3", + "f4": "F4", + "f5": "F5", + "f6": "F6", + "f7": "F7", + "f8": "F8", + "f9": "F9", + "f10": "F10", + "f11": "F11", + "f12": "F12", +} + +RolesType = Literal[ + "alert", + "alertdialog", + "application", + "article", + "banner", + "blockquote", + "button", + "caption", + "cell", + "checkbox", + "code", + "columnheader", + "combobox", + "complementary", + "contentinfo", + "definition", + "deletion", + "dialog", + "directory", + "document", + "emphasis", + "feed", + "figure", + "form", + "generic", + "grid", + "gridcell", + "group", + "heading", + "img", + "insertion", + "link", + "list", + "listbox", + "listitem", + "log", + "main", + "marquee", + "math", + "meter", + "menu", + "menubar", + "menuitem", + "menuitemcheckbox", + "menuitemradio", + "navigation", + "none", + "note", + "option", + "paragraph", + "presentation", + "progressbar", + "radio", + "radiogroup", + "region", + "row", + "rowgroup", + "rowheader", + "scrollbar", + "search", + "searchbox", + "separator", + "slider", + "spinbutton", + "status", + "strong", + "subscript", + "superscript", + "switch", + "tab", + "table", + "tablist", + "tabpanel", + "term", + "textbox", + "time", + "timer", + "toolbar", + "tooltip", + "tree", + "treegrid", + "treeitem", + "alt_text", + "label", + "placeholder", +] + +MAX_VANILLA_STR_LENGTH = 1000 + +PLAYWRIGHT_LOCATORS = ( + "get_by_role", + "get_by_text", + "get_by_label", + "get_by_placeholder", + "get_by_alt_text", + "get_by_title", + "get_by_test_id", + "filter", + "frame_locator", + "locator", +) + +PLAYWRIGHT_ACTIONS = ( + "fill", + "check", + "select_option", + "click", + "hover", + "dclick", + "type", + "focus", + "goto", + "press", + "scroll", +) + +IGNORED_ACTREE_PROPERTIES = ( + "focusable", + "editable", + "readonly", + "level", + "settable", + "multiline", + "invalid", +) diff --git a/browser_env/env_config.py b/browser_env/env_config.py new file mode 100644 index 0000000..f57f304 --- /dev/null +++ b/browser_env/env_config.py @@ -0,0 +1,51 @@ +# websites domain +import os + +REDDIT = os.environ.get("REDDIT", "http://localhost:9999") +SHOPPING = os.environ.get("SHOPPING", "http://localhost:7770") +SHOPPING_ADMIN = os.environ.get("SHOPPING_ADMIN", "http://localhost:7780/admin") +GITLAB = os.environ.get("GITLAB", "http://localhost:8023") +WIKIPEDIA = os.environ.get("WIKIPEDIA", "http://localhost:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing") +MAP = os.environ.get("MAP", "http://localhost:3000") +HOMEPAGE = os.environ.get("HOMEPAGE", "http://localhost:4399") + +assert ( + REDDIT + and SHOPPING + and SHOPPING_ADMIN + and GITLAB + and WIKIPEDIA + and MAP + and HOMEPAGE +), ( + f"Please setup the URLs to each site. Current: \n" + + f"Reddit: {REDDIT}\n" + + f"Shopping: {SHOPPING}\n" + + f"Shopping Admin: {SHOPPING_ADMIN}\n" + + f"Gitlab: {GITLAB}\n" + + f"Wikipedia: {WIKIPEDIA}\n" + + f"Map: {MAP}\n" + + f"Homepage: {HOMEPAGE}\n" +) + + +ACCOUNTS = { + "reddit": {"username": "MarvelsGrantMan136", "password": "test1234"}, + "gitlab": {"username": "byteblaze", "password": "hello1234"}, + "shopping": { + "username": "emma.lopez@gmail.com", + "password": "Password.123", + }, + "shopping_admin": {"username": "admin", "password": "admin1234"}, + "shopping_site_admin": {"username": "admin", "password": "admin1234"}, +} + +URL_MAPPINGS = { + REDDIT: "http://reddit.com", + SHOPPING: "http://onestopmarket.com", + SHOPPING_ADMIN: "http://luma.com/admin", + GITLAB: "http://gitlab.com", + WIKIPEDIA: "http://wikipedia.org", + MAP: "http://openstreetmap.org", + HOMEPAGE: "http://homepage.com", +} diff --git a/browser_env/envs.py b/browser_env/envs.py new file mode 100644 index 0000000..336ca0f --- /dev/null +++ b/browser_env/envs.py @@ -0,0 +1,334 @@ +import json +import re +import time +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Union + +import numpy as np +import numpy.typing as npt +from beartype import beartype +from beartype.door import is_bearable +from gymnasium import Env +from gymnasium.spaces import Box, Text +from playwright.sync_api import ( + CDPSession, + Page, + Playwright, + ViewportSize, + expect, + sync_playwright, +) + +from .actions import Action, execute_action, get_action_space +from .processors import ObservationHandler, ObservationMetadata +from .utils import ( + AccessibilityTree, + DetachedPage, + Observation, + png_bytes_to_numpy, +) + +import base64 +from .scripts import * + +@dataclass +class PlaywrightScript: + function: str # goto, get_by_role + destination: str # https://www.google.com/, combobox + name: str | None = None # Search, Avatar 2009 + operation: str | None = None # click, fill, press + value: str | None = None # avatar movie, Enter + + +def parse_action(action: str) -> PlaywrightScript: + splitted = action.strip().split(" ") + assert len(splitted) >= 2 + match splitted[:2]: + case ["goto", url]: + assert len(splitted) == 2 + return PlaywrightScript("goto", url) + case ["get_by_role", destination]: + assert len(splitted) >= 4 + match splitted[2:]: + case [name, operation]: + return PlaywrightScript( + "get_by_role", destination, name, operation + ) + case [name, operation, value]: + return PlaywrightScript( + "get_by_role", destination, name, operation, value + ) + case _: + raise ValueError("Invalid action") + case _: + raise ValueError(f"Invalid action {action}") + + +class ScriptBrowserEnv(Env[dict[str, Observation], Action]): + """ + The goal of this environment is to produce a prototype of a browser environment. + In the end, we want to support a fully configurable browser environment with wide + range of action spaces and observation spaces, both structured and unstructured. + But in this prototype, we just support action space specified by Playwright script, + and observation space is the html content of the page. + """ + + @beartype + def __init__( + self, + max_page_length: int = 8192, + headless: bool = True, + slow_mo: int = 0, + observation_type: str = "html", + current_viewport_only: bool = False, + viewport_size: ViewportSize = {"width": 1280, "height": 720}, + save_trace_enabled: bool = False, + sleep_after_execution: float = 5.0, + global_config = None, + ): + # TODO: make Space[Action] = ActionSpace + self.action_space = get_action_space() # type: ignore[assignment] + self.headless = headless + self.slow_mo = slow_mo + self.current_viewport_only = current_viewport_only + self.reset_finished = False + self.viewport_size = viewport_size + self.save_trace_enabled = save_trace_enabled + self.sleep_after_execution = sleep_after_execution + self.global_config = global_config + + match observation_type: + case "html" | "accessibility_tree": + self.text_observation_type = observation_type + self.image_observation_type = "" + self.main_observation_type = "text" + case "image": + self.image_observation_type = observation_type + self.text_observation_type = "" # type: ignore[assignment] + self.main_observation_type = "image" + case _: + raise ValueError( + f"Unsupported observation type: {observation_type}" + ) + + self.observation_handler = ObservationHandler( + self.main_observation_type, + self.text_observation_type, + self.image_observation_type, + self.current_viewport_only, + self.viewport_size, + ) + + self.observation_space = ( + self.observation_handler.get_observation_space() + ) + + @beartype + def setup(self, config_file: Path | None = None) -> None: + def handle_dialog(dialog): + self.page.dialog_message = dialog.message + dialog.dismiss() + self.context_manager = sync_playwright() + self.playwright = self.context_manager.__enter__() + self.browser = self.playwright.chromium.launch( + headless=self.headless, slow_mo=self.slow_mo + ) + + if config_file: + with open(config_file, "r") as f: + instance_config = json.load(f) + else: + instance_config = {} + + storage_state = instance_config.get("storage_state", None) + start_url = instance_config.get("start_url", None) + geolocation = instance_config.get("geolocation", None) + + self.context = self.browser.new_context( + viewport=self.viewport_size, + storage_state=storage_state, + geolocation=geolocation, + device_scale_factor=1, + ) + if self.save_trace_enabled: + self.context.tracing.start(screenshots=True, snapshots=True) + if start_url: + start_urls = start_url.split(" |AND| ") + for url in start_urls: + page = self.context.new_page() + page.on("dialog", handle_dialog) + client = page.context.new_cdp_session( + page + ) # talk to chrome devtools + if self.text_observation_type == "accessibility_tree": + client.send("Accessibility.enable") + page.client = client # type: ignore # TODO[shuyanzh], fix this hackey client + page.goto(url) + # set the first page as the current page + self.page = self.context.pages[0] + self.page.bring_to_front() + else: + self.page = self.context.new_page() + page.on("dialog", handle_dialog) + client = self.page.context.new_cdp_session(self.page) + if self.text_observation_type == "accessibility_tree": + client.send("Accessibility.enable") + self.page.client = client # type: ignore + + def get_page_client(self, page: Page) -> CDPSession: + return page.client # type: ignore + + def _get_obs(self) -> dict[str, Observation]: + obs = self.observation_handler.get_observation( + self.page, self.get_page_client(self.page) + ) + return obs + + def _get_obs_metadata(self) -> dict[str, ObservationMetadata]: + metadata = self.observation_handler.get_observation_metadata() + return metadata + + @beartype + def reset( + self, + *, + seed: int | None = None, + options: dict[str, str] | None = None, + ) -> tuple[dict[str, Observation], dict[str, Any]]: + """ + Reset the environment. + :param options: options for the environment. The current supported options are: + - "storage_state": the storage state of the browser. It is a file path to a json file. + """ + super().reset(seed=seed, options=options) + if self.reset_finished: + self.context_manager.__exit__() + + if options is not None and "config_file" in options: + config_file = Path(options["config_file"]) + if config_file.exists(): + self.setup(config_file=config_file) + else: + raise ValueError(f"Config file {config_file} does not exist.") + else: + self.setup() + self.reset_finished = True + + if self.sleep_after_execution > 0: + time.sleep(self.sleep_after_execution) + + images = self.modify_page() + + observation = self._get_obs() + observation_metadata = self._get_obs_metadata() + info = { + "page": DetachedPage(self.page.url, ""), + "fail_error": "", + "observation_metadata": observation_metadata, + "images": images, + } + + return (observation, info) + + def save_trace(self, trace_path: str | Path) -> None: + if self.save_trace_enabled: + self.context.tracing.stop(path=trace_path) + + def close(self) -> None: + if self.reset_finished: + self.context_manager.__exit__() + + def step( + self, action: Action + ) -> tuple[dict[str, Observation], float, bool, bool, dict[str, Any]]: + if not self.reset_finished: + raise RuntimeError("Call reset first before calling step.") + + success = False + fail_error = "" + try: + self.page = execute_action( + action, + self.page, + self.context, + self.observation_handler.action_processor, + ) + success = True + except Exception as e: + fail_error = str(e) + raise e + + # hard sleep TODO[shuyanzh] suboptimal, may need to check network + if self.sleep_after_execution > 0: + time.sleep(self.sleep_after_execution) + + images = self.modify_page() + + observation = self._get_obs() + observation_metadata = self._get_obs_metadata() + + info = { + "page": DetachedPage(self.page.url, self.page.content()), + "fail_error": fail_error, + "observation_metadata": observation_metadata, + "images": images, + } + + msg = ( + observation, + float(success), # reward + False, # terminated + False, # truncated + info, + ) + return msg + + def modify_page(self): + self.page.wait_for_timeout(500) + try: + self.page.evaluate(remove_id_script) + except: + pass + + suffix = getattr(self.global_config, "logname", "") + if suffix: + img_bytes = self.page.screenshot(path=f"output/screenshot-{suffix}.png", full_page=True) + else: + img_bytes = self.page.screenshot(path="output/screenshot_raw.png") + raw_image = base64.b64encode(img_bytes).decode() + + self.page.evaluate(mix_marker_script) + self.page.wait_for_timeout(100) + + # get all clickable elements + start_id = 0 + elem_items, start_id = self.page.evaluate(get_rect_script, { + "selector": ".possible-clickable-element", + "startIndex": start_id + }) + + # get ocr items + ocr_items = [] + # ocr_items = page.evaluate(canva_handler_script) + # svg_items, _ = page.evaluate(get_rect_script, {"selector": "svg", "startIndex": -1}) + # ocr_items = ocr_items + svg_items + # ocr_items, start_id = get_canva_images(ocr_items, img_bytes, start_id) + + items = elem_items + ocr_items + + # mark our own labels and get the images + items = self.page.evaluate(label_marker_script, items) + if suffix: + img_bytes = self.page.screenshot(path=f"output/marked-{suffix}.png", full_page=True) + else: + img_bytes = self.page.screenshot(path="output/marked.png") + marked_image = base64.b64encode(img_bytes).decode() + + self.page.evaluate(remove_label_mark_script) + + return { + "raw_image": raw_image, + "marked_image": marked_image, + } \ No newline at end of file diff --git a/browser_env/helper_functions.py b/browser_env/helper_functions.py new file mode 100644 index 0000000..18521ab --- /dev/null +++ b/browser_env/helper_functions.py @@ -0,0 +1,307 @@ +import base64 +import io +import json +import re +from pathlib import Path +from typing import Any + +from PIL import Image + +from agent.prompts import * +from browser_env import ( + Action, + ActionTypes, + ObservationMetadata, + StateInfo, + action2str, +) + +HTML_TEMPLATE = """ + + + + + + + {body} + + +""" + + +def get_render_action( + action: Action, + observation_metadata: dict[str, ObservationMetadata], + action_set_tag: str, +) -> str: + """Parse the predicted actions for rendering purpose. More comprehensive information""" + match action_set_tag: + case "id_html_tree": + text_meta_data = observation_metadata["text"] + if action["element_id"] in text_meta_data["obs_nodes_info"]: + node_content = text_meta_data["obs_nodes_info"][ + action["element_id"] + ]["text"] + else: + node_content = "No match found" + + action_str = f"
{action['raw_prediction']}
" + action_str += f"
{repr(action)}
" + action_str += f"
{action2str(action, action_set_tag, node_content)}
" + + case "id_html_nasc_tree": + text_meta_data = observation_metadata["text"] + if action["element_id"] in text_meta_data["obs_nodes_info"]: + node_content = text_meta_data["obs_nodes_info"][ + action["element_id"] + ]["text"] + else: + node_content = "No match found" + + action_str = f"
{action['raw_prediction']}
" + action_str += f"
{repr(action)}
" + action_str += f"
{action2str(action, action_set_tag, node_content)}
" + + case "id_accessibility_tree": + text_meta_data = observation_metadata["text"] + if action["element_id"] in text_meta_data["obs_nodes_info"]: + node_content = text_meta_data["obs_nodes_info"][ + action["element_id"] + ]["text"] + else: + node_content = "No match found" + + action_str = f"
{action['raw_prediction']}
" + action_str += f"
{repr(action)}
" + action_str += f"
{action2str(action, action_set_tag, node_content)}
" + + case "playwright": + action_str = action["pw_code"] + case _: + raise ValueError(f"Unknown action type {action['action_type']}") + return action_str + + +def get_action_description( + action: Action, + observation_metadata: dict[str, ObservationMetadata], + action_set_tag: str, + prompt_constructor: PromptConstructor | None, +) -> str: + """Generate the text version of the predicted actions to store in action history for prompt use. + May contain hint information to recover from the failures""" + + match action_set_tag: + case "id_html_tree": + # old_op_prompt = "Website: %s; Thinking process: %s; Html segment: %s; Operation: %s; Result: %s" + op_prompt = "Html segment: %s; Operation: %s;" + text_meta_data = observation_metadata["text"] + node_info = text_meta_data["obs_nodes_info"] + result = 'Operation Success' + + if action["action_type"] in [ + ActionTypes.CLICK, + ActionTypes.HOVER, + ActionTypes.TYPE, + ]: + action_name = str(action["action_type"]).split(".")[1].lower() + if action["element_id"] in node_info: + node_content = node_info[action["element_id"]]["text"] + node_content = " ".join(node_content.split()[1:]) + action["label"] = node_info[action["element_id"]]["label"] + action_str = action2str( + action, action_set_tag, node_content + ) + else: + action_str = "None" + result = f"Cannot find the corresponding tag. Attempt to perfom \"{action_name}\" on element \"[{action['element_id']}]\" but no matching element found. Please check the observation more carefully." + else: + if ( + action["action_type"] == ActionTypes.NONE + and prompt_constructor is not None + ): + text = action["answer"] + if text is not None and text.count("#Record#") > 0: + action_str = text + else: + action_str = "None" + result = f'Operation invalid. The format was incorrect. Ensure that the action is wrapped inside a pair of # and seperate arguments within spaces as follows: #action# arg1 arg2 ....' + else: + action_str = action2str(action, action_set_tag, "") + + # action_str = op_prompt % ( + # prompt_constructor.state["url"], + # prompt_constructor.state["intention"], + # prompt_constructor.state["segment"], + # action_str, + # result, + # ) + + action_str = op_prompt % ( + prompt_constructor.state["segment"], + action_str, + ) + case "id_html_nasc_tree": + op_prompt = "%s #HTML Segment: %s" + text_meta_data = observation_metadata["text"] + node_info = text_meta_data["obs_nodes_info"] + result = 'Operation Success' + + if action["action_type"] in [ + ActionTypes.CLICK, + ActionTypes.HOVER, + ActionTypes.TYPE, + ]: + action_name = str(action["action_type"]).split(".")[1].lower() + if action["element_id"] in node_info: + node_content = node_info[action["element_id"]]["text"] + node_content = " ".join(node_content.split()[1:]) + action["label"] = node_info[action["element_id"]]["label"] + action_str = action2str( + action, action_set_tag, node_content + ) + else: + action_str = "None" + result = f"Cannot find the corresponding tag. Attempt to perfom \"{action_name}\" on element \"[{action['element_id']}]\" but no matching element found. Please check the observation more carefully." + else: + if ( + action["action_type"] == ActionTypes.NONE + and prompt_constructor is not None + ): + text = action["answer"] + if text is not None and text.count("record") > 0: + action_str = text + else: + action_str = "None" + result = f'Operation invalid. The format was incorrect. Ensure that the action is wrapped inside a pair of # and seperate arguments within spaces as follows: #action# arg1 arg2 ....' + else: + action_str = action2str(action, action_set_tag, "") + + action_str = op_prompt % ( + action_str, + prompt_constructor.state["segment"], + ) + + case "id_accessibility_tree": + text_meta_data = observation_metadata["text"] + if action["action_type"] in [ + ActionTypes.CLICK, + ActionTypes.HOVER, + ActionTypes.TYPE, + ]: + action_name = str(action["action_type"]).split(".")[1].lower() + if action["element_id"] in text_meta_data["obs_nodes_info"]: + node_content = text_meta_data["obs_nodes_info"][ + action["element_id"] + ]["text"] + node_content = " ".join(node_content.split()[1:]) + action_str = action2str( + action, action_set_tag, node_content + ) + else: + action_str = f"Attempt to perfom \"{action_name}\" on element \"[{action['element_id']}]\" but no matching element found. Please check the observation more carefully." + else: + if ( + action["action_type"] == ActionTypes.NONE + and prompt_constructor is not None + ): + action_splitter = prompt_constructor.instruction[ + "meta_data" + ]["action_splitter"] + action_str = f'The previous prediction you issued was "{action["raw_prediction"]}". However, the format was incorrect. Ensure that the action is wrapped inside a pair of {action_splitter} and enclose arguments within [] as follows: {action_splitter}action [arg] ...{action_splitter}.' + else: + action_str = action2str(action, action_set_tag, "") + + case "playwright": + action_str = action["pw_code"] + + case _: + raise ValueError(f"Unknown action type {action['action_type']}") + + return action_str + + +class RenderHelper(object): + """Helper class to render text and image observations and meta data in the trajectory""" + + def __init__( + self, config_file: str, result_dir: str, action_set_tag: str + ) -> None: + with open(config_file, "r") as f: + _config = json.load(f) + _config_str = "" + for k, v in _config.items(): + _config_str += f"{k}: {v}\n" + _config_str = f"
{_config_str}
\n" + task_id = _config["task_id"] + + self.action_set_tag = action_set_tag + + self.render_file = open( + Path(result_dir) / f"render_{task_id}.html", "a+" + ) + self.render_file.truncate(0) + # write init template + self.render_file.write(HTML_TEMPLATE.format(body=f"{_config_str}")) + self.render_file.read() + self.render_file.flush() + + def render( + self, + action: Action, + state_info: StateInfo, + meta_data: dict[str, Any], + render_screenshot: bool = False, + ) -> None: + """Render the trajectory""" + # text observation + observation = state_info["observation"] + text_obs = observation["text"] + info = state_info["info"] + new_content = f"

New Page

\n" + new_content += f"

URL: {state_info['info']['page'].url}

\n" + new_content += f"
{text_obs}
\n" + + if render_screenshot: + # image observation + img_obs = observation["image"] + image = Image.fromarray(img_obs) + byte_io = io.BytesIO() + image.save(byte_io, format="PNG") + byte_io.seek(0) + image_bytes = base64.b64encode(byte_io.read()) + image_str = image_bytes.decode("utf-8") + new_content += f"\n" + + # meta data + new_content += f"
{meta_data['action_history'][-1]}
\n" + + # action + action_str = get_render_action( + action, + info["observation_metadata"], + action_set_tag=self.action_set_tag, + ) + # with yellow background + action_str = f"
{action_str}
" + new_content += f"{action_str}\n" + + # add new content + self.render_file.seek(0) + html = self.render_file.read() + html_body = re.findall(r"(.*?)", html, re.DOTALL)[0] + html_body += new_content + + html = HTML_TEMPLATE.format(body=html_body) + self.render_file.seek(0) + self.render_file.truncate() + self.render_file.write(html) + self.render_file.flush() + + def close(self) -> None: + self.render_file.close() diff --git a/browser_env/html_tools/__init__.py b/browser_env/html_tools/__init__.py new file mode 100644 index 0000000..5e76c66 --- /dev/null +++ b/browser_env/html_tools/__init__.py @@ -0,0 +1,6 @@ +from .identifier import IdentifierTool +from .prompt import HtmlPrompt +from .html_parser import HtmlParser + +from .utils import print_html_object +from .configs import basic_attrs, mind2web_keep_attrs \ No newline at end of file diff --git a/browser_env/html_tools/configs/__init__.py b/browser_env/html_tools/configs/__init__.py new file mode 100644 index 0000000..b900a67 --- /dev/null +++ b/browser_env/html_tools/configs/__init__.py @@ -0,0 +1,3 @@ +from .html_prompt import prompts +from .config import basic_attrs, mind2web_keep_attrs, miniwob_attrs +from .config import config_meta \ No newline at end of file diff --git a/browser_env/html_tools/configs/config.py b/browser_env/html_tools/configs/config.py new file mode 100644 index 0000000..a979364 --- /dev/null +++ b/browser_env/html_tools/configs/config.py @@ -0,0 +1,49 @@ +basic_attrs = [ + 'title', + 'value', + 'placeholder', + 'selected', +] + +mind2web_keep_attrs = [ + 'alt', + 'aria_description', + 'aria_label', + 'aria_role', + 'input_checked', + 'input_value', + 'label', + 'name', + 'option_selected', + 'placeholder', + 'role', + 'text_value', + 'title', + 'type', + 'value', +] + +miniwob_attrs = [ + 'id', + 'type', + 'value', +] + +config_meta = """ +======= Configs ======= +Columns: + - id: {id_attr} + - label: {label_attr} +Position: {use_position} + - window: {window_size} + - rect_dict: {rect} +Keep: + - parents: {parent_chain} + - attrs: {keep_attrs} + - elems: {keep_elem} + - obs_elem: {obs_elem} +Generator: + - prompt: {prompt_name} + - label: {identifier_name} +======================== +""" \ No newline at end of file diff --git a/browser_env/html_tools/configs/html_prompt.py b/browser_env/html_tools/configs/html_prompt.py new file mode 100644 index 0000000..904d021 --- /dev/null +++ b/browser_env/html_tools/configs/html_prompt.py @@ -0,0 +1,22 @@ +refine_prompt = { + 'dom': '<{tag}{label}|{attr}{content}{subtree} >', + 'label': '[{label}]', + 'attr': '{attr}', + 'attr_splitter': '; ', + 'subtree_splitter': ' ', +} + +xml_prompt = { + 'dom': '<{tag}{label}{attr}>{content}{subtree} ', + 'label': ' id="{label}"', + 'attr': '{key}="{attr}"', + 'attr_splitter': ' ', + 'subtree_splitter': ' ', +} + +prompts = { + 'refine': refine_prompt, + 'xml': xml_prompt, + 'new_data': refine_prompt, +} + \ No newline at end of file diff --git a/browser_env/html_tools/html_parser.py b/browser_env/html_tools/html_parser.py new file mode 100644 index 0000000..7b173ce --- /dev/null +++ b/browser_env/html_tools/html_parser.py @@ -0,0 +1,426 @@ +from lxml import html +import time, copy, random +import json, re, os + +from .identifier import IdentifierTool +from .prompt import HtmlPrompt +from .configs import config_meta +from .utils import get_xpath_top_down, rect2tuple + +class HtmlParser(): + def __init__(self, ctx: str, args: dict[str]={}) -> None: + stt = time.time() + self.dom_tree = self.ctx2tree(ctx) + # tool related + self.bids2label = {} + self.bids2xpath = {} + self.used_labels = {} + + # parse args + self.parse_args(args) + self.init_time = time.time() - stt + + def parse_args(self, args: dict[str]={}) -> None: + def attr_check(attr, type_model='str'): + if attr is None: + return False + attr_type = type(attr) + if attr_type != type(type_model): + return False + if attr_type == type('str') and len(attr) == 0: + return False + return True + + args = {} if args is None else args + + # [Position] use_pos: False -> use full page, otherwise use window_size + dataset = args.get('dataset', '') + use_position = args.get('use_position', False) + window_size = args.get('window_size', None) + rect = args.get('rect_dict', None) + if use_position: + if not attr_check(window_size, ()): + raise ValueError('window_size must be set when use_position is True') + if not attr_check(rect, {}): + raise ValueError('rect_dict must be set when use_position is True') + + if not attr_check(rect, {}): + rect = {} + + # [Label] for vimium is temp_clickable_label, otherwise keep all of it + label_attr = args.get('label_attr', '') + get_new_label = args.get('regenerate_label', False) + label_method = args.get('label_generator', None) + regen_label = not attr_check(label_method) + + # [id] for mind2web is backend_node_id, for normal website use our method + id_attr = args.get('id_attr', '') + regen_id = not attr_check(id_attr) + + if regen_id: + id_attr = 'temp_id' + + # [attributes] + keep_attrs = args.get('attr_list', []) + if not attr_check(keep_attrs, []): + keep_attrs = [] + + # [Tags] for clickable elem, keep: must keep, obs: keep if follow specific rule + parent_chain = args.get('parent_chain', False) + keep_elem = args.get('keep_elem', []) + obs_elem = args.get('obs_elem', []) + + # sanity check + self.set_args(use_position, window_size, rect, label_attr, id_attr, keep_attrs, keep_elem, obs_elem, parent_chain, get_new_label, dataset) + + # [Prompt] + prompt = args.get('prompt', None) + self.prompt = HtmlPrompt(prompt) + + # traverse and get special data + if regen_id or regen_label: + self.mark_id() + + if get_new_label: + self.used_labels = {} + + self.identifier = IdentifierTool(label_method, self.used_labels) + + def set_args(self, use_position: bool=False, window_size: tuple=(), rect_dict: dict[str]={}, label_attr: str='', + id_attr: str='', keep_attrs: list[str]=[], keep_elem: list[str]=[], obs_elem: list[str]=[], + parent_chain: bool=False, get_new_label: bool=False, dataset: str='') -> None: + + self.use_position = use_position + self.window_size = window_size + self.rect = rect_dict + self.label_attr = label_attr + self.id_attr = id_attr + self.keep_attrs = keep_attrs + self.keep = keep_elem + self.obs = obs_elem + self.parent_chain = parent_chain + self.get_new_label = get_new_label + self.dataset = dataset + + def get_config(self): + config = { + 'id_attr': self.id_attr, + 'keep_attrs': self.keep_attrs[:5], + 'label_attr': self.label_attr, + 'use_position': self.use_position, + 'window_size': self.window_size, + 'rect': dict(list(self.rect.items())[:3]), + 'keep_elem': self.keep[:5], + 'obs_elem': self.obs[:5], + 'parent_chain': self.parent_chain, + 'prompt_name': self.prompt.name, + 'identifier_name': self.identifier.name + } + + return config, config_meta.format(**config) + + def update_rect_dict(self, rect_dict: dict[str]={}) -> None: + self.rect = rect_dict + + @staticmethod + def ctx2tree(ctx: str) -> html.HtmlElement: + # remove useless tags, eg. style and script + ctx = re.sub('', '', ctx) + ctx = re.sub('[\W\w]*?', '', ctx) + ctx = re.sub('[\W\w]*?', '', ctx) + ctx = '' if ctx is None else re.sub(r'\s+', ' ', ctx).strip() + dom_tree = html.fromstring(ctx) + return dom_tree + + @staticmethod + def get_root(tree: html.HtmlElement) -> html.HtmlElement: + node = tree.xpath('//*')[0] + while True: + parent = node.getparent() + if parent is None: + break + node = parent + return node + + def get_node_by_bid(self, tree: html.HtmlElement, bid: str) -> html.HtmlElement: + nodes = tree.xpath(f'//*[@{self.id_attr}="{bid}"]') + if len(nodes) == 0: + return None + return nodes[0] + + def id_label_converter(self, label: str) -> str: + return self.bids2label.get(label, '') + + def id_xpath_converter(self, label: str) -> str: + return self.bids2xpath.get(label, '') + + def mark_id(self) -> None: + root = self.get_root(self.dom_tree) + _, i2xpath, used_labels = get_xpath_top_down(root, self.id_attr, self.label_attr) + self.used_labels = used_labels + self.bids2xpath = i2xpath + + def parse(self, root: html.HtmlElement, keep: list[str], obs: list[str], parent_chain: bool=False, get_new_label: bool=False) -> dict[str]: + def get_text(str: str) -> str: + return '' if str is None else str.strip()[:500] + + def check_attr(attr: str, node: html.HtmlElement) -> bool: + tag = node.tag + if ( + ( attr == 'role' and node.attrib.get(attr, '') in ['presentation', 'none', 'link'] ) + or ( attr == 'type' and node.attrib.get(attr, '') == 'hidden' ) + # or ( attr == 'value' and tag in ['option'] ) + ): + return False + return True + + def is_visible(node: html.HtmlElement, bid: str) -> bool: + if self.dataset == 'mind2web': + bound = node.attrib.get('bounding_box_rect', None) + self.rect[bid] = rect2tuple(bound) + + if not self.use_position: + return True + + rect = self.rect.get(bid, None) + if rect is None: + return False + + if self.window_size is None: + return True + + # get window size + wx, wy, ww, wh = self.window_size + x, y, w, h = rect + if x + w < wx or x > wx + ww or y + h < wy or y > wy + wh: + return False + + return True + + def _dfs(node: html.HtmlElement, keep: list[str]=[], obs: list[str]=[], + parent_chain: bool=False, get_new_label: bool=False, par_keep: bool=False) -> (str, dict[str]): + # basic information + bid = node.attrib.get(self.id_attr, '') + tag = node.tag + label = node.attrib.get(self.label_attr, '') + + # element which is keeped equivalent to visible + visible = is_visible(node, bid) + in_keep_list = bid in keep + in_obs_list = (bid in obs or len(label) > 0) and visible + keep_element = in_keep_list or in_obs_list or visible or par_keep + + # mark label + bids2label, labeled_elems = {}, [] + have_label = False + if in_keep_list or in_obs_list: + if label is None or len(label) == 0 or get_new_label: + label = self.identifier.generate() + node.attrib[self.label_attr] = label + bids2label[bid] = label + bids2label[label] = bid + have_label = True + + # get text or alt_text of current element + text = get_text(node.text) + + classes = {} + # keep attributes if needed + keep_all_attrs = len(self.keep_attrs) == 0 + keep_attrs = node.attrib.keys() if keep_all_attrs else self.keep_attrs + + # traverse attributes + for attr in keep_attrs: + if attr not in node.attrib or not check_attr(attr, node): + continue + if attr in [self.id_attr, self.label_attr]: + continue + val = get_text(node.attrib[attr]) + if len(val) > 0 or keep_all_attrs: + classes[attr] = val + + have_text = len(text) > 0 or len(classes) > 0 + + parts = [] + clickable_count = 0 + children = node.getchildren() + for child in children: + cres, cmsg = _dfs(child, keep, obs, parent_chain, get_new_label) + clickable_count += 1 if cmsg.get('have_clickable', False) else 0 + bids2label.update(cmsg.get('bids2label', {})) + labeled_elems.extend(cmsg.get('label_element', [])) + if len(cres) != 0: + parts.append(cres) + + dom = self.prompt.subtree_constructor(parts) + + # remove if all children are text + keep_as_all_text = (dom.count('<') == dom.count(' 0 + if keep_as_all_text: + matches = re.findall(r']+) >', dom) + dom = self.prompt.subtree_constructor(matches) + + keep_element = keep_element and (clickable_count > 1 or have_text or have_label or keep_as_all_text) + keep_as_parent = len(dom) > 0 and parent_chain + if in_keep_list or keep_element or keep_as_parent: + dom = self.prompt.prompt_constructor(tag, label, text, dom, classes) + + if have_label: + labeled_elems.append(bid) + + control_msg = { + 'have_clickable': bool(clickable_count or have_text), + 'bids2label': bids2label, + 'label_element': labeled_elems, + } + + return dom, control_msg + + dom, cmsg = _dfs(root, keep, obs, parent_chain, get_new_label) + return dom, cmsg + + def parse_tree(self) -> dict[str]: + # start from here + stt = time.time() + root = self.get_root(self.dom_tree) + dom, cmsg = self.parse(root, self.keep, self.obs, self.parent_chain, self.get_new_label) + self.bids2label = cmsg.get('bids2label', {}) + self.keep = list(set(self.keep + cmsg.get('label_element', []))) + + obj = { + 'html': dom, + 'parse_time': time.time() - stt + } + + return obj + + # From mind2web, https://github.com/OSU-NLP-Group/Mind2Web/blob/main/src/data_utils/dom_utils.py + def get_keep_elements(self, tree: html.HtmlElement, keep: list[str], max_depth: int, max_children: int, + max_sibling: int, dfs_count: int=1, keep_parent: bool=False) -> list[str]: + def get_anscendants(node: html.HtmlElement, max_depth: int, current_depth: int=0) -> list[str]: + if current_depth > max_depth: + return [] + + anscendants = [] + parent = node.getparent() + if parent is not None: + anscendants.append(parent) + anscendants.extend(get_anscendants(parent, max_depth, current_depth + 1)) + + return anscendants + + def get_descendants(node: html.HtmlElement, max_depth: int, current_depth: int=0) -> list[str]: + if current_depth > max_depth: + return [] + + descendants = [] + for child in node: + descendants.append(child) + descendants.extend(get_descendants(child, max_depth, current_depth + 1)) + + return descendants + + to_keep = set(copy.deepcopy(keep)) + nodes_to_keep = set() + + for _ in range(max(1, dfs_count)): + for bid in to_keep: + candidate_node = self.get_node_by_bid(tree, bid) + if candidate_node is None: + continue + + nodes_to_keep.add(candidate_node.attrib[self.id_attr]) + # get all ancestors or with max depth + nodes_to_keep.update([x.attrib.get(self.id_attr, '') for x in get_anscendants(candidate_node, max_depth)]) + + # get descendants with max depth + nodes_to_keep.update([x.attrib.get(self.id_attr, '') for x in get_descendants(candidate_node, max_depth)][:max_children]) + # get siblings within range + parent = candidate_node.getparent() + if parent is None: + continue + + siblings = [x for x in parent.getchildren() if x.tag != 'text'] + if candidate_node not in siblings: + continue + + idx_in_sibling = siblings.index(candidate_node) + nodes_to_keep.update([x.attrib.get(self.id_attr, '') + for x in siblings[max(0, idx_in_sibling - max_sibling) : idx_in_sibling + max_sibling + 1]]) + + max_children = int(max_children * 0.5) + max_depth = int(max_depth * 0.5) + max_sibling = int(max_sibling * 0.7) + + to_keep = copy.deepcopy(nodes_to_keep) + + if keep_parent: + for bid in keep: + candidate_node = self.get_node_by_bid(tree, bid) + if candidate_node is None: + continue + nodes_to_keep.update([x.attrib.get(self.id_attr, '') for x in candidate_node.xpath("ancestor::*")]) + + return list(nodes_to_keep) + + def prune(self, tree: html.HtmlElement, nodes_to_keep: list[str]) -> html.HtmlElement: + # remove nodes not in nodes_to_keep + for node in tree.xpath('//*')[::-1]: + if node.tag != 'text': + is_keep = node.attrib.get(self.id_attr, '') in nodes_to_keep + is_candidate = node.attrib.get(self.id_attr, '') in self.keep + else: + is_keep = (node.getparent().attrib.get(self.id_attr, '') in nodes_to_keep) + is_candidate = (node.getparent().attrib.get(self.id_attr, '') in self.keep) + + if not is_keep and node.getparent() is not None: + # insert all children into parent + for child in node.getchildren(): + node.addprevious(child) + node.getparent().remove(node) + else: + # if not is_candidate or node.tag == 'text': + # node.attrib.pop(self.id_attr, None) + if ( + len(node.attrib) == 0 + and not any([x.tag == 'text' for x in node.getchildren()]) + and node.getparent() is not None + and node.tag != "text" + and len(node.getchildren()) <= 1 + ): + # insert all children into parent + for child in node.getchildren(): + node.addprevious(child) + node.getparent().remove(node) + + return tree + + def prune_tree(self, dfs_count: int=1, max_depth: int=3, max_children: int=30, + max_sibling: int=3, keep_parent: bool=False) -> None: + # clone the tree + new_tree = copy.deepcopy(self.dom_tree) + nodes_to_keep = self.get_keep_elements(new_tree, self.keep, max_depth, max_children, max_sibling, dfs_count, keep_parent) + new_tree = self.prune(new_tree, nodes_to_keep) + + self.dom_tree = new_tree + + def get_segment(self, bid: str) -> str: + # clone the tree + new_tree = copy.deepcopy(self.dom_tree) + nodes_to_keep = self.get_keep_elements(new_tree, [bid], 0, 2, 1) + new_tree = self.prune(new_tree, nodes_to_keep) + dom, _ = self.parse(new_tree, self.keep, [], False) + return dom + + def get_rect_data(self, bids: list[str]) -> list[dict[str]]: + res = [] + for bid in bids: + label = self.bids2label.get(bid, '') + rect = self.rect.get(bid, None) + res.append({ + 'bid': bid, + 'label': label, + 'rect': rect + }) + return res + \ No newline at end of file diff --git a/browser_env/html_tools/identifier.py b/browser_env/html_tools/identifier.py new file mode 100644 index 0000000..793ebed --- /dev/null +++ b/browser_env/html_tools/identifier.py @@ -0,0 +1,64 @@ +import secrets + +class IdentifierTool: + def __init__(self, method: str='order', existing_labels: dict[str]={}) -> None: + self.methods = { + 'order': self.get_identifier_in_order, + 'random': self.get_random_identifier, + } + + if method is None: + method = 'order' + + self.func = self.methods.get(method, None) + self.name = method + if self.func is None: + raise ValueError(f'Invalid method for identifier: {method}') + + self.reset(existing_labels) + + def reset(self, exists: dict[str]={}) -> None: + self.identifier = -1 + self.exists = {} if exists is None else exists + + def get_identifier_in_order(self) -> str: + def id2str(id: int) -> str: + if id < 26: + return chr(id + 65) + id -= 26 + c0 = id // 676 + c1 = (id // 26) % 26 + c2 = id % 26 + label = f'{chr(c1 + 65)}{chr(c2 + 65)}' + return label if c0 == 0 else f'{chr(c0 + 64)}{label}' + + self.identifier += 1 + label = id2str(self.identifier) + + while label in self.exists: + self.identifier += 1 + label = id2str(self.identifier) + + self.exists[label] = True + return label + + def get_random_identifier(self) -> str: + secret_generator = secrets.SystemRandom() + + def get_random_label(n: int=2) -> str: + tmp = '' + for _ in range(n): + tmp += chr(secret_generator.randint(65, 90)) + return tmp + + wc = 3 if len(self.exists) > 280 else 2 + + label = get_random_label(wc) + while label in self.exists: + label = get_random_label(wc) + + self.exists[label] = True + return label + + def generate(self): + return self.func() \ No newline at end of file diff --git a/browser_env/html_tools/prompt.py b/browser_env/html_tools/prompt.py new file mode 100644 index 0000000..38d6b94 --- /dev/null +++ b/browser_env/html_tools/prompt.py @@ -0,0 +1,97 @@ +from .configs import prompts + +class HtmlPrompt: + def __init__(self, prompt: str='') -> None: + prompt = self.extract(prompt, 'xml') + if prompt not in prompts: + raise Exception('Unknown prompt: ' + prompt) + + constructors = { + 'refine': self.normal_prompt_constructor, + 'xml': self.normal_prompt_constructor, + 'new_data': self.new_data_prompt_constructor, + } + + self.name = prompt + self.prompt = prompts[prompt] + self.constructor = constructors[prompt] + + @staticmethod + def extract(data, default=''): + return data if data is not None else default + + def subtree_constructor(self, subtree: list[str]=[]) -> str: + return self.prompt['subtree_splitter'].join(subtree) + + def normal_prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: + def add_prefix(data, prefix): + return prefix + data if len(data) > 0 else '' + + tag = self.extract(tag) + label = self.extract(label) + content = self.extract(content) + subtree_str = self.extract(subtree_str, '') + class_dict = self.extract(class_dict, {}) + + label_str = '' + if len(label) > 0: + label_str = self.prompt['label'].format(label=label) + + classes = [] + values = set() + for key, val in class_dict.items(): + if val in values: + continue + values.add(val) + classes.append(self.prompt['attr'].format(key=key, attr=val)) + classes_str = self.prompt['attr_splitter'].join(classes) + + content_splitter = ' ' if len(classes_str) == 0 else self.prompt['attr_splitter'] + classes_str = add_prefix(classes_str, ' ') + content_str = add_prefix(content, content_splitter) + subtree_str = add_prefix(subtree_str, ' ') + + return self.prompt['dom'].format(tag=tag, label=label_str, attr=classes_str, content=content_str, subtree=subtree_str) + + def new_data_prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: + def add_prefix(data, prefix): + return prefix + data if len(data) > 0 else '' + + tag = self.extract(tag) + label = self.extract(label) + content = self.extract(content) + subtree_str = self.extract(subtree_str, '') + class_dict = self.extract(class_dict, {}) + + label_str = '' + if len(label) > 0: + label_str = self.prompt['label'].format(label=label) + + classes = [] + values = set() + + message = [] + for key, val in class_dict.items(): + if val == '': + message.append(key) + continue + if val in values: + continue + values.add(val) + classes.append(self.prompt['attr'].format(key=key, attr=val)) + + if len(message) > 0: + message_str = ' '.join(message) + classes.append(self.prompt['attr'].format(key='message', attr=message_str)) + + classes_str = self.prompt['attr_splitter'].join(classes) + + content_splitter = ' ' if len(classes_str) == 0 else self.prompt['attr_splitter'] + classes_str = add_prefix(classes_str, ' ') + content_str = add_prefix(content, content_splitter) + subtree_str = add_prefix(subtree_str, ' ') + + return self.prompt['dom'].format(tag=tag, label=label_str, attr=classes_str, content=content_str, subtree=subtree_str) + + def prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: + return self.constructor(tag, label, content, subtree_str, class_dict) \ No newline at end of file diff --git a/browser_env/html_tools/utils.py b/browser_env/html_tools/utils.py new file mode 100644 index 0000000..c3c71fb --- /dev/null +++ b/browser_env/html_tools/utils.py @@ -0,0 +1,101 @@ +from lxml import html +def get_xpath_top_down(element: html.HtmlElement, id_column: str='temp_id', label_column: str='temp_clickable_label', path: str='', order: int=0, + in_svg: bool=False, temp_id: int=0) -> tuple[int, dict[str, str], dict[str]]: + used_labels, i2xpath = {}, {} + # path + tag = element.tag.lower() + in_svg = in_svg or (tag == 'svg') + + if not in_svg and 'id' in element.attrib: + node_id = element.attrib['id'] + path = f'//*[@id="{node_id}"]' + else: + suffix = f'[{order}]' if order > 0 else '' + prefix = f'*[name()="{tag}"]' if in_svg else tag + path = path + '/' + prefix + suffix + + # add temp id + element.attrib[id_column] = str(temp_id) + ori_label = element.attrib.get(label_column, '') + if ori_label != '': + used_labels[ori_label] = True + + bid = str(temp_id) + i2xpath[bid] = path + i2xpath[path] = bid + i2xpath[f'xpath/{path}'] = bid + i2xpath[f'xpath=/{path}'] = bid + + temp_id += 1 + + # traverse node + children = element.getchildren() + tag_dict = {} + id_list = [] + for child in children: + ctag = child.tag.lower() + if ctag not in tag_dict: + tag_dict[ctag] = 0 + tag_dict[ctag] += 1 + id_list.append(tag_dict[ctag]) + + for cid, child in zip(id_list, children): + ctag = child.tag.lower() + cod = cid if tag_dict[ctag] > 1 else 0 + temp_id, i2x, ulabels = get_xpath_top_down(child, id_column, label_column, path, cod, in_svg, temp_id) + i2xpath.update(i2x) + used_labels.update(ulabels) + + return temp_id, i2xpath, used_labels + +def print_html_object(obj: str='') -> str: + tab_cnt = 0 + result, content, sep = '', '', '' + last_is_left, last_is_right = False, False + for ch in obj: + if ch == '<': + result += '\n' + if len(content.strip()) > 0: + result += sep + content.strip() + '\n' + result += sep + '<' + + tab_cnt += 1 + sep = ' ' * tab_cnt + + content = '' + last_is_right = False + last_is_left = True + elif ch == '>': + if last_is_left: + result += content + else: + if last_is_right: + result += '\n' + if len(content.strip()) > 0: + result += sep + content.strip() + '\n' + + tab_cnt -= 1 + sep = ' ' * tab_cnt + + if not last_is_left: + result += sep + + result += '>' + content = '' + + last_is_right = True + last_is_left = False + else: + content += ch + + return result + +def rect2tuple(rect: str) -> tuple[int, int, int, int]: + if rect is None or type(rect) != type('str'): + return None + rect = rect.strip() + if rect.count(',') != 3: + return None + rect = rect.split(',') + rect = [float(r) for r in rect] + return tuple(rect) \ No newline at end of file diff --git a/browser_env/processors.py b/browser_env/processors.py new file mode 100644 index 0000000..e7daadd --- /dev/null +++ b/browser_env/processors.py @@ -0,0 +1,1125 @@ +import json +import lxml +import re +from collections import defaultdict +from typing import Any, TypedDict, Union + +import numpy as np +import numpy.typing as npt +from gymnasium import spaces +from playwright.sync_api import CDPSession, Page, ViewportSize + +from browser_env.constants import ( + ASCII_CHARSET, + FREQ_UNICODE_CHARSET, + IGNORED_ACTREE_PROPERTIES, + UTTERANCE_MAX_LENGTH, +) + +from .utils import ( + AccessibilityTree, + AccessibilityTreeNode, + BrowserConfig, + BrowserInfo, + DOMNode, + DOMTree, + Observation, + png_bytes_to_numpy, +) + +from .html_tools import HtmlParser, basic_attrs, print_html_object + +IN_VIEWPORT_RATIO_THRESHOLD = 0.8 + +class TreeNode: + def __init__(self, node_id, role, name, depth, **kwargs): + self.visible = True + self.node_id = node_id + self.role = role + self.name = name + self.depth = depth + self.properties = None + if "properties" in kwargs.keys(): + self.properties = kwargs["properties"] + + self.children = [] + self.parent = None + + def add_child(self, child): + child.parent = self + self.children.append(child) + + def copy(self): + from copy import deepcopy + new_self = deepcopy(self) + new_self.children = [] + new_self.parent = None + return new_self + + def get_visible_node_number(self): + visible_ids = [] + + def dfs(current_node): + if current_node.visible: + visible_ids.append(current_node.node_id) + for child in current_node.children: + dfs(child) + + dfs(self) + + return len(visible_ids) + + def delete_tree(self): + for child in self.children: + child.delete_tree() + self.children.clear() + self.parent = None + + def has_properties(self): + return getattr(self, "properties", {}) + + def visible_children(self): + return [c for c in self.children if c.visible] + + def visible_siblings(self): + if not self.parent: + return [] + return [n for n in self.parent.children if n.visible and n.node_id != self.node_id] + + def siblings(self): + if not self.parent: + return [] + return [n for n in self.parent.children if n.node_id != self.node_id] + + def search_node_by_id(self, target_id): + if self.node_id == target_id or (self.name and f"[{target_id}]" in self.name): + return self + for child in self.children: + result = child.search_node_by_id(target_id) + if result: + return result + return None + + def all_children_invisible(self): + if not self.children: + return True + for child in self.children: + if child.visible: + return False + return True + + def has_the_same_properties_as(self, another_node): + node_a_has_properties = getattr(self, "properties", "") + node_b_has_properties = getattr(another_node, "properties", "") + if not node_a_has_properties and not node_b_has_properties: + return True + elif (node_a_has_properties and not node_b_has_properties) or (not node_a_has_properties and node_b_has_properties): + return False + else: + return self.properties == another_node.properties + + def is_identical_to(self, another_node): + if another_node.children: + return False + return self.role == another_node.role and self.name == another_node.name and self.has_the_same_properties_as(another_node=another_node) + + def last_sibling(self, visible_required=False): + if not self.parent: + return None + last_sibling_idx = self.parent.children.index(self) - 1 + if last_sibling_idx < 0: + return None + if not visible_required: + return self.parent.children[last_sibling_idx] + for sibling in self.parent.children[:self.parent.children.index(self):-1]: + if sibling.visible: + return sibling + return None + + def next_sibling(self, visible_required=False): + if not self.parent: + return None + next_sibling_idx = self.parent.children.index(self) + 1 + if next_sibling_idx >= len(self.parent.children): + return None + if not visible_required: + return self.parent.children[next_sibling_idx] + for sibling in self.parent.children[next_sibling_idx:]: + if sibling.visible: + return sibling + return None + + def has_identical_siblings(self): + if not (self.parent and self.all_children_invisible()): + return False + if any(sibling.role == self.role and sibling.name == self.name for sibling in self.parent.children if (sibling.node_id != self.node_id and sibling.all_children_invisible())): + return True + return False + + def has_identical_surrounding_siblings(self): + if self.last_sibling(visible_required=False): + if self.is_identical_to(self.last_sibling(visible_required=False)): + return True + if self.last_sibling(visible_required=True): + if self.is_identical_to(self.last_sibling(visible_required=True)): + return True + if self.next_sibling(visible_required=False): + if self.is_identical_to(self.next_sibling(visible_required=False)): + return True + if self.next_sibling(visible_required=True): + if self.is_identical_to(self.next_sibling(visible_required=True)): + return True + return False + + def is_differentiable(self, strict=False): + if self.parent and self.parent.role == "row": + return True + if not strict and self.has_identical_siblings(): + return False + if self.has_identical_surrounding_siblings(): + return False + return True + + +class ObservationProcessor: + def process(self, page: Page, client: CDPSession) -> Observation: + raise NotImplementedError + + +class ObservationMetadata(TypedDict): + obs_nodes_info: dict[str, Any] + + +def create_empty_metadata() -> ObservationMetadata: + return { + "obs_nodes_info": {}, + } + + +class TextObervationProcessor(ObservationProcessor): + def __init__( + self, + observation_type: str, + current_viewport_only: bool, + viewport_size: ViewportSize, + ): + self.observation_type = observation_type + self.current_viewport_only = current_viewport_only + self.viewport_size = viewport_size + self.observation_tag = "text" + self.meta_data = ( + create_empty_metadata() + ) # use the store meta data of this observation type + + def fetch_browser_info( + self, + page: Page, + client: CDPSession, + ) -> BrowserInfo: + # extract domtree + tree = client.send( + "DOMSnapshot.captureSnapshot", + { + "computedStyles": [], + "includeDOMRects": True, + "includePaintOrder": True, + }, + ) + + # calibrate the bounds, in some cases, the bounds are scaled somehow + bounds = tree["documents"][0]["layout"]["bounds"] + b = bounds[0] + n = b[2] / self.viewport_size["width"] + bounds = [[x / n for x in bound] for bound in bounds] + tree["documents"][0]["layout"]["bounds"] = bounds + + # extract browser info + win_top_bound = page.evaluate("window.pageYOffset") + win_left_bound = page.evaluate("window.pageXOffset") + win_width = page.evaluate("window.screen.width") + win_height = page.evaluate("window.screen.height") + win_right_bound = win_left_bound + win_width + win_lower_bound = win_top_bound + win_height + device_pixel_ratio = page.evaluate("window.devicePixelRatio") + assert device_pixel_ratio == 1.0, "devicePixelRatio is not 1.0" + + config: BrowserConfig = { + "win_top_bound": win_top_bound, + "win_left_bound": win_left_bound, + "win_width": win_width, + "win_height": win_height, + "win_right_bound": win_right_bound, + "win_lower_bound": win_lower_bound, + "device_pixel_ratio": device_pixel_ratio, + } + + # assert len(tree['documents']) == 1, "More than one document in the DOM tree" + info: BrowserInfo = {"DOMTree": tree, "config": config} + # with open('output/browser_info.json', 'w') as f: + # f.write(json.dumps(tree, ensure_ascii=False)) + return info + + @staticmethod + def get_bounding_client_rect( + client: CDPSession, backend_node_id: str + ) -> dict[str, Any]: + try: + remote_object = client.send( + "DOM.resolveNode", {"backendNodeId": int(backend_node_id)} + ) + remote_object_id = remote_object["object"]["objectId"] + response = client.send( + "Runtime.callFunctionOn", + { + "objectId": remote_object_id, + "functionDeclaration": """ + function() { + if (this.nodeType == 3) { + var range = document.createRange(); + range.selectNode(this); + var rect = range.getBoundingClientRect().toJSON(); + range.detach(); + return rect; + } else { + return this.getBoundingClientRect().toJSON(); + } + } + """, + "returnByValue": True, + }, + ) + return response + except Exception as e: + return {"result": {"subtype": "error"}} + + @staticmethod + def get_element_in_viewport_ratio( + elem_left_bound: float, + elem_top_bound: float, + width: float, + height: float, + config: BrowserConfig, + ) -> float: + elem_right_bound = elem_left_bound + width + elem_lower_bound = elem_top_bound + height + + win_left_bound = 0 + win_right_bound = config["win_width"] + win_top_bound = 0 + win_lower_bound = config["win_height"] + + # Compute the overlap in x and y axes + overlap_width = max( + 0, + min(elem_right_bound, win_right_bound) + - max(elem_left_bound, win_left_bound), + ) + overlap_height = max( + 0, + min(elem_lower_bound, win_lower_bound) + - max(elem_top_bound, win_top_bound), + ) + + # Compute the overlap area + ratio = overlap_width * overlap_height / width * height + return ratio + + def element_is_visible(self, page, element_id): + def _get_element_in_viewport_ratio( + elem_left_bound: float, + elem_top_bound: float, + width: float, + height: float, + config: BrowserConfig, + ) -> float: + def calculate_overlap(start1, end1, start2, end2): + # Calculate overlap + overlap_start = max(start1, start2) + overlap_end = min(end1, end2) + + # Check if there's overlap + if overlap_start < overlap_end: + overlap = overlap_end - overlap_start + else: + overlap = 0 + + return overlap + elem_right_bound = elem_left_bound + width + elem_lower_bound = elem_top_bound + height + + win_left_bound = 0 + win_right_bound = config["win_width"] + win_top_bound = 0 + win_lower_bound = config["win_height"] + + overlap_width = calculate_overlap(elem_left_bound, elem_right_bound, win_left_bound, win_right_bound) + overlap_height = calculate_overlap(elem_top_bound, elem_lower_bound, win_top_bound, win_lower_bound) + + try: + ratio = (overlap_width * overlap_height) / (width * height) + return ratio + except: + return 1 #TODO + try: + browser_info = self.fetch_browser_info(page, page.client) + except Exception: + page.wait_for_load_state("load", timeout=500) + browser_info = self.fetch_browser_info(page, page.client) + + response = self.get_bounding_client_rect( + page.client, self.obs_nodes_info[element_id]["backend_id"] + ) + + x = response["result"]["value"]["x"] + y = response["result"]["value"]["y"] + width = response["result"]["value"]["width"] + height = response["result"]["value"]["height"] + + + in_viewport_ratio = _get_element_in_viewport_ratio( + elem_left_bound=float(x), + elem_top_bound=float(y), + width=float(width), + height=float(height), + config=browser_info["config"], + ) + + if in_viewport_ratio < IN_VIEWPORT_RATIO_THRESHOLD: + return False + + return True + + def fetch_page_html( + self, + info: BrowserInfo, + page: Page, + client: CDPSession, + current_viewport_only: bool, + ) -> DOMTree: + # adopted from [natbot](https://github.com/nat/natbot) + tree = info["DOMTree"] + config = info["config"] + strings = tree["strings"] + document = tree["documents"][0] + nodes = document["nodes"] + layout = document["layout"] + + import time + stt = time.time() + # make a dom tree that is easier to navigate + dom_tree: DOMTree = [] + graph = defaultdict(list) + print(nodes.keys()) + for node_idx in range(len(nodes["nodeName"])): + cur_node: DOMNode = { + "nodeId": "", + "nodeType": "", + "nodeName": "", + "nodeValue": "", + "attributes": "", + "backendNodeId": "", + "parentId": "", + "childIds": [], + "cursor": 0, + "union_bound": None, + } + + node_type_idx = nodes["nodeType"][node_idx] + node_type = "generic" + if node_type_idx >= 0 and node_type_idx < len(strings): + node_type = strings[node_type_idx] + + node_name = strings[nodes["nodeName"][node_idx]] + + node_value_idx = nodes["nodeValue"][node_idx] + node_value = "" + if node_value_idx >= 0 and node_value_idx < len(strings): + node_value = " ".join(strings[node_value_idx].split()) + + node_attributes = [ + strings[i] for i in nodes["attributes"][node_idx] + ] + node_attributes_str = "" + for i in range(0, len(node_attributes), 2): + a = node_attributes[i] + b = node_attributes[i + 1] + # b = " ".join(b.split()) + import re + b = re.sub(r"{\s*opacity:\s*.*;*\s*}", " ", b) + b = [b_item for b_item in b.split() if b_item.count('vimium') == 0] + b = " ".join(b) + node_attributes_str += f'{a}="{b}" ' + + node_attributes_str = node_attributes_str.strip() + + cur_node["nodeId"] = str(node_idx) + cur_node["nodeType"] = node_type + cur_node["nodeName"] = node_name + cur_node["nodeValue"] = node_value + cur_node["attributes"] = node_attributes_str + cur_node["backendNodeId"] = str(nodes["backendNodeId"][node_idx]) + cur_node["parentId"] = str(nodes["parentIndex"][node_idx]) + + if cur_node["parentId"] != "-1": + graph[cur_node["parentId"]].append(str(cur_node["nodeId"])) + + # get the bound + if cur_node["parentId"] == "-1": + cur_node["union_bound"] = [0.0, 0.0, 10.0, 10.0] + else: + # method 1 + # response = self.get_bounding_client_rect( + # client, cur_node["backendNodeId"] + # ) + + # if response.get("result", {}).get("subtype", "") == "error": + # cur_node["union_bound"] = None + # else: + # x = response["result"]["value"]["x"] + # y = response["result"]["value"]["y"] + # width = response["result"]["value"]["width"] + # height = response["result"]["value"]["height"] + # cur_node["union_bound"] = [x, y, width, height] + + # method 2 + bound = [0.0, 0.0, 0.0, 0.0] + if node_idx in layout["nodeIndex"]: + bound = layout["bounds"][layout["nodeIndex"].index(node_idx)] + bound[0] -= config["win_left_bound"] + bound[1] -= config["win_top_bound"] + + cur_node["union_bound"] = bound + + dom_tree.append(cur_node) + print('[build]', time.time() - stt) + + stt = time.time() + # add parent children index to the node + for parent_id, child_ids in graph.items(): + dom_tree[int(parent_id)]["childIds"] = child_ids + print('[graph]', time.time() - stt) + + # with open('output/dom_tree.json', 'w') as f: + # f.write(json.dumps(dom_tree, ensure_ascii=False)) + + stt = time.time() + # remove the nodes that are not in the current viewport + if current_viewport_only: + + def remove_node_in_graph(node: DOMNode) -> None: + # update the node information in the accessibility tree + node_id = node["nodeId"] + parent_id = node["parentId"] + child_ids = node["childIds"] + + # update the children of the parent node + assert dom_tree[int(parent_id)]["parentId"] != "[REMOVED]" + # remove the nodeid from parent + index = dom_tree[int(parent_id)]["childIds"].index(node_id) + dom_tree[int(parent_id)]["childIds"].pop(index) + + # Insert children_nodeids in the same location + for child_id in child_ids: + dom_tree[int(parent_id)]["childIds"].insert( + index, child_id + ) + index += 1 + + # update children node's parent + for child_id in child_ids: + dom_tree[int(child_id)]["parentId"] = parent_id + # mark as removed + dom_tree[int(node_id)]["parentId"] = "[REMOVED]" + + config = info["config"] + for cursor, node in enumerate(dom_tree): + if not node["union_bound"]: + remove_node_in_graph(node) + continue + + [x, y, width, height] = node["union_bound"] + + # invisible node + if width == 0.0 or height == 0.0: + parent_id = node["parentId"] + if node["nodeName"] not in ['OPTION'] or dom_tree[int(parent_id)]["nodeName"] not in ["SELECT"]: + remove_node_in_graph(node) + continue + + in_viewport_ratio = self.get_element_in_viewport_ratio( + elem_left_bound=float(x), + elem_top_bound=float(y), + width=float(width), + height=float(height), + config=config, + ) + + if in_viewport_ratio < IN_VIEWPORT_RATIO_THRESHOLD: + remove_node_in_graph(node) + + dom_tree = [ + node + for node in dom_tree + if node.get("parentId", "-1") != "[REMOVED]" + ] + + print('[filter]', time.time() - stt) + return dom_tree + + @staticmethod + def parse_my_html(dom_tree: DOMTree) -> tuple[str, str, dict[str, Any], Any]: + """Parse the html tree into a string text""" + + obs_nodes_info = {} + nodeid_to_cursor = { + node["nodeId"]: idx for idx, node in enumerate(dom_tree) + } + + def dfs(node_cursor: int, depth: int) -> tuple[str, list[str]]: + tree_str, labeled_elems = '', [] + node = dom_tree[node_cursor] + valid_node = True + pure_text = False + try: + if node['nodeName'] == '#text': + node['nodeName'] = 'text' + + node_str = f"<{node['nodeName']}" + if node["attributes"]: + node_str += f" {node['attributes']}" + node_str += f" backend-id=\"bid-{node['backendNodeId']}\"> {node['nodeValue']}" + + # if node['nodeName'] == '#text': + # pure_text = True + # node_str = node['nodeValue'] + + valid_node = bool(node["attributes"] or node["nodeValue"] or pure_text) + + if valid_node: + node_html = lxml.html.fromstring(node_str) + label = node_html.attrib.get('data-testid', '') + if len(label) > 0: + labeled_elems.append(node["backendNodeId"]) + obs_nodes_info[str(node_cursor)] = { + "backend_id": node["backendNodeId"], + "union_bound": node["union_bound"], + "text": node['nodeValue'], + "label": label, + } + tree_str += f"{node_str}" + + except Exception as e: + valid_node = False + + for child_ids in node["childIds"]: + child_cursor = nodeid_to_cursor[child_ids] + child_depth = depth + 1 if valid_node else depth + child_str, elems = dfs(child_cursor, child_depth) + tree_str += child_str + labeled_elems.extend(elems) + + if valid_node and not pure_text: + tree_str += f"" + + return tree_str, labeled_elems + + html, labeled_elems = dfs(0, 0) + + # with open('output/raw.html', 'w') as f: + # f.write(html) + print(labeled_elems) + + args = { + 'use_position': False, + 'id_attr': 'backend-id', + 'label_generator': 'order', + 'label_attr': 'data-testid', + 'attr_list': basic_attrs, + 'prompt': 'refine', + } + + hp = HtmlParser(html, args) + packet = hp.parse_tree() + page_html = packet['html'] + + print(print_html_object(page_html)) + + it, pt = packet.get('init_time', 0), packet.get('parse_time', 0) + print(f'[Time] {it:.3f} {pt:.3f}') + + return html, page_html, obs_nodes_info, hp + + @staticmethod + def parse_html(dom_tree: DOMTree) -> tuple[str, dict[str, Any]]: + """Parse the html tree into a string text""" + + obs_nodes_info = {} + nodeid_to_cursor = { + node["nodeId"]: idx for idx, node in enumerate(dom_tree) + } + + def dfs(node_cursor: int, depth: int) -> str: + tree_str = "" + node = dom_tree[node_cursor] + indent = "\t" * depth + valid_node = True + try: + node_str = f"[{node_cursor}] <{node['nodeName']}" + if node["attributes"]: + node_str += f" {node['attributes']}" + node_str += f"> {node['nodeValue']}" + valid_node = bool(node["attributes"] or node["nodeValue"]) + + if valid_node: + obs_nodes_info[str(node_cursor)] = { + "backend_id": node["backendNodeId"], + "union_bound": node["union_bound"], + "text": node_str, + } + tree_str += f"{indent}{node_str}\n" + + except Exception as e: + valid_node = False + + for child_ids in node["childIds"]: + child_cursor = nodeid_to_cursor[child_ids] + child_depth = depth + 1 if valid_node else depth + child_str = dfs(child_cursor, child_depth) + tree_str += child_str + + return tree_str + + html = dfs(0, 0) + return html, obs_nodes_info + + def fetch_page_accessibility_tree( + self, + info: BrowserInfo, + client: CDPSession, + current_viewport_only: bool, + ) -> AccessibilityTree: + accessibility_tree: AccessibilityTree = client.send( + "Accessibility.getFullAXTree", {} + )["nodes"] + + # a few nodes are repeated in the accessibility tree + seen_ids = set() + _accessibility_tree = [] + for node in accessibility_tree: + if node["nodeId"] not in seen_ids: + _accessibility_tree.append(node) + seen_ids.add(node["nodeId"]) + accessibility_tree = _accessibility_tree + nodeid_to_cursor = {} + for cursor, node in enumerate(accessibility_tree): + nodeid_to_cursor[node["nodeId"]] = cursor + # usually because the node is not visible etc + if "backendDOMNodeId" not in node: + node["union_bound"] = None + continue + backend_node_id = str(node["backendDOMNodeId"]) + if node["role"]["value"] == "RootWebArea": + # always inside the viewport + node["union_bound"] = [0.0, 0.0, 10.0, 10.0] + else: + response = self.get_bounding_client_rect( + client, backend_node_id + ) + if response.get("result", {}).get("subtype", "") == "error": + node["union_bound"] = None + else: + x = response["result"]["value"]["x"] + y = response["result"]["value"]["y"] + width = response["result"]["value"]["width"] + height = response["result"]["value"]["height"] + node["union_bound"] = [x, y, width, height] + + # filter nodes that are not in the current viewport + if current_viewport_only: + + def remove_node_in_graph(node: AccessibilityTreeNode) -> None: + # update the node information in the accessibility tree + nodeid = node["nodeId"] + node_cursor = nodeid_to_cursor[nodeid] + parent_nodeid = node["parentId"] + children_nodeids = node["childIds"] + parent_cursor = nodeid_to_cursor[parent_nodeid] + # update the children of the parent node + assert ( + accessibility_tree[parent_cursor].get("parentId", "Root") + is not None + ) + # remove the nodeid from parent's childIds + index = accessibility_tree[parent_cursor]["childIds"].index( + nodeid + ) + accessibility_tree[parent_cursor]["childIds"].pop(index) + # Insert children_nodeids in the same location + for child_nodeid in children_nodeids: + accessibility_tree[parent_cursor]["childIds"].insert( + index, child_nodeid + ) + index += 1 + # update children node's parent + for child_nodeid in children_nodeids: + child_cursor = nodeid_to_cursor[child_nodeid] + accessibility_tree[child_cursor][ + "parentId" + ] = parent_nodeid + # mark as removed + accessibility_tree[node_cursor]["parentId"] = "[REMOVED]" + + config = info["config"] + for node in accessibility_tree: + if not node["union_bound"]: + remove_node_in_graph(node) + continue + + [x, y, width, height] = node["union_bound"] + + # invisible node + if width == 0 or height == 0: + remove_node_in_graph(node) + continue + + in_viewport_ratio = self.get_element_in_viewport_ratio( + elem_left_bound=float(x), + elem_top_bound=float(y), + width=float(width), + height=float(height), + config=config, + ) + + if in_viewport_ratio < IN_VIEWPORT_RATIO_THRESHOLD: + remove_node_in_graph(node) + + accessibility_tree = [ + node + for node in accessibility_tree + if node.get("parentId", "Root") != "[REMOVED]" + ] + + return accessibility_tree + + @staticmethod + def parse_accessibility_tree( + accessibility_tree: AccessibilityTree, + ) -> tuple[str, dict[str, Any], TreeNode]: + """Parse the accessibility tree into a string text""" + node_id_to_idx = {} + for idx, node in enumerate(accessibility_tree): + node_id_to_idx[node["nodeId"]] = idx + + obs_nodes_info = {} + + def dfs(idx: int, obs_node_id: str, depth: int, active_node_dict: dict) -> str: + tree_str = "" + node = accessibility_tree[idx] + indent = "\t" * depth + valid_node = True + try: + role = node["role"]["value"] + name = node["name"]["value"] + node_str = f"[{obs_node_id}] {role} {repr(name)}" + properties = [] + structured_properties = {} + for property in node.get("properties", []): + try: + if property["name"] in IGNORED_ACTREE_PROPERTIES: + continue + properties.append( + f'{property["name"]}: {property["value"]["value"]}' + ) + structured_properties[property["name"]] = property["value"]["value"] + except KeyError: + pass + + if properties: + node_str += " " + " ".join(properties) + + # check valid + if not node_str.strip(): + valid_node = False + + # empty generic node + if not name.strip(): + if not properties: + if role in [ + "generic", + "img", + "list", + "strong", + "paragraph", + "banner", + "navigation", + "Section", + "LabelText", + "Legend", + "listitem", + ]: + valid_node = False + elif role in ["listitem"]: + valid_node = False + + if valid_node: + tree_str += f"{indent}{node_str}" + obs_nodes_info[obs_node_id] = { + "backend_id": node["backendDOMNodeId"], + "union_bound": node["union_bound"], + "text": node_str, + } + + except Exception as e: + valid_node = False + + structured_node = TreeNode(node_id=int(obs_node_id), role=node["role"]["value"], name=node["name"]["value"], depth=depth, properties=structured_properties) if valid_node else None + active_node_dict[depth] = structured_node if valid_node else active_node_dict.get(depth, None) + + for _, child_node_id in enumerate(node["childIds"]): + if child_node_id not in node_id_to_idx: + continue + # mark this to save some tokens + child_depth = depth + 1 if valid_node else depth + child_str, child_node = dfs( + node_id_to_idx[child_node_id], child_node_id, child_depth, active_node_dict=active_node_dict + ) + if child_str.strip(): + if tree_str.strip(): + tree_str += "\n" + tree_str += child_str + if child_depth > 0 and child_node: + active_node_dict[child_depth - 1].add_child(child_node) + + return tree_str, structured_node + + tree_str, structured_node = dfs(0, accessibility_tree[0]["nodeId"], 0, active_node_dict={}) + return tree_str, obs_nodes_info, structured_node + + @staticmethod + def clean_accesibility_tree(tree_str: str) -> str: + """further clean accesibility tree""" + clean_lines: list[str] = [] + for line in tree_str.split("\n"): + if "statictext" in line.lower(): + prev_lines = clean_lines[-3:] + pattern = r"\[\d+\] StaticText '([^']+)'" + + match = re.search(pattern, line) + if match: + static_text = match.group(1) + if all( + static_text not in prev_line + for prev_line in prev_lines + ): + clean_lines.append(line) + else: + clean_lines.append(line) + + return "\n".join(clean_lines) + + def process(self, page: Page, client: CDPSession, context: str) -> str: + # get the tab info + open_tabs = page.context.pages + # try: + # tab_titles = [tab.title() for tab in open_tabs] + # current_tab_idx = open_tabs.index(page) + # for idx in range(len(open_tabs)): + # if idx == current_tab_idx: + # tab_titles[ + # idx + # ] = f"Tab {idx} (current): {open_tabs[idx].title()}" + # else: + # tab_titles[idx] = f"Tab {idx}: {open_tabs[idx].title()}" + # tab_title_str = " | ".join(tab_titles) + # except Exception: + # tab_title_str = " | ".join( + # ["Tab {idx}" for idx in range(len(open_tabs))] + # ) + + try: + tab_titles = [tab.title() for tab in open_tabs] + current_tab_idx = open_tabs.index(page) + for idx in range(len(open_tabs)): + if idx == current_tab_idx: + tab_titles[ + idx + ] = f"{idx+1}. {open_tabs[idx].title()} <-- current tab" + else: + tab_titles[idx] = f"{idx+1}. {open_tabs[idx].title()}" + tab_title_str = "\n".join(tab_titles) + except Exception: + tab_title_str = "\n".join( + [f"{idx+1}. Default" for idx in range(len(open_tabs))] + ) + + + try: + browser_info = self.fetch_browser_info(page, client) + except Exception: + page.wait_for_load_state("load", timeout=500) + browser_info = self.fetch_browser_info(page, client) + + if self.observation_type == "html": + import time + stt = time.time() + dom_tree = self.fetch_page_html( + browser_info, + page, + client, + current_viewport_only=self.current_viewport_only, + ) + + print('[fetch]', time.time() - stt) + + stt = time.time() + raw_html, content, obs_nodes_info, hp = self.parse_my_html(dom_tree) + print('[parse]', time.time() - stt) + + window_height = page.evaluate("window.innerHeight") + page_height = page.evaluate('document.documentElement.scrollHeight') / window_height + position = page.evaluate("window.scrollY") / window_height + + self.obs_nodes_info = obs_nodes_info + self.meta_data["obs_nodes_info"] = obs_nodes_info + self.meta_data["position_info"] = { + "page_height": page_height, + "position": position, + } + self.meta_data["dom_info"] = { + "raw_html": raw_html, + "dom_tree": dom_tree, + } + self.meta_data["html_parser"] = hp + self.meta_data["tab_title"] = tab_title_str + + elif self.observation_type == "accessibility_tree": + accessibility_tree = self.fetch_page_accessibility_tree( + browser_info, + client, + current_viewport_only=self.current_viewport_only, + ) + content, obs_nodes_info, node_root = self.parse_accessibility_tree( + accessibility_tree + ) + content = self.clean_accesibility_tree(content) + self.obs_nodes_info = obs_nodes_info + page_dialog_message = getattr(page, "dialog_message", "") + if page_dialog_message: + import copy + node_root.properties["page_dialog_message"] = copy.deepcopy(page_dialog_message) + " Retry." + page.dialog_message = None + self.node_root = node_root + self.meta_data["obs_nodes_info"] = obs_nodes_info + + else: + raise ValueError( + f"Invalid observatrion type: {self.observation_type}" + ) + + self.browser_config = browser_info["config"] + # content = f"{tab_title_str}\n\n{content}" + return (content, node_root) + + def get_node_info_by_element_id(self, AXTreeId): + return self.node_root.search_node_by_id(AXTreeId) + + def get_element_center(self, element_id: str, page) -> tuple[float, float]: + node = self.obs_nodes_info[element_id] + backend_node_id = str(node["backend_id"]) + response = self.get_bounding_client_rect( + page.client, backend_node_id + ) + x = response["result"]["value"]["x"] + y = response["result"]["value"]["y"] + width = response["result"]["value"]["width"] + height = response["result"]["value"]["height"] + center_x = x + width / 2 + center_y = y + height / 2 + return ( + center_x / self.viewport_size["width"], + center_y / self.viewport_size["height"], + ) + + +class ImageObservationProcessor(ObservationProcessor): + def __init__(self, observation_type: str, current_viewport_only: bool): + self.observation_type = observation_type + self.current_viewport_only = current_viewport_only + self.observation_tag = "image" + self.meta_data = create_empty_metadata() + + def process(self, page: Page, client: CDPSession, context: str) -> npt.NDArray[np.uint8]: + try: + screenshot = png_bytes_to_numpy(page.screenshot(full_page=(not self.current_viewport_only))) + screenshot = screenshot[:2*screenshot.shape[1], :, :] + except: + page.wait_for_event("load") + screenshot = png_bytes_to_numpy(page.screenshot(full_page=(not self.current_viewport_only))) + return screenshot + + +class ObservationHandler: + """Main entry point to access all observation processor""" + + def __init__( + self, + main_observation_type: str, + text_observation_type: str, + image_observation_type: str, + current_viewport_only: bool, + viewport_size: ViewportSize, + ) -> None: + self.main_observation_type = main_observation_type + self.text_processor = TextObervationProcessor( + text_observation_type, current_viewport_only, viewport_size + ) + self.image_processor = ImageObservationProcessor( + image_observation_type, current_viewport_only + ) + self.viewport_size = viewport_size + + def get_observation_space(self) -> spaces.Dict: + text_space = spaces.Text( + min_length=0, + max_length=UTTERANCE_MAX_LENGTH, + charset=ASCII_CHARSET + FREQ_UNICODE_CHARSET, + ) + + image_space = spaces.Box( + # Each position stores the RGB values. Note the swapped axes (height first). + np.zeros( + (self.viewport_size["height"], self.viewport_size["width"], 3), + dtype=np.uint8, + ), + np.ones( + (self.viewport_size["height"], self.viewport_size["width"], 3), + dtype=np.uint8, + ) + * 255.0, + dtype=np.uint8, + ) + + return spaces.Dict({"text": text_space, "image": image_space}) + + def get_observation( + self, page: Page, client: CDPSession, context: str = '', + ) -> dict[str, Observation]: + text_obs = self.text_processor.process(page, client, context) + image_obs = self.image_processor.process(page, client, context) + return {"text": text_obs, "image": image_obs} + + def get_observation_metadata(self) -> dict[str, ObservationMetadata]: + return { + "text": self.text_processor.meta_data, + "image": self.image_processor.meta_data, + } + + @property + def action_processor(self) -> ObservationProcessor: + """Return the main processor that is associated with the action space""" + if self.main_observation_type == "text": + return self.text_processor + elif self.main_observation_type == "image": + return self.image_processor + else: + raise ValueError("Invalid main observation type") diff --git a/browser_env/py.typed b/browser_env/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/browser_env/scripts/__init__.py b/browser_env/scripts/__init__.py new file mode 100644 index 0000000..719e3e3 --- /dev/null +++ b/browser_env/scripts/__init__.py @@ -0,0 +1,43 @@ +import os +from pathlib import Path +rootdir = Path(__file__).parent + +# marker, gpt-4v-act style +with open(os.path.join(rootdir, 'local_marker.js'), 'r') as f: + local_marker_script = f.read() + +with open(os.path.join(rootdir, 'mix_marker.js'), 'r') as f: + mix_marker_script = f.read() + +with open(os.path.join(rootdir, 'get_data.js'), 'r') as f: + get_rect_script = f.read() + +# canva handler +with open(os.path.join(rootdir, 'canva_handler.js'), 'r') as f: + canva_handler_script = f.read() + +# draw label on page +with open(os.path.join(rootdir, 'label_marker.js'), 'r') as f: + label_marker_script = f.read() + +# get text from page +with open(os.path.join(rootdir, 'get_text.js'), 'r') as f: + get_text_script = f.read() + +# remove label draw on page +remove_label_mark_script = """ + () => { + document.querySelectorAll(".our-dom-marker").forEach(item => { + document.body.removeChild(item); + }); + } +""" + +remove_id_script = """ + () => { + Array.from(document.getElementsByClassName('possible-clickable-element')).forEach((element) => { + element.classList.remove('possible-clickable-element'); + element.removeAttribute('data-testid'); + }); + } +""" diff --git a/browser_env/scripts/canva_handler.js b/browser_env/scripts/canva_handler.js new file mode 100644 index 0000000..997ec78 --- /dev/null +++ b/browser_env/scripts/canva_handler.js @@ -0,0 +1,44 @@ +() => { + var items = Array.prototype.slice.call( + document.querySelectorAll("canvas") + ); + + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + items = items.map(element => { + // var img = element.toDataURL("image/png"); + var bb = element.getClientRects(); + var rect = { + left: -1, + top: -1, + right: -1, + bottom: -1, + width: 0, + height: 0, + }; + if (bb.length > 0) { + bb = bb[0]; + rect = { + left: Math.max(0, bb.left), + top: Math.max(0, bb.top), + right: Math.min(vw, bb.right), + bottom: Math.min(vh, bb.bottom) + }; + rect = { + ...rect, + width: rect.right - rect.left, + height: rect.bottom - rect.top + }; + } + + return { + rects: rect, + tag: element.tagName.toLowerCase?.() || "", + text: element.textContent.trim().replace(/\s{2,}/g, ' '), + // img: img + }; + }); + + return items; +} \ No newline at end of file diff --git a/browser_env/scripts/get_data.js b/browser_env/scripts/get_data.js new file mode 100644 index 0000000..492a52a --- /dev/null +++ b/browser_env/scripts/get_data.js @@ -0,0 +1,73 @@ +(packet) => { + function int2str(index) { + var str = ""; + while (index >= 0) { + str = String.fromCharCode(65 + index % 26) + str; + index = Math.floor(index / 26) - 1; + } + return str; + }; + + selector = packet.selector + index = packet.startIndex + var items = Array.prototype.slice.call( + document.querySelectorAll(selector) + ); + + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + items = items.filter( + x => !items.some(y => x.contains(y) && !(x == y)) + ).map(element => { + var bb = element.getClientRects(); + var rect = { + left: 0, + top: 0, + right: 0, + bottom: 0, + width: 0, + height: 0 + }; + var keep = false; + var text = "", id = -1; + if (bb.length > 0) { + bb = bb[0]; + rect = { + left: Math.max(0, bb.left), + top: Math.max(0, bb.top), + right: Math.min(vw, bb.right), + bottom: Math.min(vh, bb.bottom) + }; + rect = { + ...rect, + width: rect.right - rect.left, + height: rect.bottom - rect.top + }; + if (rect.width > 0 || rect.height > 0) { + keep = true; + if (index >= 0) { + id = int2str(index++); + element.setAttribute("data-testid", id); + } + var childNodes = element.childNodes; + + for (var i = 0; i < childNodes.length; i++) { + if (childNodes[i].nodeType == Node.TEXT_NODE) { + text += childNodes[i].textContent; + } + } + } + } + + return { + keep: true, + id, + rects: rect, + tag: element.tagName.toLowerCase?.() || "", + text,//: element.innerText?.trim().replace(/\s{2,}/g, " ") || "" + }; + }).filter(x => x.keep); + + return [items, index]; +} \ No newline at end of file diff --git a/browser_env/scripts/get_text.js b/browser_env/scripts/get_text.js new file mode 100644 index 0000000..e583ed4 --- /dev/null +++ b/browser_env/scripts/get_text.js @@ -0,0 +1,48 @@ +() => { + var items = Array.prototype.slice.call( + document.querySelectorAll("*") + ); + + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + const ignoreTags = ["script", "html"]; + items = items.map(element => { + const tag = element.tagName.toLowerCase?.() || ""; + var bb = element.getClientRects(); + var keep = false; + var text = ''; + + const domId = element.getAttribute('data-testid'); + var id = domId? parseInt(domId): "-"; + + if (bb.length > 0) { + bb = bb[0]; + var width = Math.min(vw, bb.right) - Math.max(0, bb.left); + var height = Math.min(vh, bb.bottom) - Math.max(0, bb.top); + + if (width > 0 && height > 0) { + keep = true; + var childNodes = element.childNodes; + + for (var i = 0; i < childNodes.length; i++) { + if (childNodes[i].nodeType == Node.TEXT_NODE) { + text += childNodes[i].textContent; + } + } + } + } + + text = text.trim().replace(/\s{2,}/g, ' '); + if (ignoreTags.includes(tag)) keep = false; + if (id == "-" && text.length == 0) keep = false; + + return { + keep, + tag, + id, + text, //:element.innerText?.trim().replace(/\s{2,}/g, " ") || "" + }; + }).filter(x => x.keep); + + return items; +} \ No newline at end of file diff --git a/browser_env/scripts/label_marker.js b/browser_env/scripts/label_marker.js new file mode 100644 index 0000000..6c55af5 --- /dev/null +++ b/browser_env/scripts/label_marker.js @@ -0,0 +1,65 @@ +(items) => { + function getRandomColor() { + var letters = '0123456789ABCDEF'; + var color = '#'; + for (var i = 0; i < 6; i++) { + color += letters[Math.floor(Math.random() * 16)]; + } + return color; + } + + items.filter( + item => item.id != "" + ).forEach((item) => { + const bbox = item.rects; + const id_string = `dom-marker-id-${index}`; + + index = item.id; + + outerElement = document.createElement("div"); + outerElement.classList.add("our-dom-marker"); + // var borderColor = getRandomColor(); + var borderColor = "#FFFF00"; + outerElement.style.outline = `2px dashed ${borderColor}`; + outerElement.style.position = "fixed"; + outerElement.style.left = bbox.left - 2 + "px"; + outerElement.style.top = bbox.top - 2 + "px"; + outerElement.style.width = bbox.width + 4 + "px"; + outerElement.style.height = bbox.height + 4 + "px"; + outerElement.style.pointerEvents = "none"; + outerElement.style.boxSizing = "border-box"; + outerElement.style.zIndex = 2147483647; + + innerElement = document.createElement("div"); + innerElement.classList.add("our-dom-marker"); + innerElement.style.outline = `2px dashed #222288`; + innerElement.style.position = "fixed"; + innerElement.style.left = bbox.left + "px"; + innerElement.style.top = bbox.top + "px"; + innerElement.style.width = bbox.width + "px"; + innerElement.style.height = bbox.height + "px"; + innerElement.style.pointerEvents = "none"; + innerElement.style.boxSizing = "border-box"; + innerElement.style.zIndex = 2147483647; + + // Add floating label at the corner + var label = document.createElement("span"); + var topPosition = 25; + if (bbox.top < 25) topPosition = bbox.top; + label.textContent = index; + label.style.position = "absolute"; + label.style.top = `-${topPosition}px`; + label.style.left = "0px"; + label.style.background = borderColor; + label.style.color = "black"; + label.style.padding = "2px 4px"; + label.style.fontSize = "16px"; + label.style.borderRadius = "2px"; + label.style.fontWeight = "bold"; + outerElement.appendChild(label); + + document.body.appendChild(outerElement); + document.body.appendChild(innerElement); + }) + return items; +} \ No newline at end of file diff --git a/browser_env/scripts/local_marker.js b/browser_env/scripts/local_marker.js new file mode 100644 index 0000000..a21bc0c --- /dev/null +++ b/browser_env/scripts/local_marker.js @@ -0,0 +1,46 @@ +() => { + var items = Array.prototype.slice.call( + document.querySelectorAll('*') + ).map((element) => { + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + var rects = [...element.getClientRects()].filter(bb => { + var center_x = bb.left + bb.width / 2; + var center_y = bb.top + bb.height / 2; + var elAtCenter = document.elementFromPoint(center_x, center_y); + + return elAtCenter === element || element.contains(elAtCenter) + }).map(bb => { + const rect = { + left: Math.max(0, bb.left), + top: Math.max(0, bb.top), + right: Math.min(vw, bb.right), + bottom: Math.min(vh, bb.bottom) + }; + return { + ...rect, + width: rect.right - rect.left, + height: rect.bottom - rect.top + } + }); + + var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0); + + return { + element: element, + include: window.getComputedStyle(element).cursor == "pointer", + area, + rects, + text: element.textContent.trim().replace(/\s{2,}/g, ' '), + }; + }).filter(item => + item.include && (item.area >= 20) + ) + + items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y))) + + items.forEach(item => { + item.element.classList.add('possible-clickable-element'); + }); +} \ No newline at end of file diff --git a/browser_env/scripts/mix_marker.js b/browser_env/scripts/mix_marker.js new file mode 100644 index 0000000..b3d02e7 --- /dev/null +++ b/browser_env/scripts/mix_marker.js @@ -0,0 +1,148 @@ +() => { + var items = Array.prototype.slice.call( + document.querySelectorAll('*') + ).map(function(element) { + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + var rects = [...element.getClientRects()].filter(bb => { + var center_x = bb.left + bb.width / 2; + var center_y = bb.top + bb.height / 2; + var elAtCenter = document.elementFromPoint(center_x, center_y); + + if (!elAtCenter) return false; + return elAtCenter === element || element.contains(elAtCenter) + }).map(bb => { + const rect = { + left: Math.max(0, bb.left), + top: Math.max(0, bb.top), + right: Math.min(vw, bb.right), + bottom: Math.min(vh, bb.bottom) + }; + return { + ...rect, + width: rect.right - rect.left, + height: rect.bottom - rect.top + } + }); + // var rects = []; + var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0); + + const tagName = element.tagName.toLowerCase?.() || ""; + let isClickable = ((element.onclick != null) || window.getComputedStyle(element).cursor == "pointer"); + + // Insert area elements that provide click functionality to an img. + if (tagName === "img") { + let mapName = element.getAttribute("usemap"); + if (mapName) { + const imgClientRects = element.getClientRects(); + mapName = mapName.replace(/^#/, "").replace('"', '\\"'); + const map = document.querySelector(`map[name=\"${mapName}\"]`); + if (map && (imgClientRects.length > 0)) isClickable = true; + } + } + + if (!isClickable) { + const role = element.getAttribute("role"); + const clickableRoles = [ + "button", + "tab", + "link", + "checkbox", + "menuitem", + "menuitemcheckbox", + "menuitemradio", + "radio", + ]; + if (role != null && clickableRoles.includes(role.toLowerCase())) { + isClickable = true; + } else { + const contentEditable = element.getAttribute("contentEditable"); + if ( + contentEditable != null && + ["", "contenteditable", "true"].includes(contentEditable.toLowerCase()) + ) { + isClickable = true; + } + } + } + + // Check for jsaction event listeners on the element. + if (!isClickable && element.hasAttribute("jsaction")) { + const jsactionRules = element.getAttribute("jsaction").split(";"); + for (let jsactionRule of jsactionRules) { + const ruleSplit = jsactionRule.trim().split(":"); + if ((ruleSplit.length >= 1) && (ruleSplit.length <= 2)) { + const [eventType, namespace, actionName] = ruleSplit.length === 1 + ? ["click", ...ruleSplit[0].trim().split("."), "_"] + : [ruleSplit[0], ...ruleSplit[1].trim().split("."), "_"]; + if (!isClickable) { + isClickable = (eventType === "click") && (namespace !== "none") && (actionName !== "_"); + } + } + } + } + + if (!isClickable) { + const clickableTags = [ + "input", + "textarea", + "select", + "button", + "a", + "iframe", + "video", + "object", + "embed", + "details" + ]; + isClickable = clickableTags.includes(tagName); + } + + if (!isClickable) { + if (tagName === "label") + isClickable = (element.control != null) && !element.control.disabled; + else if (tagName === "img") + isClickable = ["zoom-in", "zoom-out"].includes(element.style.cursor); + } + + // An element with a class name containing the text "button" might be clickable. However, real + // clickables are often wrapped in elements with such class names. So, when we find clickables + // based only on their class name, we mark them as unreliable. + const className = element.getAttribute("class"); + if (!isClickable && className && className.toLowerCase().includes("button")) { + isClickable = true; + } + + // Elements with tabindex are sometimes useful, but usually not. We can treat them as second + // class citizens when it improves UX, so take special note of them. + const tabIndexValue = element.getAttribute("tabindex"); + const tabIndex = tabIndexValue ? parseInt(tabIndexValue) : -1; + if (!isClickable && !(tabIndex < 0) && !isNaN(tabIndex)) { + isClickable = true; + } + + const idValue = element.getAttribute("id"); + const id = idValue ? idValue.toLowerCase() : ""; + if (isClickable && area == 0) { + const textValue = element.textContent.trim().replace(/\s{2,}/g, ' '); + clickable_msg = `${tagName}[id=${id}] ${isClickable} (${area}) ${textValue}` + } + + return { + element: element, + include: isClickable, + area, + rects, + text: element.textContent.trim().replace(/\s{2,}/g, ' ') + }; + }).filter(item => + item.include && (item.area >= 1) + ); + + items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y))) + + items.forEach(item => { + item.element.classList.add('possible-clickable-element'); + }); +} \ No newline at end of file diff --git a/browser_env/trajectory.py b/browser_env/trajectory.py new file mode 100644 index 0000000..1c4c410 --- /dev/null +++ b/browser_env/trajectory.py @@ -0,0 +1,6 @@ +from typing import Union + +from .actions import Action +from .utils import StateInfo + +Trajectory = list[Union[StateInfo, Action]] diff --git a/browser_env/utils.py b/browser_env/utils.py new file mode 100644 index 0000000..1814242 --- /dev/null +++ b/browser_env/utils.py @@ -0,0 +1,80 @@ +from dataclasses import dataclass +from io import BytesIO +from typing import Any, Dict, TypedDict, Union + +import numpy as np +import numpy.typing as npt +from PIL import Image + + +@dataclass +class DetachedPage: + url: str + content: str # html + + +def png_bytes_to_numpy(png: bytes) -> npt.NDArray[np.uint8]: + """Convert png bytes to numpy array + + Example: + + >>> fig = go.Figure(go.Scatter(x=[1], y=[1])) + >>> plt.imshow(png_bytes_to_numpy(fig.to_image('png'))) + """ + return np.array(Image.open(BytesIO(png))) + + +class AccessibilityTreeNode(TypedDict): + nodeId: str + ignored: bool + role: dict[str, Any] + chromeRole: dict[str, Any] + name: dict[str, Any] + properties: list[dict[str, Any]] + childIds: list[str] + parentId: str + backendDOMNodeId: str + frameId: str + bound: list[float] | None + union_bound: list[float] | None + offsetrect_bound: list[float] | None + + +class DOMNode(TypedDict): + nodeId: str + nodeType: str + nodeName: str + nodeValue: str + attributes: str + backendNodeId: str + parentId: str + childIds: list[str] + cursor: int + union_bound: list[float] | None + + +class BrowserConfig(TypedDict): + win_top_bound: float + win_left_bound: float + win_width: float + win_height: float + win_right_bound: float + win_lower_bound: float + device_pixel_ratio: float + + +class BrowserInfo(TypedDict): + DOMTree: dict[str, Any] + config: BrowserConfig + + +AccessibilityTree = list[AccessibilityTreeNode] +DOMTree = list[DOMNode] + + +Observation = str | npt.NDArray[np.uint8] + + +class StateInfo(TypedDict): + observation: dict[str, Observation] + info: Dict[str, Any] diff --git a/config_files/tasks/stanford_cs_head.json b/config_files/tasks/stanford_cs_head.json new file mode 100644 index 0000000..15f1de2 --- /dev/null +++ b/config_files/tasks/stanford_cs_head.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google" + ], + "task_id": "stanford_cs_head", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "What is the email address of the Dean of the School of Engineering at Stanford University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "widom@stanford.edu" + } + } + } \ No newline at end of file diff --git a/config_files/tasks/tsinghua_da_head.json b/config_files/tasks/tsinghua_da_head.json new file mode 100644 index 0000000..19ea048 --- /dev/null +++ b/config_files/tasks/tsinghua_da_head.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google" + ], + "task_id": "tsinghua_da_head", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "帮我查一下清华大学自动化系系主任的邮箱", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "taozhang@tsinghua.edu.cn" + } + } + } \ No newline at end of file diff --git a/config_files/test.raw.json b/config_files/test.raw.json new file mode 100644 index 0000000..fe76d23 --- /dev/null +++ b/config_files/test.raw.json @@ -0,0 +1,29623 @@ +[ + { + "sites": [ + "shopping_admin" + ], + "task_id": 0, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling product in {{year}}", + "instantiation_dict": { + "n": 1, + "year": 2022 + }, + "intent": "What is the top-1 best-selling product in 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Quest Lumaflex\u2122 Band" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Quest Lumaflex\u2122 Band" + }, + "intent_template_id": 279 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 1, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling brand in {{period}}", + "instantiation_dict": { + "n": 1, + "period": "Quarter 1 2022" + }, + "intent": "What is the top-1 best-selling brand in Quarter 1 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Sprite" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sprite" + }, + "intent_template_id": 279 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 2, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling product type in {{period}}", + "instantiation_dict": { + "n": 1, + "period": "Quarter 1 2022" + }, + "intent": "What is the top-1 best-selling product type in Quarter 1 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Yoga ball" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yoga ball" + }, + "intent_template_id": 279 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 3, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the top-{{n}} best-selling product in {{year}}", + "instantiation_dict": { + "n": 2, + "year": 2022 + }, + "intent": "What are the top-2 best-selling product in 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Quest Lumaflex\u2122 Band", + "Sprite Stasis Ball 65 cm" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Quest Lumaflex\u2122 Band, Sprite Stasis Ball 65 cm" + }, + "intent_template_id": 279 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 4, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the top-{{n}} best-selling product in {{period}}", + "instantiation_dict": { + "n": 3, + "period": "Jan 2023" + }, + "intent": "What are the top-3 best-selling product in Jan 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Impulse Duffle", + "Overnight Duffle", + "Hawkeye Yoga Short-32-Blue" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Impulse Duffle, Overnight Duffle, Hawkeye Yoga Short-32-Blue" + }, + "intent_template_id": 279 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 5, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling product type in {{period}}", + "instantiation_dict": { + "n": 1, + "period": "Jan 2023" + }, + "intent": "What is the top-1 best-selling product type in Jan 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Duffle" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Duffle" + }, + "intent_template_id": 279 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 6, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the top-{{n}} best-selling product in {{year}}", + "instantiation_dict": { + "n": 5, + "year": 2023 + }, + "intent": "What are the top-5 best-selling product in 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Sprite Yoga Strap 6 foot", + "Overnight Duffle", + "Ida Workout Parachute Pant-29-Purple", + "Hawkeye Yoga Short-32-Blue", + "Sprite Stasis Ball 65 cm" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sprite Yoga Strap 6 foot, Overnight Duffle, Ida Workout Parachute Pant-29-Purple, Hawkeye Yoga Short-32-Blue, Sprite Stasis Ball 65 cm" + }, + "intent_template_id": 279 + }, + { + "sites": [ + "map" + ], + "task_id": 7, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "international airports", + "start": "Carnegie Mellon University", + "radius": "50 km" + }, + "intent": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States" + }, + "intent_template_id": 79 + }, + { + "sites": [ + "map" + ], + "task_id": 8, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "international airports", + "start": "Carnegie Mellon University", + "radius": "5 km" + }, + "intent": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "There is no airport within 5 km of Carnegie Mellon University" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "There is no airport within 5 km of Carnegie Mellon University" + }, + "intent_template_id": 79 + }, + { + "sites": [ + "map" + ], + "task_id": 9, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "international airports", + "start": "Carnegie Art Museum", + "radius": "30 km" + }, + "intent": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States" + }, + "intent_template_id": 79 + }, + { + "sites": [ + "map" + ], + "task_id": 10, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "US international airports", + "start": "Niagara Falls", + "radius": "60 km" + }, + "intent": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States", + "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States Buffalo-Niagara International Airport, South Youngs Road, Town of Cheektowaga, Erie County, New York, 14221, United States" + }, + "intent_template_id": 79 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 11, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "disappointed" + }, + "intent": "Tell me the the number of reviews that our store received by far that mention term \"disappointed\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "6" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "6" + }, + "intent_template_id": 288 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 12, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "satisfied" + }, + "intent": "Tell me the the number of reviews that our store received by far that mention term \"satisfied\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 288 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 13, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "decent" + }, + "intent": "Tell me the the number of reviews that our store received by far that mention term \"decent\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 288 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 14, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "not useful" + }, + "intent": "Tell me the the number of reviews that our store received by far that mention term \"not useful\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 288 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 15, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "best" + }, + "intent": "Tell me the the number of reviews that our store received by far that mention term \"best\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 288 + }, + { + "sites": [ + "map" + ], + "task_id": 16, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "5000 Fifth Avenue, Pittsburgh", + "end": "UPMC family health center" + }, + "intent": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 2min; walking: 16min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Driving: 2min. Walking: 16min." + }, + "intent_template_id": 73 + }, + { + "sites": [ + "map" + ], + "task_id": 17, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "AMC Waterfront", + "end": "Carnegie Mellon University" + }, + "intent": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 13min; walking: 1h 35min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 13min, walking: 1h 35min." + }, + "intent_template_id": 73 + }, + { + "sites": [ + "map" + ], + "task_id": 18, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "AMC Waterfront", + "end": "Univ of Pittsburgh" + }, + "intent": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 15min; walking: 1h 47min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 15min, walking: 1h 47min." + }, + "intent_template_id": 73 + }, + { + "sites": [ + "map" + ], + "task_id": 19, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "Carnegie Science Center", + "end": "Carnegie Mellon University" + }, + "intent": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 12min; walking: 1h 44min." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 12min, walking: 1h 44min." + }, + "intent_template_id": 73 + }, + { + "sites": [ + "map" + ], + "task_id": 20, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the difference in time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "Randyland", + "end": "Carnegie Mellon University" + }, + "intent": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 13min; walking: 1h 45min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 13min, walking: 1h 45min." + }, + "intent_template_id": 73 + }, + { + "sites": [ + "shopping" + ], + "task_id": 21, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html", + "geolocation": null, + "intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "ear cups being small" + }, + "intent": "List out reviewers, if exist, who mention about ear cups being small", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Joseph Brzezinski", + "Catso", + "Dibbins", + "Anglebert Dinkherhump", + "Michelle Davis" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, Michelle Davis" + }, + "intent_template_id": 222 + }, + { + "sites": [ + "shopping" + ], + "task_id": 22, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/fujifilm-finepix-z200fd-10mp-digital-camera-with-5x-optical-dual-image-stabilized-zoom-black.html", + "geolocation": null, + "intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "under water photo" + }, + "intent": "List out reviewers, if exist, who mention about under water photo", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no review about under water photo", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 222 + }, + { + "sites": [ + "shopping" + ], + "task_id": 23, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html", + "geolocation": null, + "intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "good fingerprint resistant" + }, + "intent": "List out reviewers, if exist, who mention about good fingerprint resistant", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Rachel", + "T. Gannon" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rachel, T. Gannon, " + }, + "intent_template_id": 222 + }, + { + "sites": [ + "shopping" + ], + "task_id": 24, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/haflinger-men-s-wool-felt-open-back-slippers-beige-550-peat-us-7.html", + "geolocation": null, + "intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "price being unfair" + }, + "intent": "List out reviewers, if exist, who mention about price being unfair", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no reivew about price being unfair", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 222 + }, + { + "sites": [ + "shopping" + ], + "task_id": 25, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html", + "geolocation": null, + "intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "average print quality" + }, + "intent": "List out reviewers, if exist, who mention about average print quality", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Goldfish", + "Roxanne Brandon Coffey" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "GoldfishGoldfish, Roxanne Brandon Coffey" + }, + "intent_template_id": 222 + }, + { + "sites": [ + "shopping" + ], + "task_id": 26, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html", + "geolocation": null, + "intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "complain of the customer service" + }, + "intent": "List out reviewers, if exist, who mention about complain of the customer service", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Bob in Vegas", + "RemyR" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Bob in Vegas, RemyRRemyR" + }, + "intent_template_id": 222 + }, + { + "sites": [ + "reddit" + ], + "task_id": 27, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "Showerthoughts" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33 + }, + { + "sites": [ + "reddit" + ], + "task_id": 28, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "Worcester" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33 + }, + { + "sites": [ + "reddit" + ], + "task_id": 29, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "DIY" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 33 + }, + { + "sites": [ + "reddit" + ], + "task_id": 30, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "space" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33 + }, + { + "sites": [ + "reddit" + ], + "task_id": 31, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "photoshopbattles" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33 + }, + { + "sites": [ + "map" + ], + "task_id": 32, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "information": "the walking distance", + "target1": "Hilton hotel", + "target2": "the nearest supermarket own by a local company" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "DoubleTree by Hilton Hotel Pittsburgh Airport", + "2.0km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 2.0km" + }, + "intent_template_id": 78 + }, + { + "sites": [ + "map" + ], + "task_id": 33, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "target1": "Hilton hotel", + "information": "the shortest walking distance", + "target2": "a supermarket" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "DoubleTree by Hilton Hotel Pittsburgh Airport", + "1.4km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 1.4km" + }, + "intent_template_id": 78 + }, + { + "sites": [ + "map" + ], + "task_id": 34, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "target1": "Hyatt hotel", + "information": "the shortest walking time", + "target2": "a supermarket" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hyatt Regency Pittsburgh International Airport" + ], + "fuzzy_match": [ + "Time: 3h 30min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport\n3:30" + }, + "intent_template_id": 78 + }, + { + "sites": [ + "map" + ], + "task_id": 35, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "target1": "Hyatt hotel", + "information": "the minimal driving time", + "target2": "a supermarket" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hyatt Regency Pittsburgh International Airport" + ], + "fuzzy_match": [ + "Time: 15min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Time: 15min" + }, + "intent_template_id": 78 + }, + { + "sites": [ + "map" + ], + "task_id": 36, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "social security administration", + "location": "Carnegie Mellon University" + }, + "intent": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Yes"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77 + }, + { + "sites": [ + "map" + ], + "task_id": 37, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "police station", + "location": "gates building at CMU" + }, + "intent": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Yes"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77 + }, + { + "sites": [ + "map" + ], + "task_id": 38, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "duquesne university", + "location": "pittsburgh airport" + }, + "intent": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Yes"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77 + }, + { + "sites": [ + "map" + ], + "task_id": 39, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "walmart", + "location": "5600 fifth avenue" + }, + "intent": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Yes"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77 + }, + { + "sites": [ + "map" + ], + "task_id": 40, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "amc theatre", + "location": "hobart street" + }, + "intent": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Yes"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 41, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List the top {{n}} search terms in my store", + "instantiation_dict": { + "n": "1" + }, + "intent": "List the top 1 search terms in my store", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "hollister" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hollister" + }, + "intent_template_id": 285 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 42, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List the top {{n}} search terms in my store", + "instantiation_dict": { + "n": "2" + }, + "intent": "List the top 2 search terms in my store", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "hollister", + "Joust Bag" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hollister, Joust Bag" + }, + "intent_template_id": 285 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 43, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List the top {{n}} search terms in my store", + "instantiation_dict": { + "n": "3" + }, + "intent": "List the top 3 search terms in my store", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "hollister", + "Joust Bag", + "Antonia Racer Tank" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hollister, Joust Bag, Antonia Race Tank" + }, + "intent_template_id": 285 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 44, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Check out my todos", + "instantiation_dict": {}, + "intent": "Check out my todos", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/todos", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 303 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 45, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "Check out the most recent open issues", + "instantiation_dict": {}, + "intent": "Check out the most recent open issues", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_asc&state=opened", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 300 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 46, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Check out the most recent open issues", + "instantiation_dict": {}, + "intent": "Check out the most recent open issues", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 300 + }, + { + "sites": [ + "shopping" + ], + "task_id": 47, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past month" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0 order", + "$0 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0 order, $0 total spend" + }, + "intent_template_id": 197 + }, + { + "sites": [ + "shopping" + ], + "task_id": 48, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past three days" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0 order", + "$0 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0 order, $0 total spend" + }, + "intent_template_id": 197 + }, + { + "sites": [ + "shopping" + ], + "task_id": 49, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past four month" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3 orders", + "$845.49 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3 orders, $845.49 total spend" + }, + "intent_template_id": 197 + }, + { + "sites": [ + "shopping" + ], + "task_id": 50, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past year" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "24 orders", + "$6560.69 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "24 orders, $6560.69 total spend" + }, + "intent_template_id": 197 + }, + { + "sites": [ + "shopping" + ], + "task_id": 51, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past six month" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "12 orders", + "$1603.69 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "12 orders, $1603.69 total spend" + }, + "intent_template_id": 197 + }, + { + "sites": [ + "map" + ], + "task_id": 52, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Carnegie Mellon University", + "end": "starbucks on Craig Street" + }, + "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "7 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "7 min" + }, + "intent_template_id": 68 + }, + { + "sites": [ + "map" + ], + "task_id": 53, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Univ of Pittsburgh", + "end": "starbucks on Craig Street" + }, + "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "18 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "18 min" + }, + "intent_template_id": 68 + }, + { + "sites": [ + "map" + ], + "task_id": 54, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Carnegie Mellon University", + "end": "Univ of Pittsburgh" + }, + "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "25 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "25 min" + }, + "intent_template_id": 68 + }, + { + "sites": [ + "map" + ], + "task_id": 55, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "the starbuck near CMU", + "end": "Chatham university" + }, + "intent": "How long does it take to walk from the starbuck near CMU to Chatham university?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "30 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "30 min" + }, + "intent_template_id": 68 + }, + { + "sites": [ + "map" + ], + "task_id": 56, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Carnegie Museum of Art", + "end": "a library at CMU" + }, + "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "11 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "11 min" + }, + "intent_template_id": 68 + }, + { + "sites": [ + "map" + ], + "task_id": 57, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "university center at Carnegie Mellon University" + }, + "intent": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "El Gallo de Oro", + "Back Bar Grill", + "Grano", + "Beefsteak", + "Nourish", + "Schatz Dining Room", + "Au Bon Pain" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, Schatz Dining Room, Au Bon Pain" + }, + "intent_template_id": 69 + }, + { + "sites": [ + "map" + ], + "task_id": 58, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "cafe", + "place2": "CMU Hunt library" + }, + "intent": "Tell me the closest cafe(s) to CMU Hunt library", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "De Fer Coffee & Tea" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "De Fer Coffee & Tea" + }, + "intent_template_id": 69 + }, + { + "sites": [ + "map" + ], + "task_id": 59, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "CMU Hunt library" + }, + "intent": "Tell me the closest restaurant(s) to CMU Hunt library", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The exchange" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The exchange" + }, + "intent_template_id": 69 + }, + { + "sites": [ + "map" + ], + "task_id": 60, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "CMU Posner Hall" + }, + "intent": "Tell me the closest restaurant(s) to CMU Posner Hall", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The exchange" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The exchange" + }, + "intent_template_id": 69 + }, + { + "sites": [ + "map" + ], + "task_id": 61, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "CMU Sorrells Library" + }, + "intent": "Tell me the closest restaurant(s) to CMU Sorrells Library", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "La Prima Espresso" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "La Prima Espresso" + }, + "intent_template_id": 69 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 62, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?", + "instantiation_dict": { + "quantifier": "most" + }, + "intent": "Which customer has completed the most number of orders in the entire history?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Jane Smith" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jane Smith" + }, + "intent_template_id": 276 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 63, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Which customer(s) has completed the {{quantifier}} number of orders in the entire history?", + "instantiation_dict": { + "quantifier": "second most" + }, + "intent": "Which customer(s) has completed the second most number of orders in the entire history?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Adam Garcia", + "Michael Nguyen", + "Sarah Miller" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Adam Garcia, Michael Nguyen, Sarah Miller" + }, + "intent_template_id": 276 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 64, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Which customer has placed {{number}} orders in the entire history?", + "instantiation_dict": { + "number": "2" + }, + "intent": "Which customer has placed 2 orders in the entire history?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lisa Kim", + "Lisa Green", + "Julia Williams", + "Brian Smith", + "Alexander Thomas" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lisa Kim, Lisa Green, Julia Williams, Brian Smith, Alexander Thomas" + }, + "intent_template_id": 276 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 65, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?", + "instantiation_dict": { + "quantifier": "fifth most" + }, + "intent": "Which customer has completed the fifth most number of orders in the entire history?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Jane Doe" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jane Doe" + }, + "intent_template_id": 276 + }, + { + "sites": [ + "reddit" + ], + "task_id": 66, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "show me the post URLs that recommand a single book" + }, + "intent": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol", + "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "__REDDIT__/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol, __REDDIT__/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" + }, + "intent_template_id": 17 + }, + { + "sites": [ + "reddit" + ], + "task_id": 67, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "show me the book names from posts that recommand a single book" + }, + "intent": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "A Christmas Carol", + "The Hobbit" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "A Christmas Carol, The Hobbit" + }, + "intent_template_id": 17 + }, + { + "sites": [ + "reddit" + ], + "task_id": 68, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "show me the author name and the book name from posts that recommand a single book" + }, + "intent": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "A Christmas Carol", + "Levar Burton", + "The Hobbit", + "J. R. R. Tolkien" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "A Christmas Carol by Levar Burton: , The Hobbit by J. R. R. Tolkien" + }, + "intent_template_id": 17 + }, + { + "sites": [ + "reddit" + ], + "task_id": 69, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "is there any post talks about supporting local book stores? If so, tell me the organizations involved" + }, + "intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "bookshop.org" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "bookshop.org" + }, + "intent_template_id": 17 + }, + { + "sites": [ + "map" + ], + "task_id": 70, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Carnegie Mellon University" + }, + "intent": "What is the zip code of Carnegie Mellon University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "15213" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "15213" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 71, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Chatham University" + }, + "intent": "What is the zip code of Chatham University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "15232" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "15232" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 72, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Yale University" + }, + "intent": "What is the zip code of Yale University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "06516" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "06516" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 73, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Columbia University" + }, + "intent": "What is the zip code of Columbia University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "10027" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "10027" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 74, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "instantiation_dict": { + "place_list": [ + "Carnegie Mellon University", + "apple store shadyside", + "starbucks on craig street" + ] + }, + "intent": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Carnegie Mellon University, starbucks on forbes ave, apple store shadyside" + }, + "intent_template_id": 65 + }, + { + "sites": [ + "map" + ], + "task_id": 75, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "instantiation_dict": { + "place_list": [ + "Massachusetts Institute of Technology", + "Harvard University", + "Boston Logan International Airport" + ] + }, + "intent": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" + }, + "intent_template_id": 65 + }, + { + "sites": [ + "map" + ], + "task_id": 76, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "instantiation_dict": { + "place_list": [ + "Princeton University", + "Yale University", + "Harvard University" + ] + }, + "intent": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The order is Princeton University, Yale University, Harvard University" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Princeton University, Yale University, Harvard University" + }, + "intent_template_id": 65 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 77, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": { + "status": "Pending" + }, + "intent": "What is the total count of Pending reviews amongst all the reviews?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "5" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5" + }, + "intent_template_id": 277 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 78, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": { + "status": "Approved" + }, + "intent": "What is the total count of Approved reviews amongst all the reviews?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "346" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "346" + }, + "intent_template_id": 277 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 79, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": { + "status": "Not Approved" + }, + "intent": "What is the total count of Not Approved reviews amongst all the reviews?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 277 + }, + { + "sites": [ + "map" + ], + "task_id": 80, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Carnegie Mellon University", + "place_B": "Starbucks on Craig Street", + "place_C": "Pittsburgh International Airport" + }, + "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "38 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "38 min" + }, + "intent_template_id": 72 + }, + { + "sites": [ + "map" + ], + "task_id": 81, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Univ of Pittsburgh", + "place_B": "starbucks on Craig Street", + "place_C": "Pittsburgh International Airport" + }, + "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "49 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "49 min" + }, + "intent_template_id": 72 + }, + { + "sites": [ + "map" + ], + "task_id": 82, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Massachusetts Institute of Technology", + "place_B": "Harvard University", + "place_C": "Boston Logan International Airport" + }, + "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "63 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "63 min" + }, + "intent_template_id": 72 + }, + { + "sites": [ + "map" + ], + "task_id": 83, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Carnegie Mellon University", + "place_B": "apple store shadyside", + "place_C": "starbucks on craig street" + }, + "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "22 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "22 min" + }, + "intent_template_id": 72 + }, + { + "sites": [ + "map" + ], + "task_id": 84, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "DoubleTree by Hilton New York Downtown", + "place": "Keens Steakhouse" + }, + "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "14 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "14 minutes" + }, + "intent_template_id": 64 + }, + { + "sites": [ + "map" + ], + "task_id": 85, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "La Quinta Inn near the airport", + "place": "Carnegie Mellon University" + }, + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "30 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "30 minutes" + }, + "intent_template_id": 64 + }, + { + "sites": [ + "map" + ], + "task_id": 86, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "La Quinta Inn near the airport", + "place": "Upitt" + }, + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "29 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "29 minutes" + }, + "intent_template_id": 64 + }, + { + "sites": [ + "map" + ], + "task_id": 87, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "red roof inn", + "place": "Pittsburgh science museum" + }, + "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "20 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "20 minutes" + }, + "intent_template_id": 64 + }, + { + "sites": [ + "map" + ], + "task_id": 88, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "Homewood Suites Southpointe", + "place": "PPG Paints Arena" + }, + "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "34 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "34 minutes" + }, + "intent_template_id": 64 + }, + { + "sites": [ + "map" + ], + "task_id": 89, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Connecticut" + }, + "intent": "Which US states border Connecticut?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Rhode Island", + "Massachusetts", + "New York" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rhode Island, Massachusetts, New York" + }, + "intent_template_id": 67 + }, + { + "sites": [ + "map" + ], + "task_id": 90, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Pennsylvania" + }, + "intent": "Which US states border Pennsylvania?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Ohio", + "Maryland", + "New York", + "New Jersey", + "Delaware", + "West Virginia" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Ohio, Maryland, New York, New Jersey, Delaware, West Virginia" + }, + "intent_template_id": 67 + }, + { + "sites": [ + "map" + ], + "task_id": 91, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Massachusetts" + }, + "intent": "Which US states border Massachusetts?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Rhode Island", + "Connecticut", + "New York", + "New Hampshire", + "Vermont" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rhode Island, Connecticut, New York, New Hampshire, Vermont" + }, + "intent_template_id": 67 + }, + { + "sites": [ + "map" + ], + "task_id": 92, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Vermont" + }, + "intent": "Which US states border Vermont?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "New York", + "New Hampshire", + "Massachusetts" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "New York, New Hampshire, Massachusetts" + }, + "intent_template_id": 67 + }, + { + "sites": [ + "map" + ], + "task_id": 93, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "New Hampshire" + }, + "intent": "Which US states border New Hampshire?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Massachusetts", + "Vermont", + "Maine" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Massachusetts, Vermont, Maine" + }, + "intent_template_id": 67 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 94, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Telll me the grand total of invoice {{id}}.", + "instantiation_dict": { + "id": "000000001" + }, + "intent": "Telll me the grand total of invoice 000000001.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "36.39" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$36.39" + }, + "intent_template_id": 274 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 95, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Telll me the grand total of invoice {{id}}.", + "instantiation_dict": { + "id": "000000002" + }, + "intent": "Telll me the grand total of invoice 000000002.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "39.64" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$39.64" + }, + "intent_template_id": 274 + }, + { + "sites": [ + "shopping" + ], + "task_id": 96, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the status of my latest order and when will it arrive", + "instantiation_dict": {}, + "intent": "Tell me the status of my latest order and when will it arrive", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The last order was canceled. It will never arrive." + ] + }, + "reference_url": "", + "program_html": [], + "reference_answer_raw_annotation": "The last order was canceled. It will never arrive.", + "string_note": "" + }, + "intent_template_id": 193 + }, + { + "sites": [ + "map", + "wikipedia" + ], + "task_id": 97, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", + "instantiation_dict": {}, + "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "914km |OR| 914 km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "914 km" + }, + "intent_template_id": 120 + }, + { + "sites": [ + "map" + ], + "task_id": 98, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "tea cafe", + "start": "University of Pittsburgh" + }, + "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Fuku Tea", + "3716", + "Forbes Avenue", + "Central Oakland", + "Pittsburgh", + "653m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Fuku Tea, 3716, Forbes Avenue, Oakland, Central Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n653m" + }, + "intent_template_id": 66 + }, + { + "sites": [ + "map" + ], + "task_id": 99, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "Five Guys", + "start": "5700 Penn Ave" + }, + "intent": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Five Guys", + "117", + "South Bouquet Street", + "North Oakland", + "Pittsburgh", + "4.0km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Five Guys, 117, South Bouquet Street, Oakland, North Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n4.0km" + }, + "intent_template_id": 66 + }, + { + "sites": [ + "map" + ], + "task_id": 100, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "Starbucks", + "start": "Carnegie Mellon" + }, + "intent": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Starbucks", + "417", + "South Craig Street", + "Bellefield", + "Pittsburgh", + "557m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Starbucks, 417, South Craig Street, Bellefield, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n557m" + }, + "intent_template_id": 66 + }, + { + "sites": [ + "map" + ], + "task_id": 101, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "In-N-Out", + "start": "Upitts" + }, + "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no In-N-Out near University of Pittsburgh", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 66 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 102, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "help needed", + "repo": "a11yproject/a11yproject.com" + }, + "intent": "Display the list of issues in the a11yproject/a11yproject.com repository that have labels related to help needed", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 103, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "questions", + "repo": "kkroening/ffmpeg-python" + }, + "intent": "Display the list of issues in the kkroening/ffmpeg-python repository that have labels related to questions", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 104, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "flaky-test", + "repo": "keycloak/keycloak" + }, + "intent": "Display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 105, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "OpenAPI Generator CLI", + "repo": "OpenAPITools/openapi-generator" + }, + "intent": "Display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 106, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "BUG", + "repo": "umano/AndroidSlidingUpPanel" + }, + "intent": "Display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 107, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from May to December 2022" + }, + "intent": "Presents the monthly count of successful orders from May to December 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "October: 4 orders", + "November: 5 orders", + "December: 10 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " + }, + "intent_template_id": 270 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 108, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "01/2023-05/2023" + }, + "intent": "Presents the monthly count of successful orders 01/2023-05/2023 in MM:COUNT format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January: 12 orders", + "Feburary: 7 orders", + "March: 5 orders", + "April: 9 orders", + "May: 5 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January: 12 orders Febulary: 7 orders March: 5 orders Apirl: 9 orders May: 5 orders" + }, + "intent_template_id": 270 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 109, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from Jan to December 2022" + }, + "intent": "Presents the monthly count of successful orders from Jan to December 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January: 11 orders", + "Feburary: 16 orders", + "March: 14 orders", + "April: 7 orders", + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "Octorbor: 4 orders", + "November: 5 orders", + "December: 10 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " + }, + "intent_template_id": 270 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 110, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from Jan to Nov 2022" + }, + "intent": "Presents the monthly count of successful orders from Jan to Nov 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January: 11 orders", + "Feburary: 16 orders", + "March: 14 orders", + "April: 7 orders", + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "Octorbor: 4 orders", + "November: 5 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders " + }, + "intent_template_id": 270 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 111, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from Feb to Nov 2022" + }, + "intent": "Presents the monthly count of successful orders from Feb to Nov 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Feburary: 16 orders", + "March: 14 orders", + "April: 7 orders", + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "Octorbor: 4 orders", + "November: 5 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders " + }, + "intent_template_id": 270 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 112, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "Circe fleece" + }, + "intent": "Show me the customers who have expressed dissatisfaction with Circe fleece?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Hannah Lim" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hannah Lim" + }, + "intent_template_id": 245 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 113, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "Olivia zip jacket" + }, + "intent": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Emma Lopez", + "Seam Miller" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Emma Lopez, Seam Miller" + }, + "intent_template_id": 245 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 114, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "Antonia racer tank" + }, + "intent": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Shaunte", + "Merrie" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shaunte, Merrie" + }, + "intent_template_id": 245 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 115, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}", + "instantiation_dict": { + "product": "Chloe tank" + }, + "intent": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no negative review for Chloe tank", + "reference_answer_raw_annotation": "" + }, + "intent_template_id": 245 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 116, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "tanks products" + }, + "intent": "Show me the name of the customers who have expressed dissatisfaction with tanks products?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Alexander", + "Carma", + "Dominic", + "Merrie", + "Monroe", + "Scotty", + "Shaunte", + "Teofila", + "Valorie" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, Valorie" + }, + "intent_template_id": 245 + }, + { + "sites": [ + "shopping" + ], + "task_id": 117, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the date when I made my first purchase on this site?", + "instantiation_dict": {}, + "intent": "What is the date when I made my first purchase on this site?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3/2/22" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3/2/22" + }, + "intent_template_id": 161 + }, + { + "sites": [ + "shopping" + ], + "task_id": 118, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have jaw bruxism problem, show me something that could alleviate the problem.", + "instantiation_dict": {}, + "intent": "I have jaw bruxism problem, show me something that could alleviate the problem.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "", + "required_contents": { + "must_include": [ + "jaw bruxism", + "mouth guard" + ] + } + } + ] + }, + "intent_template_id": 151 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 119, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Antonia Racer Tank" + }, + "intent": "Tell me the reasons why customers like Antonia Racer Tank", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Its color and style is good" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Its color and style is good" + }, + "intent_template_id": 250 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 120, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Ana Running Short" + }, + "intent": "Tell me the reasons why customers like Ana Running Short", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "It is comfortable" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "It is comfortable" + }, + "intent_template_id": 250 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 121, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Circe hooded fleece" + }, + "intent": "Tell me the reasons why customers like Circe hooded fleece", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Warm and comfortable. True to size." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Warm and comfortable. True to size." + }, + "intent_template_id": 250 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 122, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Olivia zip jacket" + }, + "intent": "Tell me the reasons why customers like Olivia zip jacket", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Lightweight, comfortable and stylish. Good design and details." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lightweight, comfortable, and stylish. Good design and details." + }, + "intent_template_id": 250 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 123, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Circe's products" + }, + "intent": "Tell me the reasons why customers like Circe's products", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Warm and comfortable. True to size." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Warm and comfortable. True to size." + }, + "intent_template_id": 250 + }, + { + "sites": [ + "shopping" + ], + "task_id": 124, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": { + "product": "wireless earphone" + }, + "intent": "What is the price range of wireless earphone in the One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0.14", + "745.00" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$0.14 - $745.00" + }, + "intent_template_id": 159 + }, + { + "sites": [ + "shopping" + ], + "task_id": 125, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": { + "product": "teeth grinding mouth guard" + }, + "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1.46", + "85" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$1.46 - $85" + }, + "intent_template_id": 159 + }, + { + "sites": [ + "shopping" + ], + "task_id": 126, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": { + "product": "Canon photo printer" + }, + "intent": "What is the price range of Canon photo printer in the One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2.56", + "649.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$2.56 - $649.99" + }, + "intent_template_id": 159 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 127, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What brands appear most frequently among the top search terms?", + "instantiation_dict": {}, + "intent": "What brands appear most frequently among the top search terms?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hollister", + "Joust", + "Antonia" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hollister, Joust, Antonia" + }, + "intent_template_id": 1001 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 128, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "2" + }, + "intent": "What's the total number of items sold in the most recent 2 orders?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "9" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "9" + }, + "intent_template_id": 1002 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 129, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "4" + }, + "intent": "What's the total number of items sold in the most recent 4 orders?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "16" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16" + }, + "intent_template_id": 1002 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 130, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "5" + }, + "intent": "What's the total number of items sold in the most recent 5 orders?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "18" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "18" + }, + "intent_template_id": 1002 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 131, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "7" + }, + "intent": "What's the total number of items sold in the most recent 7 orders?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "25" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "25" + }, + "intent_template_id": 1002 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 132, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "kilian", + "repo": "a11yproject", + "date": "3/5/2023" + }, + "intent": "How many commits did kilian make to a11yproject on 3/5/2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 322 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 133, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "Eric", + "repo": "a11yproject", + "date": "3/2" + }, + "intent": "How many commits did Eric make to a11yproject on 3/2?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 322 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 134, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "kilian", + "repo": "a11yproject", + "date": "3/1/2023" + }, + "intent": "How many commits did kilian make to a11yproject on 3/1/2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 322 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 135, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "Eric and Kilian", + "repo": "a11yproject", + "date": "1/3/2023" + }, + "intent": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 322 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 136, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "Steven Woodson", + "repo": "a11y-webring.club", + "date": "2/6/2023" + }, + "intent": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "5" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5" + }, + "intent_template_id": 322 + }, + { + "sites": [ + "map" + ], + "task_id": 137, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the city where the Liberty Bell is located", + "city2": "the home city of Pirates" + }, + "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "5h 47min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5h 47min" + }, + "intent_template_id": 51 + }, + { + "sites": [ + "map" + ], + "task_id": 138, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the big apple", + "city2": "the city with the most authentic Philly cheesesteaks" + }, + "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "1h 58min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1h 58min" + }, + "intent_template_id": 51 + }, + { + "sites": [ + "map" + ], + "task_id": 139, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the hometown of Joe Biden", + "city2": "Bridgeport" + }, + "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3h 20min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3h 20min" + }, + "intent_template_id": 51 + }, + { + "sites": [ + "map" + ], + "task_id": 140, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the city of Niagara Falls", + "city2": "the city of Yale University" + }, + "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "8h 33min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "8h 33min" + }, + "intent_template_id": 51 + }, + { + "sites": [ + "shopping" + ], + "task_id": 141, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "food-related", + "time": "March 2023" + }, + "intent": "How much I spent on food-related shopping during March 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "47.41" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$47.41" + }, + "intent_template_id": 162 + }, + { + "sites": [ + "shopping" + ], + "task_id": 142, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "hair care and hair style", + "time": "Jan 2023" + }, + "intent": "How much I spent on hair care and hair style shopping during Jan 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "95.23" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$95.23" + }, + "intent_template_id": 162 + }, + { + "sites": [ + "shopping" + ], + "task_id": 143, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "home decoration", + "time": "1/29/2023" + }, + "intent": "How much I spent on home decoration shopping during 1/29/2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "265.69" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$265.69" + }, + "intent_template_id": 162 + }, + { + "sites": [ + "shopping" + ], + "task_id": 144, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "food", + "time": "from mid Jan to the end Jan 2023" + }, + "intent": "How much I spent on food shopping during from mid Jan to the end Jan 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 162 + }, + { + "sites": [ + "shopping" + ], + "task_id": 145, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "cooking and food", + "time": "March 2022" + }, + "intent": "How much I spent on cooking and food shopping during March 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "52.35" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$52.35" + }, + "intent_template_id": 162 + }, + { + "sites": [ + "shopping" + ], + "task_id": 146, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "size", + "product": "picture frame", + "time": "Sep 2022" + }, + "intent": "What is the size configuration of the picture frame I bought Sep 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "16x24" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16x24" + }, + "intent_template_id": 155 + }, + { + "sites": [ + "shopping" + ], + "task_id": 147, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "size", + "product": "picture frame", + "time": "2022" + }, + "intent": "What is the size configuration of the picture frame I bought 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "16x24" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16x24" + }, + "intent_template_id": 155 + }, + { + "sites": [ + "shopping" + ], + "task_id": 148, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "color", + "product": "picture frame", + "time": "Sep 2022" + }, + "intent": "What is the color configuration of the picture frame I bought Sep 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Mist" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Mist" + }, + "intent_template_id": 155 + }, + { + "sites": [ + "shopping" + ], + "task_id": 149, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "color", + "product": "artifical plants", + "time": "Feb 2023" + }, + "intent": "What is the color configuration of the artifical plants I bought Feb 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Green-vines" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Green-vines" + }, + "intent_template_id": 155 + }, + { + "sites": [ + "shopping" + ], + "task_id": 150, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "price", + "product": "fake tree", + "time": "Jan 2023" + }, + "intent": "What is the price configuration of the fake tree I bought Jan 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "260.69" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "260.69" + }, + "intent_template_id": 155 + }, + { + "sites": [ + "map" + ], + "task_id": 151, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "CMU", + "location2": "University of Pittsburgh" + }, + "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 36 + }, + { + "sites": [ + "map" + ], + "task_id": 152, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "Schenley park", + "location2": "Upitt" + }, + "intent": "What is the minimum travel time by car from Schenley park to Upitt?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 36 + }, + { + "sites": [ + "map" + ], + "task_id": 153, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "REI", + "location2": "CMU" + }, + "intent": "What is the minimum travel time by car from REI to CMU?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "7min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "7min" + }, + "intent_template_id": 36 + }, + { + "sites": [ + "map" + ], + "task_id": 154, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "CMU gates building", + "location2": "Schenley park" + }, + "intent": "What is the minimum travel time by car from CMU gates building to Schenley park?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 36 + }, + { + "sites": [ + "map" + ], + "task_id": 155, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "Animal Rescue League of Pittsburgh", + "location2": "Schenley park" + }, + "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "9min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "9min" + }, + "intent_template_id": 36 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 156, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Checkout merge requests assigned to me", + "instantiation_dict": {}, + "intent": "Checkout merge requests assigned to me", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 290 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 157, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show all customers", + "instantiation_dict": {}, + "intent": "Show all customers", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/customer/index/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 255 + }, + { + "sites": [ + "shopping" + ], + "task_id": 158, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 11 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 11 cards", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 159, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 31 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 31 cards", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 160, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 6 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 6 cards", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 161, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 23 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 23 cards", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 162, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 40 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 40 cards", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 163, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html", + "geolocation": null, + "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "I ordered the 16gb but I only got 14 gigs even though I formatted the card", + "The memory card is kind of slow on games and downloads", + "No original packaging It's used and the previous owners data has not been erased", + "The product is a legit sony hardware that have been owned by someone else before", + "The media could not be loaded", + "I could not format the card so I wasn\u2019t able to use it for my VITA" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. The product is a legit sony hardware that have been owned by someone else before The media could not be loaded. I could not format the card so I wasn\u2019t able to use it for my VITA" + }, + "intent_template_id": 136 + }, + { + "sites": [ + "shopping" + ], + "task_id": 164, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/mineralogie-all-natural-lip-gloss-ruby-rose.html", + "geolocation": null, + "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Dry", + "Uneven color" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "DryUneven color" + }, + "intent_template_id": 136 + }, + { + "sites": [ + "shopping" + ], + "task_id": 165, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/sandgrens-swedish-handmade-wooden-clog-sandal-copenhagen.html", + "geolocation": null, + "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The 39 was too small. I am afraid the 40 will be too big", + "I was very sad when the shoe rubbed up against my baby toe", + "I had to return them because I knew in time it would tear up my feet", + "The problem is that the strap is made of some really stiff leather and is painful to my heel", + "The front is also uncomfortably tight", + "The Dansko's were similar (not as bad) and loosened up over time" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The 39 was too small. I am afraid the 40 will be too big. I was very sad when the shoe rubbed up against my baby toe. I had to return them because I knew in time it would tear up my feet. The problem is that the strap is made of some really stiff leather and is painful to my heel. The front is also uncomfortably tight. The Dansko's were similar (not as bad) and loosened up over time." + }, + "intent_template_id": 136 + }, + { + "sites": [ + "shopping" + ], + "task_id": 166, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/sensodyne-repair-protect-whitening-toothpaste-with-fluoride-3-4-oz-pack-of-3.html", + "geolocation": null, + "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "there is no existing criticism", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 136 + }, + { + "sites": [ + "shopping" + ], + "task_id": 167, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html", + "geolocation": null, + "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The wireless connection works on a whim (about 40% of the time I've owned it)", + "It seems to constantly run out of ink", + "Cartridge prices are less than some printers I've had", + "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason", + "Scanner is as slow as my first scanner I ever owned in the mid-90's", + "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had, but now I understand why. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. Scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." + }, + "intent_template_id": 136 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 168, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "more than 100" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "No repo found", + "reference_answer_raw_annotation": "No repo found" + }, + "intent_template_id": 289 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 169, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "the most" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got the most stars?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "a11yproject.com", + "design" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "a11yproject.com, Primer/design" + }, + "intent_template_id": 289 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 170, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "the least" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got the least stars?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" + }, + "intent_template_id": 289 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 171, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "less than 5" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "a11y-syntax-highlighting", + "a11y-webring.club", + "accessible-html-content-patterns", + "ericwbailey.website", + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" + }, + "intent_template_id": 289 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 172, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "no" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got no stars?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" + }, + "intent_template_id": 289 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 173, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "better" + }, + "intent": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["No, it is open"] + }, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/8", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "", + "url_note": "GOLD in PRED" + }, + "intent_template_id": 310 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 174, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "feature" + }, + "intent": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["No, it is open"] + }, + "reference_url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 310 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 175, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "dependency" + }, + "intent": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["No, it is open"] + }, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 310 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 176, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "theme editor" + }, + "intent": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["No, it is open"] + }, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 310 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 177, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "homepage content" + }, + "intent": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["Yes, it is closed"] + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", + "program_html": [], + "reference_answer_raw_annotation": "closed", + "string_note": "" + }, + "intent_template_id": 310 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 178, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "better" + }, + "intent": "Open my latest created issue that has better in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["Yes, it is closed"] + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/566", + "program_html": [], + "reference_answer_raw_annotation": "Closed", + "string_note": "" + }, + "intent_template_id": 500 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 179, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "feature" + }, + "intent": "Open my latest created issue that has feature in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["Yes, it is closed"] + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1517", + "program_html": [], + "reference_answer_raw_annotation": "Closed", + "string_note": "" + }, + "intent_template_id": 500 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 180, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "dependency" + }, + "intent": "Open my latest created issue that has dependency in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["No, it is open"] + }, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 500 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 181, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "theme editor" + }, + "intent": "Open my latest created issue that has theme editor in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["No, it is open"] + }, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 500 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 182, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "homepage content" + }, + "intent": "Open my latest created issue that has homepage content in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": ["Yes, it is closed"] + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", + "program_html": [], + "reference_answer_raw_annotation": "closed", + "string_note": "" + }, + "intent_template_id": 500 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 183, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "SKU", + "N": "10" + }, + "intent": "Give me the SKU of the products that have 10 units left", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no product that has 10 quantities left.", + "reference_answer_raw_annotation": "There is no product that has 10 quantities left." + }, + "intent_template_id": 368 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 184, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "name", + "N": "0" + }, + "intent": "Give me the name of the products that have 0 units left", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Sinbad Fitness Tank" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sinbad Fitness Tank" + }, + "intent_template_id": 368 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 185, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "brand", + "N": "3" + }, + "intent": "Give me the brand of the products that have 3 units left", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Eos", + "Minerva" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Eos, Minerva" + }, + "intent_template_id": 368 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 186, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "product names and the sizes", + "N": "2-3" + }, + "intent": "Give me the product names and the sizes of the products that have 2-3 units left", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Eos V-Neck Hoodie: S", + "Minera Luma Tech V-Tee: XS" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Eos V-Neck Hoodie: S Minera Luma Tech V-Tee: XS" + }, + "intent_template_id": 368 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 187, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "SKU", + "N": "1-3" + }, + "intent": "Give me the SKU of the products that have 1-3 units left", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "WH11-S-Blue", + "WS08-XS-Blue" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "WH11-S-Blue, WS08-XS-Blue" + }, + "intent_template_id": 368 + }, + { + "sites": [ + "shopping" + ], + "task_id": 188, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "cancelled" + }, + "intent": "Tell me the total cost of my latest cancelled order?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "365.42" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "365.42" + }, + "intent_template_id": 214 + }, + { + "sites": [ + "shopping" + ], + "task_id": 189, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "pending" + }, + "intent": "Tell me the total cost of my latest pending order?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "754.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "754.99" + }, + "intent_template_id": 214 + }, + { + "sites": [ + "shopping" + ], + "task_id": 190, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "complete" + }, + "intent": "Tell me the total cost of my latest complete order?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "65.32" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "65.32" + }, + "intent_template_id": 214 + }, + { + "sites": [ + "shopping" + ], + "task_id": 191, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "processing" + }, + "intent": "Tell me the total cost of my latest processing order?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no order of \"processing\" status", + "reference_answer_raw_annotation": "There is no order of \"processing\" status" + }, + "intent_template_id": 214 + }, + { + "sites": [ + "shopping" + ], + "task_id": 192, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "non-cancelled" + }, + "intent": "Tell me the total cost of my latest non-cancelled order?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "754.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "754.99" + }, + "intent_template_id": 214 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 193, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "completed", + "N": "2" + }, + "intent": "Get the total payment amount of the last 2 completed orders", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "182.4" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "182.4" + }, + "intent_template_id": 367 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 194, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "completed", + "N": "5" + }, + "intent": "Get the total payment amount of the last 5 completed orders", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "555.2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "555.2" + }, + "intent_template_id": 367 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 195, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "pending", + "N": "5" + }, + "intent": "Get the total payment amount of the last 5 pending orders", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "885.4" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "885.4" + }, + "intent_template_id": 367 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 196, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Compare the payment difference of the last {{N}} {{status_1}} orders and {{status_2}} orders", + "instantiation_dict": { + "status_1": "cancelled", + "status_2": "completed", + "N": "4" + }, + "intent": "Compare the payment difference of the last 4 cancelled orders and completed orders", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "194.25" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "194.25" + }, + "intent_template_id": 367 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 197, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "non-cancelled", + "N": "5" + }, + "intent": "Get the total payment amount of the last 5 non-cancelled orders", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "778.2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "annotation_note": "219.4+210+166.4+93.4+89", + "reference_answer_raw_annotation": "778.2" + }, + "intent_template_id": 367 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 198, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "customer name", + "status": "most recent cancelled" + }, + "intent": "Get the customer name of the most recent cancelled order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Lily Potter" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lily Potter" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 199, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "order ID", + "status": "newest pending" + }, + "intent": "Get the order ID of the newest pending order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "299" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "299" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 200, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "billing name", + "status": "oldest complete" + }, + "intent": "Get the billing name of the oldest complete order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "John Lee" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "John Lee" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 201, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "customer name", + "status": "earliest fraud suspect" + }, + "intent": "Get the customer name of the earliest fraud suspect order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no order of \"fraud suspect\" status", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 202, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "date", + "status": "most recent canlled" + }, + "intent": "Get the date of the most recent canlled order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "May 23 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "May 23, 2023" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 203, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "purchase date and order id", + "status": "most recent pending" + }, + "intent": "Get the purchase date and order id of the most recent pending order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "order id: 000000299", + "purchase date: May 31, 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000299, May 31, 2023, 2:55:09 AM" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 204, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "product name and discounted price (low to high)", + "status": "most recent completed" + }, + "intent": "Get the product name and discounted price (low to high) of the most recent completed order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Rapha Sports Short: $35", + "Thorpe Track Pant: $54.4", + "Mach Street Sweatshirt: $62" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rapha Sports Short: $35 Thorpe Track Pant: $54.4 Mach Street Sweatshirt: $62" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 205, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make on {{date}}?", + "instantiation_dict": { + "user": "kilian", + "date": "3/5/2023" + }, + "intent": "How many commits did kilian make on 3/5/2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 320 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 206, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make on {{date}}?", + "instantiation_dict": { + "user": "Eric", + "date": "3/2" + }, + "intent": "How many commits did Eric make on 3/2?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 320 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 207, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make on {{date}} in total?", + "instantiation_dict": { + "user": "Eric and Kilian", + "date": "1/3/2023" + }, + "intent": "How many commits did Eric and Kilian make on 1/3/2023 in total?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 320 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 208, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "+1 2058812302" + }, + "intent": "Find the customer name and email with phone number +1 2058812302", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "John Smith", + "john.smith.xyz@gmail.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "John Smith, john.smith.xyz@gmail.com" + }, + "intent_template_id": 364 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 209, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "2137418080" + }, + "intent": "Find the customer name and email with phone number 2137418080", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Jennifer White", + "jennifer.white@yahoo.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jennifer White, jennifer.white@yahoo.com" + }, + "intent_template_id": 364 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 210, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "2065555555" + }, + "intent": "Find the customer name and email with phone number 2065555555", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Adam Garcia", + "gamingpro456@gmail.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Adam Garcia, gamingpro456@gmail.com" + }, + "intent_template_id": 364 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 211, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "8015551212" + }, + "intent": "Find the customer name and email with phone number 8015551212", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Sean Miller", + "sean.miller@gmail.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sean Miller, sean.miller@gmail.com" + }, + "intent_template_id": 364 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 212, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "555-229-3326" + }, + "intent": "Find the customer name and email with phone number 555-229-3326", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Veronica Costello", + "roni_cost@example.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Veronica Costello, roni_cost@example.com" + }, + "intent_template_id": 364 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 213, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Antonia Racer Tank" + }, + "intent": "What are the key aspects that the customers don't like about Antonia Racer Tank", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Not suitable for high-impact workouts" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Not suitable for high-impact workouts" + }, + "intent_template_id": 249 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 214, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Zing Jump Rope" + }, + "intent": "What are the key aspects that the customers don't like about Zing Jump Rope", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "It is hard to find the right size. Won't last long" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "It is hard to find the right size. Won't last long" + }, + "intent_template_id": 249 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 215, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Circe ice fleece" + }, + "intent": "What are the key aspects that the customers don't like about Circe ice fleece", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Material quality, fit, insufficient warmth, color" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Material quality, fit, insufficient warmth, color" + }, + "intent_template_id": 249 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 216, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Electra Bra Top" + }, + "intent": "What are the key aspects that the customers don't like about Electra Bra Top", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Not true to size" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Not true to size" + }, + "intent_template_id": 249 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 217, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Pursuit Tone Band" + }, + "intent": "What are the key aspects that the customers don't like about Pursuit Tone Band", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Insufficient resistance for their workouts." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Insufficient resistance for their workouts." + }, + "intent_template_id": 249 + }, + { + "sites": [ + "map" + ], + "task_id": 218, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", + "instantiation_dict": { + "location": "CMU, Pittsburgh", + "n": "5" + }, + "intent": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no hotel near CMU that is within 5 minutes walking distance", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 41 + }, + { + "sites": [ + "map" + ], + "task_id": 219, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", + "instantiation_dict": { + "location": "Pittsburgh airport", + "n": "3" + }, + "intent": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no hotel near CMU that is within 5 minutes walking distance", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 41 + }, + { + "sites": [ + "map" + ], + "task_id": 220, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", + "instantiation_dict": { + "location": "Gardner Steel Conference Center,", + "n": 5 + }, + "intent": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Wyndham Pittsburgh University Cente: 375m", + "The Oaklander Hotel: 338m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Wyndham Pittsburgh University Cente: 375 m\nThe Oaklander Hotel: 338 m" + }, + "intent_template_id": 41 + }, + { + "sites": [ + "map" + ], + "task_id": 221, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to the nearest {{location}} with different transportation methods?", + "instantiation_dict": { + "location": "USPS postal office" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Walk: 1 minute", + "Drive: less than 1 minute", + "Bike: less than 1 minute" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Walk: 1 minute to walk and\nDrive: less than 1 minute\nBike: less than 1 minute" + }, + "intent_template_id": 35 + }, + { + "sites": [ + "map" + ], + "task_id": 222, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", + "instantiation_dict": { + "location": "cold stone ice cream" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3min" + }, + "intent_template_id": 35 + }, + { + "sites": [ + "map" + ], + "task_id": 223, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", + "instantiation_dict": { + "location": "Mcdonald's" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 35 + }, + { + "sites": [ + "map" + ], + "task_id": 224, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", + "instantiation_dict": { + "location": "wendys" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3min" + }, + "intent_template_id": 35 + }, + { + "sites": [ + "shopping" + ], + "task_id": 225, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What do customers say about {{product_type}} from {{manufature}}", + "instantiation_dict": { + "product_type": "brush", + "manufature": "sephora" + }, + "intent": "What do customers say about brush from sephora", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The sephora brushes don't have reviews", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 135 + }, + { + "sites": [ + "shopping" + ], + "task_id": 226, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "Amazon basic" + }, + "intent": "What is the price range for products from Amazon basic?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "5.49", + "375.19" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$5.49 - $375.19" + }, + "intent_template_id": 370 + }, + { + "sites": [ + "shopping" + ], + "task_id": 227, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "EYZUTAK" + }, + "intent": "What is the price range for products from EYZUTAK?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "9.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$9.99" + }, + "intent_template_id": 370 + }, + { + "sites": [ + "shopping" + ], + "task_id": 228, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "sephora" + }, + "intent": "What is the price range for products from sephora?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "18.18", + "94.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$18.18 - $94.99" + }, + "intent_template_id": 370 + }, + { + "sites": [ + "shopping" + ], + "task_id": 229, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "ugreen" + }, + "intent": "What is the price range for products from ugreen?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "6.99", + "38.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$6.99 - $38.99" + }, + "intent_template_id": 370 + }, + { + "sites": [ + "shopping" + ], + "task_id": 230, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "Perricone MD" + }, + "intent": "What is the price range for products from Perricone MD?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "35", + "149" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$35 - $149" + }, + "intent_template_id": 370 + }, + { + "sites": [ + "shopping" + ], + "task_id": 231, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "cancelled" + }, + "intent": "Get the order number of my most recent cancelled order ", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "170" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000170" + }, + "intent_template_id": 213 + }, + { + "sites": [ + "shopping" + ], + "task_id": 232, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "pending" + }, + "intent": "Get the order number of my most recent pending order ", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "189" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000189" + }, + "intent_template_id": 213 + }, + { + "sites": [ + "shopping" + ], + "task_id": 233, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "complete" + }, + "intent": "Get the order number of my most recent complete order ", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "180" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000180" + }, + "intent_template_id": 213 + }, + { + "sites": [ + "shopping" + ], + "task_id": 234, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "on hold" + }, + "intent": "Get the order number of my most recent on hold order ", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "there is no on hold order", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 213 + }, + { + "sites": [ + "shopping" + ], + "task_id": 235, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "under delivery" + }, + "intent": "Get the order number of my most recent under delivery order ", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no under delivery order", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 213 + }, + { + "sites": [ + "map" + ], + "task_id": 236, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}", + "instantiation_dict": { + "location": "pharmacy", + "location2": "Carnegie Mellon", + "condition": "I can walk within 20mins" + }, + "intent": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Schiller's Pharmacy", + "811", + "South Aiken Avenue", + "Shadyside", + "Pittsburgh" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Schiller's Pharmacy, 811, South Aiken Avenue, Shadyside, Pittsburgh, Allegheny County, 15232, United States" + }, + "intent_template_id": 39 + }, + { + "sites": [ + "map" + ], + "task_id": 237, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}", + "instantiation_dict": { + "location": "gas station", + "location2": "CMU", + "condition": "" + }, + "intent": "Where is the nearest gas station from CMU ", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Sunoco", + "North Craig Street", + "North Oakland", + "Pittsburgh" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sunoco, North Craig Street, North Oakland, Pittsburgh, Allegheny County, 15213, United States" + }, + "intent_template_id": 39 + }, + { + "sites": [ + "shopping" + ], + "task_id": 238, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "PS4 accessories" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from PS4 accessories category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 239, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "nutrition bars and drinks" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from nutrition bars and drinks category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 240, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "competitive swimwear" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from competitive swimwear category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 241, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "skin care tool" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from skin care tool category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 242, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "Household Supplies" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from Household Supplies category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 243, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "email address", + "product": "Circe fleece" + }, + "intent": "Show me the email address of the customer who is the most unhappy with Circe fleece", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "hannah.lim@gmail.com" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hannah.lim@gmail.com" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 244, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "email address", + "product": "Olivia zip jacket" + }, + "intent": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "emma.lopez@gmail.com" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "emma.lopez@gmail.com" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 245, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "name", + "product": "Antonia racer tank" + }, + "intent": "Show me the name of the customer who is the most unhappy with Antonia racer tank", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Shaunte" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shaunte" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 246, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "name", + "product": "Chloe tank" + }, + "intent": "Show me the name of the customer who is the most unhappy with Chloe tank", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Teofila" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Teofila" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 247, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "email address", + "product": "the style of Zoe products" + }, + "intent": "Show me the email address of the customer who is the most unhappy with the style of Zoe products", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "Valorie doesn't have a email in the system", + "program_html": [], + "string_note": "There is no negative review for Zoe products", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "map" + ], + "task_id": 248, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Carnegie Mellon Caf\u00e9" + }, + "intent": "Tell me the coordinates of Carnegie Mellon Caf\u00e9 in DD format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.442", + "-79.939" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.4424191, -79.9397388" + }, + "intent_template_id": 46 + }, + { + "sites": [ + "map" + ], + "task_id": 249, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Western Pennsylvania Hospital Heliport" + }, + "intent": "Tell me the coordinates of Western Pennsylvania Hospital Heliport in DD format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.460", + "-79.946" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.46076, -79.94666" + }, + "intent_template_id": 46 + }, + { + "sites": [ + "map" + ], + "task_id": 250, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Apple Store near Pitt" + }, + "intent": "Tell me the coordinates of Apple Store near Pitt in DD format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.451", + "-79.933" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.4511693, -79.9334241" + }, + "intent_template_id": 46 + }, + { + "sites": [ + "map" + ], + "task_id": 251, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "bus stop on the Carnegie art museum side of the street near CMU" + }, + "intent": "Tell me the coordinates of bus stop on the Carnegie art museum side of the street near CMU in DD format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.444", + "-79.948" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.4443, -79.94889" + }, + "intent_template_id": 46 + }, + { + "sites": [ + "map" + ], + "task_id": 252, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Tokyo Japanese Food Store in Pittsburgh" + }, + "intent": "Tell me the coordinates of Tokyo Japanese Food Store in Pittsburgh in DD format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.457", + "-79.929" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.45761, -79.92934" + }, + "intent_template_id": 46 + }, + { + "sites": [ + "map" + ], + "task_id": 253, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Carnegie Mellon Caf\u00e9", + "information": "phone number" + }, + "intent": "What is the phone number of Carnegie Mellon Caf\u00e9", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no such information in the map", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "map" + ], + "task_id": 254, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Western Pennsylvania Hospital", + "information": "phone number" + }, + "intent": "What is the phone number of Western Pennsylvania Hospital", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": ["4125785000"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4125785000" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "map" + ], + "task_id": 255, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Who is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "PIT airport", + "information": "operator" + }, + "intent": "Who is the operator of PIT airport", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Allegheny County Airport Authority" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Allegheny County Airport Authority" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "map" + ], + "task_id": 256, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Carnegie art museum in pittsburgh", + "information": "website" + }, + "intent": "What is the website of Carnegie art museum in pittsburgh", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "http://web.cmoa.org/" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "http://web.cmoa.org/" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "map" + ], + "task_id": 257, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Tokyo Japanese Food Store in Pittsburgh", + "information": "hours of operation" + }, + "intent": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "We-Su 10:00-17:00" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "We-Su 10:00-17:00" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 258, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "See all public projects", + "instantiation_dict": {}, + "intent": "See all public projects", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/explore", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 325 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 259, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Get me my RSS feed token", + "instantiation_dict": {}, + "intent": "Get me my RSS feed token", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "TMN_bBn9Z48qVbUFZV45" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "TMN_bBn9Z48qVbUFZV45" + }, + "intent_template_id": 312 + }, + { + "sites": [ + "shopping" + ], + "task_id": 260, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Video Game" + }, + "intent": "I want to browse the products in the Video Game category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/video-games.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 261, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Headphones" + }, + "intent": "I want to browse the products in the Headphones category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/electronics/headphones.html", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/electronics.html?cat=60"}] + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 262, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Men shoes" + }, + "intent": "I want to browse the products in the Men shoes category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/clothing-shoes-jewelry.html?cat=145"}] + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 263, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Woman clothing" + }, + "intent": "I want to browse the products in the Woman clothing category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/clothing-shoes-jewelry.html?cat=143"}] + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 264, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Cabinets, Racks & Shelves" + }, + "intent": "I want to browse the products in the Cabinets, Racks & Shelves category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/office-products.html?cat=187"}] + }, + "intent_template_id": 211 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 265, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}? How far is it to drive there?", + "instantiation_dict": { + "city": "Boston" + }, + "intent": "What's the closest national park to Boston? How far is it to drive there?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Acadia National Park", + "457km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park\n457km" + }, + "intent_template_id": 85 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 266, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}?", + "instantiation_dict": { + "city": "the largest city in Maine" + }, + "intent": "What's the closest national park to the largest city in Maine?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Acadia National Park" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park" + }, + "intent_template_id": 85 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 267, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}? How long it takes to drive there?", + "instantiation_dict": { + "city": "the hometown of Stephen King" + }, + "intent": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Acadia National Park" + ], + "fuzzy_match": [ + "1h 23min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park\n1h 23min" + }, + "intent_template_id": 85 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 268, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}? How long does it take to bike there?", + "instantiation_dict": { + "city": "Vinalhaven, ME" + }, + "intent": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Acadia National Park" + ], + "fuzzy_match": [ + "10h 33min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park\n10h 33min" + }, + "intent_template_id": 85 + }, + { + "sites": [ + "shopping" + ], + "task_id": 269, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "25", + "product_category": "women shoes" + }, + "intent": "Show me products under $25 in \"women shoes\" category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 270, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "30", + "product_category": "men shoes" + }, + "intent": "Show me products under $30 in \"men shoes\" category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 271, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "46.99", + "product_category": "makeup remover" + }, + "intent": "Show me products under $46.99 in \"makeup remover\" category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 272, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "78", + "product_category": "children dental care" + }, + "intent": "Show me products under $78 in \"children dental care\" category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 273, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "199", + "product_category": "furtiture with accent" + }, + "intent": "Show me products under $199 in \"furtiture with accent\" category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 274, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "usb wifi" + }, + "intent": "Search for \"usb wifi\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 275, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "xbox" + }, + "intent": "Search for \"xbox\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=xbox", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 276, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "switch accessories" + }, + "intent": "Search for \"switch accessories\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 277, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "batteries for iphone 13" + }, + "intent": "Search for \"batteries for iphone 13\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=iphone+13", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 278, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "green tea bag for weight loss" + }, + "intent": "Search for \"green tea bag for weight loss\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 279, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models", + "instantiation_dict": {}, + "intent": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)", + "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones", + "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus", + "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black", + "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)", + "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)", + "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)", + "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)", + "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)", + "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)", + "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", + "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed", + "18.99", + "406" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are avaiable: SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed) Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items) Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W) Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B) Sony MDRAS600BT Active Sports Bluetooth Headset (Black) Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items) Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R) Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed The price ranges from $18.99 to $406 " + }, + "intent_template_id": 204 + }, + { + "sites": [ + "shopping" + ], + "task_id": 280, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", + "instantiation_dict": {}, + "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More", + "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)", + "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)", + "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply", + "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)", + "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)", + "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More", + "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy", + "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More", + "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)", + "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", + "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)", + "8.99", + "59.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are availiable: Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included) Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278) 5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter) Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter) USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111) Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included) Magnetic Wireless Charger, Anker Wireless Charger with 5ft Built-in USB-C Cable, PowerWave Magnetic Pad, 7.5W Charging for iPhone 13 / 13 Pro / 13 Pro Max / 13 mini / 12 / 12 Pro (No AC Adapter) USB C Super Fast Charger, Anker 25W PD Wall Charger Fast Charging for Samsung Galaxy S21/S21+/S21 Ultra/S20/Z Flip/Note20/20 Ultra/Note10/10+/S9/S8/S10e, iPad Pro 12.9, and More (Cable not Included) The price ranges from $8.99 to $59.99" + }, + "intent_template_id": 204 + }, + { + "sites": [ + "shopping" + ], + "task_id": 281, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush", + "instantiation_dict": {}, + "intent": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", + "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack", + "3.745", + "6.495" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are availiable: Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack The price ranges from $3.745 to $6.495 " + }, + "intent_template_id": 204 + }, + { + "sites": [ + "shopping" + ], + "task_id": 282, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List the full product names of slide slippers from Nike and tell me the price range of the available products", + "instantiation_dict": {}, + "intent": "List the full product names of slide slippers from Nike and tell me the price range of the available products", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Nike Men's Air Max Camden Slide Sandal", + "Nike Men's Benassi JDI Fanny Pack Slides", + "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)", + "Nike Offcourt Slide Mens Bq4639-002 Size 12", + "Nike Jordan Men's Break Slide Red AR6374-602", + "Nike Victori One Slide Mens Style : Dd9559-300", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", + "Nike womens Benassi Just Do It", + "27.6", + "90.65" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are availiable: Nike Men's Air Max Camden Slide Sandal Nike Men's Benassi JDI Fanny Pack Slides Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10) Nike Offcourt Slide Mens Bq4639-002 Size 12 Nike Jordan Men's Break Slide Red AR6374-602 Nike Victori One Slide Mens Style : Dd9559-300 Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14) Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8) Nike womens Benassi Just Do It The price ranges from $27.6 to $90.65" + }, + "intent_template_id": 204 + }, + { + "sites": [ + "shopping" + ], + "task_id": 283, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Look up the most recent models of XBox controllers released between 2020-2021?", + "instantiation_dict": {}, + "intent": "Look up the most recent models of XBox controllers released between 2020-2021?", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 210 + }, + { + "sites": [ + "shopping" + ], + "task_id": 284, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": { + "product": "shoe storage", + "min_storage": "12 pairs" + }, + "intent": "Show the least expensive shoe storage with a minimum storage capacity of 12 pairs.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 207 + }, + { + "sites": [ + "shopping" + ], + "task_id": 285, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": { + "product": "switch card holder", + "min_storage": "15 cards" + }, + "intent": "Show the least expensive switch card holder with a minimum storage capacity of 15 cards.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 207 + }, + { + "sites": [ + "shopping" + ], + "task_id": 286, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": { + "product": "ssd hard drive", + "min_storage": "1TB" + }, + "intent": "Show the least expensive ssd hard drive with a minimum storage capacity of 1TB.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 207 + }, + { + "sites": [ + "map" + ], + "task_id": 287, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car?", + "instantiation_dict": {}, + "intent": "How much time does it take from Pittsburgh to Philadelphia by car?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "5h 47min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5h 47min" + }, + "intent_template_id": 47 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 288, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "name" + }, + "intent": "Tell me the name of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Samantha Jones" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Samantha Jones" + }, + "intent_template_id": 234 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 289, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "email address, name, phone number" + }, + "intent": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "email: coolcat321@hotmail.com", + "name: Samantha Jones", + "phone number: 3055551212" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "email: coolcat321@hotmail.com name: Samantha Jones phone number: 3055551212" + }, + "intent_template_id": 234 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 290, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "product SKUs in the most recent cancelled orders" + }, + "intent": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "WSH09-29-White", + "WSH09-28-Green", + "MSH11-34-Blue", + "WP09-29-Purple" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "WSH09-29-White,WSH09-28-Green,MSH11-34-Blue,WP09-29-Purple" + }, + "intent_template_id": 234 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 291, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "total spend on products in the most recent cancelled orders" + }, + "intent": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "148" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$148" + }, + "intent_template_id": 234 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 292, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "total number of cancellations" + }, + "intent": "Tell me the total number of cancellations of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "9" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "9" + }, + "intent_template_id": 234 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 293, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "Super_Awesome_Robot" + }, + "intent": "Show me the command to clone Super_Awesome_Robot with SSH.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git" + }, + "intent_template_id": 329 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 294, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "ChatGPT" + }, + "intent": "Show me the command to clone ChatGPT with SSH.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git" + }, + "intent_template_id": 329 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 295, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "metaseq" + }, + "intent": "Show me the command to clone metaseq with SSH.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git" + }, + "intent_template_id": 329 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 296, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "the best GAN python implementation" + }, + "intent": "Show me the command to clone the best GAN python implementation with SSH.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["git clone ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git" + }, + "intent_template_id": 329 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 297, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "the most stared Covid location tracker" + }, + "intent": "Show me the command to clone the most stared Covid location tracker with SSH.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["git clone ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git" + }, + "intent_template_id": 329 + }, + { + "sites": [ + "shopping" + ], + "task_id": 298, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order page", + "instantiation_dict": { + "status": "completed" + }, + "intent": "Show the most recent completed order page", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/sales/order/view/order_id/180/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "shopping" + ], + "task_id": 299, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order page", + "instantiation_dict": { + "status": "cancelled" + }, + "intent": "Show the most recent cancelled order page", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/sales/order/view/order_id/170/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "shopping" + ], + "task_id": 300, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order page", + "instantiation_dict": { + "status": "pending" + }, + "intent": "Show the most recent pending order page", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/sales/order/view/order_id/189/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "shopping" + ], + "task_id": 301, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order page", + "instantiation_dict": { + "status": "processing" + }, + "intent": "Show the most recent processing order page", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": {"fuzzy_match": "N/A"}, + "reference_url": "", + "program_html": [], + "string_note": "there is no order in processing" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "shopping" + ], + "task_id": 302, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order page", + "instantiation_dict": { + "status": "out of delivery" + }, + "intent": "Show the most recent out of delivery order page", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": {"fuzzy_match": "N/A"}, + "reference_url": "", + "program_html": [], + "string_note": "there is no order in processing" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 303, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Kilian", + "period": "durning 2023" + }, + "intent": "How many commits did Kilian make durning 2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 321 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 304, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Eric", + "period": "between Feb 2023 and May 2023" + }, + "intent": "How many commits did Eric make between Feb 2023 and May 2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "14" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "14" + }, + "intent_template_id": 321 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 305, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Philip", + "period": "in 2023/1" + }, + "intent": "How many commits did Philip make in 2023/1?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 321 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 306, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Anthony", + "period": "between 08/2022-09/2022" + }, + "intent": "How many commits did Anthony make between 08/2022-09/2022?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 321 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 307, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Nic", + "period": "in April 2021" + }, + "intent": "How many commits did Nic make in April 2021?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "16" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16" + }, + "intent_template_id": 321 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 308, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "primer/design" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Shawn Allen"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shawn Allen" + }, + "intent_template_id": 323 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 309, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "thoughtbot/administrate" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Grayson Wright"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Grayson Wright" + }, + "intent_template_id": 323 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 310, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "AndroidSlidingUpPanel" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["tokudu"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "tokudu" + }, + "intent_template_id": 323 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 311, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "Pytorch GAN" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Erik Linder-Nor\u00e9n"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Erik Linder-Nor\u00e9n" + }, + "intent_template_id": 323 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 312, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "csvkit" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["Christopher Groskopf"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Christopher Groskopf" + }, + "intent_template_id": 323 + }, + { + "sites": [ + "shopping" + ], + "task_id": 313, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Which number to call for the customer service?", + "instantiation_dict": {}, + "intent": "Which number to call for the customer service?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no phone number in the website", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 134 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 314, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "prime/design", + "attribute": "name" + }, + "intent": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Shawn Allen", + "Inayaili Le\u00f3n", + "Aurora Pleguezuelo" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shawn Allen, Inayaili Le\u00f3n, Aurora Pleguezuelo" + }, + "intent_template_id": 324 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 315, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "Pytorch GAN", + "attribute": "email address" + }, + "intent": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "eriklindernoren@live.se", + "eriklindernoren@gmail.com", + "pinnacle.chen@qq.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "eriklindernoren@live.se, eriklindernoren@gmail.com, pinnacle.chen@qq.com" + }, + "intent_template_id": 324 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 316, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "facebook's guide on building react apps", + "attribute": "name" + }, + "intent": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Ian Sutherland", + "Joe Hadda", + "Dan Abramov" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Ian Sutherland, Joe Hadda, Dan Abramov" + }, + "intent_template_id": 324 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 317, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "metaseq", + "attribute": "name and number of commits" + }, + "intent": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Susan Zhang: 70", + "Stephen Roller: 51", + "Peter Albert: 12" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Susan Zhang: 70, Stephen Roller: 51, Peter Albert: 12" + }, + "intent_template_id": 324 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 318, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "2019-nCov", + "attribute": "last names" + }, + "intent": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lo", + "Chen", + "Chu" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lo, Chen, Chu" + }, + "intent_template_id": 324 + }, + { + "sites": [ + "shopping" + ], + "task_id": 319, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", + "instantiation_dict": { + "time": "April 2022" + }, + "intent": "How much refund I should expect from my order canlled in April 2022, including shipping fee", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 160 + }, + { + "sites": [ + "shopping" + ], + "task_id": 320, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", + "instantiation_dict": { + "time": "Feb 2023" + }, + "intent": "How much refund I should expect from my order canlled in Feb 2023, including shipping fee", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "406.53" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "406.53" + }, + "intent_template_id": 160 + }, + { + "sites": [ + "shopping" + ], + "task_id": 321, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", + "instantiation_dict": { + "time": "2022" + }, + "intent": "How much refund I should expect from my order canlled in 2022, including shipping fee", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "3053.97" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3053.97" + }, + "intent_template_id": 160 + }, + { + "sites": [ + "shopping" + ], + "task_id": 322, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canlled in {{time}} if I cannot get the shipping fee refunded?", + "instantiation_dict": { + "time": "May 2023" + }, + "intent": "How much refund I should expect from my order canlled in May 2023 if I cannot get the shipping fee refunded?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "350.42" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "350.42" + }, + "intent_template_id": 160 + }, + { + "sites": [ + "shopping" + ], + "task_id": 323, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canlled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "instantiation_dict": { + "time": "2022/03" + }, + "intent": "How much refund I should expect from my order canlled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "264.49" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "264.49" + }, + "intent_template_id": 160 + }, + { + "sites": [ + "shopping" + ], + "task_id": 324, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "chairs", + "sorting_order": "ascending price" + }, + "intent": "Show me the \"chairs\" listings by ascending price.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 325, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "mouth night guard", + "sorting_order": "descending price" + }, + "intent": "Show me the \"mouth night guard\" listings by descending price.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 326, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "Canon photo printer", + "sorting_order": "search relevance, from most to least" + }, + "intent": "Show me the \"Canon photo printer\" listings by search relevance, from most to least.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 327, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "iphone 12 phone case", + "sorting_order": "name alphabetically" + }, + "intent": "Show me the \"iphone 12 phone case\" listings by name alphabetically.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 328, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "iphone 12 phone case", + "sorting_order": "price" + }, + "intent": "Show me the \"iphone 12 phone case\" listings by price.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 329, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "on 4/19/2023" + }, + "intent": "How much I spend on 4/19/2023 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 147 + }, + { + "sites": [ + "shopping" + ], + "task_id": 330, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "in March 2023" + }, + "intent": "How much I spend in March 2023 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "83.31" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "81.31" + }, + "intent_template_id": 147 + }, + { + "sites": [ + "shopping" + ], + "task_id": 331, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "in July 2022" + }, + "intent": "How much I spend in July 2022 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.16" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.16" + }, + "intent_template_id": 147 + }, + { + "sites": [ + "shopping" + ], + "task_id": 332, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "each month from Jan to the end of March 2023" + }, + "intent": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Jan: 572.8", + "Feb: 762.18", + "Mar: 83.31" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jan: 572.8\nFeb: 762.18\nMar: 83.31" + }, + "intent_template_id": 147 + }, + { + "sites": [ + "shopping" + ], + "task_id": 333, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much did I spend on shopping at One Stop Market {{time}}? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", + "instantiation_dict": { + "time": "on November 2022" + }, + "intent": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "359.546" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "359.546" + }, + "intent_template_id": 147 + }, + { + "sites": [ + "shopping" + ], + "task_id": 334, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "muffin cornbread mix" + }, + "intent": "Tell me when I last ordered my muffin cornbread mix?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "March 11th 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "March 11th 2023" + }, + "intent_template_id": 169 + }, + { + "sites": [ + "shopping" + ], + "task_id": 335, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "body butter" + }, + "intent": "Tell me when I last ordered my body butter?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January 16th 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January 16th 2023" + }, + "intent_template_id": 169 + }, + { + "sites": [ + "shopping" + ], + "task_id": 336, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "conditioner" + }, + "intent": "Tell me when I last ordered my conditioner?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January 16th 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January 16th 2023" + }, + "intent_template_id": 169 + }, + { + "sites": [ + "shopping" + ], + "task_id": 337, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "bread olive" + }, + "intent": "Tell me when I last ordered my bread olive?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "December 12th 2022" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "December 12th 2022" + }, + "intent_template_id": 169 + }, + { + "sites": [ + "shopping" + ], + "task_id": 338, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "toothpaste" + }, + "intent": "Tell me when I last ordered my toothpaste?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "December 4th 2022" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "December 4th 2022" + }, + "intent_template_id": 169 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 339, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that report bugs" + }, + "intent": "List all opened issues that report bugs", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 340, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that report bugs" + }, + "intent": "List all opened issues that report bugs", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 341, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/root/metaseq", + "geolocation": null, + "intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "requesting new features" + }, + "intent": "List all opened issues requesting new features", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 342, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/root/metaseq", + "geolocation": null, + "intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that ask about OPT model related questions" + }, + "intent": "List all opened issues that ask about OPT model related questions", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 343, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/root/metaseq", + "geolocation": null, + "intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that don't have any labels" + }, + "intent": "List all opened issues that don't have any labels", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 344, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "by far" + }, + "intent": "How many reviews our shop received by far?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "351" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "351" + }, + "intent_template_id": 248 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 345, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "in Apr 2023" + }, + "intent": "How many reviews our shop received in Apr 2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "351" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "351" + }, + "intent_template_id": 248 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 346, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "during 2022" + }, + "intent": "How many reviews our shop received during 2022?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 248 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 347, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "from the beginning of the shop" + }, + "intent": "How many reviews our shop received from the beginning of the shop?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "351" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "351" + }, + "intent_template_id": 248 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 348, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "in May 2023" + }, + "intent": "How many reviews our shop received in May 2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 248 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 349, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Who else have access to my repo {{repo}}, show me their usernames", + "instantiation_dict": { + "repo": "gimmiethat.space" + }, + "intent": "Who else have access to my repo gimmiethat.space, show me their usernames", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "yjlou" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "yjlou" + }, + "intent_template_id": 298 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 350, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Who else have access to my repo {{repo}}, show me their usernames", + "instantiation_dict": { + "repo": "prism-theme" + }, + "intent": "Who else have access to my repo prism-theme, show me their usernames", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "abisubramanya27" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Abishek S, abisubramanya27" + }, + "intent_template_id": 298 + }, + { + "sites": [ + "shopping" + ], + "task_id": 351, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "PS4 accessories", + "order": "ascending" + }, + "intent": "List products from PS4 accessories category by ascending price", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/video-games.html?cat=67&product_list_order=price"}] + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 352, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "nutrition bars and drinks", + "order": "ascending" + }, + "intent": "List products from nutrition bars and drinks category by ascending price", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/health-household.html?cat=192&product_list_order=price"}] + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 353, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "competitive swimwear", + "order": "ascending" + }, + "intent": "List products from competitive swimwear category by ascending price", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/clothing-shoes-jewelry.html?cat=149&product_list_order=price"}] + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 354, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "living room furtniture", + "order": "descending" + }, + "intent": "List products from living room furtniture category by descending price", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/home-kitchen.html?cat=154&product_list_order=price&product_list_dir=desc"}] + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 355, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "kids' bedding", + "order": "descending" + }, + "intent": "List products from kids' bedding category by descending price", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc", + "program_html": [], + "url_note": "GOLD in PRED", + "or": [{"reference_url": "__SHOPPING__/home-kitchen.html?cat=155&product_list_dir=desc"}] + }, + "intent_template_id": 137 + }, + { + "sites": [ + "map" + ], + "task_id": 356, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", + "instantiation_dict": {}, + "intent": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Gates and Hillman Centers", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Independence Hall", + "Philadelphia" + ] + } + } + ] + }, + "intent_template_id": 49 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 357, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Checkout merge requests requiring my review", + "instantiation_dict": {}, + "intent": "Checkout merge requests requiring my review", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 291 + }, + { + "sites": [ + "shopping" + ], + "task_id": 358, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "shipping method", + "order_number": 187 + }, + "intent": "Show me the shipping method for order number 187.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Flat Rate - Fixed" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Flat Rate - Fixed" + }, + "intent_template_id": 206 + }, + { + "sites": [ + "shopping" + ], + "task_id": 359, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "order date", + "order_number": "148" + }, + "intent": "Show me the order date for order number 148.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "1/29/2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1/29/2023" + }, + "intent_template_id": 206 + }, + { + "sites": [ + "shopping" + ], + "task_id": 360, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "product names", + "order_number": "148" + }, + "intent": "Show me the product names for order number 148.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", + "Russound 5B45W 4\" Indoor Outdoor Speakers White" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress), Russound 5B45W 4\" Indoor Outdoor Speakers White" + }, + "intent_template_id": 206 + }, + { + "sites": [ + "shopping" + ], + "task_id": 361, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "order statuses", + "order_number": "170 and 189" + }, + "intent": "Show me the order statuses for order number 170 and 189.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "170: cancelled", + "189: pending" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "170: cancelled, 189: pending" + }, + "intent_template_id": 206 + }, + { + "sites": [ + "shopping" + ], + "task_id": 362, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "billing address", + "order_number": "00178" + }, + "intent": "Show me the billing address for order number 00178.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "101 S San Mateo Dr", + "San Mateo", + "California", + "94010", + "United States" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Emma Lopez, 101 S San Mateo Dr, San Mateo, California, 94010, United States" + }, + "intent_template_id": 206 + }, + { + "sites": [ + "map" + ], + "task_id": 363, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Mellon University", + "location/address_2": "Carnegie Music Hall" + }, + "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["778m |OR| 778 m"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "748m" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 364, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Mellon University", + "location/address_2": "UPMC Shadyside" + }, + "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["1.7km |OR| 1.7 km"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1.7km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 365, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Music Hall", + "location/address_2": "UPMC Shadyside" + }, + "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["2.2km |OR| 2.2 km"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2.2km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 366, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "CVS (closet one)", + "location/address_2": "UPMC Shadyside" + }, + "intent": "Measure distance between CVS (closet one) and UPMC Shadyside by walking", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["1.2km |OR| 1.2 km"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1.2km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 367, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Mellon University", + "location/address_2": "CVS (closet one)" + }, + "intent": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": ["1.4km |OR| 1.4 km"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1.4km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "shopping" + ], + "task_id": 368, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "find discounted items.", + "instantiation_dict": {}, + "intent": "find discounted items.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no function to show only discount items", + "reference_answer_raw_annotation": "There is no function to show only discount items." + }, + "intent_template_id": 188 + }, + { + "sites": [ + "map" + ], + "task_id": 369, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Carnegie Music Hall" + }, + "intent": "Pull up the description page of Carnegie Music Hall on Map", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Carnegie Music Hall" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 370, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Carnegie Mellon University" + }, + "intent": "Pull up the description page of Carnegie Mellon University on Map", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Carnegie Mellon University" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 371, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Piada restaurant near Pitt" + }, + "intent": "Pull up the description page of Piada restaurant near Pitt on Map", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Piada Italian Street Food", + "Forbes Avenue" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 372, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "the Costco in Pittsburhg near a river" + }, + "intent": "Pull up the description page of the Costco in Pittsburhg near a river on Map", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Costco", + "Waterfront Drive West" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 373, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Whole Foods near Carnegie Mellon" + }, + "intent": "Pull up the description page of Whole Foods near Carnegie Mellon on Map", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Whole Foods", + "East Liberty" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 374, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Preview the {{name}} theme for my shop", + "instantiation_dict": { + "name": "Magento Blank" + }, + "intent": "Preview the Magento Blank theme for my shop", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 266 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 375, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Preview the {{name}} theme for my shop", + "instantiation_dict": { + "name": "Magento Luma" + }, + "intent": "Preview the Magento Luma theme for my shop", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 266 + }, + { + "sites": [ + "shopping" + ], + "task_id": 376, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Summarize customer reviews for {{product}}.", + "instantiation_dict": { + "product": "Amazon Echo Dot 3rd generation" + }, + "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no review for this product", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 182 + }, + { + "sites": [ + "map" + ], + "task_id": 377, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "CMU ArtPark Lab", + "space": "resturants" + }, + "intent": "Find the resturants around CMU ArtPark Lab", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 378, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "CMU main campus", + "space": "parking" + }, + "intent": "Find the parking around CMU main campus", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=parking%20near%20carnegie%20mellon%20university", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 379, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "CMU main campus", + "space": "hotel" + }, + "intent": "Find the hotel around CMU main campus", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=hotels%20near%20carnegie%20mellon%20university", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 380, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "Carnegie Music Hall", + "space": "bar" + }, + "intent": "Find the bar around Carnegie Music Hall", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 381, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "Carnegie Music Hall", + "space": "hotel" + }, + "intent": "Find the hotel around Carnegie Music Hall", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 382, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", + "instantiation_dict": {}, + "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no USCIS nearby", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 781 + }, + { + "sites": [ + "map" + ], + "task_id": 383, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", + "instantiation_dict": {}, + "intent": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hyatt Regency Pittsburgh International Airport", + "Giant Eagle", + "ALDI" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Giant Eagle, ALDI" + }, + "intent_template_id": 782 + }, + { + "sites": [ + "shopping" + ], + "task_id": 384, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List the customer names who complain about the quality of EYZUTAK phone cases", + "instantiation_dict": {}, + "intent": "List the customer names who complain about the quality of EYZUTAK phone cases", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lisa Lee", + "Evelyn Kurver", + "Amanda", + "N Randall" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lisa Lee, Evelyn Kurver, Amanda, N Randall" + }, + "intent_template_id": 666 + }, + { + "sites": [ + "shopping" + ], + "task_id": 385, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List the customer names who thinks EYZUTAK phone cases are of good looking", + "instantiation_dict": {}, + "intent": "List the customer names who thinks EYZUTAK phone cases are of good looking", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lisa Lee", + "MH", + "Misba009", + "Amanda", + "N Randall", + "Amazon Customer", + "Cally", + "Bethany Robertson" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, Bethany Robertson" + }, + "intent_template_id": 666 + }, + { + "sites": [ + "shopping" + ], + "task_id": 386, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the rating of {{product}}", + "instantiation_dict": { + "product": "Ugreen lightning to 3.5mm cable" + }, + "intent": "What is the rating of Ugreen lightning to 3.5mm cable. Please round to the nearest whole number", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "65 |OR| 3" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Around 3.1 stars" + }, + "intent_template_id": 1355 + }, + { + "sites": [ + "shopping" + ], + "task_id": 387, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", + "instantiation_dict": { + "stars": "4 or 5 stars" + }, + "intent": "Who gave 4 or 5 stars for phone cases from EYZUTAK", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "MH", + "Misba009", + "Amanda", + "Amazon Customer", + "Cally", + "Bethany Robertson" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "MH, Misba009, Amanda, Amazon Customer, Cally, Bethany Robertson, Bethany Robertson" + }, + "intent_template_id": 1356 + }, + { + "sites": [ + "shopping" + ], + "task_id": 388, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", + "instantiation_dict": { + "stars": "1 or 2 stars" + }, + "intent": "Who gave 1 or 2 stars for phone cases from EYZUTAK", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Evelyn Kurver", + "N Randall" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Evelyn Kurver, N Randall" + }, + "intent_template_id": 1356 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 389, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "octovisuals page", + "repo": "primer/design", + "content": "Thanks, working on reviews" + }, + "intent": "Post \"Thanks, working on reviews\" for the merge request related to octovisuals page in primer/design project", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/primer/design/-/merge_requests/450", + "locator": "", + "required_contents": { + "must_include": [ + "Thanks, working on reviews" + ] + } + } + ] + }, + "intent_template_id": 348 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 390, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "semantic HTML post", + "repo": "a11yproject/a11yproject.com", + "content": "lgtm" + }, + "intent": "Post \"lgtm\" for the merge request related to semantic HTML post in a11yproject/a11yproject.com project", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1531", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "lgtm" + } + } + ] + }, + "intent_template_id": 348 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 391, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "focus edge cases", + "repo": "a11yproject/a11yproject.com", + "content": "close because non reproducible" + }, + "intent": "Post \"close because non reproducible\" for the merge request related to focus edge cases in a11yproject/a11yproject.com project", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1265", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "close because non reproducible" + } + } + ] + }, + "intent_template_id": 348 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 392, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "color ulitity", + "repo": "a11yproject.com", + "content": "Good idea" + }, + "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1071", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "Good idea" + } + } + ] + }, + "intent_template_id": 348 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 393, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "fixing the broken links", + "repo": "byteblaze/empathy-prompts", + "content": "lgtm" + }, + "intent": "Post \"lgtm\" for the merge request related to fixing the broken links in byteblaze/empathy-prompts project", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/empathy-prompts/-/merge_requests/19", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "lgtm" + } + } + ] + }, + "intent_template_id": 348 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 394, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "2019-nCov" + }, + "intent": "Fork 2019-nCov.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/2019-nCov", + "locator": "", + "required_contents": { + "must_include": [ + "2019-nCov" + ] + } + } + ] + }, + "intent_template_id": 352 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 395, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "the Pytorch GAN repo with most stars" + }, + "intent": "Fork the Pytorch GAN repo with most stars.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/PyTorch-GAN", + "locator": "", + "required_contents": { + "must_include": [ + "Pytorch-GAN" + ] + } + } + ] + }, + "intent_template_id": 352 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 396, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "ChatGPT" + }, + "intent": "Fork ChatGPT.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/ChatGPT", + "locator": "", + "required_contents": { + "must_include": [ + "ChatGPT" + ] + } + } + ] + }, + "intent_template_id": 352 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 397, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "MetaSeq" + }, + "intent": "Fork MetaSeq.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/metaseq", + "locator": "", + "required_contents": { + "must_include": [ + "metaseq" + ] + } + } + ] + }, + "intent_template_id": 352 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 398, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "all source repos from Akilesh Kannan" + }, + "intent": "Fork all source repos from Akilesh Kannan", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/SimCache", + "locator": "", + "required_contents": { + "must_include": [ + "SimCache" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/dots", + "locator": "", + "required_contents": { + "must_include": [ + "dots" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/CacheEval", + "locator": "", + "required_contents": { + "must_include": [ + "CacheEval" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/nvidia-patch", + "locator": "", + "required_contents": { + "must_include": [ + "404" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/viewgrades-scraper", + "locator": "", + "required_contents": { + "must_include": [ + "404" + ] + } + } + ] + }, + "intent_template_id": 352 + }, + { + "sites": [ + "reddit" + ], + "task_id": 399, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "I am a robot" + }, + "intent": "Change my reddit bio to \"I am a robot\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "I am a robot" + } + } + ] + }, + "intent_template_id": 6 + }, + { + "sites": [ + "reddit" + ], + "task_id": 400, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Pro Python Developer with 20 years of Experience" + }, + "intent": "Change my reddit bio to \"Pro Python Developer with 20 years of Experience\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Pro Python Developer with 20 years of Experience" + } + } + ] + }, + "intent_template_id": 6 + }, + { + "sites": [ + "reddit" + ], + "task_id": 401, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Seeking SDE positions" + }, + "intent": "Change my reddit bio to \"Seeking SDE positions\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Seeking SDE positions" + } + } + ] + }, + "intent_template_id": 6 + }, + { + "sites": [ + "reddit" + ], + "task_id": 402, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Freelance Web Developer" + }, + "intent": "Change my reddit bio to \"Freelance Web Developer\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Freelance Web Developer" + } + } + ] + }, + "intent_template_id": 6 + }, + { + "sites": [ + "reddit" + ], + "task_id": 403, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Awesome Prompt Artist" + }, + "intent": "Change my reddit bio to \"Awesome Prompt Artist\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Awesome Prompt Artist" + } + } + ] + }, + "intent_template_id": 6 + }, + { + "sites": [ + "reddit" + ], + "task_id": 404, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "books" + }, + "intent": "Upvote the newest post in books subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/books/124260/adults-reading-to-each-other-out-loud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22 + }, + { + "sites": [ + "reddit" + ], + "task_id": 405, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "DIY" + }, + "intent": "Upvote the newest post in DIY subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/DIY/119019/how-can-i-bring-an-hdmi-cable-from-my-pc-downstairs-to-my-tv", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22 + }, + { + "sites": [ + "reddit" + ], + "task_id": 406, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "future technology" + }, + "intent": "Upvote the newest post in future technology subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/Futurology/119517/openai-ceo-it-s-not-funny-that-i-m-afraid-of-the-ai-we-re", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22 + }, + { + "sites": [ + "reddit" + ], + "task_id": 407, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "deeplearning" + }, + "intent": "Upvote the newest post in deeplearning subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/deeplearning/124993/meta-s-llama-weights-leaked-on-torrent-and-the-best-thing", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22 + }, + { + "sites": [ + "reddit" + ], + "task_id": 408, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "explain like im 5" + }, + "intent": "Upvote the newest post in explain like im 5 subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/explainlikeimfive/39244/eli5-how-does-pinching-a-ribbon-and-sliding-your-finger", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22 + }, + { + "sites": [ + "reddit" + ], + "task_id": 409, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "geolocation": null, + "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the manager of the website", + "content_description": "thanks! I am a big fan of your website." + }, + "intent": "Reply to the manager of the website in this post with \"thanks! I am a big fan of your website.\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/books/59421/-/comment/1235250", + "locator": "", + "required_contents": { + "must_include": [ + "thanks! I am a big fan of your website." + ] + } + } + ] + }, + "intent_template_id": 23 + }, + { + "sites": [ + "reddit" + ], + "task_id": 410, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/singularity/69404/this-is-how-chatgpt-sees-itself", + "geolocation": null, + "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the first reply", + "content_description": "don't panic" + }, + "intent": "Reply to the first reply in this post with \"don't panic\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/singularity/69404/-/comment/1042264", + "locator": "", + "required_contents": { + "must_include": [ + "don't panic" + ] + } + } + ] + }, + "intent_template_id": 23 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 411, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to MIT license.", + "instantiation_dict": { + "repo": "byteblaze/cloud-to-butt" + }, + "intent": "Make the LICENSE of byteblaze/cloud-to-butt to MIT license.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/cloud-to-butt/-/blob/master/LICENSE.txt", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license" + ] + } + } + ] + }, + "intent_template_id": 355 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 412, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to Apache License", + "instantiation_dict": { + "repo": "byteblaze/accessible-html-content-patterns" + }, + "intent": "Make the LICENSE of byteblaze/accessible-html-content-patterns to Apache License", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "Apache License", + "http://www.apache.org/licenses/LICENSE-2.0" + ] + } + } + ] + }, + "intent_template_id": 355 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 413, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to one that mandates all copies and derivative works to be under the same license", + "instantiation_dict": { + "repo": "byteblaze/a11y-syntax-highlighting" + }, + "intent": "Make the LICENSE of byteblaze/a11y-syntax-highlighting to one that mandates all copies and derivative works to be under the same license", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "GENERAL PUBLIC LICENSE" + ] + } + } + ] + }, + "intent_template_id": 355 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 414, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to MIT license.", + "instantiation_dict": { + "repo": "byteblaze/dotfiles" + }, + "intent": "Make the LICENSE of byteblaze/dotfiles to MIT license.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license", + "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." + ] + } + } + ] + }, + "intent_template_id": 355 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 415, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": { + "topic": "verification functions" + }, + "intent": "Go to the merge request on verification functions I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/merge_requests/40", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "must_include": ["@davepgreene"] + } + } + ] + }, + "intent_template_id": 360 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 416, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": { + "topic": "wcag" + }, + "intent": "Go to the merge request on wcag I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1270", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "must_include": ["Thank you"] + } + } + ] + }, + "intent_template_id": 360 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 417, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": { + "topic": "404 link" + }, + "intent": "Go to the merge request on 404 link I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1485", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "must_include": ["@Roshanjossey"] + } + } + ] + }, + "intent_template_id": 360 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 418, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Busy" + }, + "intent": "Set my gitlab status as Busy.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-title').textContent", + "required_contents": { + "must_include": ["Busy"] + } + } + ] + }, + "intent_template_id": 361 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 419, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Enjoying life" + }, + "intent": "Set my gitlab status as Enjoying life.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Enjoying life" + } + } + ] + }, + "intent_template_id": 361 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 420, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Playing Badminton" + }, + "intent": "Set my gitlab status as Playing Badminton.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Playing Badminton" + } + } + ] + }, + "intent_template_id": 361 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 421, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Resting due to leg injury" + }, + "intent": "Set my gitlab status as Resting due to leg injury.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Resting due to leg injury" + } + } + ] + }, + "intent_template_id": 361 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 422, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Out of Office" + }, + "intent": "Set my gitlab status as Out of Office.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Out of Office" + } + } + ] + }, + "intent_template_id": 361 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 423, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{brand}} shirts on sale", + "instantiation_dict": { + "brand": "Hollister" + }, + "intent": "Mark all Hollister shirts on sale", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/126/", + "locator": "document.querySelector('input[name=\"product[sale]\"]').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 237 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 424, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the place where Mr. Rogers was filmed" + }, + "intent": "Find the page of the place where Mr. Rogers was filmed on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Pittsburgh" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 425, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the longest bridge in the Western hemisphere" + }, + "intent": "Find the page of the longest bridge in the Western hemisphere on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Mackinac Bridge" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 426, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the place in Pennsylvania where a plane crashed during the September 11th attacks" + }, + "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Somerset County" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 427, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the university that has most Turning Award winners" + }, + "intent": "Find the page of the university that has most Turning Award winners on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Massachusetts Institute of Technology" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 428, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the undergrad college of the person who developed the Nash equilibrium" + }, + "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Carnegie Mellon University" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 429, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the colleges where The Chair was filmed in Pittsburgh" + }, + "intent": "Find the page of the colleges where The Chair was filmed in Pittsburgh on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Chatham University" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 430, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh" + }, + "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Washington & Jefferson College" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "shopping" + ], + "task_id": 431, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/tall-pink-taper-candles-4-piece-orange-colored-tapered-candles-gradient-candles-10-6-inches-tall-tie-dye-candle-set-large-dripless-long-burning-candlesticks-two-color-taper-candles-candlesticks.html |AND| __SHOPPING__/spaas-white-taper-candles-4-pack-10-inch-tall-candles-scent-free-premium-wax-candle-sticks-8-hour-long-burning-white-candlesticks-for-home-decoration-wedding-holiday-and-parties.html |AND| __SHOPPING__/white-starfish-wall-candle-sconces-set-of-2-beach-decor-ocean-themed-wall-mount-candleholders-nautical-style-beach-bathroom-decor-coastal-farmhouse-seashell-candle-holders.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "SPAAS White Taper Candles - 4 Pack |OR| 10 Inch Tall Candles, Scent-Free Premium Wax Candle Sticks |OR| 8 Hour Long Burning White Candlesticks for Home Decoration, Wedding, Holiday and Parties" + ] + } + } + ] + }, + "intent_template_id": 145 + }, + { + "sites": [ + "shopping" + ], + "task_id": 432, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/ciclon-energy-drink-regular-24-cans-8-3oz.html |AND| __SHOPPING__/v8-energy-healthy-energy-drink-steady-energy-from-black-and-green-tea-pomegranate-blueberry-8-ounce-can-pack-of-24.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "V8 +Energy, Healthy Energy Drink, Steady Energy from Black and Green Tea, Pomegranate Blueberry, 8 Ounce Can ,Pack of 24" + ] + } + } + ] + }, + "intent_template_id": 145 + }, + { + "sites": [ + "shopping" + ], + "task_id": 433, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/tazrigo-5pcs-white-dental-resin-brush-pens-dental-shaping-silicone-tooth-tool.html |AND| __SHOPPING__/stylus-pens-for-touch-screens-2-pcs-universal-stylus-2-in-1-2022-updated-touch-screen-pens-for-all-touch-screens-cell-phones-tablets-laptops-with-6-replacement-tips-4-discstips-2-fiber-tips.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "Tazrigo 5pcs White Dental Resin Brush Pens Dental Shaping Silicone Tooth Tool" + ] + } + } + ] + }, + "intent_template_id": 145 + }, + { + "sites": [ + "shopping" + ], + "task_id": 434, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/3-pairs-ruffle-socks-lace-ankle-socks-for-girls-frilly-socks-women-decorative.html |AND| __SHOPPING__/viviki-women-glitter-socks-ultrathin-transparent-tulle-lace-socks-no-show-ankle-crew-socks-3-pack.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "VIVIKI Women Glitter Socks Ultrathin Transparent Tulle Lace Socks - No Show Ankle Crew Socks 3 Pack" + ] + } + } + ] + }, + "intent_template_id": 145 + }, + { + "sites": [ + "shopping" + ], + "task_id": 435, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/35-ft-hdmi-cable-gearit-pro-series-hdmi-cable-35-feet-high-speed-ethernet-4k-resolution-3d-video-and-arc-audio-return-channel-hdmi-cable-white.html |AND| __SHOPPING__/dp-to-hdmi-cable-6ft-2-pack-fosmon-gold-plated-displayport-to-hdmi-cable-1080p-full-hd-for-pcs-to-hdtv-monitor-projector-with-hdmi-port.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "DP to HDMI Cable 6FT (2 Pack), Fosmon Gold Plated Displayport to HDMI Cable 1080p Full HD for PCs to HDTV, Monitor, Projector with HDMI Port" + ] + } + } + ] + }, + "intent_template_id": 145 + }, + { + "sites": [ + "shopping" + ], + "task_id": 436, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a mattress foundation", + "time": "around Feb or March 2023" + }, + "intent": "I previously ordered some a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B07DFJ5XKH" + ] + } + } + ] + }, + "intent_template_id": 156 + }, + { + "sites": [ + "shopping" + ], + "task_id": 437, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a table lamp", + "time": "in May 2023" + }, + "intent": "I previously ordered some a table lamp in May 2023 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B072XS3F6W" + ] + } + } + ] + }, + "intent_template_id": 156 + }, + { + "sites": [ + "shopping" + ], + "task_id": 438, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a TV stand", + "time": "sometime around sep 2022" + }, + "intent": "I previously ordered some a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B08PVHRRB7" + ] + } + } + ] + }, + "intent_template_id": 156 + }, + { + "sites": [ + "shopping" + ], + "task_id": 439, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a cat t-shirt", + "time": "during 2022" + }, + "intent": "I previously ordered some a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B0844BWS76" + ] + } + } + ] + }, + "intent_template_id": 156 + }, + { + "sites": [ + "shopping" + ], + "task_id": 440, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a make up removal kit", + "time": "during summer 2022" + }, + "intent": "I previously ordered some a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B0738JQG6Q" + ] + } + } + ] + }, + "intent_template_id": 156 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 441, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "GIVE ME SPACE" + }, + "intent": "Update the project site's title to \"GIVE ME SPACE\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "GIVE ME SPACE" + ] + } + } + ] + }, + "intent_template_id": 308 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 442, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Welcome to my site" + }, + "intent": "Update the project site's title to \"Welcome to my site\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Welcome to my site" + ] + } + } + ] + }, + "intent_template_id": 308 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 443, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Not an interesting site" + }, + "intent": "Update the project site's title to \"Not an interesting site\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Not an interesting site" + ] + } + } + ] + }, + "intent_template_id": 308 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 444, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Title Wanted" + }, + "intent": "Update the project site's title to \"Title Wanted\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Title Wanted" + ] + } + } + ] + }, + "intent_template_id": 308 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 445, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Hello" + }, + "intent": "Update the project site's title to \"Hello\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Hello" + ] + } + } + ] + }, + "intent_template_id": 308 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 446, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": { + "repo": "a11yproject", + "issue": 404, + "account": "Roshanjossey" + }, + "intent": "Assign the issue regarding 404 in a11yproject to Roshanjossey.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Roshanjossey", + "locator": "", + "required_contents": { + "must_include": [ + "404s, bad host, timeouts, bad urls for URLs linked from website" + ] + } + } + ] + }, + "intent_template_id": 999 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 447, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": { + "repo": "a11y-webring.club", + "issue": "linking to an accessibility statement", + "account": "Rohan" + }, + "intent": "Assign the issue regarding linking to an accessibility statement in a11y-webring.club to Rohan.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Seirdy", + "locator": "", + "required_contents": { + "must_include": [ + "linking to an accessibility statement" + ] + } + } + ] + }, + "intent_template_id": 999 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 448, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "https://egg.tart.com" + }, + "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "egg.tart.com" + } + } + ] + }, + "intent_template_id": 331 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 449, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "https://helloworld.xyz" + }, + "intent": "set the homepage URL on my GitLab profile to https://helloworld.xyz", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "helloworld.xyz" + } + } + ] + }, + "intent_template_id": 331 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 450, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "a11yproject.contributor.me" + }, + "intent": "set the homepage URL on my GitLab profile to a11yproject.contributor.me", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "a11yproject.contributor.me" + } + } + ] + }, + "intent_template_id": 331 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 451, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "www.byteblaze.com" + }, + "intent": "set the homepage URL on my GitLab profile to www.byteblaze.com", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "www.byteblaze.com" + } + } + ] + }, + "intent_template_id": 331 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 452, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "byteblaze.github.io" + }, + "intent": "set the homepage URL on my GitLab profile to byteblaze.github.io", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "byteblaze.github.io" + } + } + ] + }, + "intent_template_id": 331 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 453, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Teton pullover hoodie" + }, + "intent": "Disable Teton pullover hoodie from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/78/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 454, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Ryker Tee Crew Neck" + }, + "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/478/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 455, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "lHelios Endurance Tank" + }, + "intent": "Disable lHelios Endurance Tank from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/676/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 456, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Cora Pant" + }, + "intent": "Disable Cora Pant from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1840/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 457, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Karmen yoga pants" + }, + "intent": "Disable Karmen yoga pants from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1819/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 458, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "$5", + "action": "Reduce" + }, + "intent": "Reduce the price of this product by $5", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "27.00" + } + } + ] + }, + "intent_template_id": 247 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 459, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "10%", + "action": "Reduce" + }, + "intent": "Reduce the price of this product by 10%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "62.10" + } + } + ] + }, + "intent_template_id": 247 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 460, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "15%", + "action": "Reduce" + }, + "intent": "Reduce the price of this product by 15%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "38.25" + } + } + ] + }, + "intent_template_id": 247 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 461, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "$11.5", + "action": "Increase" + }, + "intent": "Increase the price of this product by $11.5", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "29.50" + } + } + ] + }, + "intent_template_id": 247 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 462, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "10%", + "action": "Increase" + }, + "intent": "Increase the price of this product by 10%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "19.80" + } + } + ] + }, + "intent_template_id": 247 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 463, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "15%", + "action": "Increase" + }, + "intent": "Increase the price of this product by 15%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "36.80" + } + } + ] + }, + "intent_template_id": 247 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 464, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Antonia Racer Tank" + }, + "intent": "Update the product description of Antonia Racer Tank to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../antonia-racer-tank.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "This is in regular rotation at the gym", + "Its colorful and looks kinda cute under my exercise tanks", + "it's very stylish for yoga or something else low impact" + ] + } + } + ] + }, + "intent_template_id": 251 + }, + { + "sites": [ + "shopping" + ], + "task_id": 465, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" + }, + "intent": "Add Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count to my wish list", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" + ] + } + } + ] + }, + "intent_template_id": 186 + }, + { + "sites": [ + "shopping" + ], + "task_id": 466, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" + }, + "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" + ] + } + } + ] + }, + "intent_template_id": 186 + }, + { + "sites": [ + "shopping" + ], + "task_id": 467, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" + }, + "intent": "Add HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits to my wish list", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" + ] + } + } + ] + }, + "intent_template_id": 186 + }, + { + "sites": [ + "shopping" + ], + "task_id": 468, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" + }, + "intent": "Add DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit to my wish list", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" + ] + } + } + ] + }, + "intent_template_id": 186 + }, + { + "sites": [ + "shopping" + ], + "task_id": 469, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" + }, + "intent": "Add Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes to my wish list", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" + ] + } + } + ] + }, + "intent_template_id": 186 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 470, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "302" + }, + "intent": "Cancel order 302", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 471, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "307" + }, + "intent": "Cancel order 307", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 472, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "299" + }, + "intent": "Cancel order 299", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 473, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "301" + }, + "intent": "Cancel order 301", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 474, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "305" + }, + "intent": "Cancel order 305", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/305/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 475, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "chatgpt_plugin" + }, + "intent": "Set up a new, empty repository with the name chatgpt_plugin?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/chatgpt_plugin", + "locator": "", + "required_contents": { + "must_include": [ + "chatgpt_plugin" + ] + } + } + ] + }, + "intent_template_id": 292 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 476, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "awesome_llm_reading" + }, + "intent": "Set up a new, empty repository with the name awesome_llm_reading?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_llm_reading", + "locator": "", + "required_contents": { + "must_include": [ + "awesome_llm_reading" + ] + } + } + ] + }, + "intent_template_id": 292 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 477, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "awesome_program_aided_reasoning" + }, + "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_program_aided_reasoning", + "locator": "", + "required_contents": { + "must_include": [ + "awesome_program_aided_reasoning" + ] + } + } + ] + }, + "intent_template_id": 292 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 478, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "webagent" + }, + "intent": "Set up a new, empty repository with the name webagent?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/webagent", + "locator": "", + "required_contents": { + "must_include": [ + "webagent" + ] + } + } + ] + }, + "intent_template_id": 292 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 479, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "awesome_webagent" + }, + "intent": "Set up a new, empty repository with the name awesome_webagent?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_webagent", + "locator": "", + "required_contents": { + "must_include": [ + "awesome_webagent" + ] + } + } + ] + }, + "intent_template_id": 292 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 480, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}}", + "instantiation_dict": { + "collaborator_account_list": "yjlou", + "repo": "solarized-prism-theme" + }, + "intent": "Invite yjlou as collaborator to solarized-prism-theme", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/solarized-prism-theme/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "yjlou" + ] + } + } + ] + }, + "intent_template_id": 293 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 481, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Abishek" + }, + "intent": "Abishek wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 482, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "yjlou" + }, + "intent": "yjlou wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 483, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Koushik" + }, + "intent": "Koushik wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'koush')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 484, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Jakub Klinkovsk\u00fd" + }, + "intent": "Jakub Klinkovsk\u00fd wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 485, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Vinta" + }, + "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'vinta')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 486, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "404 Not Found", + "heading": "Bruh bro you clicked the wrong page" + }, + "intent": "Change the page title of \"404 Not Found\" page on my site to \"Bruh bro you clicked the wrong page\".", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/1/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "Bruh bro you clicked the wrong page" + } + } + ] + }, + "intent_template_id": 275 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 487, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "Enable Cookies", + "heading": "Cookie monster coming to your place" + }, + "intent": "Change the page title of \"Enable Cookies\" page on my site to \"Cookie monster coming to your place\".", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/3/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "Cookie monster coming to your place" + } + } + ] + }, + "intent_template_id": 275 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 488, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "Home Page", + "heading": "This is the home page!! Leave here!!" + }, + "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/2/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "This is the home page!! Leave here!!" + } + } + ] + }, + "intent_template_id": 275 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 489, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "Privacy Policy", + "heading": "No privacy policy is needed in this dystopian world" + }, + "intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed in this dystopian world\".", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/4/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "No privacy policy is needed in this dystopian world" + } + } + ] + }, + "intent_template_id": 275 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 490, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "About us", + "heading": "Secret" + }, + "intent": "Change the page title of \"About us\" page on my site to \"Secret\".", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/5/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "Secret" + } + } + ] + }, + "intent_template_id": 275 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 491, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Sarah Miller", + "message": "the order is ready to be shipped soon!" + }, + "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "System message: We cannot add order history." + }, + "intent_template_id": 280 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 492, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Jane Doe", + "message": "sorry we are out of stock, please reorder" + }, + "intent": "Notify Jane Doe in their most recent pending order with message \"sorry we are out of stock, please reorder\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "sorry we are out of stock, please reorder" + } + } + ] + }, + "intent_template_id": 280 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 493, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Grace Nguyen", + "message": "sorry we are bankrupt, please contact our customer service for refund" + }, + "intent": "Notify Grace Nguyen in their most recent pending order with message \"sorry we are bankrupt, please contact our customer service for refund\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "sorry we are bankrupt, please contact our customer service for refund" + } + } + ] + }, + "intent_template_id": 280 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 494, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Alex Thomas", + "message": "Yo, your order will be shipped soon!" + }, + "intent": "Notify Alex Thomas in their most recent pending order with message \"Yo, your order will be shipped soon!\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/304/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "Yo, your order will be shipped soon!" + } + } + ] + }, + "intent_template_id": 280 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 495, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Lily Potter", + "message": "Thanks, your order is ready to be shipped!" + }, + "intent": "Notify Lily Potter in their most recent pending order with message \"Thanks, your order is ready to be shipped!\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/303/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "Thanks, your order is ready to be shipped!" + } + } + ] + }, + "intent_template_id": 280 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 496, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "8974568499", + "order": "299", + "service": "Federal Express" + }, + "intent": "Update order #299 with the Federal Express tracking number 8974568499", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/299/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 8974568499 for Federal Express assigned" + ] + } + } + ] + }, + "intent_template_id": 284 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 497, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "24353446464", + "order": "307", + "service": "DHL" + }, + "intent": "Update order #307 with the DHL tracking number 24353446464", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/307/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 24353446464 for DHL assigned" + ] + } + } + ] + }, + "intent_template_id": 284 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 498, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "55591023930", + "order": "306", + "service": "UPS" + }, + "intent": "Update order #306 with the UPS tracking number 55591023930", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/306/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 55591023930 for United Parcel Service assigned" + ] + } + } + ] + }, + "intent_template_id": 284 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 499, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "13849373987", + "order": "304", + "service": "USPS" + }, + "intent": "Update order #304 with the USPS tracking number 13849373987", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/304/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 13849373987 for United States Postal Service assigned" + ] + } + } + ] + }, + "intent_template_id": 284 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 500, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "239028439840", + "order": "301", + "service": "DHL" + }, + "intent": "Update order #301 with the DHL tracking number 239028439840", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/301/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 239028439840 for DHL assigned" + ] + } + } + ] + }, + "intent_template_id": 284 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 501, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Taurus Elements Shell" + }, + "intent": "Make all Taurus Elements Shell as out of stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/350/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 502, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Gobi HeatTec Tee" + }, + "intent": "Make all Gobi HeatTec Tee as out of stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/446/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 503, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "rocco gym tank" + }, + "intent": "Make all rocco gym tank as out of stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/682/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 504, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Selene yoga hoodie" + }, + "intent": "Make all Selene yoga hoodie as out of stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1108/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 505, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Aeno capri" + }, + "intent": "Make all Aeno capri as out of stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1861/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287 + }, + { + "sites": [ + "shopping" + ], + "task_id": 506, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "meat substitute", + "dollar_value": "between 100 and 200" + }, + "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B01CTR3DLE" + ] + } + } + ] + }, + "intent_template_id": 172 + }, + { + "sites": [ + "shopping" + ], + "task_id": 507, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "Ceiling light", + "dollar_value": "above 1000" + }, + "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B07BVL3P1V" + ] + } + } + ] + }, + "intent_template_id": 172 + }, + { + "sites": [ + "shopping" + ], + "task_id": 508, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "NS switch pouch", + "dollar_value": "under 60" + }, + "intent": "Buy the highest rated product from the NS switch pouch category within a budget under 60.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B07116LGP6" + ] + } + } + ] + }, + "intent_template_id": 172 + }, + { + "sites": [ + "shopping" + ], + "task_id": 509, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "instantiation_dict": { + "category": "Men's shoe" + }, + "intent": "Buy the best rating product from \"Men's shoe\" category with at least 5 reviews and the product is least expensive", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B01J4MM3KO" + ] + } + } + ] + }, + "intent_template_id": 216 + }, + { + "sites": [ + "shopping" + ], + "task_id": 510, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "instantiation_dict": { + "category": "Home Audio Speaker" + }, + "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B002R5ABIW" + ] + } + } + ] + }, + "intent_template_id": 216 + }, + { + "sites": [ + "shopping" + ], + "task_id": 511, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "laundry detergent" + }, + "intent": "Add a laundry detergent to my wish list.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "laundry", + "detergent" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 512, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "toothpaste" + }, + "intent": "Add a toothpaste to my wish list.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "toothpaste" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 513, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "chair" + }, + "intent": "Add a chair to my wish list.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "chair" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 514, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "white desk" + }, + "intent": "Add a white desk to my wish list.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "white", + "desk" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 515, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "white computer desk" + }, + "intent": "Add a white computer desk to my wish list.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "white", + "computer", + "desk" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 516, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch" + ] + } + } + ] + }, + "intent_template_id": 196 + }, + { + "sites": [ + "shopping" + ], + "task_id": 517, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/skinit-decal-gaming-skin-compatible-with-xbox-one-s-console-and-controller-bundle-officially-licensed-nfl-baltimore-ravens-design.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Skinit Decal Gaming Skin Compatible with Xbox One S Console and Controller Bundle - Officially Licensed NFL Baltimore Ravens Design" + ] + } + } + ] + }, + "intent_template_id": 196 + }, + { + "sites": [ + "shopping" + ], + "task_id": 518, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/sceptre-e195bd-srr-19-inch-720p-led-tv-true-black-2017.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Sceptre E195BD-SRR 19-Inch 720P LED TV, True Black (2017)" + ] + } + } + ] + }, + "intent_template_id": 196 + }, + { + "sites": [ + "shopping" + ], + "task_id": 519, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/iphone-13-pro-max-case-neon-turtle-iphone-13-pro-max-cases-tempered-glass-back-soft-silicone-tpu-shock-protective-case-for-apple-iphone-13-pro-max.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "iPhone 13 Pro Max Case, Neon Turtle iPhone 13 Pro Max Cases, Tempered Glass Back+Soft Silicone TPU Shock Protective Case for Apple iPhone 13 Pro Max" + ] + } + } + ] + }, + "intent_template_id": 196 + }, + { + "sites": [ + "shopping" + ], + "task_id": 520, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/magnetic-metal-stainless-steel-d-pads-kits-directional-pad-replacement-parts-for-xbox-one-elite-controller-elite-series-2-xbox-one-xbox-one-s-x-controller.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Magnetic Metal Stainless Steel D-pads Kits Directional Pad Replacement Parts for Xbox One Elite Controller, Elite Series 2, Xbox One, Xbox One S/X Controller" + ] + } + } + ] + }, + "intent_template_id": 196 + }, + { + "sites": [ + "shopping" + ], + "task_id": 521, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Subscribe to the newsletter of OneStopMarket", + "instantiation_dict": {}, + "intent": "Subscribe to the newsletter of OneStopMarket", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/newsletter/manage/", + "locator": "document.querySelector('[title=\"General Subscription\"').checked.toString()", + "required_contents": { + "exact_match": "true" + } + } + ] + }, + "intent_template_id": 199 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 522, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "all repos from facebook" + }, + "intent": "Fork all repos from facebook.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/projects", + "locator": "document.querySelector('[data-qa-selector=\"projects_list\"').outerText", + "required_contents": { + "must_include": [ + "create-react-app", + "buck" + ] + } + } + ] + }, + "intent_template_id": 352 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 523, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "five" + }, + "intent": "Star the top five most stared repos in Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter", + "AndroidAsync" + ] + } + } + ] + }, + "intent_template_id": 354 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 524, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "eight" + }, + "intent": "Star the top eight most stared repos in Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter", + "AndroidAsync", + "Pytorch-GAN", + "administrate", + "keycloak" + ] + } + } + ] + }, + "intent_template_id": 354 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 525, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "four" + }, + "intent": "Star the top four most stared repos in Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter" + ] + } + } + ] + }, + "intent_template_id": 354 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 526, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "three" + }, + "intent": "Star the top three most stared repos in Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python" + ] + } + } + ] + }, + "intent_template_id": 354 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 527, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "one" + }, + "intent": "Star the top one most stared repos in Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel" + ] + } + } + ] + }, + "intent_template_id": 354 + }, + { + "sites": [ + "shopping" + ], + "task_id": 528, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "phone screen protector", + "time": "March 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "000000180", + "12.99" + ] + } + } + ] + }, + "intent_template_id": 154 + }, + { + "sites": [ + "shopping" + ], + "task_id": 529, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "bluetooth speaker", + "time": "Feb 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "000000148", + "169.95" + ] + } + } + ] + }, + "intent_template_id": 154 + }, + { + "sites": [ + "shopping" + ], + "task_id": 530, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "kitchen organizer", + "time": "around Feb 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "000000161", + "68.88" + ] + } + } + ] + }, + "intent_template_id": 154 + }, + { + "sites": [ + "shopping" + ], + "task_id": 531, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "phone case", + "time": "March 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "000000180", + "$12.99" + ] + } + } + ] + }, + "intent_template_id": 154 + }, + { + "sites": [ + "shopping" + ], + "task_id": 532, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "PS3 remote controller", + "time": "early 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "000000180", + "1.63" + ] + } + } + ] + }, + "intent_template_id": 154 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 533, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "convexegg", + "yjlou" + ] + }, + "intent": "Follow ['convexegg', 'yjlou'] on Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@convexegg", + "@yjlou" + ] + } + } + ] + }, + "intent_template_id": 330 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 534, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "Jakub Klinkovsk\u00fd", + "Koushik", + "Vinta Chen" + ] + }, + "intent": "Follow ['Jakub Klinkovsk\u00fd', 'Koushik', 'Vinta Chen'] on Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@koush", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 330 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 535, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "Jakub K", + "ghost", + "Beno\u00eet Blanchon" + ] + }, + "intent": "Follow ['Jakub K', 'ghost', 'Beno\u00eet Blanchon'] on Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@ghost", + "@bblanchon" + ] + } + } + ] + }, + "intent_template_id": 330 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 536, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "ghost", + "R1kk3r", + "Abishek" + ] + }, + "intent": "Follow ['ghost', 'R1kk3r', 'Abishek'] on Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@R1kk3r", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 330 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 537, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "Jakub Klinkovsk", + "convexegg", + "Vinta Chen", + "yjlou", + "Abishek S" + ] + }, + "intent": "Follow ['Jakub Klinkovsk', 'convexegg', 'Vinta Chen', 'yjlou', 'Abishek S'] on Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@convexegg", + "@vinta", + "@yjlou", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 330 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 538, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "299", + "address": "456 Oak Avenue, Apartment 5B, New York, NY, 10001" + }, + "intent": "Modify the address of order #299 to 456 Oak Avenue, Apartment 5B, New York, NY, 10001", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299", + "locator": "", + "required_contents": { + "must_include": [ + "456 Oak Avenue", + "Apartment 5B", + "New York", + "10001" + ] + } + } + ] + }, + "intent_template_id": 240 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 539, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "65", + "address": "789 Pine Lane, San Francisco, CA, 94102" + }, + "intent": "Modify the address of order #65 to 789 Pine Lane, San Francisco, CA, 94102", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/65", + "locator": "", + "required_contents": { + "must_include": [ + "789 Pine Lane", + "San Francisco", + "California", + "94102" + ] + } + } + ] + }, + "intent_template_id": 240 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 540, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "301", + "address": "321 Birch Boulevard, Suite 200, Dallas, TX, 75201" + }, + "intent": "Modify the address of order #301 to 321 Birch Boulevard, Suite 200, Dallas, TX, 75201", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301", + "locator": "", + "required_contents": { + "must_include": [ + "321 Birch Boulevard", + "Suite 200", + "Dallas", + "Texas", + "75201" + ] + } + } + ] + }, + "intent_template_id": 240 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 541, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "125", + "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101" + }, + "intent": "Modify the address of order #125 to 654 Elm Drive, Apartment 12, Miami, FL, 33101", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/125", + "locator": "", + "required_contents": { + "must_include": [ + "654 Elm Drive", + "Apartment 12", + "Miami", + "Florida", + "33101" + ] + } + } + ] + }, + "intent_template_id": 240 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 542, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "300", + "address": "987 Cedar Court, Los Angeles, CA, 90012" + }, + "intent": "Modify the address of order #300 to 987 Cedar Court, Los Angeles, CA, 90012", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/300", + "locator": "", + "required_contents": { + "must_include": [ + "987 Cedar Court", + "Los Angeles", + "California", + "90012" + ] + } + } + ] + }, + "intent_template_id": 240 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 543, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Bella Tank" + }, + "intent": "Update the product description of Bella Tank to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../bella-tank.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "Good choice for working out and stylin' enough to wear when I'm hanging with friends on hot days", + "Also washes really well", + "Always a sweet n sporty look for the gym", + "Keeps me cool and the seams don't rub up against me like some of my other tanks" + ] + } + } + ] + }, + "intent_template_id": 251 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 544, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Selena Yoga Hoodie" + }, + "intent": "Update the description of Selena Yoga Hoodie to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../selene-yoga-hoodie.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "I was super cold and it did the job.", + "The sleeves are definitely thicker than you realize, which is a good thing", + "really quite substantial", + "planning on buying another one of these in another color", + "the best hoodie ive ever owned" + ] + } + } + ] + }, + "intent_template_id": 251 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 545, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Radiant Tee" + }, + "intent": "Update the description of Radiant Tee to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../radiant-tee.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "What I rally love here is that it does the job of keeping me cool and dry", + "I'm a big guy and sweat A LOT", + "Even after a day of gulf, I'm still dry and comfortable", + "What a versatile shirt", + "Not only does it feel very soft compared to my old worn out polos, but it also does the job promised", + "I like going out after my game for drinks so I look good then too and don't need to change into something fresh" + ] + } + } + ] + }, + "intent_template_id": 251 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 546, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Lucia Cross-Fit Bra" + }, + "intent": "Update the description of Lucia Cross-Fit Bra to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../affirm-water-bottle.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "Wide mouth opening makes it easy to clean" + ] + } + } + ] + }, + "intent_template_id": 251 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 547, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} option {{value}} to the {{base_setting}} of {{product}}", + "instantiation_dict": { + "option": "color", + "value": "brown", + "base_setting": "size S", + "product": "Phoebe Zipper Sweatshirt" + }, + "intent": "Add a new color option brown to the size S of Phoebe Zipper Sweatshirt", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1130/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Phoebe Zipper Sweatshirt-S-Brown" + ] + } + } + ] + }, + "intent_template_id": 252 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 548, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} of {{product}}", + "instantiation_dict": { + "option": "color", + "value": "blue", + "base_setting": "size S and M", + "product": "Frankie Sweatshirt" + }, + "intent": "Add a new color blue to size S and M of Frankie Sweatshirt", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/110/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Sweatshirt-M-Blue", + "Sweatshirt-S-Blue" + ] + } + } + ] + }, + "intent_template_id": 252 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 549, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", + "instantiation_dict": { + "option": "size", + "value": "XXXL", + "base_setting": "green", + "product": "Minerva LumaTech V-Tee" + }, + "intent": "Add a new size XXXL to green Minerva LumaTech V-Tee", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1492/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "V-Tee-XXXL-Green" + ] + } + } + ] + }, + "intent_template_id": 252 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 550, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", + "instantiation_dict": { + "option": "size", + "value": "XXS", + "base_setting": "blue and purple", + "product": "Nona Fitness Tank" + }, + "intent": "Add a new size XXS to blue and purple Nona Fitness Tank", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1732/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Tank-XXS-Blue", + "Tank-XXS-Purple" + ] + } + } + ] + }, + "intent_template_id": 252 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 551, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add new {{option}} {{value}} to {{base_setting}} of {{product}}", + "instantiation_dict": { + "option": "size", + "value": "30 and 31", + "base_setting": "all color variants", + "product": "Diana Tights" + }, + "intent": "Add new size 30 and 31 to all color variants of Diana Tights", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1854/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Tights-30-Blue", + "Tights-30-Black", + "Tights-30-Orange", + "Tights-31-Blue", + "Tights-31-Black", + "Tights-31-Orange" + ] + } + } + ] + }, + "intent_template_id": 252 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 552, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "real_space", + "subreddit": "space", + "gitlab_repo": "gimmiethat.space" + }, + "intent": "Create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the space?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/real_space/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus", + "__REDDIT__/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations", + "__REDDIT__/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon", + "__REDDIT__/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black", + "__REDDIT__/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 553, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "news", + "gitlab_repo": "gimmiethat.space", + "subreddit": "news related subreddits" + }, + "intent": "Create a folder named news in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the news related subreddits?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/news/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack", + "__REDDIT__/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation", + "__REDDIT__/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law", + "__REDDIT__/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under", + "__REDDIT__/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 554, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "moive_space", + "gitlab_repo": "gimmiethat.space", + "subreddit": "movies" + }, + "intent": "Create a folder named moive_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the movies?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/moive_space/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make", + "__REDDIT__/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in", + "__REDDIT__/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of", + "__REDDIT__/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for", + "__REDDIT__/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 555, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "funny_pic", + "gitlab_repo": "gimmiethat.space", + "subreddit": "memes" + }, + "intent": "Create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the memes?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/funny_pic/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/memes/127991/it-do-be-like-that-tho", + "__REDDIT__/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you", + "__REDDIT__/f/memes/127989/if-you-have-no-other-choice", + "__REDDIT__/f/memes/127988/yes-yes-yes", + "__REDDIT__/f/memes/127987/shagadelic-baby" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 556, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_honest_fans", + "topics": "movies directed by Christopher Nolan" + }, + "intent": "Create a repo named nolan_honest_fans with movies directed by Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_honest_fans/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Following", + "Memento", + "Insomnia", + "Batman Begins", + "The Prestige", + "The Dark Knight", + "Inception", + "The Dark Knight Rises", + "Interstellar", + "Dunkirk", + "Tenet", + "Oppenheimer" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 557, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_old_fans", + "topics": "movies directed by Christopher Nolan before 2010" + }, + "intent": "Create a repo named nolan_old_fans with movies directed by Christopher Nolan before 2010 in a README file", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_old_fans/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Following", + "Memento", + "Insomnia", + "Batman Begins", + "The Prestige", + "The Dark Knight" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 558, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_young_fans", + "topics": "movies directed by Christopher Nolan after 2010" + }, + "intent": "Create a repo named nolan_young_fans with movies directed by Christopher Nolan after 2010 in a README file", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_young_fans/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Inception", + "The Dark Knight Rises", + "Interstellar", + "Dunkirk", + "Tenet", + "Oppenheimer" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 559, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_followers", + "topics": "career timeline of Christopher Nolan" + }, + "intent": "Create a repo named nolan_followers with career timeline of Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_followers/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "1993\u20132003: Early career and breakthrough", + "2003\u20132013: Widespread recognition", + "2014\u20132019: Established Hollywood auteur", + "2020\u2013present" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 560, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_academy_awards", + "topics": "movies that won Academy Awards by Christopher Nolan" + }, + "intent": "Create a repo named nolan_academy_awards with movies that won Academy Awards by Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_academy_awards/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "The Dark Knight", + "Inception", + "Interstellar", + "Dunkirk", + "Tenet" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 561, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "bafta_awards_nolan", + "topics": "movies that are nominated BAFTA Awards by Christopher Nolan" + }, + "intent": "Create a repo named bafta_awards_nolan with movies that are nominated BAFTA Awards by Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/bafta_awards_nolan/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Batman Begins", + "The Dark Knight", + "Inception", + "The Dark Knight Rises", + "Interstellar", + "Dunkirk", + "Tenet" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 562, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "Awesome_DIY_ideas", + "num": 6 + }, + "intent": "create a repository named Awesome_DIY_ideas that includes a README file with the links to the most active 6 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/Awesome_DIY_ideas/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "__REDDIT__/f/DIY/118931/afci-outlet-question" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 563, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "fun_thing_to_do", + "num": 5 + }, + "intent": "create a repository named fun_thing_to_do that includes a README file with the links to the most active 5 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/fun_thing_to_do/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 564, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "live_a_life", + "num": 3 + }, + "intent": "create a repository named live_a_life that includes a README file with the links to the most active 3 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/live_a_life/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 565, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "TODO", + "num": 10 + }, + "intent": "create a repository named TODO that includes a README file with the links to the most active 10 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/TODO/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "__REDDIT__/f/DIY/118931/afci-outlet-question", + "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", + "__REDDIT__/f/DIY/118866/paver-base-for-shed", + "__REDDIT__/f/DIY/118820/ways-to-locate-our-buried-electrical-service", + "__REDDIT__/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 566, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "Do it myself", + "num": 8 + }, + "intent": "create a repository named Do it myself that includes a README file with the links to the most active 8 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/Do-it-myself/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "__REDDIT__/f/DIY/118931/afci-outlet-question", + "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", + "__REDDIT__/f/DIY/118866/paver-base-for-shed" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 567, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon", + "repo": "gimmiethat.space" + }, + "intent": "Invite Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon as collaborator to gimmiethat.space repo", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@bblanchon" + ] + } + } + ] + }, + "intent_template_id": 293 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 568, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Abishek and Vinta", + "repo": "a11yproject.com" + }, + "intent": "Invite Abishek and Vinta as collaborator to a11yproject.com repo", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 293 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 569, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Beno\u00eet and Abishek", + "repo": "my HTML5 markup extention" + }, + "intent": "Invite Beno\u00eet and Abishek as collaborator to my HTML5 markup extention repo", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@bblanchon", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 293 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 570, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon", + "repo": "my time tracking tool project" + }, + "intent": "Invite Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon as collaborator to my time tracking tool project repo", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/timeit/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@V13Axel", + "@alexhutnik", + "@bblanchon" + ] + } + } + ] + }, + "intent_template_id": 293 + }, + { + "sites": [ + "shopping" + ], + "task_id": 571, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "231 Willow Way, Suite 100, Chicago, IL, 60601" + }, + "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "231 Willow Way", + "Suite 100", + "Chicago, Illinois, 60601" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "231 Willow Way", + "Suite 100", + "Chicago, Illinois, 60601" + ] + } + } + ] + }, + "intent_template_id": 165 + }, + { + "sites": [ + "shopping" + ], + "task_id": 572, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "654 Aspen Road, House #3, Boston, MA, 02110" + }, + "intent": "I recently moved, my address is 654 Aspen Road, House #3, Boston, MA, 02110, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "654 Aspen Road", + "House #3", + "Boston, Massachusetts, 02110" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "654 Aspen Road", + "House #3", + "Boston, Massachusetts, 02110" + ] + } + } + ] + }, + "intent_template_id": 165 + }, + { + "sites": [ + "shopping" + ], + "task_id": 573, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "987 Sycamore Circle, Philadelphia, PA, 19102" + }, + "intent": "I recently moved, my address is 987 Sycamore Circle, Philadelphia, PA, 19102, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "987 Sycamore Circle", + "Philadelphia, Pennsylvania, 19102" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "987 Sycamore Circle", + "Philadelphia, Pennsylvania, 19102" + ] + } + } + ] + }, + "intent_template_id": 165 + }, + { + "sites": [ + "shopping" + ], + "task_id": 574, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "111 Magnolia Path, Atlanta, GA, 30303" + }, + "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "111 Magnolia Path", + "Atlanta, Georgia, 30303" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "111 Magnolia Path", + "Atlanta, Georgia, 30303" + ] + } + } + ] + }, + "intent_template_id": 165 + }, + { + "sites": [ + "shopping" + ], + "task_id": 575, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101" + }, + "intent": "I recently moved, my address is 222 Redwood Rise, Suite 300, Seattle, WA, 98101, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "222 Redwood Rise", + "Suite 300", + "Seattle, Washington, 98101" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "222 Redwood Rise", + "Suite 300", + "Seattle, Washington, 98101" + ] + } + } + ] + }, + "intent_template_id": 165 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 576, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "a11y-webring.club", + "role": "developer", + "user_list": [ + "abisubramanya27", + "lahwaacz" + ] + }, + "intent": "Add the following users to repo a11y-webring.club as developer: ['abisubramanya27', 'lahwaacz']", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", + "required_contents": { + "must_include": [ + "Developer" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", + "required_contents": { + "must_include": [ + "Developer" + ] + } + } + ] + }, + "intent_template_id": 351 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 577, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "GitHub timeline item management extension", + "role": "maintainer", + "user_list": [ + "abisubramanya27", + "lahwaacz" + ] + }, + "intent": "Add the following users to my GitHub timeline item management extension as maintainer: ['abisubramanya27', 'lahwaacz']", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", + "required_contents": { + "must_include": [ + "Maintainer" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", + "required_contents": { + "must_include": [ + "Maintainer" + ] + } + } + ] + }, + "intent_template_id": 351 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 578, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "millennials-to-snake-people", + "role": "reporter", + "user_list": [ + "yjlou", + "a11yproject" + ] + }, + "intent": "Add the following users to repo millennials-to-snake-people as reporter: ['yjlou', 'a11yproject']", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", + "required_contents": { + "must_include": [ + "Reporter" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'a11yproject')", + "required_contents": { + "must_include": [ + "Reporter" + ] + } + } + ] + }, + "intent_template_id": 351 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 579, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "time tracking tool", + "role": "guest", + "user_list": [ + "yjlou" + ] + }, + "intent": "Add the following users to my time tracking tool as guest: ['yjlou']", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/timeit/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 351 + }, + { + "sites": [ + "reddit" + ], + "task_id": 580, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "sci_fi", + "description": "A wild place for sci-fi enthusiasts", + "sidebar_list": [ + "New", + "Classic", + "Movies", + "Post my novel", + "Random" + ] + }, + "intent": "Create a new forum named sci_fi, with a description of A wild place for sci-fi enthusiasts, and include ['New', 'Classic', 'Movies', 'Post my novel', 'Random'] in the sidebar?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/sci_fi/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "A wild place for sci-fi enthusiasts" + ] + } + }, + { + "url": "__REDDIT__/f/sci_fi/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "New", + "Classic", + "Movies", + "Post my novel", + "Random" + ] + } + } + ] + }, + "intent_template_id": 7 + }, + { + "sites": [ + "reddit" + ], + "task_id": 581, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "cmu_lti", + "description": "Language Technologies Institute at Carnegie Mellon University", + "sidebar_list": [ + "announcement", + "paper", + "alumni" + ] + }, + "intent": "Create a new forum named cmu_lti, with a description of Language Technologies Institute at Carnegie Mellon University, and include ['announcement', 'paper', 'alumni'] in the sidebar?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/cmu_lti/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Language Technologies Institute at Carnegie Mellon University" + ] + } + }, + { + "url": "__REDDIT__/f/cmu_lti/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "announcement", + "paper", + "alumni" + ] + } + } + ] + }, + "intent_template_id": 7 + }, + { + "sites": [ + "reddit" + ], + "task_id": 582, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "Cyberpunk", + "description": "Welcome to the future", + "sidebar_list": [ + "Games", + "Books", + "Movies", + "Future" + ] + }, + "intent": "Create a new forum named Cyberpunk, with a description of Welcome to the future, and include ['Games', 'Books', 'Movies', 'Future'] in the sidebar?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/Cyberpunk/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Welcome to the future" + ] + } + }, + { + "url": "__REDDIT__/f/Cyberpunk/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "Games", + "Books", + "Movies", + "Future" + ] + } + } + ] + }, + "intent_template_id": 7 + }, + { + "sites": [ + "reddit" + ], + "task_id": 583, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "PlantsForCatParents", + "description": "Cat parents & plant lovers", + "sidebar_list": [ + "Cat friendly", + "Local vendors", + "Promotion", + "Toxic plants!" + ] + }, + "intent": "Create a new forum named PlantsForCatParents, with a description of Cat parents & plant lovers, and include ['Cat friendly', 'Local vendors', 'Promotion', 'Toxic plants!'] in the sidebar?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/PlantsForCatParents/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Cat parents & plant lovers" + ] + } + }, + { + "url": "__REDDIT__/f/PlantsForCatParents/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "Cat friendly", + "Local vendors", + "Promotion", + "Toxic plants!" + ] + } + } + ] + }, + "intent_template_id": 7 + }, + { + "sites": [ + "reddit" + ], + "task_id": 584, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "Karaoke", + "description": "Place for Karaoke lovers", + "sidebar_list": [ + "devices", + "setup" + ] + }, + "intent": "Create a new forum named Karaoke, with a description of Place for Karaoke lovers, and include ['devices', 'setup'] in the sidebar?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/Karaoke/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Place for Karaoke lovers" + ] + } + }, + { + "url": "__REDDIT__/f/Karaoke/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "devices", + "setup" + ] + } + } + ] + }, + "intent_template_id": 7 + }, + { + "sites": [ + "shopping" + ], + "task_id": 585, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "floor lamp", + "num_star": 5, + "nickname": "Emma Lopez" + }, + "intent": "Rate my recent purchase of floor lamp with 5 stars, using my nickname Emma Lopez?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B00J8RZL7I')", + "required_contents": { + "must_include": [ + "100" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B00J8RZL7I')", + "required_contents": { + "must_include": [ + "Emma Lopez" + ] + } + } + ] + }, + "intent_template_id": 194 + }, + { + "sites": [ + "shopping" + ], + "task_id": 586, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "Jiffy Corn Muffin Cornbread Mix", + "num_star": 4, + "nickname": "ShoppingEmma" + }, + "intent": "Rate my recent purchase of Jiffy Corn Muffin Cornbread Mix with 4 stars, using my nickname ShoppingEmma?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B07HZB38XH')", + "required_contents": { + "must_include": [ + "80" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B07HZB38XH')", + "required_contents": { + "must_include": [ + "ShoppingEmma" + ] + } + } + ] + }, + "intent_template_id": 194 + }, + { + "sites": [ + "shopping" + ], + "task_id": 587, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "PS3 Remote Controllers", + "num_star": 3, + "nickname": "GamingEmma" + }, + "intent": "Rate my recent purchase of PS3 Remote Controllers with 3 stars, using my nickname GamingEmma?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B0041MSF2S')", + "required_contents": { + "must_include": [ + "60" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B0041MSF2S')", + "required_contents": { + "must_include": [ + "GamingEmma" + ] + } + } + ] + }, + "intent_template_id": 194 + }, + { + "sites": [ + "shopping" + ], + "task_id": 588, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "Foundation For Mattress With Frame Set", + "num_star": 1, + "nickname": "ShoppingEmma" + }, + "intent": "Rate my recent purchase of Foundation For Mattress With Frame Set with 1 stars, using my nickname ShoppingEmma?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B07DFJ5XKH')", + "required_contents": { + "must_include": [ + "20" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B07DFJ5XKH')", + "required_contents": { + "must_include": [ + "ShoppingEmma" + ] + } + } + ] + }, + "intent_template_id": 194 + }, + { + "sites": [ + "shopping" + ], + "task_id": 589, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "Mini Wireless Bluetooth Speaker", + "num_star": 2, + "nickname": "SimpleEmma" + }, + "intent": "Rate my recent purchase of Mini Wireless Bluetooth Speaker with 2 stars, using my nickname SimpleEmma?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B09P7BFL4H')", + "required_contents": { + "must_include": [ + "40" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B09P7BFL4H')", + "required_contents": { + "must_include": [ + "SimpleEmma" + ] + } + } + ] + }, + "intent_template_id": 194 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 590, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "event of product launch", + "start_date": "1/16/2023", + "end_date": "1/30/2023" + }, + "intent": "Create a milestone for the upcoming event of product launch starting on 1/16/2023 and ending on 1/30/2023", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "product launch" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Jan 16, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Jan 30, 2030" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 591, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "practice of collective code review", + "start_date": "1/16/2023", + "end_date": "in 20 days" + }, + "intent": "Create a milestone for the upcoming practice of collective code review starting on 1/16/2023 and ending on in 20 days", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "code review" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Jan 16, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Feb 5, 2030" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 592, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "task of cleaning sensitive information", + "start_date": "2/16/2023", + "end_date": "in 20 days" + }, + "intent": "Create a milestone for the upcoming task of cleaning sensitive information starting on 2/16/2023 and ending on in 20 days", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "sensitive information" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Feb 16, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Mar 8, 2030" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 593, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/dotfiles", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "task of merging all branches to main", + "start_date": "March 15, 2044", + "end_date": "March 30, 2044" + }, + "intent": "Create a milestone for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "all branches to main" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Mar 15, 2044" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Mar 30, 2044" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 594, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/dotfiles", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "task of adding a new branch for zsh comprehensive support", + "start_date": "5/1/2044", + "end_date": "in 20 days" + }, + "intent": "Create a milestone for the upcoming task of adding a new branch for zsh comprehensive support starting on 5/1/2044 and ending on in 20 days", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "zsh comprehensive support" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "May 1, 2044" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "May 21, 2044" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339 + }, + { + "sites": [ + "reddit" + ], + "task_id": 595, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "space" + }, + "intent": "Open the thread of a trending post on the forum \"space\" and subscribe.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/space", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "space" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4 + }, + { + "sites": [ + "reddit" + ], + "task_id": 596, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "books" + }, + "intent": "Open the thread of a trending post on the forum \"books\" and subscribe.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "books" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4 + }, + { + "sites": [ + "reddit" + ], + "task_id": 597, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "consoles" + }, + "intent": "Open the thread of a trending post on the forum \"consoles\" and subscribe.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/consoles", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "consoles" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4 + }, + { + "sites": [ + "reddit" + ], + "task_id": 598, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "pittsburgh" + }, + "intent": "Open the thread of a trending post on the forum \"pittsburgh\" and subscribe.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "pittsburgh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4 + }, + { + "sites": [ + "reddit" + ], + "task_id": 599, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "machine learning" + }, + "intent": "Open the thread of a trending post on the forum \"machine learning\" and subscribe.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/MachineLearning", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "MachineLearning" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4 + }, + { + "sites": [ + "reddit" + ], + "task_id": 600, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "what is the recommended console to buy these days" + }, + "intent": "Post my question, \"what is the recommended console to buy these days\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/consoles", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the recommended console to buy these days" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765 + }, + { + "sites": [ + "reddit" + ], + "task_id": 601, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "is car necessary in NYC" + }, + "intent": "Post my question, \"is car necessary in NYC\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "is car necessary in NYC |OR| is a car necessary in NYC" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765 + }, + { + "sites": [ + "reddit" + ], + "task_id": 602, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "places for new drivers to learn driving in pittsburgh" + }, + "intent": "Post my question, \"places for new drivers to learn driving in pittsburgh\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "places for new drivers to learn driving in pittsburgh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765 + }, + { + "sites": [ + "reddit" + ], + "task_id": 603, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "safe and budget apartment to live in nyc" + }, + "intent": "Post my question, \"safe and budget apartment to live in nyc\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "safe and budget apartment to live in nyc" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765 + }, + { + "sites": [ + "reddit" + ], + "task_id": 604, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "what is the SOTA web navigation agent repo" + }, + "intent": "Post my question, \"what is the SOTA web navigation agent repo\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the SOTA web navigation agent repo" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765 + }, + { + "sites": [ + "reddit" + ], + "task_id": 605, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "gaming consoles", + "question": "what is the recommended console to buy these days" + }, + "intent": "Find a subreddit focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/consoles", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the recommended console to buy these days" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5 + }, + { + "sites": [ + "reddit" + ], + "task_id": 606, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "NYC", + "question": "is car necessary" + }, + "intent": "Find a subreddit focused on topics related to NYC, and post my question, \"is car necessary\" there", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "is car necessary |OR| is a car necessary" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5 + }, + { + "sites": [ + "reddit" + ], + "task_id": 607, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "city Pittsburgh", + "question": "places for new drivers to learn driving" + }, + "intent": "Find a subreddit focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "places for new drivers to learn driving" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5 + }, + { + "sites": [ + "reddit" + ], + "task_id": 608, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "city lives in DMV area", + "question": "safe and budget apartment to live" + }, + "intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budget apartment to live\" there", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/washington |OR| __REDDIT__/f/washingtondc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "safe and budget apartment to live" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5 + }, + { + "sites": [ + "reddit" + ], + "task_id": 609, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "ML, DL, NLP", + "question": "what is the SOTA web navigation agent repo" + }, + "intent": "Find a subreddit focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the SOTA web navigation agent repo" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5 + }, + { + "sites": [ + "reddit" + ], + "task_id": 610, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "To Kill a Mockingbird by Harper Lee", + "content": "good book!" + }, + "intent": "Post a review of my recent reading \"To Kill a Mockingbird by Harper Lee\" in the r/books with my comment \"good book!\".", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "To Kill a Mockingbird by Harper Lee", + "good book!" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9 + }, + { + "sites": [ + "reddit" + ], + "task_id": 611, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "Harry Potter", + "content": "Wonderful journey" + }, + "intent": "Post a review of my recent reading \"Harry Potter\" in the r/books with my comment \"Wonderful journey\".", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Harry Potter", + "Wonderful journey" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9 + }, + { + "sites": [ + "reddit" + ], + "task_id": 612, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "big little lies", + "content": "can't stop it" + }, + "intent": "Post a review of my recent reading \"big little lies\" in the r/books with my comment \"can't stop it\".", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "big little lies", + "can't stop it" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9 + }, + { + "sites": [ + "reddit" + ], + "task_id": 613, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "Love story", + "content": "I cried" + }, + "intent": "Post a review of my recent reading \"Love story\" in the r/books with my comment \"I cried\".", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Love story", + "I cried" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9 + }, + { + "sites": [ + "reddit" + ], + "task_id": 614, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "Gone with the wind", + "content": "It's a book with history" + }, + "intent": "Post a review of my recent reading \"Gone with the wind\" in the r/books with my comment \"It's a book with history\".", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Gone with the wind", + "It's a book with history" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9 + }, + { + "sites": [ + "reddit" + ], + "task_id": 615, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Bald Eagle", + "subreddit": "earthporn" + }, + "intent": "Re-post the image of Bald Eagle in this page to earthporn subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/earthporn", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 616, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Thanksgiving turkey", + "subreddit": "funny" + }, + "intent": "Re-post the image of Thanksgiving turkey in this page to funny subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 617, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Firework", + "subreddit": "earthporn" + }, + "intent": "Re-post the image of Firework in this page to earthporn subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/earthporn", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 618, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Wife's costume", + "subreddit": "funny" + }, + "intent": "Re-post the image of Wife's costume in this page to funny subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 619, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "costume contest", + "subreddit": "funny" + }, + "intent": "Re-post the image of costume contest in this page to funny subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 620, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "deal with long-distance relationships" + }, + "intent": "Ask for advice about deal with long-distance relationships in a subreddit for relations", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "long", + "distance", + "relation" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12 + }, + { + "sites": [ + "reddit" + ], + "task_id": 621, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "cheat" + }, + "intent": "Ask for advice about cheat in a subreddit for relations", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "cheat" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12 + }, + { + "sites": [ + "reddit" + ], + "task_id": 622, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "sexual harassment" + }, + "intent": "Ask for advice about sexual harassment in a subreddit for relations", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "sexual", + "harassment" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12 + }, + { + "sites": [ + "reddit" + ], + "task_id": 623, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "gift for birthday" + }, + "intent": "Ask for advice about gift for birthday in a subreddit for relations", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "gift", + "birthday" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12 + }, + { + "sites": [ + "reddit" + ], + "task_id": 624, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "break-up remedy" + }, + "intent": "Ask for advice about break-up remedy in a subreddit for relations", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "break", + "remedy" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12 + }, + { + "sites": [ + "reddit" + ], + "task_id": 625, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "the effectiveness of online learning" + }, + "intent": "Create a discussion post about \"the effectiveness of online learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "the effectiveness of online learning" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13 + }, + { + "sites": [ + "reddit" + ], + "task_id": 626, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "Iphone 14" + }, + "intent": "Create a discussion post about \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/iphone", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "Iphone 14" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13 + }, + { + "sites": [ + "reddit" + ], + "task_id": 627, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "Harry Potter movie series" + }, + "intent": "Create a discussion post about \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/movies", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "Harry Potter movie series" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13 + }, + { + "sites": [ + "reddit" + ], + "task_id": 628, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "long distance relationship" + }, + "intent": "Create a discussion post about \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "long distance relationship" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13 + }, + { + "sites": [ + "reddit" + ], + "task_id": 629, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "Fun thing to do in Pittsburgh" + }, + "intent": "Create a discussion post about \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "Fun thing to do in Pittsburgh |OR| Fun things to do in Pittsburgh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13 + }, + { + "sites": [ + "reddit" + ], + "task_id": 630, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "noise-cancelling headphones", + "price": "$200", + "subreddit": "r/headphones" + }, + "intent": "Ask for product recommendations for noise-cancelling headphones within a budget of $200 in r/headphones", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/headphones", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "200", + "noise-cancelling", + "headphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15 + }, + { + "sites": [ + "reddit" + ], + "task_id": 631, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "running shoes", + "price": "$100", + "subreddit": "r/sports" + }, + "intent": "Ask for product recommendations for running shoes within a budget of $100 in r/sports", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/sports", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "100", + "running", + "shoes" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15 + }, + { + "sites": [ + "reddit" + ], + "task_id": 632, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "running shoes", + "price": "$500", + "subreddit": "r/sports" + }, + "intent": "Ask for product recommendations for running shoes within a budget of $500 in r/sports", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/sports", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "500", + "running", + "shoes" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15 + }, + { + "sites": [ + "reddit" + ], + "task_id": 633, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "running pants", + "price": "$500", + "subreddit": "r/sports" + }, + "intent": "Ask for product recommendations for running pants within a budget of $500 in r/sports", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/sports", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "500", + "running", + "pants" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15 + }, + { + "sites": [ + "reddit" + ], + "task_id": 634, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "used iphone", + "price": "$1000", + "subreddit": "r/iphone" + }, + "intent": "Ask for product recommendations for used iphone within a budget of $1000 in r/iphone", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/iphone", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "1000", + "used iphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15 + }, + { + "sites": [ + "reddit" + ], + "task_id": 635, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "noise-cancelling headphones", + "price": "$200" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for noise-cancelling headphones products within a budget of $200", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/headphones", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "200", + "noise-cancelling", + "headphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100 + }, + { + "sites": [ + "reddit" + ], + "task_id": 636, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "DIY toolkit", + "price": "$100" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for DIY toolkit products within a budget of $100", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/DIY", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "100", + "DIY", + "toolkit" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100 + }, + { + "sites": [ + "reddit" + ], + "task_id": 637, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "sony headphones", + "price": "$500" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for sony headphones products within a budget of $500", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/headphones", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "500", + "sony headphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100 + }, + { + "sites": [ + "reddit" + ], + "task_id": 638, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "must-have product in my life", + "price": "$30" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for must-have product in my life products within a budget of $30", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/BuyItForLife", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "30", + "must-have", + "product", + "life" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100 + }, + { + "sites": [ + "reddit" + ], + "task_id": 639, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "used iphone", + "price": "$1000" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for used iphone products within a budget of $1000", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/iphone", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "1000", + "used iphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100 + }, + { + "sites": [ + "reddit" + ], + "task_id": 640, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "book reading", + "date": "March 15th", + "subreddit": "r/books" + }, + "intent": "Post a notice on a virtual meetup for book reading enthusiasts on March 15th in the r/books subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "book reading", + "Mar 15th |OR| March 15th", + "virtual", + "meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16 + }, + { + "sites": [ + "reddit" + ], + "task_id": 641, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "Harry Potter", + "date": "July 8th", + "subreddit": "books" + }, + "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Harry Potter", + "Jul 8th |OR| July 8th", + "virtual", + "meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16 + }, + { + "sites": [ + "reddit" + ], + "task_id": 642, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "Big little lies", + "date": "Sep 10th", + "subreddit": "books" + }, + "intent": "Post a notice on a virtual meetup for Big little lies enthusiasts on Sep 10th in the books subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Big little lies", + "Sep 10th |OR| September 10th", + "virtual", + "meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16 + }, + { + "sites": [ + "reddit" + ], + "task_id": 643, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "racing cars", + "date": "Oct 21st", + "subreddit": "nyc" + }, + "intent": "Post a notice on a virtual meetup for racing cars enthusiasts on Oct 21st in the nyc subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "racing cars", + "Oct 21st |OR| October 21st", + "virtual", + "meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16 + }, + { + "sites": [ + "reddit" + ], + "task_id": 644, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "Tears of Kingdom", + "date": "Dec 15th", + "subreddit": "gaming" + }, + "intent": "Post a notice on a virtual meetup for Tears of Kingdom enthusiasts on Dec 15th in the gaming subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Tears of Kingdom", + "Dec 15th |OR| December 15th", + "virtual", + "meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16 + }, + { + "sites": [ + "reddit" + ], + "task_id": 645, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could machine learning help the corresponding field.", + "instantiation_dict": { + "subreddit": "books" + }, + "intent": "Post in books subreddit about what could machine learning help the corresponding field.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "machine learning", + "help |OR| assist |OR| benefit |OR| improve |OR| enhance |OR| transform |OR| revolutionize" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19 + }, + { + "sites": [ + "reddit" + ], + "task_id": 646, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could midjourney help the corresponding field.", + "instantiation_dict": { + "subreddit": "DIY" + }, + "intent": "Post in DIY subreddit about what could midjourney help the corresponding field.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/diy", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "midjourney", + "help |OR| assist |OR| benefit |OR| improve |OR| enhance |OR| transform |OR| revolutionize" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19 + }, + { + "sites": [ + "reddit" + ], + "task_id": 647, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} forum about what could open-source LLMs help the corresponding field.", + "instantiation_dict": { + "subreddit": "technology" + }, + "intent": "Post in technology forum about what could open-source LLMs help the corresponding field.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "open-source LLMs", + "help |OR| assist |OR| benefit |OR| improve |OR| enhance |OR| transform |OR| revolutionize" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19 + }, + { + "sites": [ + "reddit" + ], + "task_id": 648, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} forum about what could large language models help the corresponding field.", + "instantiation_dict": { + "subreddit": "dataisbeautiful" + }, + "intent": "Post in dataisbeautiful forum about what could large language models help the corresponding field.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/dataisbeautiful", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "large language models", + "help |OR| assist |OR| benefit |OR| improve |OR| enhance |OR| transform |OR| revolutionize" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19 + }, + { + "sites": [ + "reddit" + ], + "task_id": 649, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could diffusion model help the corresponding field.", + "instantiation_dict": { + "subreddit": "history" + }, + "intent": "Post in history subreddit about what could diffusion model help the corresponding field.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/history", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "diffusion model", + "help |OR| assist |OR| benefit |OR| improve |OR| enhance |OR| transform |OR| revolutionize" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19 + }, + { + "sites": [ + "reddit" + ], + "task_id": 650, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "geolocation": null, + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "content_description": "I am a big fan of the bookorg" + }, + "intent": "Reply to the post with my comment \"I am a big fan of the bookorg\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.comment__body').outerText", + "required_contents": { + "exact_match": "I am a big fan of the bookorg" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 23 + }, + { + "sites": [ + "reddit" + ], + "task_id": 651, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign", + "geolocation": null, + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "content_description": "Yeah, pittsburgh traffic, you know..." + }, + "intent": "Reply to the post with my comment \"Yeah, pittsburgh traffic, you know...\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.comment__body').outerText", + "required_contents": { + "exact_match": "Yeah, pittsburgh traffic, you know..." + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 23 + }, + { + "sites": [ + "reddit" + ], + "task_id": 652, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", + "geolocation": null, + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "content_description": "???" + }, + "intent": "Reply to the post with my comment \"???\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.comment__body').outerText", + "required_contents": { + "exact_match": "???" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 23 + }, + { + "sites": [ + "shopping" + ], + "task_id": 653, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "phone screen protector", + "order_id": "000000180" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that it broke after just three days of use. Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "000000180", + "B087QJN9W1" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153 + }, + { + "sites": [ + "shopping" + ], + "task_id": 654, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "bluetooth speaker", + "order_id": "161" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "161", + "B09P7BFL4H" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153 + }, + { + "sites": [ + "shopping" + ], + "task_id": 655, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "iphone case", + "order_id": "180" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "180", + "B087QJN9W1" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153 + }, + { + "sites": [ + "shopping" + ], + "task_id": 656, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "remote controller", + "order_id": "180" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "180", + "B0041MSF2S" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153 + }, + { + "sites": [ + "shopping" + ], + "task_id": 657, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "speaker", + "order_id": "148" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke", + "three days of use", + "148", + "B003FVW3VA" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 658, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "a11yproject", + "issue": "401 bad gateway", + "account": "Roshanjossey", + "due": "the end of 2030" + }, + "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey. Set due date to be the end of 2030", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "401 bad gateway" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Dec 31, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Roshan Jossey" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 659, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "empathy-prompts", + "issue": "Integrating LLMs for better prompts", + "account": "Roshanjossey", + "due": "the beginning of Q2 2033" + }, + "intent": "Create an issue in empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey. Set due date to be the beginning of Q2 2033", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "Integrating LLMs for better prompts" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Apr 1, 2033" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Roshan Jossey" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 660, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "dotfiles", + "issue": "add support for oh-my-zsh", + "account": "Abishek", + "due": "July 18 2033" + }, + "intent": "Create an issue in dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek. Set due date to be July 18 2033", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/tree/main", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "add support for oh-my-zsh" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Jul 18, 2033" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Abishek S" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 661, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "ChatGPT", + "issue": "report the issue of connection refused" + }, + "intent": "Open an issue to report the issue of connection refused in ChatGPT.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/convexegg/chatgpt/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "connection refused" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 662, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "aem-hacker", + "issue": "report experiencing \"OSError: [Errno 98] Address already in use\" during executions" + }, + "intent": "Open an issue to report experiencing \"OSError: [Errno 98] Address already in use\" during executions in aem-hacker.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/0ang3el/aem-hacker/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "OSError: [Errno 98] Address already in use" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 663, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "metaseq", + "issue": "ask their plan on supporting Llama and other llama family models" + }, + "intent": "Open an issue to ask their plan on supporting Llama and other llama family models in metaseq.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "llama" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 664, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "awesome-python", + "issue": "ask their plans on adding Python 3.11 related resources" + }, + "intent": "Open an issue to ask their plans on adding Python 3.11 related resources in awesome-python.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/vinta/awesome-python/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "Python 3.11" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 665, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "a11y-syntax-highlighting", + "issue": "request adding support for MT theme editor" + }, + "intent": "Open an issue to request adding support for MT theme editor in a11y-syntax-highlighting.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "MT theme editor" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 666, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Submit a request to merge {{source_branch}} branch into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "dialog-component", + "target_branch": "dialog", + "reviewer": "Carol" + }, + "intent": "Submit a request to merge dialog-component branch into dialog branch, assign Carol as the reviewer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "dialog" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "dialog-component" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Caroline Stewart" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 667, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "dialog-component", + "target_branch": "bump-doctocat", + "reviewer": "primer" + }, + "intent": "Submit a merge request for dialog-component branch to be merged into bump-doctocat branch, assign primer as the reviewer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "bump-doctocat" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "dialog-component" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Primer" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 668, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "a11yproject.com/redesign", + "target_branch": "master", + "reviewer": "Justin Armstrong" + }, + "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into master branch, assign Justin Armstrong as the reviewer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "redesign" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "main" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Justin Armstrong" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 669, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/solarized-prism-theme", + "geolocation": null, + "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "instantiation_dict": { + "feature": "dark mode" + }, + "intent": "Open a new issue to discuss the implementation of dark mode", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/solarized-prism-theme/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "implementation", + "dark mode" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 337 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 670, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/dotfiles", + "geolocation": null, + "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "instantiation_dict": { + "feature": "default plugins for .zsh" + }, + "intent": "Open a new issue to discuss the implementation of default plugins for .zsh", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "implementation", + "default plugins", + "zsh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 337 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 671, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Sony Computer Entertainment VR", + "rating": "2 stars and less" + }, + "intent": "Gather the titles of Sony Computer Entertainment VR reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Sony Computer Entertainment VR\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Sony Computer Entertainment VR" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "didn't last a year without issues", + "Disappointing. Didn't last long before it stopped powering on and needed to be sent in for repair.", + "Received used items!!" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 672, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Nintendo Switch Fortnite Wildcat Console EU", + "rating": "3 stars and less" + }, + "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Nintendo Switch Fortnite Wildcat Console EU" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "EU charger and wild cat card doesn\u2019t even work!", + "REFUND REJECTED", + "Charging port not compatible", + "not compatible in the US", + "Wildcard Bonus Credits Not Redeemable!", + "Code not available!!" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 673, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Racing Wheel Overdrive for Xbox X", + "rating": "1 star" + }, + "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Racing Wheel Overdrive for Xbox X" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "Unable to set neutral steering", + "Doesn\u2019t work with PC", + "Crazy problems in automatic mode", + "pedals stopped working", + "Only works with certain games" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 674, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Doc and Pies Arcade Factory Cocktail Arcade Machine", + "rating": "3 stars and less" + }, + "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "Poorly Made Exterior. Consider a different Company.", + "piece of junk ,..can't believe I spent money on this !!!!", + "Based arrived broken but game itself works" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 675, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "HORI 3D Surround Gaming Neckset", + "rating": "2 stars and less" + }, + "intent": "Gather the titles of HORI 3D Surround Gaming Neckset reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on HORI 3D Surround Gaming Neckset\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on HORI 3D Surround Gaming Neckset" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "Not worth it for PC users", + "I really wanted to like this.", + "I wish this was better..." + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 676, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "suspected of being fraudulent" + }, + "intent": "Lookup orders that are suspected of being fraudulent", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Suspected Fraud" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 677, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "processing" + }, + "intent": "Lookup orders that are processing", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Processing" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 678, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "canceled" + }, + "intent": "Lookup orders that are canceled", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Canceled" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 679, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "completed" + }, + "intent": "Lookup orders that are completed", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Complete" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 680, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "on hold" + }, + "intent": "Lookup orders that are on hold", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "On Hold" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 681, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": { + "topic": "gan implementation" + }, + "intent": "Find a GitLab repository related to gan implementation and make a Reddit post linking to it in a relevant subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "gan implementation", + "__GITLAB__/eriklindernoren/PyTorch-GAN" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 116 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 682, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": { + "topic": "chatGPT" + }, + "intent": "Find a GitLab repository related to chatGPT and make a Reddit post linking to it in a relevant subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/singularity |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "chatgpt", + "__GITLAB__/convexegg/chatgpt" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 116 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 683, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": { + "topic": "metaseq" + }, + "intent": "Find a GitLab repository related to metaseq and make a Reddit post linking to it in a relevant subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "metaseq", + "__GITLAB__/root/metaseq" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 116 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 684, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "byteblaze/cloud-to-butt", + "subreddit": "LifeProTips" + }, + "intent": "Promote byteblaze/cloud-to-butt to subreddit LifeProTips with the description from the repo itself.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/LifeProTips", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "Chrome extension that replaces occurrences of 'the cloud' with 'my butt'", + "__GITLAB__/byteblaze/cloud-to-butt" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 685, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "byteblaze/dotfiles", + "subreddit": "aww" + }, + "intent": "Promote byteblaze/dotfiles to subreddit aww with the description from the repo itself.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/aww", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "Computer setup", + "__GITLAB__/byteblaze/dotfiles" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 686, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "auth0/angular-storage", + "subreddit": "technology" + }, + "intent": "Promote auth0/angular-storage to subreddit technology with the description from the repo itself.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "A storage library for AngularJS done right", + "__GITLAB__/auth0/angular-storage" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 687, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "koush/AndroidAsync", + "subreddit": "funny" + }, + "intent": "Promote koush/AndroidAsync to subreddit funny with the description from the repo itself.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "Asynchronous socket, http(s) (client+server) and websocket library for android. Based on nio, not threads.", + "__GITLAB__/koush/AndroidAsync" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 688, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "lahwaacz/arch-wiki-docs", + "subreddit": "science" + }, + "intent": "Promote lahwaacz/arch-wiki-docs to subreddit science with the description from the repo itself.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/science", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "A script to download pages from Arch Wiki for offline browsing", + "__GITLAB__/lahwaacz/arch-wiki-docs" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "shopping" + ], + "task_id": 689, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "I am a loyal customer" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a loyal customer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "I am a loyal customer" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163 + }, + { + "sites": [ + "shopping" + ], + "task_id": 690, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "they promised me a coupon last time" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon as they promised me a coupon last time", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "promised" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163 + }, + { + "sites": [ + "shopping" + ], + "task_id": 691, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "I plan to make a bulk purchase" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon as I plan to make a bulk purchase", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "bulk purchase" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163 + }, + { + "sites": [ + "shopping" + ], + "task_id": 692, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "I am a student" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a student", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "student" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163 + }, + { + "sites": [ + "shopping" + ], + "task_id": 693, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "my refund is suppoed to be replaced by a coupon" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon as my refund is suppoed to be replaced by a coupon", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "refund" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 694, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Energy-Bulk Women Shirt", + "stock": "50", + "size": "S", + "color": "blue", + "price": "60" + }, + "intent": "Add a simple product named Energy-Bulk Women Shirt with 50 in stock, available in size S and color blue, priced at $60", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "60.00" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Energy-Bulk Women Shirt" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "top" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[size]\"').value", + "required_contents": { + "exact_match": "167" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "tops" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 695, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Energy-Bulk Man Yoga Pant", + "stock": "50", + "size": "38", + "color": "yellow", + "price": "69.99" + }, + "intent": "Add a simple product named Energy-Bulk Man Yoga Pant with 50 in stock, available in size 38 and color yellow, priced at $69.99", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "69.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Energy-Bulk Man Yoga Pant" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "bottom" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[size]\"').value", + "required_contents": { + "exact_match": "179" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "60" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "bottoms" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 696, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "FancyBoy Man Causal Jeans", + "stock": "42", + "size": "34", + "color": "Blue", + "price": "169.99" + }, + "intent": "Add a simple product named FancyBoy Man Causal Jeans with 42 in stock, available in size 34 and color Blue, priced at $169.99", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "FancyBoy Man Causal Jeans" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "42" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "169.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "bottom" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[size]\"').value", + "required_contents": { + "exact_match": "177" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "bottoms" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 697, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Swaatch Smart Watch", + "stock": "42", + "size": "uni-size", + "color": "Blue", + "price": "769.99" + }, + "intent": "Add a simple product named Swaatch Smart Watch with 42 in stock, available in size uni-size and color Blue, priced at $769.99", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Swaatch Smart Watch" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "42" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "769.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "gear" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "watches" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 698, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Lelelumon Yoga Mat", + "stock": "42", + "size": "uni-size", + "color": "black", + "price": "769.99" + }, + "intent": "Add a simple product named Lelelumon Yoga Mat with 42 in stock, available in size uni-size and color black, priced at $769.99", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Lelelumon Yoga Mat" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "42" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "769.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "gear" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "49" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "fitness equipment" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 699, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "spring sale", + "rule": "a 20 percent discount site-wide" + }, + "intent": "Draft a new marketing price rule for spring sale that offers a 20 percent discount site-wide for all customers", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "spring sale" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "by_percent" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "20" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 700, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "fall discount", + "rule": "$10 discount on checkout" + }, + "intent": "Draft a new marketing price rule for fall discount that offers $10 discount on checkout for all customers", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "fall discount" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "cart_fixed" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "10" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 701, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "Mother's day sale", + "rule": "$15 discount on checkout" + }, + "intent": "Draft a new marketing price rule for Mother's day sale that offers $15 discount on checkout for all customers", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "Mother's day sale" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "cart_fixed" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "15" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 702, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "Pride Month", + "rule": "45% off on all products" + }, + "intent": "Draft a new marketing price rule for Pride Month that offers 45% off on all products for all customers", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "Pride Month" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "by_percent" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "45" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 703, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "Thanks giving sale", + "rule": "$40 discount on checkout" + }, + "intent": "Draft a new marketing price rule for Thanks giving sale that offers $40 discount on checkout for all customers", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "Thanks giving sale" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "cart_fixed" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": ["document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"], + "required_contents": { + "exact_match": "40" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 704, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "sales order report", + "time_span": "for last month" + }, + "intent": "Today is 3/15/2023, generate a sales order report for last month", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "2/1/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "2/28/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 705, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "sales order report", + "time_span": "over the last 45 days" + }, + "intent": "Today is 3/15/2023, generate a sales order report over the last 45 days", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/29/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/15/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 706, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "refund report", + "time_span": "for Q1" + }, + "intent": "Today is 3/15/2023, generate a refund report for Q1", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/refunded", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/1/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/31/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 707, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "sales order report", + "time_span": "for last year" + }, + "intent": "Today is 3/15/2023, generate a sales order report for last year", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/1/22" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "12/31/22" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 708, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "tax report", + "time_span": "for this year" + }, + "intent": "Today is 3/15/2023, generate a tax report for this year", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/tax/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/1/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "12/31/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 709, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create an {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "orders", + "start_date": "beginning of May 2021", + "end_date": "end of March 2022" + }, + "intent": "Create an orders report from beginning of May 2021 to end of March 2022", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "5/1/21" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/31/22" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 710, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "shipping", + "start_date": "08/05/2022", + "end_date": "03/01/2023" + }, + "intent": "Create a shipping report from 08/05/2022 to 03/01/2023", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/shipping", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "8/5/22" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/1/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 711, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "product view", + "start_date": "07/05/2021", + "end_date": "05/31/2023" + }, + "intent": "Create a product view report from 07/05/2021 to 05/31/2023", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_product/viewed/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "7/5/21" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "5/31/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 712, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "coupons", + "start_date": "05/01/2021", + "end_date": "05/15/2023" + }, + "intent": "Create a coupons report from 05/01/2021 to 05/15/2023", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "5/1/21" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "5/15/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 713, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "best sellers", + "start_date": "05/01/2022", + "end_date": "05/31/2023" + }, + "intent": "Create a best sellers report from 05/01/2022 to 05/31/2023", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "5/1/22" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "5/31/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "reddit" + ], + "task_id": 714, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "subreddit": "gadgets", + "k": "1" + }, + "intent": "Thumbs down the top 1 post ever in gadgets.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/gadgets/19459/a-custom-gaming-pc-built-inside-a-vintage-1940s-motorola", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24 + }, + { + "sites": [ + "reddit" + ], + "task_id": 715, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "subreddit": "history", + "k": "2" + }, + "intent": "Thumbs down the top 2 post ever in history.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/history/84338/the-scientist-who-discovered-sperm-was-so-grossed-out-he", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/history/105990/4-500-year-old-sumerian-temple-dedicated-to-mighty-thunder", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24 + }, + { + "sites": [ + "reddit" + ], + "task_id": 716, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "k": "3", + "subreddit": "books" + }, + "intent": "Thumbs down the top 3 post ever in books.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/books/81371/the-letters-of-t-s-eliot-to-emily-hale-that-were-kept-sealed", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/books/59447/appalachian-prison-book-project-seeks-notebook-donations-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24 + }, + { + "sites": [ + "reddit" + ], + "task_id": 717, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "k": "4", + "subreddit": "movies" + }, + "intent": "Thumbs down the top 4 post ever in movies.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/movies/86174/who-will-win-the-oscar-for-actress-in-a-supporting-role", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/movies/86029/who-will-win-the-oscar-for-film-editing", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/movies/86055/cindy-williams-dies-laverne-amp-shirley-star-who-appeared-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/movies/42682/michelle-yeoh-to-receive-palm-springs-film-festival-s", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24 + }, + { + "sites": [ + "reddit" + ], + "task_id": 718, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "k": "5", + "subreddit": "technology" + }, + "intent": "Thumbs down the top 5 post ever in technology.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/technology/48670/brain-cancer-vaccine-succeeds-at-prolonging-survival-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/134696/india-cuts-internet-for-27-million-people-amid-search-for", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/48785/us-judge-orders-amazon-to-cease-and-desist-anti-union", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/70354/activision-s-boston-studio-workers-announce-unionization", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/70233/social-media-influencers-are-charged-with-feeding-followers", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24 + }, + { + "sites": [ + "reddit" + ], + "task_id": 719, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "wallstreetbets", + "user": "ThetaGang_wsb" + }, + "intent": "Like all submissions created by ThetaGang_wsb in subreddit wallstreetbets", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/wallstreetbets/29478/how-will-airbnb-close-following-their-earnings-report-on", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/wallstreetbets/29458/how-much-will-the-federal-reserve-raise-interest-rates-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25 + }, + { + "sites": [ + "reddit" + ], + "task_id": 720, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "earthporn", + "user": "CameronKelsey" + }, + "intent": "Like all submissions created by CameronKelsey in subreddit earthporn", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/EarthPorn/98332/my-favorite-place-on-the-planet-henry-s-fork-of-the-snake", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/EarthPorn/98297/2-years-later-this-is-still-one-of-the-most-incredible", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/EarthPorn/98256/i-can-t-wait-for-all-this-green-to-start-coming-back-little", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25 + }, + { + "sites": [ + "reddit" + ], + "task_id": 721, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "UniversityofBath", + "subreddit": "IAmA" + }, + "intent": "Like all submissions created by UniversityofBath in subreddit IAmA", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/IAmA/119742/hi-i-m-vienne-a-doctoral-student-at-the-university-of-bath-i", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/119719/hello-reddit-i-m-nazia-mehrban-a-lecturer-in-biotechnology", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/119714/i-m-ellie-jarvis-she-her-a-2nd-year-phd-student-in-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/55155/hi-i-m-dr-lucy-maddox-from-bath-university-uk-i-m-a-clinical", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/55142/we-re-sadeka-nujhat-hannah-leese-and-sandhya-moise-from-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/34032/we-re-sandhya-moise-david-phillips-and-chan-lee-from-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/13175/hi-i-m-kit-yates-i-m-a-mathematical-biologist-at-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/13170/hello-i-m-dr-sara-fontani-from-the-university-of", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25 + }, + { + "sites": [ + "reddit" + ], + "task_id": 722, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "Don_Gato1", + "subreddit": "new york" + }, + "intent": "Like all submissions created by Don_Gato1 in subreddit new york", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/nyc/44650/fox-news-hosts-cast-new-york-as-crime-ridden-and-chaotic", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25 + }, + { + "sites": [ + "reddit" + ], + "task_id": 723, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "FTorrez81", + "subreddit": "iphone13" + }, + "intent": "Like all submissions created by FTorrez81 in subreddit iphone13", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "reference_answer_raw_annotation": "N/A", + "string_note": "FTorrez81 does not have any submissions in iphone13" + }, + "intent_template_id": 25, + "string_note": "FTorrez81 has no submissions in subreddit iphone13" + }, + { + "sites": [ + "reddit" + ], + "task_id": 724, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "Hrekires", + "subreddit": "news" + }, + "intent": "Like all submissions created by Hrekires in subreddit news", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25 + }, + { + "sites": [ + "reddit" + ], + "task_id": 725, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "massachusetts", + "user": "RickyDontLoseThat" + }, + "intent": "DisLike all submissions created by RickyDontLoseThat in subreddit massachusetts", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/massachusetts/84954/the-last-of-lincoln", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510 + }, + { + "sites": [ + "reddit" + ], + "task_id": 726, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "earthporn", + "user": "jacyanthis" + }, + "intent": "DisLike all submissions created by jacyanthis in subreddit earthporn", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "jacyanthis does not have any submissions in earthporn", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 1510 + }, + { + "sites": [ + "reddit" + ], + "task_id": 727, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "PatientBuilder499", + "subreddit": "videos" + }, + "intent": "DisLike all submissions created by PatientBuilder499 in subreddit videos", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/videos/115139/hundreds-of-civilian-turkish-volunteers-waiting-to-be-sent", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510 + }, + { + "sites": [ + "reddit" + ], + "task_id": 728, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "sirbarani", + "subreddit": "sports" + }, + "intent": "DisLike all submissions created by sirbarani in subreddit sports", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/sports/48303/iran-football-legend-daei-will-not-attend-world-cup-amid", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510 + }, + { + "sites": [ + "reddit" + ], + "task_id": 729, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "AdamCannon", + "subreddit": "UpliftingNews" + }, + "intent": "DisLike all submissions created by AdamCannon in subreddit UpliftingNews", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/UpliftingNews/16087/same-sex-marriage-is-now-legal-in-all-of-mexico-s-states", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510 + }, + { + "sites": [ + "reddit" + ], + "task_id": 730, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "Hrekires", + "subreddit": "news" + }, + "intent": "DisLike all submissions created by Hrekires in subreddit news", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510 + }, + { + "sites": [ + "reddit" + ], + "task_id": 731, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Nvidia RTX 4090", + "content": "EDIT: This news aged well" + }, + "intent": "Edit my post on Nvidia RTX 4090 by adding a line to the body that says \"EDIT: This news aged well\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/MachineLearning/1/nvidia-rtx-4090", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "EDIT: This news aged well", + "Crazy device for ML!" + ] + } + } + ] + }, + "intent_template_id": 27 + }, + { + "sites": [ + "reddit" + ], + "task_id": 732, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "The Night Agent", + "content": "Done watching, pretty cool!" + }, + "intent": "Edit my post on The Night Agent by adding a line to the body that says \"Done watching, pretty cool!\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/134868/the-night-agent-renewed-for-season-2-at-netflix", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "Done watching, pretty cool!" + } + } + ] + }, + "intent_template_id": 27 + }, + { + "sites": [ + "reddit" + ], + "task_id": 733, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Star Trek Starfleet Academy series", + "content": "Every watch makes me feel like a kid again" + }, + "intent": "Edit my post on Star Trek Starfleet Academy series by adding a line to the body that says \"Every watch makes me feel like a kid again\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/135201/star-trek-starfleet-academy-series-from-alex-kurtzman-and", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "Every watch makes me feel like a kid again" + } + } + ] + }, + "intent_template_id": 27 + }, + { + "sites": [ + "reddit" + ], + "task_id": 734, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Ted Lasso", + "content": "Done watching. I love the renew!" + }, + "intent": "Edit my post on Ted Lasso by adding a line to the body that says \"Done watching. I love the renew!\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/135156/ted-lasso-season-3-premiere-scores-870k-u-s-households-up-59", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "Done watching. I love the renew!" + } + } + ] + }, + "intent_template_id": 27 + }, + { + "sites": [ + "reddit" + ], + "task_id": 735, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Lord of the Rings", + "content": "The cast is amazing!" + }, + "intent": "Edit my post on Lord of the Rings by adding a line to the body that says \"The cast is amazing!\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/135152/lord-of-the-rings-the-rings-of-power-season-2-cast-adds", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "The cast is amazing!" + } + } + ] + }, + "intent_template_id": 27 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 736, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to MIT license.", + "instantiation_dict": { + "repo": "gimmiethat.space and dotfiles" + }, + "intent": "Make the LICENSE of gimmiethat.space and dotfiles to MIT license.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license", + "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." + ] + } + }, + { + "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license", + "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." + ] + } + } + ] + }, + "intent_template_id": 355 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 737, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Philadelphia 76ers", + "time": "" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers ", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Wells Fargo Center", + "South Philadelphia Sports Complex" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 738, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Philadelphia 76ers", + "time": "in the 70th" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "3601 South Broad Street", + "South Philadelphia" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 739, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Yankees", + "time": "in the 80th" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Yankee Stadium", + "East 161st Street" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 740, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "NYC NBA team", + "time": "" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team ", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Madison Square Garden", + "Pennsylvania Plaza", + "Manhattan", + "New York" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 741, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Boston home NBA team", + "time": "" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team ", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "TD Garden", + "Boston", + "Massachusetts" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 742, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"planner\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "private", + "account_list": "Abishek, Vinta" + }, + "intent": "Create a new private project \"planner\" and add Abishek, Vinta as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/planner", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/planner/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 743, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"web_arena\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "public", + "account_list": "Abishek, Vinta" + }, + "intent": "Create a new public project \"web_arena\" and add Abishek, Vinta as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_arena", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "public" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_arena/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 744, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"AutoAGI\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "public", + "account_list": "primer" + }, + "intent": "Create a new public project \"AutoAGI\" and add primer as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/AutoAGI", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "public" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/AutoAGI/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 745, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "public", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome-llms", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "public" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/awesome-llms/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 746, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"llm_bulk_inference\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "private", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Create a new private project \"llm_bulk_inference\" and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/llm_bulk_inference", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/llm_bulk_inference/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 747, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "awesome_web_agents", + "template": "blank", + "account_list": "Abishek, Vinta" + }, + "intent": "Start a private project awesome_web_agents with blank template and add Abishek, Vinta as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_web_agents", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/awesome_web_agents/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initial commit" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/awesome_web_agents/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 2100 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 748, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "web_agent_android_xl", + "template": "Android", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'Android' project template" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 2100 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 749, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "project_site", + "template": "NodeJS", + "account_list": "primer, convexegg, vinta" + }, + "intent": "Start a private project project_site with NodeJS template and add primer, convexegg, vinta as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/project_site", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/project_site/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'NodeJS Express' project template" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/project_site/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 2100 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 750, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "agi_index", + "template": "HTML", + "account_list": "Vinta Chen" + }, + "intent": "Start a private project agi_index with HTML template and add Vinta Chen as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/agi_index", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/agi_index", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + }, + { + "url": "__GITLAB__/byteblaze/agi_index/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "Vinta Chen" + ] + } + } + ] + }, + "intent_template_id": 2100 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 751, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "AGISite", + "template": "JEKYLL", + "account_list": "Rohan and Vinta" + }, + "intent": "Start a private project AGISite with JEKYLL template and add Rohan and Vinta as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/AGISite", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/AGISite", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + }, + { + "url": "__GITLAB__/byteblaze/AGISite/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@Seirdy", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 2100 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 752, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent", + "template": "blank" + }, + "intent": "Create a private blank repository called \"web_agent\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initial commit" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 753, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent_android_xs", + "template": "Android" + }, + "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_android_xs", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xs/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'Android' project template" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 754, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent_nodejs", + "template": "NodeJS" + }, + "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_nodejs", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_nodejs/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'NodeJS Express' project template" + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 755, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent_index", + "template": "HTML" + }, + "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_index", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_index", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 756, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "11711_gitlab", + "template": "JEKYLL" + }, + "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/11711_gitlab", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/11711_gitlab", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + } + ] + }, + "intent_template_id": 332 + }, + { + "sites": [ + "map" + ], + "task_id": 757, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", + "instantiation_dict": { + "city1": "home of the 1980 Super Bowl champions", + "city2": "home of the 1991 Super Bowl champions" + }, + "intent": "Show me the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "New York" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map" + ], + "task_id": 758, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", + "instantiation_dict": { + "city1": "the big apple", + "city2": "biggest city in Maine" + }, + "intent": "Show me the path and travel time from the big apple to biggest city in Maine.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "New York" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Portland", + "Maine" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map", + "shopping_admin" + ], + "task_id": 759, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", + "instantiation_dict": { + "city1": "the city where my E-commerce customer Sophia Young lives", + "city2": "New York City" + }, + "intent": "Show me the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Boston" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "New York" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map", + "shopping_admin" + ], + "task_id": 760, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", + "instantiation_dict": { + "city1": "Allentown, PA", + "city2": "the city where my E-commerce customer Amanda Kim lives" + }, + "intent": "Show me the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Allentown" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Hoboken", + "New Jersey" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map" + ], + "task_id": 761, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.", + "instantiation_dict": { + "location/address_1": "Carnegie Science Museum", + "location/address_2": "Hunt library CMU", + "transportation": "walk" + }, + "intent": "Get directions from Carnegie Science Museum to Hunt library CMU using walk options.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Science Center", + "Allegheny County", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Hunt Library", + "Pittsburgh" + ] + } + } + ] + }, + "intent_template_id": 54 + }, + { + "sites": [ + "map" + ], + "task_id": 762, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.", + "instantiation_dict": { + "location/address_1": "Carnegie Music Hall in NYC", + "location/address_2": "Carnegie Mellon University", + "transportation": "driving" + }, + "intent": "Get directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Hall", + "West 57th Street", + "Manhattan", + "New York" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + } + ] + }, + "intent_template_id": 54 + }, + { + "sites": [ + "map" + ], + "task_id": 763, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "Trader Joe's", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest Trader Joe's from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Trader Joe's, 6343, Penn Avenue, East Liberty" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 764, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "Target", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest Target from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Target, 6231, Penn Avenue, East Liberty" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 765, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "Japanese food market", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest Japanese food market from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Tokyo Japanese Food Store, 5855, Ellsworth Avenue, Shadyside" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 766, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "grocessory owned by Amazon", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest grocessory owned by Amazon from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Whole Foods Market, 5700, Penn Avenue, East Liberty" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 767, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "chain grocessory owned by a local business", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest chain grocessory owned by a local business from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Giant Eagle, 5550, Centre Avenue, Shadyside" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 768, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{quantity}} {{product}} arrived, update the stock", + "instantiation_dict": { + "quantity": "5", + "product": "blue Cronus yoga pants with size 33" + }, + "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "5" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 241 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 769, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "We've received {{quantity}} {{product}}, please update the inventory.", + "instantiation_dict": { + "quantity": "378", + "product": "brown Aero daily fitness tee in every size" + }, + "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/544/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/547/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/550/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/553/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/556/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + } + ] + }, + "intent_template_id": 241 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 770, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "We've received {{quantity}}, update the inventory.", + "instantiation_dict": { + "quantity": "12 white Cora parachute pant of size 28 and 56 blue of size 29" + }, + "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1836/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "112" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1838/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "156" + } + } + ] + }, + "intent_template_id": 241 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 771, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Approve the positive reviews to display in our store.", + "instantiation_dict": {}, + "intent": "Approve the positive reviews to display in our store.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/352", + "locator": "document.querySelector('[name=\"status_id\"').value", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/349", + "locator": "document.querySelector('[name=\"status_id\"').value", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/347", + "locator": "document.querySelector('[name=\"status_id\"').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 243 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 772, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "pending negative reviews for Circe fleece" + }, + "intent": "Delete all pending negative reviews for Circe fleece", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/999", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 773, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "pending negative reviews" + }, + "intent": "Delete all pending negative reviews", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/351", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/353", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 774, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "pending reviews with less than 4 stars" + }, + "intent": "Delete all pending reviews with less than 4 stars", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/351", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/353", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/349", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 775, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "reviews from the scammer Arden" + }, + "intent": "Delete all reviews from the scammer Arden", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/51", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 776, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "reviews from the scammer Carlo" + }, + "intent": "Delete all reviews from the scammer Carlo", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/93", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/109", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 777, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "$5", + "action": "Reduce", + "config": "green Hollister backyard sweater in all size" + }, + "intent": "Reduce the price of green Hollister backyard sweater in all size by $5", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/120/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/117/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/114/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/111/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/123/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + } + ] + }, + "intent_template_id": 742 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 778, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "13.5%", + "action": "Reduce", + "config": "size 28 Sahara leggings" + }, + "intent": "Reduce the price of size 28 Sahara leggings by 13.5%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1841/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.88" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1842/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.88" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1843/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.88" + } + } + ] + }, + "intent_template_id": 742 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 779, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "15%", + "action": "Reduce", + "config": "yellow shirts from Gwyn Endurance in all size below L" + }, + "intent": "Reduce the price of yellow shirts from Gwyn Endurance in all size below L by 15%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1559/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "20.40" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1562/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "20.40" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1565/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "20.40" + } + } + ] + }, + "intent_template_id": 742 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 780, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "$17", + "action": "Increase", + "config": "white Ingrid Running with size L and above" + }, + "intent": "Increase the price of white Ingrid Running with size L and above by $17", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1264/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1267/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.00" + } + } + ] + }, + "intent_template_id": 742 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 781, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "37%", + "action": "Increase", + "config": "black fitness tshirts from Desiree with size XS" + }, + "intent": "Increase the price of black fitness tshirts from Desiree with size XS by 37%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1573/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "32.88" + } + } + ] + }, + "intent_template_id": 742 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 782, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "23%", + "action": "Increase", + "config": "all blue running tshirts in extra small and small sizes" + }, + "intent": "Increase the price of all blue running tshirts in extra small and small sizes by 23%", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/496/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "22.33" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/499/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "22.33" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/479/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "21.56" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/482/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "21.56" + } + } + ] + }, + "intent_template_id": 742 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 783, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "kkroening/ffmpeg-python", + "role": "maintainer", + "user_list": [ + "yjlou", + "a11yproject" + ] + }, + "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "not the owner of the repo so cannot add maintainer", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 351 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 784, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "main", + "attribute": "email address" + }, + "intent": "Tell me the email address of the contributor who has the most commits to branch main", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "secupwn@users.noreply.github.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "secupwn@users.noreply.github.com" + }, + "intent_template_id": 316 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 785, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "gh-page", + "attribute": "email address" + }, + "intent": "Tell me the email address of the contributor who has the most commits to branch gh-page", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "secupwn@users.noreply.github.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "secupwn@users.noreply.github.com" + }, + "intent_template_id": 316 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 786, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/vinta/awesome-python", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "main", + "attribute": "number of commits" + }, + "intent": "Tell me the number of commits of the contributor who has the most commits to branch main", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "412" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "412" + }, + "intent_template_id": 316 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 787, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/dehenne/awesome-visibility/-/graphs/master", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "main", + "attribute": "number of followers" + }, + "intent": "Tell me the number of followers of the contributor who has the most commits to branch main", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 316 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 788, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "php52", + "attribute": "full name, gitlab account name, location and email address" + }, + "intent": "Tell me the full name, gitlab account name, location and email address of the contributor who has the most commits to branch php52", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Andrew M. Whalen", + "amwhalen", + "Massachusetts", + "github@amwhalen.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Andrew M. Whalen, amwhalen, Massachusetts, github@amwhalen.com" + }, + "intent_template_id": 316 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 789, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "huggingface dataset", + "issue": "asking about do they have any plan on supporting Webagent in the next quater" + }, + "intent": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "huggingface repo does not exist", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 328 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 790, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "negative reviews for Sybil running short" + }, + "intent": "Delete all negative reviews for Sybil running short", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "No negative reviews for sybil running short", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 246 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 791, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make a folder named {{directory}} on the {{gitlab_repo}} repo and include a file called urls.txt that consists of the links to the 5 most recent posts from {{subreddit}}.", + "instantiation_dict": { + "directory": "car", + "subreddit": "cars", + "gitlab_repo": "gimmiethat.space" + }, + "intent": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "the subreddit does not exist", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 84 + }, + { + "sites": [ + "shopping" + ], + "task_id": 792, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "Beauty & Personal Care", + "dollar_value": "under 20" + }, + "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The website does not support sorting by rating and there are too many products in this category", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 172 + }, + { + "sites": [ + "shopping" + ], + "task_id": 793, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "Men clothing", + "dollar_value": "above 50 but under 129.99" + }, + "intent": "Buy the highest rated product from the Men clothing category within a budget above 50 but under 129.99.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The website does not support sorting by rating and there are too many products in this category", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 172 + }, + { + "sites": [ + "shopping" + ], + "task_id": 794, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "4000 Forbes Ave, Pittsburgh, PA" + }, + "intent": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191 + }, + { + "sites": [ + "shopping" + ], + "task_id": 795, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "6726 McPherson Blvd, Pittsburgh, PA" + }, + "intent": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191 + }, + { + "sites": [ + "shopping" + ], + "task_id": 796, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "155 5th Street, San Francisco, CA" + }, + "intent": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191 + }, + { + "sites": [ + "shopping" + ], + "task_id": 797, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "3 Oxford St, Cambridge, MA" + }, + "intent": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191 + }, + { + "sites": [ + "shopping" + ], + "task_id": 798, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "77 Massachusetts Ave, Cambridge, MA" + }, + "intent": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 799, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "n-lab", + "members": "patou, egpast, westurner, jontutcher" + }, + "intent": "create a new group \"n-lab\" with members patou, egpast, westurner, jontutcher", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/n-lab/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@patou", + "@egpast", + "@westurner", + "@jontutcher" + ] + } + } + ] + }, + "intent_template_id": 600 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 800, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "x-lab", + "members": "JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY" + }, + "intent": "create a new group \"x-lab\" with members JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/x-lab/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@JonasVautherin", + "@dilipchandima", + "@dawiss1337", + "@bmyun", + "@DCMJY" + ] + } + } + ] + }, + "intent_template_id": 600 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 801, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "crew", + "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix" + }, + "intent": "create a new group \"crew\" with members ASWATFZLLC, patrickhlauke, westurner, linkmatrix", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/crew/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@ASWATFZLLC", + "@patrickhlauke", + "@westurner", + "@linkmatrix" + ] + } + } + ] + }, + "intent_template_id": 600 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 802, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "coding_friends", + "members": "qhduan, Agnes-U" + }, + "intent": "create a new group \"coding_friends\" with members qhduan, Agnes-U", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/coding_friends/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@qhduan", + "@Agnes-U" + ] + } + } + ] + }, + "intent_template_id": 600 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 803, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "webagent", + "members": "pandey2000, sayakpaul, sayakpaul" + }, + "intent": "create a new group \"webagent\" with members pandey2000, sayakpaul, sayakpaul", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/webagent/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@pandey2000", + "@sayakpaul" + ] + } + } + ] + }, + "intent_template_id": 600 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 804, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} to {{account}}.", + "instantiation_dict": { + "issue": "flash alerts", + "account": "myself and primer" + }, + "intent": "Assign the issue regarding flash alerts to myself and primer.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", + "locator": "", + "required_contents": { + "must_include": [ + "Add documentation on using Flash alerts in dialog components", + "Clarify usage of flash alert" + ] + } + }, + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=primer", + "locator": "", + "required_contents": { + "must_include": [ + "Add documentation on using Flash alerts in dialog components", + "Clarify usage of flash alert" + ] + } + } + ] + }, + "intent_template_id": 999 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 805, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "the branch that implements the support of template strings", + "target_branch": "master", + "reviewer": "myself and Roshan" + }, + "intent": "Submit a merge request for the branch that implements the support of template strings to be merged into master branch, assign myself and Roshan as the reviewer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "feature/replace-gulp" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "main" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze", + "Roshan Jossy" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 806, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "a11yproject.com/redesign", + "target_branch": "markdown-figure-block", + "reviewer": "myself" + }, + "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into markdown-figure-block branch, assign myself as the reviewer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "redesign" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "feature/markdown-figure-block" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 807, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "build time debug", + "target_branch": "main", + "reviewer": "myself" + }, + "intent": "Submit a merge request for build time debug branch to be merged into main branch, assign myself as the reviewer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "main" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "debug-build-time" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 808, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "cloud-to-butt", + "issue": "Let's keep the project alive", + "account": "myself", + "due": "the end of Q1 2033" + }, + "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself. Set due date to be the end of Q1 2033", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/cloud-to-butt/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "Let's keep the project alive" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Mar 31, 2033" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 809, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "a11yproject", + "issue": "404 for many URLs", + "account": "myself", + "due": "2030-1-3" + }, + "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself. Set due date to be 2030-1-3", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "404 for many URLs" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Jan 3, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 810, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} repo to {{account}}.", + "instantiation_dict": { + "repo": "primer design guide", + "issue": "flash alert", + "account": "myself" + }, + "intent": "Assign the issue regarding flash alert in primer design guide repo to myself.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", + "locator": "", + "required_contents": { + "must_include": [ + "Add documentation on using Flash alerts in dialog components", + "Clarify usage of flash alert" + ] + } + } + ] + }, + "intent_template_id": 999 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 811, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": { + "repo": "a11yproject", + "issue": 404, + "account": "myself" + }, + "intent": "Assign the issue regarding 404 in a11yproject to myself.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", + "locator": "", + "required_contents": { + "must_include": [ + "404s, bad host, timeouts, bad urls for URLs linked from website" + ] + } + } + ] + }, + "intent_template_id": 999 + } +] diff --git a/config_files/webvoyager/Allrecipes--10.json b/config_files/webvoyager/Allrecipes--10.json new file mode 100644 index 0000000..5a45e80 --- /dev/null +++ b/config_files/webvoyager/Allrecipes--10.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "allrecipes" + ], + "task_id": "Allrecipes--10", + "require_login": false, + "storage_state": null, + "start_url": "https://www.allrecipes.com/", + "geolocation": null, + "intent": "Find The Most Popular Recipes of the 1960s, noting the recipe name, preparation time and total time of the second recipe in this collection.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "'Swedish Meatballs I', prep time 25 mins, total time 1 hour 25 mins" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Allrecipes--17.json b/config_files/webvoyager/Allrecipes--17.json new file mode 100644 index 0000000..b285823 --- /dev/null +++ b/config_files/webvoyager/Allrecipes--17.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "allrecipes" + ], + "task_id": "Allrecipes--17", + "require_login": false, + "storage_state": null, + "start_url": "https://www.allrecipes.com/", + "geolocation": null, + "intent": "Find the Easy Vegetarian Spinach Lasagna recipe on Allrecipes and tell me what the latest review says.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Easy to make and very delicious" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Allrecipes--3.json b/config_files/webvoyager/Allrecipes--3.json new file mode 100644 index 0000000..239f858 --- /dev/null +++ b/config_files/webvoyager/Allrecipes--3.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "allrecipes" + ], + "task_id": "Allrecipes--3", + "require_login": false, + "storage_state": null, + "start_url": "https://www.allrecipes.com/", + "geolocation": null, + "intent": "Locate a recipe for vegan chocolate chip cookies with over 60 reviews and a rating of at least 4.5 stars on Allrecipes.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "'Vegan Chocolate Chip, Oatmeal, and Nut Cookies', 4.9 star, 67 viewers (> 60)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Allrecipes--40.json b/config_files/webvoyager/Allrecipes--40.json new file mode 100644 index 0000000..0da43c0 --- /dev/null +++ b/config_files/webvoyager/Allrecipes--40.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "allrecipes" + ], + "task_id": "Allrecipes--40", + "require_login": false, + "storage_state": null, + "start_url": "https://www.allrecipes.com/", + "geolocation": null, + "intent": "Browse the about us section of Allrecipes for a brief introduction to The Allrecipes Allstars.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "The Allrecipes Allstars: Social media influencers, registered dietitians, grillmasters, and more seasoned home cooks make up our enthusiastic squad of 100+ brand ambassadors. This diverse, food-loving crew spans the U.S. geographically and represents many different cultures, ethnicities, and family makeups. Since 2011, the Allstars have created tens of thousands of original recipes, photos, and reviews plus shared their cooking expertise via flat and video content on our website, social media, plus more marketing channels." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Amazon--16.json b/config_files/webvoyager/Amazon--16.json new file mode 100644 index 0000000..f5af174 --- /dev/null +++ b/config_files/webvoyager/Amazon--16.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "amazon" + ], + "task_id": "Amazon--16", + "require_login": false, + "storage_state": null, + "start_url": "https://www.amazon.com/", + "geolocation": null, + "intent": "Find the Return Policy for Mens Rhinestone Skull Graphic Shirt on Amazon. Color: Black, Size: XX-Large. If Free return is avaliable, tell me how to return this item.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "FREE Returns, 1. Go to Your Orders to start the return; 2. Print the return shipping label; 3. Ship it!" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Apple--12.json b/config_files/webvoyager/Apple--12.json new file mode 100644 index 0000000..c264f58 --- /dev/null +++ b/config_files/webvoyager/Apple--12.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "apple" + ], + "task_id": "Apple--12", + "require_login": false, + "storage_state": null, + "start_url": "https://www.apple.com/", + "geolocation": null, + "intent": "What Apple Repair ways are mentioned on apple website, answer 2 of them.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Any 2 of 'Send your product to Apple', 'Find an Apple Authorized Service Provider', 'Visit a Genius at an Apple Store', 'Independent Repair Providers', 'Self Service Repair'" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Apple--17.json b/config_files/webvoyager/Apple--17.json new file mode 100644 index 0000000..fe547b7 --- /dev/null +++ b/config_files/webvoyager/Apple--17.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "apple" + ], + "task_id": "Apple--17", + "require_login": false, + "storage_state": null, + "start_url": "https://www.apple.com/", + "geolocation": null, + "intent": "Search Apple for the accessory Smart Folio for iPad and check the closest pickup availability next to zip code 90038.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Apple Tower Theatre" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Apple--18.json b/config_files/webvoyager/Apple--18.json new file mode 100644 index 0000000..aec6032 --- /dev/null +++ b/config_files/webvoyager/Apple--18.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "apple" + ], + "task_id": "Apple--18", + "require_login": false, + "storage_state": null, + "start_url": "https://www.apple.com/", + "geolocation": null, + "intent": "Check if there are trade-in offers for the latest model of iPhone.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "There are trade-in offers." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Apple--19.json b/config_files/webvoyager/Apple--19.json new file mode 100644 index 0000000..4260ba7 --- /dev/null +++ b/config_files/webvoyager/Apple--19.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "apple" + ], + "task_id": "Apple--19", + "require_login": false, + "storage_state": null, + "start_url": "https://www.apple.com/", + "geolocation": null, + "intent": "On Apple's website, what is the slogan for the Mac and what is the slogan for the Macbook pro.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "If you can dream it, Mac can do it; Mind-blowing. Head-turning" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Apple--28.json b/config_files/webvoyager/Apple--28.json new file mode 100644 index 0000000..21bfb6e --- /dev/null +++ b/config_files/webvoyager/Apple--28.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "apple" + ], + "task_id": "Apple--28", + "require_login": false, + "storage_state": null, + "start_url": "https://www.apple.com/", + "geolocation": null, + "intent": "On the Apple website, find out if the Mac Mini can be configured with a GPU larger than 16-core.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Yes. Mac mini Apple M2 Pro chip, Configurable to: 19-core GPU" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Apple--31.json b/config_files/webvoyager/Apple--31.json new file mode 100644 index 0000000..e000a93 --- /dev/null +++ b/config_files/webvoyager/Apple--31.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "apple" + ], + "task_id": "Apple--31", + "require_login": false, + "storage_state": null, + "start_url": "https://www.apple.com/", + "geolocation": null, + "intent": "On Apple's website, what is the slogan for the latest Apple Watch Series.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Smarter. Brighter. Mightier." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Apple--37.json b/config_files/webvoyager/Apple--37.json new file mode 100644 index 0000000..6451dd6 --- /dev/null +++ b/config_files/webvoyager/Apple--37.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "apple" + ], + "task_id": "Apple--37", + "require_login": false, + "storage_state": null, + "start_url": "https://www.apple.com/", + "geolocation": null, + "intent": "Compare the color options of iPhone 13 Pro, iPhone 14 Pro and iPhone 15 Pro.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "iPhone 13 pro: Alpine Green, Silver, Gold, Graphite, Sierra Blue; iPhone 14 pro: Deep Purple, Gold, Silver, Space Black; iPhone 15 pro: Natural Titanium, Blue Titanium, White Titanium, Black Titanium" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--10.json b/config_files/webvoyager/ArXiv--10.json new file mode 100644 index 0000000..33f69f8 --- /dev/null +++ b/config_files/webvoyager/ArXiv--10.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--10", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Visit ArXiv Help on how to withdraw an article if the submission is not yet announced.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "If your submission has not yet become publicly available you may delete or delay it. To do either of these things go to your user page and select either the Delete or Unsubmit icon." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--11.json b/config_files/webvoyager/ArXiv--11.json new file mode 100644 index 0000000..a254ca4 --- /dev/null +++ b/config_files/webvoyager/ArXiv--11.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--11", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "For Non-English submissions, do I need to provide a multi-language abstract, if need, answer the separator between the multiple abstracts.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "-----" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--12.json b/config_files/webvoyager/ArXiv--12.json new file mode 100644 index 0000000..d8eec34 --- /dev/null +++ b/config_files/webvoyager/ArXiv--12.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--12", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Find store in arXiv Help, tell me how many styles of arXiv Logo Shirt are available?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "3" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--14.json b/config_files/webvoyager/ArXiv--14.json new file mode 100644 index 0000000..feb95db --- /dev/null +++ b/config_files/webvoyager/ArXiv--14.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--14", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "On ArXiv, how many articles have 'SimCSE' in the article and are originally announced in October 2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "3" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--17.json b/config_files/webvoyager/ArXiv--17.json new file mode 100644 index 0000000..35dff74 --- /dev/null +++ b/config_files/webvoyager/ArXiv--17.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--17", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Find the paper 'GPT-4 Technical Report', when was v3 submitted?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Mon, 27 Mar 2023 17:46:54 UTC" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--18.json b/config_files/webvoyager/ArXiv--18.json new file mode 100644 index 0000000..141a945 --- /dev/null +++ b/config_files/webvoyager/ArXiv--18.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--18", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Download the paper 'Dense Passage Retrieval for Open-Domain Question Answering'. How many formulas are in the article and which one is the loss function?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "2 formulas, the second one is loss function" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--25.json b/config_files/webvoyager/ArXiv--25.json new file mode 100644 index 0000000..f22b72f --- /dev/null +++ b/config_files/webvoyager/ArXiv--25.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--25", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Browse the ArXiv store and let me know how many different types of merchandise are available.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "6, arXiv Logo Shirt, arXiv Logo Mug, arXiv is Open Science, Gift cards, arXiv Morning Mug, arXiv Forever" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--27.json b/config_files/webvoyager/ArXiv--27.json new file mode 100644 index 0000000..536422d --- /dev/null +++ b/config_files/webvoyager/ArXiv--27.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--27", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "On ArXiv, what categories does Economics include, and what are their abbreviations?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Econometrics (econ.EM), General Economics (econ.GN), and Theoretical Economics (econ.TH)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--30.json b/config_files/webvoyager/ArXiv--30.json new file mode 100644 index 0000000..77366cf --- /dev/null +++ b/config_files/webvoyager/ArXiv--30.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--30", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Look up the submission guidelines on ArXiv for submitting a paper and tell me the formats for figures.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Accepted figure formats: PostScript (PS, EPS) \u2014 requires LaTeX processing; JPEG, GIF, PNG or PDF figures \u2014 requires PDFLaTeX processing" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--31.json b/config_files/webvoyager/ArXiv--31.json new file mode 100644 index 0000000..401d9b6 --- /dev/null +++ b/config_files/webvoyager/ArXiv--31.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--31", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Search ArXiv for papers with 'Graph Neural Networks' in the abstract that were submitted between Jan 1, 2024, and Jan 3, 2024, and determine how many of these papers have more than five authors.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "7 papers" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--36.json b/config_files/webvoyager/ArXiv--36.json new file mode 100644 index 0000000..d62b32f --- /dev/null +++ b/config_files/webvoyager/ArXiv--36.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--36", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Search 'CVPR 2023' and 'CVPR2023' through journal ref on ArXiv to see how many results there are respectively.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "'CVPR 2023': 48 results; 'CVPR2023': 9 results" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--37.json b/config_files/webvoyager/ArXiv--37.json new file mode 100644 index 0000000..687cc97 --- /dev/null +++ b/config_files/webvoyager/ArXiv--37.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--37", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Find the names of people in ArXiv's Leadership Team.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Ramin Zabih, Yoav Artzi, Stephanie Orphan, Steinn Sigurdsson, and Charles Frankston." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--39.json b/config_files/webvoyager/ArXiv--39.json new file mode 100644 index 0000000..af7de6b --- /dev/null +++ b/config_files/webvoyager/ArXiv--39.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--39", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Search the title 'GPT-4 Technical Report' and access this paper through HTML format. Read the paper on this page and tell me what is 'one of the main goals of developing such models' mentioned in the Introduction.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "One of the main goals of developing such models is to improve their ability to understand and generate natural language text, particularly in more complex and nuanced scenarios." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--41.json b/config_files/webvoyager/ArXiv--41.json new file mode 100644 index 0000000..0e86f05 --- /dev/null +++ b/config_files/webvoyager/ArXiv--41.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--41", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Find the button to share arxiv non-profit store and follow the QR code to share the shop. Then add arXiv Forever short sleeve (XL) to your cart.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "QR code image, Action: add to chart" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--42.json b/config_files/webvoyager/ArXiv--42.json new file mode 100644 index 0000000..2d82cd0 --- /dev/null +++ b/config_files/webvoyager/ArXiv--42.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--42", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "Find an article published between 1 January 2000 and 1 January 2005 that requires Support Vector Machines in the title and its Journey ref is ACL Workshop.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "'Using a Support-Vector Machine for Japanese-to-English Translation of Tense, Aspect, and Modality'" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ArXiv--6.json b/config_files/webvoyager/ArXiv--6.json new file mode 100644 index 0000000..74ae28e --- /dev/null +++ b/config_files/webvoyager/ArXiv--6.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "arxiv" + ], + "task_id": "ArXiv--6", + "require_login": false, + "storage_state": null, + "start_url": "https://arxiv.org/", + "geolocation": null, + "intent": "How many figures and tables are in the paper \"On the Sentence Embeddings from Pre-trained Language Models\"?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "2 Figures, 8 Tables." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/BBC News--28.json b/config_files/webvoyager/BBC News--28.json new file mode 100644 index 0000000..2943dc3 --- /dev/null +++ b/config_files/webvoyager/BBC News--28.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "bbc news" + ], + "task_id": "BBC News--28", + "require_login": false, + "storage_state": null, + "start_url": "https://www.bbc.com/news/", + "geolocation": null, + "intent": "Find the Market Data section on BBC News and tell me which company the data comes from.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Business - Market Data, Source: Morningstar" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/BBC News--5.json b/config_files/webvoyager/BBC News--5.json new file mode 100644 index 0000000..c45f372 --- /dev/null +++ b/config_files/webvoyager/BBC News--5.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "bbc news" + ], + "task_id": "BBC News--5", + "require_login": false, + "storage_state": null, + "start_url": "https://www.bbc.com/news/", + "geolocation": null, + "intent": "Find the article \"What is climate change? A really simple guide\" and use it to answer what human activities are causing climate change.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "This recent climate change has been caused by human activity, mainly the widespread use of fossil fuels - coal, oil and gas - in homes, factories and transport." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Booking--33.json b/config_files/webvoyager/Booking--33.json new file mode 100644 index 0000000..1cfad51 --- /dev/null +++ b/config_files/webvoyager/Booking--33.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "booking" + ], + "task_id": "Booking--33", + "require_login": false, + "storage_state": null, + "start_url": "https://www.booking.com/", + "geolocation": null, + "intent": "Find the Customer Service on the Booking website, browse the questions about cancellation, and tell me 'how do I know whether my booking has been cancelled'.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "After you cancel a booking with us, you should get an email confirming the cancellation. Make sure to check your inbox and spam/junk mail folders. If you don\u2019t receive an email within 24 hours, contact the property to confirm they got your cancellation." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Booking--41.json b/config_files/webvoyager/Booking--41.json new file mode 100644 index 0000000..da9c2c9 --- /dev/null +++ b/config_files/webvoyager/Booking--41.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "booking" + ], + "task_id": "Booking--41", + "require_login": false, + "storage_state": null, + "start_url": "https://www.booking.com/", + "geolocation": null, + "intent": "Browse Booking's homepage to find out which company it belongs to.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Booking Holdings Inc." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--0.json b/config_files/webvoyager/Cambridge Dictionary--0.json new file mode 100644 index 0000000..b47f223 --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--0.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--0", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Look up the pronunciation and definition of the word \"sustainability\" on the Cambridge Dictionary.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "UK: /s\u0259\u02ccste\u026a.n\u0259\u02c8b\u026al.\u0259.ti/, US: /s\u0259\u02ccste\u026a.n\u0259\u02c8b\u026al.\u0259.t\u032ci/; the quality of being able to continue over a period of time" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--17.json b/config_files/webvoyager/Cambridge Dictionary--17.json new file mode 100644 index 0000000..efb956c --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--17.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--17", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "How many meanings of \"unblemished\" are given in Cambridge Dictionary? Please browse the page and give the number directly.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "2" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--18.json b/config_files/webvoyager/Cambridge Dictionary--18.json new file mode 100644 index 0000000..5517418 --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--18.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--18", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Search for \"to behave well\" in Cambridge Dictionary's Thesaurus and see which synonyms the dictionary gives.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "behaves themselves; be on their best behaviour" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--19.json b/config_files/webvoyager/Cambridge Dictionary--19.json new file mode 100644 index 0000000..4957f78 --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--19.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--19", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Try a Cambridge Dictionary translation and tell me which company provided the translation.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Microsoft" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--21.json b/config_files/webvoyager/Cambridge Dictionary--21.json new file mode 100644 index 0000000..c0fa182 --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--21.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--21", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Search for the word \"ephemeral\" on Cambridge Dictionary and find its translation into Spanish.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "ef\u00edmero" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--26.json b/config_files/webvoyager/Cambridge Dictionary--26.json new file mode 100644 index 0000000..c4c535e --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--26.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--26", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Search for the word \"nostalgia\" in the Cambridge Dictionary and report the translation of this word into Chinese.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "\u6000\u65e7" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--28.json b/config_files/webvoyager/Cambridge Dictionary--28.json new file mode 100644 index 0000000..420fcb4 --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--28.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--28", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Search for \"feel giddy\" in Cambridge Dictionary's Thesaurus and list the synonyms the dictionary provides.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Synonyms: feel dizzy; whirl; spin; reel" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--42.json b/config_files/webvoyager/Cambridge Dictionary--42.json new file mode 100644 index 0000000..ea8e246 --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--42.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--42", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Convert the Cambridge Dictionary homepage from English (UK) to Deutsch.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Action: Click English (UK), change language to: Deutsch" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Cambridge Dictionary--6.json b/config_files/webvoyager/Cambridge Dictionary--6.json new file mode 100644 index 0000000..acda793 --- /dev/null +++ b/config_files/webvoyager/Cambridge Dictionary--6.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "cambridge dictionary" + ], + "task_id": "Cambridge Dictionary--6", + "require_login": false, + "storage_state": null, + "start_url": "https://dictionary.cambridge.org/", + "geolocation": null, + "intent": "Search for the word \"sustainability\" on the Cambridge Dictionary, what is the translation of sustainability into Chinese and French in the dictionary.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "\u53ef\u6301\u7eed\u6027; durabilit\u00e9 , viabilit\u00e9" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Coursera--38.json b/config_files/webvoyager/Coursera--38.json new file mode 100644 index 0000000..011fe6d --- /dev/null +++ b/config_files/webvoyager/Coursera--38.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "coursera" + ], + "task_id": "Coursera--38", + "require_login": false, + "storage_state": null, + "start_url": "https://www.coursera.org/", + "geolocation": null, + "intent": "Browse Coursera, which universities and companies from Australia are partners of Coursera? List all of them.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Macquarie University; The University of Melbourne; The University of Sydney; University of Western Australia; UNSW Sydney (The University of New South Wales)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Coursera--39.json b/config_files/webvoyager/Coursera--39.json new file mode 100644 index 0000000..258a25c --- /dev/null +++ b/config_files/webvoyager/Coursera--39.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "coursera" + ], + "task_id": "Coursera--39", + "require_login": false, + "storage_state": null, + "start_url": "https://www.coursera.org/", + "geolocation": null, + "intent": "Find the Space Safety course offered by TUM on Coursera. How many videos are there in module 2? What is the name of each video?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "6 videos; Introduction; Space Debris; Mitigation; Measurements; Protection; Atmospheric Re-entry" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--11.json b/config_files/webvoyager/ESPN--11.json new file mode 100644 index 0000000..8829029 --- /dev/null +++ b/config_files/webvoyager/ESPN--11.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--11", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "How many NBA teams are there and list all the teams with 'New' in their name.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "30; New York Knicks; New Orleans Pelicans" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--15.json b/config_files/webvoyager/ESPN--15.json new file mode 100644 index 0000000..2078672 --- /dev/null +++ b/config_files/webvoyager/ESPN--15.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--15", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "Check the scores of the NBA games played on December 25, 2023.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "(US Time) Bucks vs Knicks, 122 - 129; Warriors vs Nuggets, 114 - 120; Celtics vs Lakers, 126 - 115; 76ers vs Heat, 113 - 119; Mavericks vs Suns, 128 - 114" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--17.json b/config_files/webvoyager/ESPN--17.json new file mode 100644 index 0000000..05a7d1d --- /dev/null +++ b/config_files/webvoyager/ESPN--17.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--17", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "Check out the NBA Basketball Power Index 2023-24 to see which teams are in first place and which are in last place.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Boston Celtics; San Antonio Spurs" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--18.json b/config_files/webvoyager/ESPN--18.json new file mode 100644 index 0000000..4170f69 --- /dev/null +++ b/config_files/webvoyager/ESPN--18.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--18", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "How many sports leagues can you choose from on the ESPN home page?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "31 (in ESPN America)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--19.json b/config_files/webvoyager/ESPN--19.json new file mode 100644 index 0000000..d9f961f --- /dev/null +++ b/config_files/webvoyager/ESPN--19.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--19", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "Who has the highest salary in Boston Celtics Roster 2023-24?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Jrue Holiday" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--27.json b/config_files/webvoyager/ESPN--27.json new file mode 100644 index 0000000..70e7ee9 --- /dev/null +++ b/config_files/webvoyager/ESPN--27.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--27", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "Search on ESPN for how many teams have 'Golden' in their name and how many of them are in the NHL.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "30 teams in search results, 1 team Vegas Golden Knights (NHL)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--28.json b/config_files/webvoyager/ESPN--28.json new file mode 100644 index 0000000..717cc42 --- /dev/null +++ b/config_files/webvoyager/ESPN--28.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--28", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "How many MLB teams are there and list all the teams with 'City' in their name.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "30 teams in search results, Kansas City Royals" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--31.json b/config_files/webvoyager/ESPN--31.json new file mode 100644 index 0000000..a32b282 --- /dev/null +++ b/config_files/webvoyager/ESPN--31.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--31", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "Who has the heaviest weight among infielders in the New York Yankees Roster 2023-24?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Carlos Rodon, 255 lbs" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--41.json b/config_files/webvoyager/ESPN--41.json new file mode 100644 index 0000000..6f5686e --- /dev/null +++ b/config_files/webvoyager/ESPN--41.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--41", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "Find out which four teams the NFC North contains in the NFL on ESPN.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Chicago Bears, Detroit Lions, Green Bay Packers, and Minnesota Vikings" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/ESPN--9.json b/config_files/webvoyager/ESPN--9.json new file mode 100644 index 0000000..6139efd --- /dev/null +++ b/config_files/webvoyager/ESPN--9.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "espn" + ], + "task_id": "ESPN--9", + "require_login": false, + "storage_state": null, + "start_url": "https://www.espn.com/", + "geolocation": null, + "intent": "Search on ESPN for how many teams have Los Angeles in their name and how many of them are NBA.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "10 teams have Los Angeles in their name; 2 teams are NBA" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--1.json b/config_files/webvoyager/Google Map--1.json new file mode 100644 index 0000000..c0f1849 --- /dev/null +++ b/config_files/webvoyager/Google Map--1.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--1", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Tell me one bus stop that is nearest to the intersection of main street and Amherst street in Altavista.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "'Amherst and 7th' or 'Main Street Middle'" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--21.json b/config_files/webvoyager/Google Map--21.json new file mode 100644 index 0000000..690d8cd --- /dev/null +++ b/config_files/webvoyager/Google Map--21.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--21", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Identify the nearest bus stop to the corner of Elm Street and Oak Street in Massachusetts.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Elm Street & Oak Street, 18 Bay St, Amesbury, MA 01913" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--26.json b/config_files/webvoyager/Google Map--26.json new file mode 100644 index 0000000..ab2a15c --- /dev/null +++ b/config_files/webvoyager/Google Map--26.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--26", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Search for Los Angeles on Google Map, try to print the map as PDF and summarize the information on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": ", print PDF" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--28.json b/config_files/webvoyager/Google Map--28.json new file mode 100644 index 0000000..3a8546c --- /dev/null +++ b/config_files/webvoyager/Google Map--28.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--28", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Find the search settings for Google Map, what options are shown on that page?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Privacy & Safety: Activity, Content, More options; Other settings" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--31.json b/config_files/webvoyager/Google Map--31.json new file mode 100644 index 0000000..077bb55 --- /dev/null +++ b/config_files/webvoyager/Google Map--31.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--31", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "First search New York's Central Park Zoo on Google Map, and then find the way to share the map. What is the generated sharing link?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "share link, https://maps.app.goo.gl/Bnp4Q67dTHoFZ4Lx8" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--33.json b/config_files/webvoyager/Google Map--33.json new file mode 100644 index 0000000..b2a6f43 --- /dev/null +++ b/config_files/webvoyager/Google Map--33.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--33", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Check out Denver International Airport's information and tell me: 1) which level has the least proportion in reviews; 2) what are its Accessibility and Amenities.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "star 2 has the least proportion; Accessibility: Assistive hearing loop; Wheelchair accessible entrance; Wheelchair accessible parking lot; Wheelchair accessible restroom; Wheelchair accessible seating; Amenities: Baggage storage; Wi-Fi; Free Wi-Fi" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--7.json b/config_files/webvoyager/Google Map--7.json new file mode 100644 index 0000000..8f97c65 --- /dev/null +++ b/config_files/webvoyager/Google Map--7.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--7", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Find bus stops in Alanson, MI", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Alanson, MI (EZ-Mart) Bus Stop" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--8.json b/config_files/webvoyager/Google Map--8.json new file mode 100644 index 0000000..0a9e746 --- /dev/null +++ b/config_files/webvoyager/Google Map--8.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--8", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Find a place to climb within 2 miles of zip code 90028.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Hollywood Boulders" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Map--9.json b/config_files/webvoyager/Google Map--9.json new file mode 100644 index 0000000..89ffec7 --- /dev/null +++ b/config_files/webvoyager/Google Map--9.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google map" + ], + "task_id": "Google Map--9", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/maps/", + "geolocation": null, + "intent": "Find the art gallery that is nearest to Los Angeles Hindu Temple.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "'Honor Fraser Gallery' or 'Walter Maciel Gallery'." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--0.json b/config_files/webvoyager/Google Search--0.json new file mode 100644 index 0000000..dbe8dbc --- /dev/null +++ b/config_files/webvoyager/Google Search--0.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--0", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Find the initial release date for Guardians of the Galaxy Vol. 3 the movie.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "May 5, 2023" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--12.json b/config_files/webvoyager/Google Search--12.json new file mode 100644 index 0000000..7a90515 --- /dev/null +++ b/config_files/webvoyager/Google Search--12.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--12", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Find the year that Tom Brady had the most touchdowns in a single seasson.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "2007" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--14.json b/config_files/webvoyager/Google Search--14.json new file mode 100644 index 0000000..5c0da8c --- /dev/null +++ b/config_files/webvoyager/Google Search--14.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--14", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Find the retired players the year before last named James Smith and tell me which club he has been a member of from 2020\u20132021.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Yeovil Town" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--15.json b/config_files/webvoyager/Google Search--15.json new file mode 100644 index 0000000..40917e6 --- /dev/null +++ b/config_files/webvoyager/Google Search--15.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--15", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Please try to log in to twitter with email: webagenttest@testmail.com and password: test123456. Let me know if the login was successful.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Not successful" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--17.json b/config_files/webvoyager/Google Search--17.json new file mode 100644 index 0000000..513cfc2 --- /dev/null +++ b/config_files/webvoyager/Google Search--17.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--17", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Tell me the names of Trump's kids", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Ivanka Trump, Barron Trump, Donald Trump Jr., Tiffany Trump, Eric Trump" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--18.json b/config_files/webvoyager/Google Search--18.json new file mode 100644 index 0000000..ad95b8c --- /dev/null +++ b/config_files/webvoyager/Google Search--18.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--18", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "When and where the most recent World Cup was held, and which team was the winner?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Qatar; November 20 to December 18, 2022; Argentina" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--19.json b/config_files/webvoyager/Google Search--19.json new file mode 100644 index 0000000..9b7f051 --- /dev/null +++ b/config_files/webvoyager/Google Search--19.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--19", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "What are the first 7 bits of the SHA of the Bert's latest commit on GitHub, and what exactly was changed in that commit.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "eedf571, Smaller BERT Models" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--20.json b/config_files/webvoyager/Google Search--20.json new file mode 100644 index 0000000..3140dc1 --- /dev/null +++ b/config_files/webvoyager/Google Search--20.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--20", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Find the release date for the latest \"Fast & Furious\" movie.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "April 4, 2025" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--21.json b/config_files/webvoyager/Google Search--21.json new file mode 100644 index 0000000..fcc7bca --- /dev/null +++ b/config_files/webvoyager/Google Search--21.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--21", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Show a list of the top 5 highest-grossing animated movies, sorted by box office earnings.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "The Lion King (2019); Frozen II (2019); The Super Mario Bros. Movie (2023); Frozen (2013); Incredibles 2 (2018)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--24.json b/config_files/webvoyager/Google Search--24.json new file mode 100644 index 0000000..30fd53e --- /dev/null +++ b/config_files/webvoyager/Google Search--24.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--24", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "What is the name of the star system closest to the Solar System, and what are the discovered planets in it?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Alpha Centauri star system; Proxima Centauri b, Proxima Centauri c, and Proxima Centauri d" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--28.json b/config_files/webvoyager/Google Search--28.json new file mode 100644 index 0000000..dc0f1c4 --- /dev/null +++ b/config_files/webvoyager/Google Search--28.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--28", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Check the IMDb and Metacritic scores of the movie \"Inception.\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "IMDb score 8.8, Metacritic score 74%." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--29.json b/config_files/webvoyager/Google Search--29.json new file mode 100644 index 0000000..7420b32 --- /dev/null +++ b/config_files/webvoyager/Google Search--29.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--29", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Find out the current world record for the men's 100m sprint.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "9.58s held by Usain Bolt of Jamaica" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--3.json b/config_files/webvoyager/Google Search--3.json new file mode 100644 index 0000000..bec9c97 --- /dev/null +++ b/config_files/webvoyager/Google Search--3.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--3", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Show me a list of comedy movies, sorted by user ratings. Show me the Top 5 movies.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Life Is Beautiful, Back to the Future, The Intouchables, City Lights, Modern Times" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--31.json b/config_files/webvoyager/Google Search--31.json new file mode 100644 index 0000000..1888ae4 --- /dev/null +++ b/config_files/webvoyager/Google Search--31.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--31", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Discover which year Cristiano Ronaldo scored the most goals in a single season.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "2014-15 season" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--40.json b/config_files/webvoyager/Google Search--40.json new file mode 100644 index 0000000..13eed28 --- /dev/null +++ b/config_files/webvoyager/Google Search--40.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--40", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Look up the elevation of Mount Kilimanjaro on Google Search.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "19,341 feet (5,895 meters)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Google Search--7.json b/config_files/webvoyager/Google Search--7.json new file mode 100644 index 0000000..c12e8c3 --- /dev/null +++ b/config_files/webvoyager/Google Search--7.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "google search" + ], + "task_id": "Google Search--7", + "require_login": false, + "storage_state": null, + "start_url": "https://www.google.com/", + "geolocation": null, + "intent": "Find the software requirements for iPhones that support AirDrop's ability to continue transmitting over the web when out of range.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "IOS 17.1" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--10.json b/config_files/webvoyager/Huggingface--10.json new file mode 100644 index 0000000..b87f307 --- /dev/null +++ b/config_files/webvoyager/Huggingface--10.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--10", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Open space: argilla/notux-chat-ui and interact with it by asking it 'which team trained you'. What is its answer.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Mistral AI team" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--13.json b/config_files/webvoyager/Huggingface--13.json new file mode 100644 index 0000000..cd9e46e --- /dev/null +++ b/config_files/webvoyager/Huggingface--13.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--13", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Search for LLaMA in the huggingface doc, what type is the spaces_between_special_tokens parameter in LlamaTokenizer and what is its default value.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "bool, defaults to False" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--14.json b/config_files/webvoyager/Huggingface--14.json new file mode 100644 index 0000000..1e04cd0 --- /dev/null +++ b/config_files/webvoyager/Huggingface--14.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--14", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "How much is the Pro account of Hugging face for a month and what are the features?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "$9/month; Pro Account: Get a PRO badge on your profile, Early access to new features, Unlock Inference for PROs, Higher tier for AutoTrain" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--18.json b/config_files/webvoyager/Huggingface--18.json new file mode 100644 index 0000000..41ff700 --- /dev/null +++ b/config_files/webvoyager/Huggingface--18.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--18", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Look up TRL's forward modelling in the hugging face documentation on how to add a margin to a loss.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "As in the Llama 2 paper, you can add a margin to the loss by adding a margin column to the dataset. The reward collator will automatically pass it through and the loss will be computed accordingly." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--20.json b/config_files/webvoyager/Huggingface--20.json new file mode 100644 index 0000000..b5746af --- /dev/null +++ b/config_files/webvoyager/Huggingface--20.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--20", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Locate a pre-trained natural language processing model on Hugging Face that specializes in named entity recognition (NER), confirm that the model was last updated in 2022 and has 1M+ downloads.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "ckiplab/bert-base-chinese-ner" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--21.json b/config_files/webvoyager/Huggingface--21.json new file mode 100644 index 0000000..76a184f --- /dev/null +++ b/config_files/webvoyager/Huggingface--21.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--21", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Look up the tour about how to use the 'pipeline' feature in the Hugging Face Transformers library for sentiment analysis, and identify the default model it uses.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "from transformers import pipeline \\n classifier = pipeline(\"sentiment-analysis\") \\n classifier(\"We are very happy to show you the \ud83e\udd17 Transformers library.\") ... distilbert/distilbert-base-uncased-finetuned-sst-2-english" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--24.json b/config_files/webvoyager/Huggingface--24.json new file mode 100644 index 0000000..2725db2 --- /dev/null +++ b/config_files/webvoyager/Huggingface--24.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--24", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Search for a model on Hugging Face with an Apache-2.0 license that has received the highest number of likes.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "mistralai/Mixtral-8x7B-Instruct-v0.1" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--25.json b/config_files/webvoyager/Huggingface--25.json new file mode 100644 index 0000000..3cfc13c --- /dev/null +++ b/config_files/webvoyager/Huggingface--25.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--25", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "In the Hugging Face documentation, find the tutorial on loading adapters with PEFT, tell me how to load in 8bit or 4bit.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Add the load_in_8bit or load_in_4bit parameters to from_pretrained() and set device_map=\"auto\" to effectively distribute the model to your hardware. (Or use code)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--27.json b/config_files/webvoyager/Huggingface--27.json new file mode 100644 index 0000000..4f49e22 --- /dev/null +++ b/config_files/webvoyager/Huggingface--27.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--27", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Determine the most downloaded dataset related to Text Retrieval in NLP on Hugging Face.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "nlphuji/mscoco_2014_5k_test_image_text_retrieval" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--30.json b/config_files/webvoyager/Huggingface--30.json new file mode 100644 index 0000000..936314f --- /dev/null +++ b/config_files/webvoyager/Huggingface--30.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--30", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Identify the most downloaded English-Chinese (en-zh) machine translation model on Huggingface and report its latest performance metrics and usage guidelines.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Helsinki-NLP/opus-mt-en-zh; testset, BLEU, chr-F: Tatoeba-test.eng.zho, 31.4, 0.268" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--32.json b/config_files/webvoyager/Huggingface--32.json new file mode 100644 index 0000000..decbe9c --- /dev/null +++ b/config_files/webvoyager/Huggingface--32.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--32", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "On the Hugging Face website, search for the model 'GPT-J-6B' and find the 'temperature' parameter in its settings. What is the default value of this parameter?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "\"temperature\": 1.0" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--38.json b/config_files/webvoyager/Huggingface--38.json new file mode 100644 index 0000000..c977849 --- /dev/null +++ b/config_files/webvoyager/Huggingface--38.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--38", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Investigate the 'transformers' library in the Hugging Face documentation, focusing on how to add new tokens to a tokenizer.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "use add_tokens method" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--41.json b/config_files/webvoyager/Huggingface--41.json new file mode 100644 index 0000000..b70eddc --- /dev/null +++ b/config_files/webvoyager/Huggingface--41.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--41", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "What is the current Text-to-3D model with the highest number of downloads and tell me are there Spaces that use the model.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "openai/shap-e; there are Spaces like hysts/Shap-E ..." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--42.json b/config_files/webvoyager/Huggingface--42.json new file mode 100644 index 0000000..26d93e9 --- /dev/null +++ b/config_files/webvoyager/Huggingface--42.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--42", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Check the Dataset Viewer for ai2lumos/lumos_complex_qa_plan_onetime on Hugging face. what is the content corresponding to user in the first message?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "content: Please provide a reasonable subgoal-based plan to solve the given task.\\nTask: What was the opening date of the museum dedicated to the war that, after it occurred, Boston became one of the wealthiest international ports?; Initial Environment Description: None." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--6.json b/config_files/webvoyager/Huggingface--6.json new file mode 100644 index 0000000..7f0cde9 --- /dev/null +++ b/config_files/webvoyager/Huggingface--6.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--6", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Find the model sentence-transformers/all-MiniLM-L6-v2 and use the Inference API on the webpage to get the similarity of the following two sentences: 'Tomorrow is Sunday', 'Eat a burger on Sunday'.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "0.550" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--7.json b/config_files/webvoyager/Huggingface--7.json new file mode 100644 index 0000000..2bcb721 --- /dev/null +++ b/config_files/webvoyager/Huggingface--7.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--7", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Which is the most downloaded audio related dataset on Hugging face currently.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "autumnjohnson/ceti_audio" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Huggingface--9.json b/config_files/webvoyager/Huggingface--9.json new file mode 100644 index 0000000..124fea2 --- /dev/null +++ b/config_files/webvoyager/Huggingface--9.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "huggingface" + ], + "task_id": "Huggingface--9", + "require_login": false, + "storage_state": null, + "start_url": "https://huggingface.co/", + "geolocation": null, + "intent": "Find the most download machine translation model on Huggingface which focuses on English and Japanese (en-ja) and report the evaluation metrics stated for it.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Helsinki-NLP/opus-mt-ja-en; BLEU 41.7\t; chr-F 0.589" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--0.json b/config_files/webvoyager/Wolfram Alpha--0.json new file mode 100644 index 0000000..cb686ed --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--0.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--0", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "derivative of x^2 when x=5.6", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "11.2" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--10.json b/config_files/webvoyager/Wolfram Alpha--10.json new file mode 100644 index 0000000..cf66c84 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--10.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--10", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Give the geomagnetic field on June 20, 2023 in Oslo.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "geomagnetic field, total 51.5 uT;" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--11.json b/config_files/webvoyager/Wolfram Alpha--11.json new file mode 100644 index 0000000..8d15b24 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--11.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--11", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Show the electrical resistivity of UNS A92024 and UNS G10800 at 20 degrees Celsius.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "UNS A92024: 4.9\u00d710^-6 \u03a9 cm (ohm centimeters) (at 20 \u00b0C); UNS G10800: 1.8\u00d710^-5 \u03a9 cm (ohm centimeters)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--12.json b/config_files/webvoyager/Wolfram Alpha--12.json new file mode 100644 index 0000000..4a5d642 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--12.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--12", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Which character in unicode 8900 to 8920 looks like a snowflake", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "8902 (U+22C6)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--15.json b/config_files/webvoyager/Wolfram Alpha--15.json new file mode 100644 index 0000000..ddd3a6f --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--15.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--15", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Show the blood relationship fraction between you and your father's mother's sister's son.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "3.125%" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--17.json b/config_files/webvoyager/Wolfram Alpha--17.json new file mode 100644 index 0000000..c984595 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--17.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--17", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Show the average price of movie ticket in Providence, Nashville, Boise in 2023.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Providence $13.81; Nashville $12.65; Boise $12.65" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--2.json b/config_files/webvoyager/Wolfram Alpha--2.json new file mode 100644 index 0000000..fb94bf3 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--2.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--2", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Calculate 3^71 and retain 5 significant figures in scientific notation.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "7.5095 * 10^33" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--20.json b/config_files/webvoyager/Wolfram Alpha--20.json new file mode 100644 index 0000000..8992334 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--20.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--20", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Compute the integral of 3e^(2x) from x=0 to x=5.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "approximately 33038" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--21.json b/config_files/webvoyager/Wolfram Alpha--21.json new file mode 100644 index 0000000..29d6028 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--21.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--21", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Calculate (1+0.1*i)^8 + (1\u22120.2*i)^8 where i is a complex number.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "approximately 0.717183 - 0.425258 i" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--22.json b/config_files/webvoyager/Wolfram Alpha--22.json new file mode 100644 index 0000000..f480aa6 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--22.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--22", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Determine the area of a regular hexagon with a side length of 7 cm.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "127.306 cm^2 or 147 \\sqrt(3) / 2" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--23.json b/config_files/webvoyager/Wolfram Alpha--23.json new file mode 100644 index 0000000..ddd97f0 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--23.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--23", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Calculate the population growth rate of Canada from 2020 to 2023 using Wolfram Alpha.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "mean population growth rate of Canada from 2020 to 2023 is 0.9998% per year" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--24.json b/config_files/webvoyager/Wolfram Alpha--24.json new file mode 100644 index 0000000..68e67e0 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--24.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--24", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Solve the differential equation y''(t) - 2y'(t) + 10y(t) = 0 and display its general solution.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "y(t) = c1 e^t sin(3t) + c2 e^t cos(3t)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--25.json b/config_files/webvoyager/Wolfram Alpha--25.json new file mode 100644 index 0000000..e3c5e9d --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--25.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--25", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Calculate the final position and velocity of a projectile launched at 45 degrees with an initial speed of 30 m/s after 3 seconds.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "if g=9.81; x = 63.64m, y = 19.49m; Vx = 21.21 m/s, Vy = -8.22 m/s" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--27.json b/config_files/webvoyager/Wolfram Alpha--27.json new file mode 100644 index 0000000..d87f201 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--27.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--27", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Display the thermal conductivity of Copper (Cu) and Aluminum (Al) at 25 degrees Celsius.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "401.2 W/(m K); 236.9 W/(m K)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--28.json b/config_files/webvoyager/Wolfram Alpha--28.json new file mode 100644 index 0000000..1527760 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--28.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--28", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Identify the character in Unicode range 9632 to 9650 that represents a hollow parallelogram.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "9649 or U+25B1" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--3.json b/config_files/webvoyager/Wolfram Alpha--3.json new file mode 100644 index 0000000..5eb276e --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--3.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--3", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Let g(x) be the integral of x^2 cos(2x). Write the expression of g(x).", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "1/4 (2 x cos(2 x) + (-1 + 2 x^2) sin(2 x)) + Constant" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--32.json b/config_files/webvoyager/Wolfram Alpha--32.json new file mode 100644 index 0000000..f43362b --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--32.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--32", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Print all prime numbers between 1000 and 1200 using Wolfram alpha.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193." + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--33.json b/config_files/webvoyager/Wolfram Alpha--33.json new file mode 100644 index 0000000..fa9b90d --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--33.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--33", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Identify the electrical energy output of a hydroelectric power plant named Itaipu Dam in 2023 using Wolfram Alpha.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "89.5 TWh (terawatt hours)" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--34.json b/config_files/webvoyager/Wolfram Alpha--34.json new file mode 100644 index 0000000..9c428b9 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--34.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--34", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Calculate the mass of Jupiter compared to Earth using Wolfram Alpha. Also, find the length of one day on Jupiter.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "317.8 times that of Earth, and the length of one day on Jupiter is approximately 9.925 hours" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--35.json b/config_files/webvoyager/Wolfram Alpha--35.json new file mode 100644 index 0000000..f515fcb --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--35.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--35", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Calculate the determinant of a 6x6 Hilbert matrix.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "1/186313420339200000" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--36.json b/config_files/webvoyager/Wolfram Alpha--36.json new file mode 100644 index 0000000..1b1c4b7 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--36.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--36", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Determine the convergence or divergence of the series \u03a3 (n=1 to \u221e) of 1/(n^3 + 1).", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "converges" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--37.json b/config_files/webvoyager/Wolfram Alpha--37.json new file mode 100644 index 0000000..936a6fd --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--37.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--37", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "How many days are there between February 12, 2024 and August 9, 2050?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "9675" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--38.json b/config_files/webvoyager/Wolfram Alpha--38.json new file mode 100644 index 0000000..f8adeda --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--38.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--38", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Compute the length of a curve defined by y = 2x^3 - 3x^2 + 4x - 5 from x = 0 to x = 3.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "around 39.2" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--39.json b/config_files/webvoyager/Wolfram Alpha--39.json new file mode 100644 index 0000000..5876cb3 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--39.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--39", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Use Wolfram alpha to write the expression of the ellipse x^2 + 3 y^2 = 4 rotated 33 degrees counterclockwise.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "x^2(\\sin(\frac{2\u03c0}{15}) - 2) + 2xy \\cos(\frac{2\u03c0}{15}) + 4 = y^2(2 + \\sin(\frac{2\u03c0}{15}))" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--4.json b/config_files/webvoyager/Wolfram Alpha--4.json new file mode 100644 index 0000000..ff86e05 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--4.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--4", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Pack 24 circles in a circle radius r. Compare Densest known packing and Square packing. Then tell me the radius of the inner circles.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "Densest known packing: 0.176939r; Square packing: 0.163961r" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--40.json b/config_files/webvoyager/Wolfram Alpha--40.json new file mode 100644 index 0000000..50bcec9 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--40.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--40", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Approximate amount of fat burned by a 28yo, 172cm tall, 70kg woman running for 30min at a pace of 6min/mile.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "around 0.078 kg" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--41.json b/config_files/webvoyager/Wolfram Alpha--41.json new file mode 100644 index 0000000..2928443 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--41.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--41", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "What is the approximate Heart Rate Reserve of a 50 year old man who has a heart rate of 60bpm at rest.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "110 bpm" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--42.json b/config_files/webvoyager/Wolfram Alpha--42.json new file mode 100644 index 0000000..6b6e653 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--42.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--42", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "What is the raw memory of a 100.2\" * 123.5\" true colour picture at 72 ppi?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "192 MB" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--43.json b/config_files/webvoyager/Wolfram Alpha--43.json new file mode 100644 index 0000000..3442437 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--43.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--43", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "A polyominoes of order 6 means you have 6 identical squares to combine different shapes (2-sided). How many combinations are there? Looking at all the shapes in the result, how many of them have only 2 rows in total?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "35; 12" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--45.json b/config_files/webvoyager/Wolfram Alpha--45.json new file mode 100644 index 0000000..afafdc5 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--45.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--45", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "A 175cm tall, 85kg, 40yo man climbs 2500 steps at about 18cm per step and 40 steps per minute. summarise the Metabolic properties.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "energy expenditure | 2720 kJ (kilojoules); average energy expenditure per step | 1.1 kJ/step (kilojoules per step); fat burned | 0.0842 kg (kilograms); oxygen consumption | 129.9 L (liters); metabolic equivalent | 7 metabolic equivalents" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--5.json b/config_files/webvoyager/Wolfram Alpha--5.json new file mode 100644 index 0000000..88c92c7 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--5.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--5", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Show the solution of y\"(z) + sin(y(z)) = 0 from wolframalpha.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "y(z) = \u00b1 2 am(1/2 sqrt((c_1 + 2) (z + c_2)^2), 4/(c_1 + 2)), am(x, m) is the Jacobi amplitude function" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--6.json b/config_files/webvoyager/Wolfram Alpha--6.json new file mode 100644 index 0000000..d34aa4a --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--6.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--6", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Simplify x^5-20x^4+163x^3-676x^2+1424x-1209 so that it has fewer items.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "7 + 3 (-4 + x)^3 + (-4 + x)^5" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--7.json b/config_files/webvoyager/Wolfram Alpha--7.json new file mode 100644 index 0000000..b2ccc40 --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--7.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--7", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Give the final angle and final length after 6s of a Spring pendulum with spring equilibrium length=0.12m, initial length=0.24m, initial angle=80deg, mass=1kg, spring constant=120 N/m .", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "-73.26\u00b0 from vertical; 0.252 m" + } + } +} \ No newline at end of file diff --git a/config_files/webvoyager/Wolfram Alpha--9.json b/config_files/webvoyager/Wolfram Alpha--9.json new file mode 100644 index 0000000..c32cb9e --- /dev/null +++ b/config_files/webvoyager/Wolfram Alpha--9.json @@ -0,0 +1,20 @@ +{ + "sites": [ + "wolfram alpha" + ], + "task_id": "Wolfram Alpha--9", + "require_login": false, + "storage_state": null, + "start_url": "https://www.wolframalpha.com/", + "geolocation": null, + "intent": "Annual energy production of Diablo Canyon 2 in 2010.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "9752 GW h/yr (gigawatt hours per year)" + } + } +} \ No newline at end of file diff --git a/eval_webarena.py b/eval_webarena.py new file mode 100644 index 0000000..67051e5 --- /dev/null +++ b/eval_webarena.py @@ -0,0 +1,141 @@ +import os +import time +import re +import argparse +import os +import shutil + +from AgentOccam.env import WebArenaEnvironmentWrapper + +from AgentOccam.AgentOccam import AgentOccam +from webagents_step.utils.data_prep import * +from webagents_step.agents.step_agent import StepAgent + +from AgentOccam.prompts import AgentOccam_prompt +from webagents_step.prompts.webarena import step_fewshot_template_adapted, step_fewshot_template + +from AgentOccam.utils import EVALUATOR_DIR + +def run(): + parser = argparse.ArgumentParser( + description="Only the config file argument should be passed" + ) + parser.add_argument( + "--config", type=str, required=True, help="yaml config file location" + ) + args = parser.parse_args() + with open(args.config, "r") as file: + config = DotDict(yaml.safe_load(file)) + + if config.logging: + if config.logname: + dstdir = f"{config.logdir}/{config.logname}" + else: + dstdir = f"{config.logdir}/{time.strftime('%Y%m%d-%H%M%S')}" + os.makedirs(dstdir, exist_ok=True) + shutil.copyfile(args.config, os.path.join(dstdir, args.config.split("/")[-1])) + random.seed(42) + + config_file_list = [] + + task_ids = config.env.task_ids + if hasattr(config.env, "relative_task_dir"): + relative_task_dir = config.env.relative_task_dir + else: + relative_task_dir = "tasks" + if task_ids == "all" or task_ids == ["all"]: + task_ids = [filename[:-len(".json")] for filename in os.listdir(f"config_files/{relative_task_dir}") if filename.endswith(".json")] + for task_id in task_ids: + config_file_list.append(f"config_files/{relative_task_dir}/{task_id}.json") + + fullpage = config.env.fullpage if hasattr(config.env, "fullpage") else True + current_viewport_only = not fullpage + + if config.agent.type == "AgentOccam": + agent_init = lambda: AgentOccam( + prompt_dict = {k: v for k, v in AgentOccam_prompt.__dict__.items() if isinstance(v, dict)}, + config = config.agent, + ) + elif config.agent.type == "AgentOccam-SteP": + agent_init = lambda: StepAgent( + root_action = config.agent.root_action, + action_to_prompt_dict = {k: v for k, v in step_fewshot_template_adapted.__dict__.items() if isinstance(v, dict)}, + low_level_action_list = config.agent.low_level_action_list, + max_actions=config.env.max_env_steps, + verbose=config.verbose, + logging=config.logging, + debug=config.debug, + model=config.agent.model_name, + prompt_mode=config.agent.prompt_mode, + ) + elif config.agent.type == "SteP-replication": + agent_init = lambda: StepAgent( + root_action = config.agent.root_action, + action_to_prompt_dict = {k: v for k, v in step_fewshot_template.__dict__.items() if isinstance(v, dict)}, + low_level_action_list = config.agent.low_level_action_list, + max_actions=config.env.max_env_steps, + verbose=config.verbose, + logging=config.logging, + debug=config.debug, + model=config.agent.model_name, + prompt_mode=config.agent.prompt_mode, + ) + else: + raise NotImplementedError(f"{config.agent.type} not implemented") + + + for config_file in config_file_list: + with open(config_file, "r") as f: + task_config = json.load(f) + print(f"Task {task_config['task_id']}.") + if os.path.exists(os.path.join(dstdir, f"{task_config['task_id']}.json")): + print(f"Skip {task_config['task_id']}.") + continue + if task_config['task_id'] in list(range(600, 650))+list(range(681, 689)): + print("Reddit post task. Sleep 30 mins.") + time.sleep(1800) + env = WebArenaEnvironmentWrapper(config_file=config_file, + max_browser_rows=config.env.max_browser_rows, + max_steps=config.max_steps, + slow_mo=1, + observation_type="accessibility_tree", + current_viewport_only=current_viewport_only, + viewport_size={"width": 1920, "height": 1080}, + headless=config.env.headless, + global_config=config) + + agent = agent_init() + objective = env.get_objective() + status = agent.act(objective=objective, env=env) + env.close() + + if config.logging: + with open(config_file, "r") as f: + task_config = json.load(f) + log_file = os.path.join(dstdir, f"{task_config['task_id']}.json") + log_data = { + "task": config_file, + "id": task_config['task_id'], + "model": config.agent.actor.model if hasattr(config.agent, "actor") else config.agent.model_name, + "type": config.agent.type, + "trajectory": agent.get_trajectory(), + } + summary_file = os.path.join(dstdir, "summary.csv") + summary_data = { + "task": config_file, + "task_id": task_config['task_id'], + "model": config.agent.actor.model if hasattr(config.agent, "actor") else config.agent.model_name, + "type": config.agent.type, + "logfile": re.search(r"/([^/]+/[^/]+\.json)$", log_file).group(1), + } + if status: + summary_data.update(status) + log_run( + log_file=log_file, + log_data=log_data, + summary_file=summary_file, + summary_data=summary_data, + ) + +if __name__ == "__main__": + run() diff --git a/evaluation_harness/__init__.py b/evaluation_harness/__init__.py new file mode 100644 index 0000000..e942c10 --- /dev/null +++ b/evaluation_harness/__init__.py @@ -0,0 +1,6 @@ +from .evaluators import * +from .helper_functions import ( + shopping_get_latest_order_url, + shopping_get_sku_latest_review_author, + shopping_get_sku_latest_review_rating, +) diff --git a/evaluation_harness/evaluators.py b/evaluation_harness/evaluators.py new file mode 100644 index 0000000..527c9f9 --- /dev/null +++ b/evaluation_harness/evaluators.py @@ -0,0 +1,419 @@ +"""base class for evaluation""" +# answer string match +import collections +import html +import importlib +import json +import time +import urllib +from pathlib import Path +from typing import Any, Tuple, Union, Optional + +from beartype import beartype +import nltk +nltk.download('punkt') +from nltk.tokenize import word_tokenize # type: ignore + +from playwright.sync_api import CDPSession, Page + +from browser_env.actions import Action +from browser_env.utils import StateInfo +from evaluation_harness.helper_functions import ( + PseudoPage, + gitlab_get_project_memeber_role, + llm_fuzzy_match, + llm_ua_match, + reddit_get_post_url, + shopping_get_latest_order_url, + shopping_get_sku_latest_review_author, + shopping_get_sku_latest_review_rating, +) + +Trajectory = list[Union[Action, StateInfo]] + + +class Evaluator(object): + def __init__(self, eval_tag: str = "") -> None: + self.eval_tag = eval_tag + + @beartype + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage, + client: CDPSession, + ) -> float: + raise NotImplementedError + + @staticmethod + def get_last_action(trajectory: Trajectory) -> Action: + try: + # is_bearable(trajectory[-1], Action) + last_action = trajectory[-1] + except Exception: + raise ValueError( + "The last element of trajectory should be an action, add a fake stop action if needed" + ) + + return last_action # type: ignore[return-value] + + @staticmethod + def get_last_state(trajectory: Trajectory) -> StateInfo: + try: + # is_bearable(trajectory[-2], StateInfo) + last_state = trajectory[-2] + except Exception: + raise ValueError( + "The second last element of trajectory should be a state, add a fake stop action if needed" + ) + + return last_state # type: ignore[return-value] + + +class StringEvaluator(Evaluator): + """Check whether the answer is correct with: + exact match: the answer is exactly the same as the reference answer + must include: each phrase in the reference answer must be included in the answer + fuzzy match: the answer is similar to the reference answer, using LLM judge + """ + + @staticmethod + @beartype + def clean_answer(answer: str) -> str: + answer = answer.strip() + if answer.startswith("'") and answer.endswith("'"): + answer = answer[1:-1] + elif answer.startswith('"') and answer.endswith('"'): + answer = answer[1:-1] + return answer.lower() + + @staticmethod + @beartype + def exact_match(ref: str, pred: str) -> float: + return float( + StringEvaluator.clean_answer(pred) + == StringEvaluator.clean_answer(ref) + ) + + @staticmethod + @beartype + def must_include(ref: str, pred: str, tokenize: bool = False) -> float: + clean_ref = StringEvaluator.clean_answer(ref) + clean_pred = StringEvaluator.clean_answer(pred) + # tokenize the answer if the ref is a single word + # prevent false positive (e.g, 0) + if " |or| " in clean_ref or " |OR| " in clean_ref: + refs = clean_ref.split(" |or| ") if " |or| " in clean_ref else clean_ref.split(" |OR| ") + refs = [r.strip() for r in refs] + for r in refs: + if ( + tokenize + and len(r) == 1 + and len(word_tokenize(r)) == 1 + ): + tok_pred = word_tokenize(r) + if r in tok_pred: + return float(r in tok_pred) + else: + if r in clean_pred: + return float(r in clean_pred) + return 0.0 + if ( + tokenize + and len(clean_ref) == 1 + and len(word_tokenize(clean_ref)) == 1 + ): + tok_pred = word_tokenize(clean_pred) + return float(clean_ref in tok_pred) + else: + return float(clean_ref in clean_pred) + + @staticmethod + @beartype + def fuzzy_match(ref: str, pred: str, intent: str) -> float: + return llm_fuzzy_match(pred, ref, intent) + + @staticmethod + @beartype + def ua_match(ref: str, pred: str, intent: str) -> float: + return llm_ua_match(pred, ref, intent) + + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage | None = None, + client: CDPSession | None = None, + ) -> float: + with open(config_file, "r") as f: + configs = json.load(f) + + last_action = self.get_last_action(trajectory) + pred = self.clean_answer(last_action["answer"]) + + score = 1.0 + for approach, value in configs["eval"]["reference_answers"].items(): + match approach: + case "exact_match": + score *= self.exact_match(ref=value, pred=pred) + + case "must_include": + assert isinstance(value, list) + must_include_score = 0. + for must_value in value: + must_include_score += self.must_include( + ref=must_value, + pred=pred, + tokenize=(len(value) == 1), + ) + must_include_score /= len(value) + score *= must_include_score + case "fuzzy_match": + intent = configs["intent"] + if value == "N/A": + # if the instruction only asks the model to generate N/A when encountering an unachievable task + # without more concrete reasons + score *= self.exact_match(ref=value, pred=pred) + # if the instruction also asks the model to generate the reason why the task is unachievable + # this should be the default as it will prevent false positive N/A` + if score != 1: + score = 1.0 * self.ua_match( + intent=configs["intent"], + ref=configs["eval"]["string_note"], + pred=pred, + ) + else: + if isinstance(value, list): + fuzzy_match_value = "; ".join(value) + else: + fuzzy_match_value = value + fuzzy_match_score = self.fuzzy_match( + ref=fuzzy_match_value, pred=pred, intent=intent + ) + score *= fuzzy_match_score + return score + + +class URLEvaluator(Evaluator): + """Check URL matching""" + + @beartype + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage, + client: CDPSession | None = None, + ) -> float: + with open(config_file, "r") as f: + configs = json.load(f) + + def clean_url(url: str) -> str: + url = str(url) + url = url.rstrip("/") + url = url.lower() + return url + + def parse_url(url: str) -> tuple[str, dict[str, list[str]]]: + """Parse a URL into its base, path, and query components.""" + parsed_url = urllib.parse.urlparse(url) + base_path = parsed_url.netloc + parsed_url.path + query = urllib.parse.parse_qs(parsed_url.query) + return base_path, query + + def parse_urls( + urls: list[str], + ) -> tuple[list[str], dict[str, set[str]]]: + """Parse a list of URLs.""" + base_paths = [] + queries = collections.defaultdict(set) + for url in urls: + base_path, query = parse_url(url) + base_paths.append(base_path) + for k, v in query.items(): + queries[k].update(v) + return base_paths, queries + + pred = clean_url(page.url) + matching_rule = configs["eval"].get("url_note", "GOLD in PRED") + if matching_rule == "GOLD in PRED": + if "or" in configs["eval"].keys(): + or_ref_urls_list = [configs["eval"]["reference_url"]] + [item["reference_url"] for item in configs["eval"]["or"]] + else: + or_ref_urls_list = [configs["eval"]["reference_url"]] + or_score_list = [] + for or_ref_urls in or_ref_urls_list: + ref_urls = or_ref_urls.split(" |OR| ") + ref_urls = [clean_url(url) for url in ref_urls] + ref_base_paths, ref_queries = parse_urls(ref_urls) + pred_base_paths, pred_query = parse_url(pred) + + base_score = float( + any( + [ + ref_base_path in pred_base_paths + for ref_base_path in ref_base_paths + ] + ) + ) + query_score = 1.0 + for k, possible_values in ref_queries.items(): + query_score *= float( + any( + possible_ref_value in pred_query.get(k, []) + for possible_ref_value in possible_values + ) + ) + or_score_list.append(base_score * query_score) + score = max(or_score_list) + + else: + raise ValueError(f"Unknown matching rule: {matching_rule}") + + return score + + +class HTMLContentEvaluator(Evaluator): + """Check whether the contents appear in the page""" + + @beartype + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage, + client: CDPSession | None = None, + ) -> float: + with open(config_file, "r") as f: + configs = json.load(f) + + targets = configs["eval"]["program_html"] + + score = 1.0 + for target in targets: + if "or" in target.keys(): + or_target_list = [target] + [t for t in target["or"]] + else: + or_target_list = [target] + or_score_list = [] + for or_target in or_target_list: + target_url: str = or_target["url"] # which url to check + if target_url.startswith("func"): + func = target_url.split("func:")[1] + func = func.replace("__last_url__", page.url) + target_url = eval(func) + + locator: str = or_target["locator"] # js element locator + + # navigate to that url + if target_url != "last": + page.goto(target_url) + time.sleep(3) # TODO [shuyanzh]: fix this hard-coded sleep + + # empty, use the full page + if not locator.strip(): + selected_element = page.content() + # use JS to select the element + elif locator.startswith("document.") or locator.startswith( + "[...document." + ): + if "prep_actions" in or_target: + try: + for prep_action in or_target["prep_actions"]: + page.evaluate(f"() => {prep_action}") + except Exception: + pass + try: + selected_element = str(page.evaluate(f"() => {locator}")) + if not selected_element: + selected_element = "" + except Exception: + # the page is wrong, return empty + selected_element = "" + # run program to call API + elif locator.startswith("func:"): # a helper function + func = locator.split("func:")[1] + func = func.replace("__page__", "page") + selected_element = eval(func) + else: + raise ValueError(f"Unknown locator: {locator}") + + selected_element = html.unescape(selected_element) + + if "exact_match" in or_target["required_contents"]: + required_contents = or_target["required_contents"]["exact_match"] + cur_score = StringEvaluator.exact_match( + ref=required_contents, pred=selected_element + ) + or_score_list.append(cur_score) + print(f"[exact match] {cur_score}, selected element: {selected_element}, required contents: {required_contents}") + elif "must_include" in or_target["required_contents"]: + required_contents = or_target["required_contents"]["must_include"] + assert isinstance(required_contents, list) + content_score_list = [] + for content in required_contents: + content_or = content.split(" |OR| ") + cur_score = any( + [ + StringEvaluator.must_include( + ref=content, + pred=selected_element, + tokenize=False, + ) + for content in content_or + ] + ) + content_score_list.append(cur_score) + # score *= float(cur_score) + print(f"[must include] {cur_score}, selected element: {selected_element}, required contents: {content_or}") + or_score_list.append(sum(content_score_list)/len(content_score_list)) + else: + raise ValueError( + f"Unknown required_contents: {or_target['required_contents'].keys()}" + ) + or_score = max(or_score_list) + score *= or_score + + return score + + +class EvaluatorComb: + def __init__(self, evaluators: list[Evaluator]) -> None: + self.evaluators = evaluators + + @beartype + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Optional[Page | PseudoPage] = None, + client: Optional[CDPSession] = None, + ) -> float: + score = 1.0 + for evaluator in self.evaluators: + cur_score = evaluator(trajectory, config_file, page, client) + score *= cur_score + return score + + +@beartype +def evaluator_router(config_file: Path | str) -> EvaluatorComb: + """Router to get the evaluator class""" + with open(config_file, "r") as f: + configs = json.load(f) + + eval_types = configs["eval"]["eval_types"] + evaluators: list[Evaluator] = [] + for eval_type in eval_types: + match eval_type: + case "string_match": + evaluators.append(StringEvaluator()) + case "url_match": + evaluators.append(URLEvaluator()) + case "program_html": + evaluators.append(HTMLContentEvaluator()) + case _: + raise ValueError(f"eval_type {eval_type} is not supported") + + return EvaluatorComb(evaluators) diff --git a/evaluation_harness/helper_functions.py b/evaluation_harness/helper_functions.py new file mode 100644 index 0000000..d1a55cf --- /dev/null +++ b/evaluation_harness/helper_functions.py @@ -0,0 +1,220 @@ +"""Implements helper functions to assist evaluation cases where other evaluators are not suitable.""" +import json +from typing import Any +from urllib.parse import urlparse + +import requests +from playwright.sync_api import CDPSession, Page + +from browser_env.env_config import ( + ACCOUNTS, + GITLAB, + MAP, + REDDIT, + SHOPPING, + SHOPPING_ADMIN, + WIKIPEDIA, +) +from llms.providers.openai_utils import ( + generate_from_openai_chat_completion, +) + + +def shopping_get_auth_token() -> str: + response = requests.post( + url=f"{SHOPPING}/rest/default/V1/integration/admin/token", + headers={"content-type": "application/json"}, + data=json.dumps( + { + "username": ACCOUNTS["shopping_site_admin"]["username"], + "password": ACCOUNTS["shopping_site_admin"]["password"], + } + ), + ) + token: str = response.json() + return token + + +def shopping_get_latest_order_url() -> str: + """Get the latest order url from the shopping website.""" + + header = { + "Authorization": f"Bearer {shopping_get_auth_token()}", + "Content-Type": "application/json", + } + + params = { + "searchCriteria[sortOrders][0][field]": "created_at", + "searchCriteria[sortOrders][0][direction]": "DESC", + "searchCriteria[pageSize]": "1", + } + + response = requests.get( + f"{SHOPPING}/rest/V1/orders", params=params, headers=header + ) + assert response.status_code == 200 + response_obj = response.json()["items"][0] + order_id = int(response_obj["increment_id"]) + order_url = f"{SHOPPING}/sales/order/view/order_id/{order_id}/" + return order_url + + +def shopping_get_sku_latest_review_author(sku: str) -> str: + """Get the latest review for shopping admin.""" + header = { + "Authorization": f"Bearer {shopping_get_auth_token()}", + "Content-Type": "application/json", + } + response = requests.get( + f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header + ) + assert response.status_code == 200 + response_obj = response.json() + if len(response_obj) == 0: + return "" + author: str = response_obj[-1]["nickname"] + return author + + +def shopping_get_sku_latest_review_rating(sku: str) -> str: + """Get the latest review for shopping admin.""" + header = { + "Authorization": f"Bearer {shopping_get_auth_token()}", + "Content-Type": "application/json", + } + response = requests.get( + f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header + ) + assert response.status_code == 200 + response_obj = response.json() + if len(response_obj) == 0: + return "" + assert response_obj[0]["ratings"][0]["rating_name"] == "Rating" + rating: str = str(response_obj[-1]["ratings"][0]["percent"]) + return rating + + +def reddit_get_post_url(url: str) -> str: + """Get the post url""" + # Url is http://domain/f/subreddit/post_id/... + # get domain, subreddit, post_id + domain = urlparse(url).netloc + tok_url = urlparse(url).path.split("/") + # not a valid post/comment url, return the url as is + if len(tok_url) < 4: + return url + if tok_url[1] != "f": + return url + subreddit = urlparse(url).path.split("/")[2] + post_id = urlparse(url).path.split("/")[3] + scheme = urlparse(url).scheme + post_url = f"{scheme}://{domain}/f/{subreddit}/{post_id}/" + return post_url + + +def gitlab_get_project_memeber_role(page: Page, account_name: str) -> str: + # get the account index + try: + account_idx = page.evaluate( + f"""(() => {{ + const elements = document.querySelectorAll("td[data-label='Account'] span.gl-avatar-labeled-sublabel"); + let index = -1; // Default value if not found + + for(let i = 0; i < elements.length; i++) {{ + if(elements[i].outerText === '@{account_name}') {{ + index = i; + break; + }} + }} + + return index; + }})()""" + ) + + # get the role + role: str = page.evaluate( + f"""(() => {{ + return document.querySelectorAll("td.col-max-role span")[{account_idx}].outerText; + }})()""" + ) + except Exception: + role = "" + + return role + + +def llm_fuzzy_match(pred: str, reference: str, question: str) -> float: + """Check whether the prediction matches the reference with GPT4-turbo""" + messages: list[dict[str, Any]] = [] + # construct the question to ask + message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student has performed the action to get the answer. They are allowed to use different phrasing or wording to answer the question. The goal is to evaluate whether the key points in the reference answer are included in the student's answer. We allow answers with additional information that doesn't contradict the reference answer and review them as fully (not partially) correct.\n" + message += f"question: {question}\n" + message += f"reference answer: {reference}\n" + message += "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n" + message += f"student answer: {pred}\n" + message += "Conclude the judgement by correct/incorrect/partially correct and explain why." + messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": message}, + ] + response = generate_from_openai_chat_completion( + model="gpt-4-1106-preview", + messages=messages, + temperature=0, + max_tokens=768, + top_p=1.0, + context_length=0, + ).lower() + print(response) + if "partially correct" in response or "incorrect" in response: + return 0.0 + else: + assert "correct" in response + return 1.0 + +def llm_ua_match(pred: str, reference: str, question: str) -> float: + """Check whether the prediction matches the reference with GPT-turbo""" + messages: list[dict[str, Any]] = [] + # construct the question to ask + message = "" + message += f"task: {question}\n" + message += f"actual unachievable reason: {reference}\n" + message += f"reported unachievable reason: {pred}\n" + message += ( + "The task described above is inherently unachievable due to the reason specified under 'actual unachievable reason'. " + "An individual previously attempted this task and was unable to complete it. They provided a reason for their failure, " + "which is listed under 'reported unachievable reason'. Your role is to review both the actual and reported reasons. " + "Determine if the reported reason aligns with the actual reason, even if implicitly. " + "If the stated reason is in line with the actual reason, respond with 'same'. Otherwise, respond with 'different'." + ) + messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": message}, + ] + + response = generate_from_openai_chat_completion( + model="gpt-4-1106-preview", + messages=messages, + temperature=0, + max_tokens=768, + top_p=1.0, + context_length=0, + ).lower() + if "different" in response: + return 0.0 + else: + assert "same" in response + return 1.0 + + +class PseudoPage: + def __init__(self, original_page: Page, url: str): + self.url = url + self.original_page = original_page + + def __getattr__(self, attr: str) -> Any: + # Delegate attribute access to the original page object + if attr not in ["url"]: + return getattr(self.original_page, attr) + else: + return getattr(self, attr) diff --git a/files/overview.png b/files/overview.png new file mode 100644 index 0000000..15f09da Binary files /dev/null and b/files/overview.png differ diff --git a/llms/__init__.py b/llms/__init__.py new file mode 100644 index 0000000..aac6b24 --- /dev/null +++ b/llms/__init__.py @@ -0,0 +1,16 @@ +"""This module is adapt from https://github.com/zeno-ml/zeno-build""" +from .providers.hf_utils import generate_from_huggingface_completion +from .providers.openai_utils import ( + generate_from_openai_chat_completion, + generate_from_openai_completion, +) +from .providers.anthropic_utils import generate_from_anthropic_completion +from .utils import call_llm + +__all__ = [ + "generate_from_openai_completion", + "generate_from_openai_chat_completion", + "generate_from_huggingface_completion", + "call_llm", + "generate_from_anthropic_completion" +] diff --git a/llms/lm_config.py b/llms/lm_config.py new file mode 100644 index 0000000..13ec79f --- /dev/null +++ b/llms/lm_config.py @@ -0,0 +1,61 @@ +"""Config for language models.""" + +from __future__ import annotations + +import argparse +import dataclasses +from dataclasses import dataclass +from typing import Any + + +@dataclass(frozen=True) +class LMConfig: + """A config for a language model. + + Attributes: + provider: The name of the API provider. + model: The name of the model. + model_cls: The Python class corresponding to the model, mostly for + Hugging Face transformers. + tokenizer_cls: The Python class corresponding to the tokenizer, mostly + for Hugging Face transformers. + mode: The mode of the API calls, e.g., "chat" or "generation". + """ + + provider: str + model: str + model_cls: type | None = None + tokenizer_cls: type | None = None + mode: str | None = None + gen_config: dict[str, Any] = dataclasses.field(default_factory=dict) + cuda: str = '0' + + +def construct_llm_config(args: argparse.Namespace) -> LMConfig: + llm_config = LMConfig( + provider=args.provider, model=args.model, mode=args.mode, cuda=args.cuda + ) + if args.provider == "openai": + llm_config.gen_config["temperature"] = args.temperature + llm_config.gen_config["top_p"] = args.top_p + llm_config.gen_config["context_length"] = args.context_length + llm_config.gen_config["max_tokens"] = args.max_tokens + llm_config.gen_config["stop_token"] = args.stop_token + llm_config.gen_config["max_obs_length"] = args.max_obs_length + llm_config.gen_config["max_retry"] = args.max_retry + elif args.provider == "huggingface": + llm_config.gen_config["temperature"] = args.temperature + llm_config.gen_config["top_p"] = args.top_p + llm_config.gen_config["max_new_tokens"] = args.max_tokens + llm_config.gen_config["stop_sequences"] = ( + [args.stop_token] if args.stop_token else None + ) + llm_config.gen_config["max_obs_length"] = args.max_obs_length + llm_config.gen_config["model_endpoint"] = args.model_endpoint + llm_config.gen_config["max_retry"] = args.max_retry + elif args.provider == "anthropic": + llm_config.gen_config["max_obs_length"] = None + llm_config.gen_config["max_retry"] = args.max_retry + else: + raise NotImplementedError(f"provider {args.provider} not implemented") + return llm_config diff --git a/llms/providers/anthropic_utils.py b/llms/providers/anthropic_utils.py new file mode 100644 index 0000000..754c77f --- /dev/null +++ b/llms/providers/anthropic_utils.py @@ -0,0 +1,45 @@ +# Use the native inference API to send a text message to Anthropic Claude. + +import boto3 +import json + +def generate_from_anthropic_completion( + prompt: str, + engine: str, +) -> str: + # Create a Bedrock Runtime client in the AWS Region of your choice. + client = boto3.client("bedrock-runtime", region_name="us-east-1") + + # Set the model ID, e.g., Claude 3 Haiku. + model_id = "anthropic.claude-3-sonnet-20240229-v1:0" + + # Format the request payload using the model's native structure. + native_request = { + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 1024, + "temperature": 0.5, + "messages": [ + { + "role": "user", + "content": [{"type": "text", "text": prompt}], + } + ], + } + + # Convert the native request to JSON. + request = json.dumps(native_request) + + try: + # Invoke the model with the request. + response = client.invoke_model(modelId=engine, body=request) + + except Exception as e: # (ClientError, Exception) as e: + raise KeyError(f"ERROR: Can't invoke '{engine}'. Reason: {e}") + + # Decode the response body. + model_response = json.loads(response["body"].read()) + + # Extract and print the response text. + response_text = model_response["content"][0]["text"] + + return response_text \ No newline at end of file diff --git a/llms/providers/hf_utils.py b/llms/providers/hf_utils.py new file mode 100644 index 0000000..b5e8987 --- /dev/null +++ b/llms/providers/hf_utils.py @@ -0,0 +1,21 @@ +from text_generation import Client # type: ignore + + +def generate_from_huggingface_completion( + prompt: str, + model_endpoint: str, + temperature: float, + top_p: float, + max_new_tokens: int, + stop_sequences: list[str] | None = None, +) -> str: + client = Client(model_endpoint, timeout=60) + generation: str = client.generate( + prompt=prompt, + temperature=temperature, + top_p=top_p, + max_new_tokens=max_new_tokens, + stop_sequences=stop_sequences, + ).generated_text + + return generation diff --git a/llms/providers/openai_utils.py b/llms/providers/openai_utils.py new file mode 100644 index 0000000..fb86d52 --- /dev/null +++ b/llms/providers/openai_utils.py @@ -0,0 +1,283 @@ +"""Tools to generate from OpenAI prompts. +Adopted from https://github.com/zeno-ml/zeno-build/""" + +import asyncio +import logging +import os +import random +import time +from typing import Any + +import aiolimiter +import openai +from openai import OpenAI +from tqdm.asyncio import tqdm_asyncio + + +def retry_with_exponential_backoff( # type: ignore + func, + initial_delay: float = 1, + exponential_base: float = 2, + jitter: bool = True, + max_retries: int = 3, + errors: tuple[Any] = (openai.RateLimitError,), +): + """Retry a function with exponential backoff.""" + + def wrapper(*args, **kwargs): # type: ignore + # Initialize variables + num_retries = 0 + delay = initial_delay + + # Loop until a successful response or max_retries is hit or an exception is raised + while True: + try: + return func(*args, **kwargs) + # Retry on specified errors + except errors as e: + # Increment retries + num_retries += 1 + + # Check if max retries has been reached + if num_retries > max_retries: + raise Exception( + f"Maximum number of retries ({max_retries}) exceeded." + ) + + # Increment the delay + delay *= exponential_base * (1 + jitter * random.random()) + print(f"Retrying in {delay} seconds.") + # Sleep for the delay + time.sleep(delay) + + # Raise exceptions for any errors not specified + except Exception as e: + raise e + + return wrapper + + +async def _throttled_openai_completion_acreate( + engine: str, + prompt: str, + temperature: float, + max_tokens: int, + top_p: float, + limiter: aiolimiter.AsyncLimiter, +) -> dict[str, Any]: + async with limiter: + for _ in range(3): + try: + return await openai.Completion.acreate( # type: ignore + engine=engine, + prompt=prompt, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + ) + except openai.RateLimitError: + logging.warning( + "OpenAI API rate limit exceeded. Sleeping for 10 seconds." + ) + await asyncio.sleep(10) + except openai.APIError as e: + logging.warning(f"OpenAI API error: {e}") + break + return {"choices": [{"message": {"content": ""}}]} + + +async def agenerate_from_openai_completion( + prompts: list[str], + engine: str, + temperature: float, + max_tokens: int, + top_p: float, + context_length: int, + requests_per_minute: int = 300, +) -> list[str]: + """Generate from OpenAI Completion API. + + Args: + prompts: list of prompts + temperature: Temperature to use. + max_tokens: Maximum number of tokens to generate. + top_p: Top p to use. + context_length: Length of context to use. + requests_per_minute: Number of requests per minute to allow. + + Returns: + List of generated responses. + """ + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "OPENAI_API_KEY environment variable must be set when using OpenAI API." + ) + openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") + limiter = aiolimiter.AsyncLimiter(requests_per_minute) + async_responses = [ + _throttled_openai_completion_acreate( + engine=engine, + prompt=prompt, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + limiter=limiter, + ) + for prompt in prompts + ] + responses = await tqdm_asyncio.gather(*async_responses) + return [x["choices"][0]["text"] for x in responses] + + +@retry_with_exponential_backoff +def generate_from_openai_completion( + prompt: str, + engine: str, + temperature: float, + max_tokens: int, + top_p: float, + stop_token: str | None = None, +) -> str: + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "OPENAI_API_KEY environment variable must be set when using OpenAI API." + ) + openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") + response = OpenAI().completions.create( # type: ignore + prompt=prompt, + engine=engine, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + stop=[stop_token], + ) + answer: str = response["choices"][0]["text"] + return answer + + +async def _throttled_openai_chat_completion_acreate( + model: str, + messages: list[dict[str, str]], + temperature: float, + max_tokens: int, + top_p: float, + limiter: aiolimiter.AsyncLimiter, +) -> dict[str, Any]: + async with limiter: + for _ in range(3): + try: + return await openai.ChatCompletion.acreate( # type: ignore + model=model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + ) + except openai.RateLimitError: + logging.warning( + "OpenAI API rate limit exceeded. Sleeping for 10 seconds." + ) + await asyncio.sleep(10) + except asyncio.exceptions.TimeoutError: + logging.warning("OpenAI API timeout. Sleeping for 10 seconds.") + await asyncio.sleep(10) + except openai.APIError as e: + logging.warning(f"OpenAI API error: {e}") + break + return {"choices": [{"message": {"content": ""}}]} + + +async def agenerate_from_openai_chat_completion( + messages_list: list[list[dict[str, str]]], + engine: str, + temperature: float, + max_tokens: int, + top_p: float, + context_length: int, + requests_per_minute: int = 300, +) -> list[str]: + """Generate from OpenAI Chat Completion API. + + Args: + messages_list: list of message list + temperature: Temperature to use. + max_tokens: Maximum number of tokens to generate. + top_p: Top p to use. + context_length: Length of context to use. + requests_per_minute: Number of requests per minute to allow. + + Returns: + List of generated responses. + """ + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "OPENAI_API_KEY environment variable must be set when using OpenAI API." + ) + openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") + limiter = aiolimiter.AsyncLimiter(requests_per_minute) + async_responses = [ + _throttled_openai_chat_completion_acreate( + model=engine, + messages=message, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + limiter=limiter, + ) + for message in messages_list + ] + responses = await tqdm_asyncio.gather(*async_responses) + return [x["choices"][0]["message"]["content"] for x in responses] + + +@retry_with_exponential_backoff +def generate_from_openai_chat_completion( + messages: list[dict[str, str]], + model: str, + temperature: float, + max_tokens: int, + top_p: float, + context_length: int, + stop_token: str | None = None, +) -> str: + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "OPENAI_API_KEY environment variable must be set when using OpenAI API." + ) + openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") + + response = OpenAI().chat.completions.create( # type: ignore + model=model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + stop=[stop_token] if stop_token else None, + ) + answer: str = response.choices[0].message.content + return answer + + +@retry_with_exponential_backoff +# debug only +def fake_generate_from_openai_chat_completion( + messages: list[dict[str, str]], + model: str, + temperature: float, + max_tokens: int, + top_p: float, + context_length: int, + stop_token: str | None = None, +) -> str: + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "OPENAI_API_KEY environment variable must be set when using OpenAI API." + ) + openai.api_key = os.environ["OPENAI_API_KEY"] + openai.organization = os.environ.get("OPENAI_ORGANIZATION", "") + answer = "Let's think step-by-step. This page shows a list of links and buttons. There is a search box with the label 'Search query'. I will click on the search box to type the query. So the action I will perform is \"click [60]\"." + return answer diff --git a/llms/tokenizers.py b/llms/tokenizers.py new file mode 100644 index 0000000..9883ed1 --- /dev/null +++ b/llms/tokenizers.py @@ -0,0 +1,29 @@ +from typing import Any + +import tiktoken +from transformers import LlamaTokenizer # type: ignore + + +class Tokenizer(object): + def __init__(self, provider: str, model_name: str) -> None: + if provider == "openai": + self.tokenizer = tiktoken.encoding_for_model(model_name) + elif provider == "huggingface": + self.tokenizer = LlamaTokenizer.from_pretrained(model_name) + # turn off adding special tokens automatically + self.tokenizer.add_special_tokens = False # type: ignore[attr-defined] + self.tokenizer.add_bos_token = False # type: ignore[attr-defined] + self.tokenizer.add_eos_token = False # type: ignore[attr-defined] + elif provider == "anthropic": + pass + else: + raise NotImplementedError + + def encode(self, text: str) -> list[int]: + return self.tokenizer.encode(text) + + def decode(self, ids: list[int]) -> str: + return self.tokenizer.decode(ids) + + def __call__(self, text: str) -> list[int]: + return self.tokenizer.encode(text) diff --git a/llms/utils.py b/llms/utils.py new file mode 100644 index 0000000..cf3cc52 --- /dev/null +++ b/llms/utils.py @@ -0,0 +1,67 @@ +import argparse +from typing import Any +from transformers import AutoTokenizer, AutoModel + +from llms import ( + generate_from_huggingface_completion, + generate_from_openai_chat_completion, + generate_from_openai_completion, + lm_config, +) + +APIInput = str | list[Any] | dict[str, Any] + +model = None +tokenizer = None + +def call_llm( + lm_config: lm_config.LMConfig, + prompt: APIInput, +) -> str: + global model + global tokenizer + + response: str + + if lm_config.provider == "openai": + if lm_config.mode == "chat": + assert isinstance(prompt, list) + response = generate_from_openai_chat_completion( + messages=prompt, + model=lm_config.model, + temperature=lm_config.gen_config["temperature"], + top_p=lm_config.gen_config["top_p"], + context_length=lm_config.gen_config["context_length"], + max_tokens=lm_config.gen_config["max_tokens"], + stop_token=None, + ) + elif lm_config.mode == "completion": + assert isinstance(prompt, str) + response = generate_from_openai_completion( + prompt=prompt, + engine=lm_config.model, + temperature=lm_config.gen_config["temperature"], + max_tokens=lm_config.gen_config["max_tokens"], + top_p=lm_config.gen_config["top_p"], + stop_token=lm_config.gen_config["stop_token"], + ) + else: + raise ValueError( + f"OpenAI models do not support mode {lm_config.mode}" + ) + elif lm_config.provider == "huggingface": + assert isinstance(prompt, str) + response = generate_from_huggingface_completion( + prompt=prompt, + model_endpoint=lm_config.gen_config["model_endpoint"], + temperature=lm_config.gen_config["temperature"], + top_p=lm_config.gen_config["top_p"], + stop_sequences=lm_config.gen_config["stop_sequences"], + max_new_tokens=lm_config.gen_config["max_new_tokens"], + ) + else: + raise NotImplementedError( + f"Provider {lm_config.provider} not implemented" + ) + + return response diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f85e701 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +lxml +torch +ctranslate2 +boto3 +accelerate +bitsandbytes +peft +matplotlib +google +google-generativeai +google-ai-generativelanguage \ No newline at end of file diff --git a/scripts/run_config.sh b/scripts/run_config.sh new file mode 100644 index 0000000..44d095e --- /dev/null +++ b/scripts/run_config.sh @@ -0,0 +1,11 @@ +export SHOPPING="http://:7770" +export SHOPPING_ADMIN="http://:7780/admin" +export REDDIT="http://:9999" +export GITLAB="http://:8023" +export MAP="http://:3000" +export WIKIPEDIA="http://:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" +export HOMEPAGE="http://:4399" +export OPENAI_API_KEY= +conda activate webarena +python browser_env/auto_login.py +python eval_webarena.py --config AgentOccam/configs/AgentOccam.yml \ No newline at end of file diff --git a/scripts/run_webarena.sh b/scripts/run_webarena.sh new file mode 100644 index 0000000..b318292 --- /dev/null +++ b/scripts/run_webarena.sh @@ -0,0 +1,6 @@ +python webarena_replication.py \ + --instruction_path webarena/agent/prompts/jsons/p_cot_id_actree_2s.json \ + --test_start_idx 65 \ + --test_end_idx 66 \ + --model gpt-4-turbo \ + --result_dir ../AgentOccam-Trajectories/WebArena-replication \ No newline at end of file diff --git a/webagents_step/__init__.py b/webagents_step/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/webagents_step/agents/agent.py b/webagents_step/agents/agent.py new file mode 100644 index 0000000..4a0670b --- /dev/null +++ b/webagents_step/agents/agent.py @@ -0,0 +1,106 @@ +from typing import List + + +class Agent: + def __init__( + self, + max_actions, + verbose=0, + logging=False, + previous_actions: List = None, + previous_reasons: List = None, + previous_responses: List = None, + ): + self.previous_actions = [] if previous_actions is None else previous_actions + self.previous_reasons = [] if previous_reasons is None else previous_reasons + self.previous_responses = [] if previous_responses is None else previous_responses + self.max_actions = max_actions + self.verbose = verbose + self.logging = logging + self.trajectory = [] + self.data_to_log = {} + + def reset(self): + self.previous_actions = [] + self.previous_reasons = [] + self.previous_responses = [] + self.trajectory = [] + self.data_to_log = {} + + def get_trajectory(self): + return self.trajectory + + def update_history(self, action, reason): + if action: + self.previous_actions += [action] + if reason: + self.previous_reasons += [reason] + + def predict_action(self, objective, observation, url=None): + pass + + def receive_response(self, response): + self.previous_responses += [response] + + def act(self, objective, env): + while not env.done(): + observation = env.observation() + action, reason = self.predict_action( + objective=objective, observation=observation, url=env.get_url() + ) + status = env.step(action) + + if self.logging: + self.log_step( + objective=objective, + url=env.get_url(), + observation=observation, + action=action, + reason=reason, + status=status, + ) + + if len(self.previous_actions) >= self.max_actions: + print(f"Agent exceeded max actions: {self.max_actions}") + break + + return status + + async def async_act(self, objective, env): + while not env.done(): + observation = await env.observation() + action, reason = self.predict_action( + objective=objective, observation=observation, url=env.get_url() + ) + status = await env.step(action) + + if self.logging: + self.log_step( + objective=objective, + url=env.get_url(), + observation=observation, + action=action, + reason=reason, + status=status, + ) + + if len(self.previous_actions) >= self.max_actions: + print(f"Agent exceeded max actions: {self.max_actions}") + break + + return status + + def log_step(self, objective, url, observation, action, reason, status): + self.data_to_log['objective'] = objective + self.data_to_log['url'] = url + self.data_to_log['observation'] = observation if isinstance(observation, str) else observation["text"] + self.data_to_log['previous_actions'] = self.previous_actions[:-1] + self.data_to_log['previous_responses'] = self.previous_responses[:-1] + self.data_to_log['previous_reasons'] = self.previous_reasons[:-1] + self.data_to_log['action'] = action + self.data_to_log['reason'] = reason + if status: + for (k, v) in status.items(): + self.data_to_log[k] = v + self.trajectory.append(self.data_to_log) + self.data_to_log = {} diff --git a/webagents_step/agents/prompt_agent.py b/webagents_step/agents/prompt_agent.py new file mode 100644 index 0000000..ced63b3 --- /dev/null +++ b/webagents_step/agents/prompt_agent.py @@ -0,0 +1,59 @@ +from webagents_step.agents.agent import Agent +from typing import List +from webagents_step.utils.llm import fill_prompt_template, construct_llm_message_openai, call_openai_llm, parse_action_reason, calculate_cost_openai, call_anthropic_llm + +class PromptAgent(Agent): + def __init__(self, max_actions: int = 10, verbose: bool = False, logging: bool = False, + debug: bool = False, prompt_template: str = None, model: str = "gpt-3.5-turbo", + prompt_mode: str = "chat", previous_actions: List = None, previous_reasons: List = None, previous_responses: List = None): + super().__init__(max_actions=max_actions, verbose=verbose, logging=logging, previous_actions=previous_actions, previous_reasons=previous_reasons, previous_responses=previous_responses) + self.debug = debug + self.prompt_template = prompt_template + self.model = model + self.prompt_mode = prompt_mode + + def previous_history(self): + previous_history = [] + + if len(self.previous_actions) == len(self.previous_responses): + for action, response in zip(self.previous_actions, self.previous_responses): + if response: + previous_history.append(f"{response} = {action}") + else: + previous_history.append(action) + previous_history="\n".join(previous_history) + else: + previous_history = "\n".join(action for action in self.previous_actions if action is not None) if self.previous_actions is not None else "" + + + return previous_history + + def predict_action(self, objective, observation, url=None): + prompt = fill_prompt_template(prompt_template=self.prompt_template, objective=objective, + observation=observation, url=url, + previous_history=self.previous_history()) + messages = construct_llm_message_openai(prompt=prompt, prompt_mode=self.prompt_mode) + model_response = call_openai_llm(messages=messages, model=self.model) + action, reason = parse_action_reason(model_response) + + if self.logging: + self.data_to_log['prompt'] = messages + + if self.verbose > 0: + print(f"\n OBJECTIVE: {objective}") + print(f"\n URL: {url}") + print(f"\n PREVIOUS HISTORY: {self.previous_history()}") + print(f"\n REASON: {reason}") + print(f"\n ACTION: {action}") + if self.verbose > 1: + print(f"\n OBSERVATION: {observation}") + print(f"\n RESPONSE: {model_response}") + + if self.debug: + human_input = input() + if human_input != "c": + action = human_input + reason = "None" + + self.update_history(action=action, reason=reason) + return action, reason \ No newline at end of file diff --git a/webagents_step/agents/step_agent.py b/webagents_step/agents/step_agent.py new file mode 100644 index 0000000..bd3482c --- /dev/null +++ b/webagents_step/agents/step_agent.py @@ -0,0 +1,105 @@ +from webagents_step.agents.agent import Agent +from webagents_step.utils.stack import Stack +from webagents_step.agents.prompt_agent import PromptAgent + +from typing import List, Dict +import re +import time + +class StepAgent(Agent): + def __init__(self, max_actions: int = 10, verbose: bool = False, logging: bool = False, + debug: bool = False, + root_action: str = None, + action_to_prompt_dict: Dict = None, + low_level_action_list: List = None, + model: str = "gpt-3.5-turbo", + prompt_mode: str = "chat", previous_actions: List = None): + super().__init__(max_actions=max_actions, verbose=verbose, logging=logging, previous_actions=previous_actions) + self.debug = debug + self.root_action = root_action + self.action_to_prompt_dict = {} if action_to_prompt_dict is None else action_to_prompt_dict + self.low_level_action_list = [] if low_level_action_list is None else low_level_action_list + self.model = model + self.prompt_mode = prompt_mode + self.stack = Stack() + + def is_done(self, action): + if action and "stop" in action: + return True + return False + + def is_low_level_action(self, action): + if not action: + return False + action_type = action.split()[0] + return (action_type in self.low_level_action_list) + + def is_high_level_action(self, action): + if not action: + return False + action_type = action.split()[0] + return (action_type in self.action_to_prompt_dict) + + def init_root_agent(self, objective): + root_prompt_template = self.action_to_prompt_dict[self.root_action] + agent = PromptAgent( + prompt_template=root_prompt_template, + model=self.model, + prompt_mode=self.prompt_mode, + max_actions=self.max_actions, + verbose=self.verbose, + logging=self.logging, + debug=self.debug, + previous_actions=[], + previous_reasons=[], + previous_responses=[] + ) + return {'agent': agent, 'objective': objective} + + def init_agent(self, action): + pattern = r'(\w+)\s+\[(.*?)\]' + matches = re.findall(pattern, action) + action_type, _ = matches[0] + objective = action + prompt_template = self.action_to_prompt_dict[action_type] + agent = PromptAgent( + prompt_template=prompt_template, + model=self.model, + prompt_mode=self.prompt_mode, + max_actions=self.max_actions, + verbose=self.verbose, + logging=self.logging, + debug=self.debug, + previous_actions=[], + previous_reasons=[], + previous_responses=[] + ) + return {'agent': agent, 'objective': objective} + + def predict_action(self, objective, observation, url=None): + observation = observation["text"] if isinstance(observation, dict) else observation + if self.stack.is_empty(): + new_element = self.init_root_agent(objective=objective) + self.stack.push(new_element) + + action, reason = None, None + while not self.stack.is_empty(): + element = self.stack.peek() + action, reason = element['agent'].predict_action(objective=element['objective'], observation=observation, url=url) + if (not self.is_done(action)) and self.is_low_level_action(action): + element['agent'].receive_response("") + return action, reason + if (not self.is_done(action)) and self.is_high_level_action(action): + new_element = self.init_agent(action) + self.stack.push(new_element) + if self.logging: + self.log_step(objective=element['objective'], url=url, observation=observation, action=action, reason=reason, status={}) + continue + if self.is_done(action): + self.stack.pop() + if not self.stack.is_empty(): + self.stack.peek()['agent'].receive_response(re.search(r"\[(.*?)\]", action).group(1)) + if self.logging: + self.log_step(objective=element['objective'], url=url, observation=observation, action=action, reason=reason, status={}) + continue + return action, reason \ No newline at end of file diff --git a/webagents_step/environment/env.py b/webagents_step/environment/env.py new file mode 100644 index 0000000..949ef07 --- /dev/null +++ b/webagents_step/environment/env.py @@ -0,0 +1,18 @@ +class WebEnvironment(): + def __init__(self): + pass + + def reset(self): + pass + + def observation(self): + pass + + def get_url(self): + pass + + def step(self, action): + pass + + def done(self): + pass diff --git a/webagents_step/environment/liveweb.py b/webagents_step/environment/liveweb.py new file mode 100644 index 0000000..6680424 --- /dev/null +++ b/webagents_step/environment/liveweb.py @@ -0,0 +1,175 @@ +from time import sleep +import pandas as pd +import re + +from webagents_step.parser import ( + heihei_web_parser, + playwright_parser_nat, + playwright_parser_webarena, +) +from webagents_step.environment.env import WebEnvironment + + +class LiveWebEnvironmentWrapper(WebEnvironment): + def __init__( + self, + url=None, + objective=None, + parser_type="heihei", + observation_type="text", + text_observation_type="accesibility_tree", + max_browser_rows=1000, + max_steps=50, + step_delay=2, + headless=False, + ): + self.url = url + self.objective = objective + self.headless = headless + self.parser_type = parser_type + self.observation_type = observation_type + self.text_observation_type = text_observation_type + self.max_browser_rows = max_browser_rows + self.max_steps = max_steps + + self.steps = 0 + self.is_done = False + self.parse_timeout = 5 + self.step_delay = step_delay + self.response = "" + + async def init_parser(self): + if self.parser_type == "heihei": + self.parser = heihei_web_parser.HeiHeiWebParser() + await self.parser.init() + elif self.parser_type == "playwright_webarena": + self.parser = playwright_parser_webarena.PlaywrightParserWebArena( + headless=self.headless, + observation_type=self.observation_type, + text_observation_type=self.text_observation_type, + ) + self.parser.init() + elif self.parser_type == "playwright_nat": + self.parser = playwright_parser_nat.PlaywrightParserNat( + headless=self.headless + ) + await self.parser.init() + else: + raise NotImplementedError(f"{self.parser_type} not implemented.") + + if self.url is not None: + await self.parser.go_to_page(self.url) + self.clear_page_presets() + await self.parser.parse_page() + + def clear_page_presets(self): + pass + + async def reset(self): + await self.close() + await self.init_parser() + + async def close(self): + await self.parser.close() + + async def observation(self, tab_id=None, format=None): + format = self.text_observation_type if format is None else format + if self.parser_type == "heihei": + try: + browser_content = await self.parser.parse_page( + format=format, tab_id=tab_id + ) + except: + sleep(self.parse_timeout) + browser_content = await self.parser.parse_page( + format=format, tab_id=tab_id + ) + else: + browser_content = await self.parser.parse_page() + + if format not in ["htree", "html", "json"]: + browser_content = [str(w) for w in browser_content] + browser_content = browser_content[: self.max_browser_rows] + browser_content = "\n".join(browser_content) + + return browser_content + + def get_log(self): + return self.df_log + + def get_response(self): + return self.response + + def get_url(self): + return self.parser.get_url() + + async def execute_action(self, action): + """ + Execute a given action based on the action type, + - click [id]: Clicks an element based on the provided id. + - type [id] [content]: Types the provided content into the element with the specified id. + - goto [url]: Navigates to an existing tab at that URL + - open [url]: Opens a new tab with provided URL + - copy [content]: Copies content, but no-op action + - stop [response]: Stops execution and optionally provides a response. + """ + click_match = re.match(r"click \[(\S+)\]", action, re.IGNORECASE) + type_match = re.match(r"type \[(\S+)\] \[(.+)\]", action, re.IGNORECASE) + goto_match = re.match(r"goto \[(\S+)\]", action, re.IGNORECASE) + open_match = re.match(r"open \[(\S+)\]", action, re.IGNORECASE) + copy_match = re.match(r"copy \[(\S+)\]", action, re.IGNORECASE) + stop_match = re.match(r"stop \[([^\]]*)\]", action, re.IGNORECASE) + + if click_match: + id = click_match.group(1) + if not id.isdigit(): + raise Exception("Id not a valid integer") + await self.parser.click(int(id)) + + elif type_match: + id = type_match.group(1) + content = type_match.group(2) + if not id.isdigit(): + raise Exception("Id not a valid integer") + await self.parser.type(int(id), content) + + elif goto_match: + url = goto_match.group(1) + tab_id, tab_url = await self.parser.get_tab_from_url(url) + await self.parser.go_to_page(url) + + elif open_match: + url = open_match.group(1) + await self.parser.go_to_page(url) + + elif copy_match: + pass + + elif stop_match: + self.response = stop_match.group(1) + self.is_done = True + + else: + print(f"[execute_action] Error {action} not defined") + + async def step(self, action, delay=None): + delay = self.step_delay if delay is None else delay + + if self.steps > self.max_steps: + print(f"Steps {self.steps} exceeded maximum {self.max_steps}") + self.is_done = True + return + + print(f"[Step {self.steps+1}] {action}") + try: + await self.execute_action(action) + except Exception as e: + print(f"Error while executing action '{action}'. Details: {e}") + + sleep(delay) + self.steps = self.steps + 1 + + return {"done": self.is_done, "response": self.response} + + def done(self): + return self.is_done diff --git a/webagents_step/parser/playwright_parser_webarena.py b/webagents_step/parser/playwright_parser_webarena.py new file mode 100644 index 0000000..c5bb5c6 --- /dev/null +++ b/webagents_step/parser/playwright_parser_webarena.py @@ -0,0 +1,99 @@ +from sys import platform +from playwright.sync_api import sync_playwright +from browser_env.processors import TextObervationProcessor, ImageObservationProcessor + + +class PlaywrightParserWebArena: + def __init__( + self, + headless=True, + observation_type="text", + text_observation_type="accessibility_tree", + viewport_size={"width": 1280, "height": 1080}, + current_viewport_only=True, + ): + self.headless = headless + self.viewport_size = viewport_size + self.current_viewport_only = current_viewport_only + self.observation_type = observation_type + self.text_observation_type = text_observation_type + + self.playwright = sync_playwright().start() + self.browser = self.playwright.chromium.launch(headless=self.headless) + self.context = self.browser.new_context( + viewport=self.viewport_size, + device_scale_factor=1, + ) + + self.page = self.context.new_page() + client = self.page.context.new_cdp_session(self.page) + if (self.observation_type == "text") and ( + self.text_observation_type == "accessibility_tree" + ): + client.send("Accessibility.enable") + self.page.client = client + + ## scratch ## + # initialize with html string + # self.page.goto(url if "://" in url else "http://" + url) + # potentially later + # self.page.goto("https://www.google.com", wait_until='networkidle') + # print(self.page.accessibility.snapshot()) + # self.page = self.page.accessibility.snapshot() + + self.text_processor = TextObervationProcessor( + observation_type=self.text_observation_type, + current_viewport_only=self.current_viewport_only, + viewport_size=self.viewport_size, + ) + self.image_processor = ImageObservationProcessor(observation_type="image") + + def clear_page_presets(): + pass + + def observation_processor(self): + if self.observation_type == "text": + return self.text_processor + elif self.observation_type == "image": + return self.image_processor + else: + raise ValueError("Invalid observation type") + + def get_url(self): + return self.page.url + + def go_to_page(self, url: str): + self.page.goto(url if "://" in url else "http://" + url) + + def close(self): + self.browser.close() + self.playwright_context.stop() + + def click_xy(self, x: float, y: float) -> None: + viewport_size = self.page.viewport_size + self.page.mouse.click(x * viewport_size["width"], y * viewport_size["height"]) + + def click(self, id: int) -> None: + element_center = self.observation_processor().get_element_center(id) + self.click_xy(element_center[0], element_center[1]) + + def type(self, id: int, text: str, clear: bool = True): + if clear: + self.clear(id) + self.click(id) + self.page.keyboard.type(text) + + def clear(self, id: int) -> None: + self.click(id) + select_key = "Meta" if platform.startswith("darwin") else "Control" + self.page.keyboard.down(select_key) + self.page.keyboard.press("a") + self.page.keyboard.up(select_key) + self.page.keyboard.press("Backspace") + + def parse_page(self): + observation = self.observation_processor().process( + page=self.page, client=self.page.client + ) + + return observation diff --git a/webagents_step/prompts/webarena/step_fewshot_template.py b/webagents_step/prompts/webarena/step_fewshot_template.py new file mode 100644 index 0000000..a309613 --- /dev/null +++ b/webagents_step/prompts/webarena/step_fewshot_template.py @@ -0,0 +1,1457 @@ +github_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_commits [query]`: Given you are in a project page, this subroutine searches Gitlab for commits made to the project and retrieves information about a commit. This function returns the answer to the query. +`search_issues [query]`: Use this subroutine to find an issue on Gitlab. Any objective that requires finding an issue as an intermediate step, e.g. open latest issue, open issue with and check for X, should call this subroutine +`create_project [query]`: Given you are in the create new project page, this subroutine completes the act of creating a project, adding members etc. +`create_group [query]`: Given you are in the create new group page, this subroutine completes the act of creating a group, adding members etc. + + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?] +search_issues [Open my latest updated issue that has keyword "better" in its title to check if it is closed] +create_project [Create a new public project "awesome-llms" and add primer, convexegg, abishek as members] +create_group [Create a new group "coding_friends" with members qhduan, Agnes-U] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response, e.g. 1 = find_commits [query] + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these GENERAL INSTRUCTIONS: +* PREVIOUS ACTIONS contains previous actions and subroutine calls with corresponding responses, e.g. 1 = find_commits [query] implies that find_commits subroutine returned a response of 1 commit +* USE the responses from your subroutine. Do NOT try to solve the subroutine objective again by yourself +* DO NOT count commits yourself. Return the response from find_commits in PREVIOUS ACTIONS, e.g. 1 = find_commits [query] implies you should return stop [1] +* If the subroutine returns a response, e.g. Open = search_issues [query], and you have to issue a stop, then issue the same format as that of the response, e.g. stop [Open] +* If the objective is to check if an issue, pull request, etc is open or closed, respond as though you are answering the question, e.g. "No, it is open", "Yes, it is closed" +* To access all public projects, you need to navigate to Explore +* In a repository page, every repository has 4 metrics listed in order Stars, Forks, Merge Requests, and Issues. +* If a project does not exist, then you cannot do any operations on it like cloning it or creating issues etc. +* The command to clone a project is git clone [project URL] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +In your REASON, you MUST specify if any of the subroutine actions or GENERAL INSTRUCTIONS apply and how that affects the action you choose. +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +How many commits did Mike Perotti make to diffusionProject on 03/23/2023? +OBSERVATION: +[4] RootWebArea 'Projects · Dashboard · GitLab' focused: True + [1664] heading 'The diffusion / diffusionProject.com' + [1522] link 'The diffusion / diffusionProject.com +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Let's think step-by-step. We are in the dashboard page. The task is to find how many commits Mike Perotti made to the diffusionProject. We have to first click on the project page. Once we are in the project page, we can use the find_commits subroutine. The id corresponding to the project page is 1664. In summary, the next action I will perform is ```click [1664]``` +ACTION: +click [1664] +"""}, +{ +"input": """ +OBJECTIVE: +How many commits did Mike Perotti make to diffusionProject on 03/23/2023? +OBSERVATION: +[2234] RootWebArea 'The diffusionProject · GitLab' focused: True + [3014] link '0' + [3373] link '2,320 Commits' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Let's think step-by-step. We are in the diffusionProject page. The task is to find how many commits Mike Perotti made to the diffusionProject. Since we are in the project page, we can use the find_commits subroutine. In summary, the next action I will perform is ```find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?]``` +ACTION: +find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?] +""" +}, +{ + "input": """ +OBJECTIVE: +How many stars does a11y-webring.club have? +OBSERVATION: + [1258] link 'A' + [1248] heading 'Byte Blaze / a11y-webring.club' + [1831] link 'Byte Blaze / a11y-webring.club' + [1683] generic 'Public - The project can be accessed without any authentication.' + [1673] StaticText 'Owner' + [1241] generic 'globe with meridians' + [1684] StaticText '🌐' + [1771] StaticText ' A webring for digital accessibility practitioners.' + [1726] link '2' + [1463] link '0' + [1325] link '1' + [1784] link '4' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Every repository has 4 metrics listed in order Stars, Forks, Merge Requests, and Issues. Hence, [1726] link '2' suggests 2 stars. +ACTION: +stop [2] +""" +} +] +} + +find_commits = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`scroll [direction=down|up]`: Scroll the page up or down. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +scroll [down] +stop [Mark made 2 commits on 07/08/2023] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +* To find a list of all commits, you must navigate to the commits section of the repository +* Look at the first and last date in your observation to know if the desired date is in the range +* If it's in the range but not visible, that means no commits were made on that date +* If the date is outside of the range, you need to scroll up/down to get to the desired date range. Scrolling down takes you to a date earlier in time (e.g. Feb 2023 is earlier in time than Mar 2023) +* To count commits from a specific author, count the number of times their avatar (e.g. img " avatar") appears in the observation. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +In your REASON, you MUST specify if any of the general instructions or examples apply and how that affects the action you choose. +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +find_commits [How many commits did Mike Perotti make to diffusionProject on 02/02/2023?] +OBSERVATION: +[8420] StaticText '02 Feb, 2023' + [8423] StaticText '3 commits' + [8426] img "Mike Perrotti's avatar" + [8428] link 'Coverage improvements (#449)' + [8433] link 'Mike Perrotti' + [8434] StaticText ' authored ' + [8435] time 'Feb 1, 2023 10:43pm EST' + [8437] StaticText '1 year ago' + [10354] button 'Unverified' + [8444] StaticText 'a323cbb6' + [8442] button 'Copy commit SHA' live: polite atomic: False relevant: additions text + [8443] link 'Browse Files' + [8455] img "Mike Perrotti's avatar" + [8457] link 'updates guidance about numeric table values to be more specific (#451) + [8462] link 'Mike Perrotti' + [8463] StaticText ' authored ' + [8464] time 'Feb 1, 2023 10:26pm EST' + [8466] StaticText '1 year ago' + [10356] button 'Unverified' + [8473] StaticText '9bf4ae35' + [8471] button 'Copy commit SHA' live: polite atomic: False relevant: additions text + [8472] link 'Browse Files' + [8484] img "Emily Brick's avatar" + [8486] link 'Correct typos on Contributing Guidelines and Follow (#1518)' + [8491] link 'Emily Brick' + [8492] StaticText ' authored ' + [8493] time 'Feb 1, 2023 7:40pm EST' + [8495] StaticText '1 year ago' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [3373] +scroll [down] +scroll [down] +""", +"response": """ +REASON: +The objective is to find how many commits Mike Perotti made on Feb 2, 2023. I see that there has been 3 commits on 02 Feb, 2023. +However, I must count number of times img "Mike Perrotti's avatar" appears. +I see [8426] img "Mike Perrotti's avatar", [8455] img "Mike Perrotti's avatar". +Counting this leads to 2 commits made by Mike Perotti. In summary, the next action I will perform is ```stop [2]``` +ACTION: +stop [2] +""" +}, +] +} + +search_issues = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [Closed] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these general instructions: +* First navigate the Issues page +* Once you are in the Issues page, you MUST first navigate to all issues so that you see both open and closed issues for solving the objective +* You may not see all issues listed at once, use the search bar to search for appropriate keywords and filter down to relevant set of issues +* If the objective says to "Open ... issue, check if it is X", you must first open the specific issue page by clicking it. Do not stop [] until you have navigated to the specific issue page. +* Once you are on the issue page, return the appropriate status +* In your status, if the objective is to check if an issue is open or clossed, respond as though you are answering a question, e.g. "No, it is open", "Yes, it is closed" +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +In your REASON, you MUST specify if any of the general instructions or examples apply and how that affects the action you choose. +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +Tab 0 (current): Issues \u00b7 Dashboard \u00b7 GitLab\n\n[2234] RootWebArea 'Issues \u00b7 Dashboard \u00b7 GitLab' focused: True\n\t[2241] link 'Skip to content'\n\t[2295] link 'Dashboard'\n\t[4183] button '' hasPopup: menu expanded: False\n\t[3082] textbox 'Search GitLab' required: False\n\t[3025] generic 'Use the shortcut key / to start a search'\n\t[2367] link 'Create new...'\n\t[2369] link 'Issues'\n\t\t[2638] generic '13 assigned issues'\n\t[2370] link 'Merge requests'\n\t\t[2640] generic '8 merge requests'\n\t[2372] link 'To-Do List'\n\t\t[2656] generic 'Todos count'\n\t[2373] link 'Help'\n\t[2375] link 'Byte Blaze'\n\t\t[2690] img 'Byte Blaze'\n\t[2247] main ''\n\t\t[2259] heading 'Issues'\n\t\t[2332] link 'Select project to create issue'\n\t\t[2334] button 'Toggle project select'\n\t\t[2335] link 'Open 13'\n\t\t[2336] link 'Closed 53'\n\t\t[2337] link 'All 66'\n\t\t[2268] link 'Subscribe to RSS feed'\n\t\t[2269] link 'Subscribe to calendar'\n\t\t[2390] button 'Recent searches'\n\t\t[3466] button 'Assignee = Byte Blaze'\n\t\t\t[4251] button ''\n\t\t[3084] textbox '' required: False\n\t\t[3462] button ''\n\t\t[2282] group ''\n\t\t\t[3115] button 'Updated date' hasPopup: listbox\n\t\t\t[2349] link 'Sort direction'\n\t\t[4035] link 'Tm Theme Editor' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Let's think step-by-step. Since the objective is to search over all issues, first I need to navigate to the page containing All issues. Looking at the observation, I am not certain if I am seeing all issues. Hence, I must click the link with text link 'All 66' that suggests that it leads to the page with all issues. In summary, the next action I will perform is ```click [2337]``` +ACTION: +click [2337] +"""}, +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +Tab 0 (current): Issues \u00b7 Dashboard \u00b7 GitLab\n\n[4371] RootWebArea 'Issues \u00b7 Dashboard \u00b7 GitLab' focused: True\n\t[4622] link 'Skip to content'\n\t[4631] link 'Dashboard'\n\t[6161] button '' hasPopup: menu expanded: False\n\t[4383] textbox 'Search GitLab' required: False\n\t[4660] generic 'Use the shortcut key / to start a search'\n\t[4680] link 'Create new...'\n\t[4700] link 'Issues'\n\t\t[4702] generic '13 assigned issues'\n\t[4707] link 'Merge requests'\n\t\t[4710] generic '8 merge requests'\n\t[4732] link 'To-Do List'\n\t\t[4734] generic 'Todos count'\n\t\t\t[4738] StaticText '5'\n\t[4739] link 'Help'\n\t[4780] link 'Byte Blaze'\n\t\t[4782] img 'Byte Blaze'\n\t[4387] main ''\n\t\t[4841] heading 'Issues'\n\t\t[4843] link 'Select project to create issue'\n\t\t[4844] button 'Toggle project select'\n\t\t[4862] link 'Open 13'\n\t\t[4867] link 'Closed 53'\n\t\t[4872] link 'All 66'\n\t\t[4877] link 'Subscribe to RSS feed'\n\t\t[4878] link 'Subscribe to calendar'\n\t\t[4894] button 'Recent searches'\n\t\t[4400] button 'Assignee = Byte Blaze'\n\t\t\t[4946] button ''\n\t\t[4398] textbox '' required: False\n\t\t[4893] button ''\n\t\t[5199] group ''\n\t\t\t[5202] button 'Updated date' hasPopup: listbox\n\t\t\t[5201] link 'Sort direction'\n\t\t[5299] link 'Tm Theme Editor'\n\t\t[5302] StaticText 'byteblaze/a11y-syntax-highlighting#1 '\n\t\t[5303] StaticText '\u00b7 created '\n\t\t[4423] time 'Apr 2, 2020 12:13am EDT'\n\t\t\t[5306] StaticText '3 years ago'\n\t\t[5304] StaticText ' by '\n\t\t[5305] link 'earle'\n\t\t[5312] link 'Assigned to Byte Blaze'\n\t\t[5314] link ' 14'\n\t\t[5320] StaticText 'updated '\n\t\t[4427] time 'Mar 27, 2023 7:15pm EDT'\n\t\t\t[5321] +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2337] +""", +"response": """ +REASON: +Let's think step-by-step. I see all issues on the current page. Given the objective, I must search for issues that contain the keyword "homepage content". To do this, I will type the keyword into the search input textbox that appears just before the list of issues. In summary, the next action I will perform is ```type [4398] [homepage content] [1]``` +ACTION: +type [4398] [homepage content] [1] +""" +}, +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +'\n\t\t[7142] StaticText '\u00b7 created '\n\t\t[6357] time 'Jul 15, 2020 2:48pm EDT'\n\t\t\t[7154] StaticText '3 years ago'\n\t\t[7143] StaticText ' by '\n\t\t[7144] link 'Byte Blaze'\n\t\t[7145] StaticText ' \\xa0 '\n\t\t[7147] link 'bug'\n\t\t[7149] link 'claimed'\n\t\t[7151] link 'redesign'\n\t\t[7153] link 'styling'\n\t\t[7169] StaticText 'CLOSED'\n\t\t[7170] link 'Assigned to Byte Blaze'\n\t\t[7172] link ' 4'\n\t\t[7178] StaticText 'updated '\n\t\t[6361] time 'Mar 23, 2023 4:40am EDT'\n\t\t\t[7179] StaticText '10 months ago'\n\t\t[7182] link \"Rethink the homepage's content\"\n\t\t[7185] StaticText 'a11yproject/a11yproject.com#719 '\n\t\t[7186] StaticText '\u00b7 created '\n\t\t[6367] time 'Dec 30, 2018 2:42pm EST'\n\t\t\t[7197] StaticText '5 years ago'\n\t\t[7187] StaticText ' by '\n\t\t[7188] link 'Byte Blaze'\n\t\t[7190] StaticText '\\xa0 '\n\t\t[7191] link ' Content Updates for 2019'\n\t\t[7192] StaticText ' \\xa0 '\n\t\t[7194] link 'enhancement'\n\t\t[7196] link 'redesign'\n\t\t[7213] StaticText 'CLOSED'\n\t\t[7214] link 'Assigned to Byte Blaze'\n\t\t[7216] link ' 5'\n\t\t[7222] StaticText 'updated '\n\t\t[6371] time 'Mar 23, 2023 4:39am EDT'\n\t\t\t[7223] StaticText +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2337] +type [4398] [homepage content] [1] +""", +"response": """ +REASON: +Let's think step-by-step. I see one issue called "Rethink the homepage's content" that contains the keywords "homepage content". Since it is only one issue, it must be the latest updated issue. According to the general instructions, since the objective asks me to open the issue, I must click the issue. In summary, the next action I will perform is ```click [7182]``` +ACTION: +click [7182] +""" +}, +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +button 'Collapse sidebar'\n\t[8080] navigation 'Breadcrumbs'\n\t\t[8095] link 'The A11Y Project'\n\t\t[8101] link 'a11yproject.com'\n\t\t[8108] link 'Issues'\n\t\t[8114] link '#719'\n\t[7383] main ''\n\t\t[8138] StaticText 'Closed'\n\t\t[8149] StaticText 'Issue created '\n\t\t[8150] time 'Dec 30, 2018 7:42pm'\n\t\t[8151] StaticText ' by' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2337] +type [4398] [homepage content] [1] +click [7182] +""", +"response": """ +REASON: +The observation indicates that the issue titled \"Rethink the homepage's content (#719)\" is already open in the current tab. There is also a StaticText element with the text 'Closed' (id 8138), which suggests that the status of the issue is visible on the page. Therefore, no further navigation or clicks are required to complete the task. Hence, I must stop and return the desired response. In summary, the next action I will perform is ```stop [Closed]``` +ACTION: +"stop [Closed]" +""" +}, +] +} + +create_project = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these general instructions: +1. To add new members, once you have created the project, click on Project Information in the sidebar to be guided to a link with memmbers. +2. When adding members, first type their name, then click on their name from the down down. Consult PREVIOUS ACTIONS to see if you have typed and selected the names. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +create_project [Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members] +OBSERVATION: +Tab 0 (current): Byte Blaze / awesome-llms \u00b7 GitLab\n\n[34973] RootWebArea 'Byte Blaze / awesome-llms \u00b7 GitLab' focused: True\n\t[35051] link 'Skip to content'\n\t[35060] link 'Dashboard'\n\t[36630] button '' hasPopup: menu expanded: False\n\t[34985] textbox 'Search GitLab' required: False\n\t[35092] generic 'Use the shortcut key / to start a search'\n\t[35112] link 'Create new...'\n\t[35151] link 'Issues'\n\t\t[35153] generic '13 assigned issues'\n\t[35158] link 'Merge requests'\n\t\t[35161] generic '8 merge requests'\n\t[35183] link 'To-Do List'\n\t\t[35185] generic 'Todos count'\n\t[35190] link 'Help'\n\t[35231] link 'Byte Blaze'\n\t\t[35233] img 'Byte Blaze'\n\t[35287] complementary 'Project navigation'\n\t\t[35292] listitem 'awesome-llms'\n\t\t\t[35313] link 'awesome-llms'\n\t\t\t\t[35317] StaticText 'A'\n\t\t[35319] link 'Project information'\n\t\t[35345] link 'Repository'\n\t\t[35387] link 'Issues'\n\t\t\t[35397] StaticText '0'\n\t\t[35421] link 'Merge requests'\n\t\t[35438] link 'CI/CD'\n\t\t[35468] link 'Security & Compliance'\n\t\t[35486] link 'Deployments'\n\t\t[35512] link 'Packages and registries'\n\t\t[35534] link 'Infrastructure'\n\t\t[35560] link 'Monitor'\n\t\t[35590] link 'Analytics'\n\t\t[35616] link 'Wiki'\n\t\t[35629] link 'Snippets'\n\t\t[35642] link 'Settings'\n\t\t[35290] button 'Collapse sidebar'\n\t[4817] generic 'Public - The project can be accessed without any authentication.'\n\t\t[4823] StaticText 'Project ID: 195 '\n\t\t[4824] button 'Copy project ID' live: polite atomic: False relevant: additions text\n\t\t[5115] button '' hasPopup: menu expanded: False\n\t\t[4833] button 'Star'\n\t\t[4834] link '0'\n\t\t[4844] link 'Fork'\n\t\t[4843] link '0'\n\t\t[4857] link '1 Commit'\n\t\t[4865] link '1 Branch'\n\t\t[4873] link '0 Tags'\n\t\t[4881] link '72 KB Project Storage'\n\t\t[6112] link ''\n\t\t[6119] link 'Initial commit'\n\t\t[6122] +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2782] +type [3910] [awesome-llms] [0] +click [3903] +""", +"response": """ +REASON: +Let's think step-by-step. The task is to create a new public project named "awesome-llms" and add members to it. However, based on the observation, it appears that the project "awesome-llms" has already been created, as indicated by the presence of elements such as 'Project ID: 195' and 'Initial commit'. The next step is to add members to the project. To do this, I need to navigate to the project information which contains a link to add members. The link to 'Project information' is present in the observation with id 35319. In summary, the next action I will perform is ```click [35319]``` +ACTION: +click [35319] +"""}, +] +} + +create_group = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these general instructions: +1. To add new members, click on the Members tab in the side pane. If you don't see it, click on Group Information in the sidebar to be guided to a link with memmbers. +2. When adding members, first type their name, then click on their name from the down down. Consult PREVIOUS ACTIONS to see if you have typed and selected the names. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +reddit_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_subreddit [query]`: Find a subreddit corresponding to the query. The query can either be the name of the subreddit or a informative description of what the subreddit may contain. The subroutine hands back control once it navigates to the subreddit by returning "N/A" to denote success. +`find_user [user_name]`: Navigate to the page of a user with user_name. The page contains all the posts made by the user. + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +find_subreddit [books] +find_subreddit [something related to driving in Pittsburgh] +find_subreddit [most appropriate subreddit for X] +find_user [AdamCannon] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +1. If you have to do a task related to a particular user, first find the user using find_user subroutine +2. Otherwise, if you have to post or edit a post in a subreddit, first find the subreddit using the find_subreddit subroutine. Pass in as much information in the argument. While find_subreddit will return the most relevant subreddit to your query, it is okay if it does not exactly match your query. +3. When making a post or a comment to a reply, look at your OBSERVATION or PREVIOUS ACTIONS to make sure you are not repeating the same action. +4. When typing the "Title" of a submission, make sure to match the phrasing in objective exactly. If the objective said Post "what could X", type that in exactly as the title. In your REASON, you MUST specify the formatting guidelines you are following. +5. When creating a Forum, be sure to fill in the title, description and sidebar as specified in the objective exactly. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +find_subreddit = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +* The objective find_subreddit [query] asks you to navigate to the subreddit that best matches the query. The query can be specific or vague. +* The first step is to navigate to Forums to see the list of subreddits. However, if you have done this already (indicated as non empty PREVIOUS ACTIONS), do not repeat this step. +* Under forums, you will see only a subset of subreddits. To get the full list of subreddits, you need to navigate to the Alphabetical option. +* To know you can see the full list of subreddits, you will see 'All Forums' in the observation +* Often you will not find a focused subreddit that exactly matches your query. In that case, go ahead with the closest relevant subreddit. +* To know that you have reached a subreddit successfully, you will see '/f/subreddit_name' in the observation. +* Once you have navigated to any specific subreddit, return stop [N/A]. Even if the subreddit is generally related and not specific to your quwey, stop here and do not try to search again for another subreddit. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +find_user = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] +goto [https://localhost:9999/user/] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective find_user [user_name] asks you to navigate the page of a user with user_name +2. To do so, look at the current base URL (e.g. https://localhost:9999) and add a suffix /user/user_name, i.e. +goto [https://localhost:9999/user/user_name] +3. Once you have navigated to the user page (as seen in your past actions), return stop [N/A] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +shopping_admin_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`scroll [direction=down|up]`: Scroll the page up or down. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_customer_review [query]`: Find customer reviews for a particular product using the query to specify the kind of review. +`find_order [query]`: Find an order corresponding to a particular customer or order number. +`search_customer [query]`: Find a customer given some details about them such as their phone number. + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +scroll [down] +find_customer_review [Show me customer reviews for Zoe products] +find_order [Most recent pending order by Sarah Miller] +find_order [Order 305] +search_customer [Search customer with phone number 8015551212] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +1. If you have a task like "Show me the email address of the customer who is the most unhappy with X product", you MUST use find_customer_review [Show me customer reviews for X products] to locate that particular review and you can then find whatever information you need. Do not try to solve the task without using the subroutine as it contains specific instructions on how to solve it. +2. If you have a task like "Show me the customers who have expressed dissatisfaction with X product", you MUST use find_customer_review [Show me customer reviews for X product]. +3. If you have a task about a particular order, e.g. "Notify X in their most recent pending order with message Y", you MUST use find_order [Most recent pending order for X] to locate the order, and then do operations on that page. Do this even if the order is visible in the current page. +4. To write a comment on the order page, you MUST scroll[down] till you find the Comment section. You MUST NOT click on "Comments History" tab, it does not lead you to the right place. Stay on the current page and scroll down to see the comment section. +5. If you have a task about a particular order, e.g. "Cancel order X", you MUST use find_order [Find order X] to locate the order, and then do operations on that page. +6. If you have a task like "Find the customer name and email with phone number X", you MUST use search_customer [Search customer with phone number X] to locate the customer, and then answer the query. Do NOT click on CUSTOMERS side panel. +7. You MUST use Subroutine Actions whenever possible. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +In your REASON, you MUST specify if any of the general instructions above apply that would affect the action you choose. +""", + +"response": "", + +"examples": [ +] +} + +find_customer_review = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective find_customer_review [query] asks you to navigate to the product page containing customer reviews. +2. To navigate to a review, first click on REPORTS in the side panel +3. Once you have clicked on REPORTS, and you see the Reports panel with Marketing, Sales, Reviews, Customers etc, click on By Products under Customers. +4. Once you are in the Product Reviews Report, you need to locate the product by searching for it. Use the gridcell below Product to search for a product. Do not use other search boxes. Look at the example below where I show you how to search for Zoe in the correct gridcell. +5. When searching for a product, search the first word only like Zoe, or Antonia or Chloe. +6. Once the product shows up, click on 'Show Reviews'. +7. Once all the reviews show up, return stop [N/A] to hand back control to the agent that queried you. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ + { +"input": """ +OBJECTIVE: +find_product_review [Show me the review of the customer who is the most unhappy with the style of Zoe products] +OBSERVATION: +Tab 0 (current): Product Reviews Report / Reviews / Reports / Magento Admin\n\t\t[1992] table ''\n\t\t\t[2723] row ''\n\t\t\t\t[2724] columnheader 'ID' required: False\n\t\t\t\t[2725] columnheader 'Product' required: False\n\t\t\t\t[2726] columnheader '\u2191 Reviews' required: False\n\t\t\t\t[2727] columnheader 'Average' required: False\n\t\t\t\t[2728] columnheader 'Average (Approved)' required: False\n\t\t\t\t[2729] columnheader 'Last Review' required: False\n\t\t\t\t[2730] columnheader 'Action' required: False\n\t\t\t[1994] row ''\n\t\t\t\t[1995] gridcell '' required: False\n\t\t\t\t\t[1996] textbox '' required: False\n\t\t\t\t[1997] gridcell '' required: False\n\t\t\t\t\t[1998] textbox '' required: False\n\t\t\t\t[1999] gridcell '' required: False\n\t\t\t\t\t[2000] textbox '' required: False\n\t\t\t\t[2001] gridcell '' required: False\n\t\t\t\t\t[2002] textbox '' required: False\n\t\t\t\t[2003] gridcell '' required: False\n\t\t\t\t\t[2004] textbox '' required: False\n\t\t\t\t[2005] gridcell 'undefined \\ue627 undefined \\ue627' required: False\n\t\t\t\t\t[2008] textbox 'From' required: False\n\t\t\t\t\t[3196] button 'undefined \\ue627'\n\t\t\t\t\t[2010] textbox 'To' required: False\n\t\t\t\t\t[3201] button 'undefined \\ue627'\n\t\t\t\t[2747] gridcell '\\xa0' required: False\n\t\t\t[2759] row 'https://localhost:7780/admin/review/product/index/productId/1620/'\n\t\t\t\t[2779] gridcell '1620' required: False\n\t\t\t\t[2780] gridcell 'Erica Evercool Sports Bra' required: False\n\t\t\t\t[2781] gridcell '4' required: False\n\t\t\t\t[2782] gridcell '60.0000' required: False\n\t\t\t\t[2783] gridcell '60.0000' required: False\n\t\t\t\t[2784] gridcell 'Apr 19, 2023, 12:15:19 PM' required: False\n\t\t\t\t[2785] gridcell 'Show Reviews' required: False\n\t\t\t\t\t[2792] link 'Show Reviews'\n\t\t\t[2760] row 'https://localhost:7780/admin/review/product/index/productId/39/'\n\t\t\t\t[2794] gridcell '39' required: False\n\t\t\t\t[2795] gridcell +URL: +https://localhost:7780/admin/reports/report_review/product/ +PREVIOUS ACTIONS: +click [1195] +click [1531] +""", +"response": """ +REASON: +Let's think step-by-step. We are in the Product Review Report page. We have to search for a Zoe product. To do so, I have to type in Zoe in a gridcell corresponding to the Product column. The Product column is the second column after ID. Therefore I have to find the second gridcell just below it. The first one is 1995, and the second one is 1997. In summary, the next action I will perform is type [1997] [Zoe] [1] +ACTION: +type [1997] [Zoe] [1] +"""}, +] +} + +find_order = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`scroll [direction=down|up]`: Scroll the page up or down. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] +scroll [down] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +* The objective find_order [query] asks you to navigate to the order page corresponding to the query +* To navigate to orders, first go to SALES in the side panel +* Once you have clicked on SALES, go to Orders +* Once you are in the orders page, you have to use the 'Filter' button to filter down to desired criteria +* Desired criterias include filtering down to a specific order ID field or Name field. ONLY use fields that are in the objective +* You MUST use Filter to find orders instead of using the search bar +* If there are any active filters, be sure to clear them before entering your filter criteria +* In your filtered list of orders, if you don't find the desired order, make sure to scroll down till you find the order or reach end of page (typically indicated by 'Copyright © ...' in the observation) +* Once you have found the order, go to View to open the order +* Once you are in the desired order page (as noted by "Order & Account Information") you MUST return stop [N/A] to hand back control to the agent that queried you. Do not go back to another page. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +search_customer = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective search_customer [query] asks you to search for customer details corresponding to the query +2. To navigate to customers, first click on CUSTOMERS in the side panel +3. Once you have clicked on CUSTOMERS, click on All Customers. +4. Once you are in the customers page, you have to use the 'Search by keyword' text box to search for your customer. Always be sure to search first. For example, for find_order [Search customer with phone number 8015551212], search 8015551212. +5. If the page shows a number has already been searched, click on Clear All first. Then proceed with the search. +6. Once you are done with the search, and the customer with matching query shows up, you MUST return stop [N/A] to hand back control to the agent that queried you. Do not go back to another page. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +shopping_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`scroll [direction=down|up]`: Scroll the page up or down. +`hover [id]`: Hover over an element with id. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`search_order [question]`: Search orders to answer a question about my orders +`find_products [query]`: Find products that match a query +`search_reviews [query]`: Search reviews to answer a question about reviews + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +scroll [down] +hover [11] +search_order [How much I spend on 4/19/2023 on shopping at One Stop Market?] +list_products [List products from PS4 accessories category by ascending price] +search_reviews [List out reviewers, if exist, who mention about ear cups being small] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these GENERAL INSTRUCTIONS: +* If the OBJECTIVE is a question about my orders, you MUST use search_order [question] to answer the question e.g. How much did I spend on X, or What is the size of X that I bought, or Change the delivery address for X. +Do not try to solve the task without using search_order as it contains specific instructions on how to solve it. Do not click on MyAccount directly. +* The response from subroutines is stored in PREVIOUS ACTIONS. For example, $0 = search_order [How much I spend on X?] means that the response was $0. In that case, return the answer directly, e.g. stop [$0]. If the response was N/A, reply stop [N/A]. Trust the answer returned by search_order. +* If the OBJECTIVE is a question about listing / showing products, you MUST use list_products. For example, +list_products [List products from X] +list_products [Show me the most expensive product from X] +* If the OBJECTIVE requires you to retrieve details about a particular order you placed liked SKU, you MUST first use search_order [] to retrieve the SKU. +For example, if the OBJECTIVE is "Fill the form for a refund on X .... Also, ensure to include the order number #161 and the product SKU.", you must first issue search_order [Give me the SKU of X from order number #161]. +* If the OBJECTIVE requires order id and amount, you must first issue search_order [Give me the order id and the amount for X] +* If the OBJECTIVE is about reviews for the product, you MUST use search_reviews. For example, search_reviews [List out reviewers ..] or search_reviews [What are the main criticisms of X] +* Return the response from search_reviews VERBATIM. Trust that it has solved the OBJECTIVE correctly. +* When filling out a form for refund, you must mention the word refund. Also, you MUST NOT use the word "just" or "which". This is against formatting guidelines. E.g. say "It broke after three days" rather than "which broke after just three days" or "The product broke after three days". +* The Contact Us link is usually at the bottom of a page, scroll down to find it. +* If the OBJECTIVE asks you to "Draft" something, perform all necessary actions except submitting at the end. Do NOT submit as this is a draft. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +In your REASON, you MUST specify if any of the subroutine actions or GENERAL INSTRUCTIONS apply and how that affects the action you choose. +""", + +"response": "", + +"examples": [ +] +} + +search_order = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`note [content]`: Use this to make a personal note of some content you would like to remember. This shows up in your history of previous actions so you can refer to it. +`go_back`: Navigate to the previously viewed page. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +note [Spent $10 on 4/1/2024] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these GENERAL INSTRUCTIONS: +* Navigate to My Account, then My Orders to access all orders. +* The orders are sorted by descending date. Click on Page Next to go to a earlier date. Click on Page Previous to go to a earlier date. +If you don't see an order for a date, and the first order on the page is after the date, the last order on the page is before the date, then it means there is no order for the date. +* In your REASON, state what is the current range, what range you are looking for, and whether you should search for an earlier or a later date. +* If you have to find the total amount you spent on orders that span multiple pages, use note [Spent $10 on 4/1/2024] to make a personal note before moving on to the next page. When you are done, you can look at PREVIOUS ACTIONS to find all notes. +* When you are adding numbers, work out each addition step by step in REASON. +* Use go_back to go back to a previous page from an order. But before you do, use note [] to make a note that you checked the page, e.g. note [Checked order on 11/29/2023, no picture frame.] +* If you are in an order page and need to go back, issue go_back. Don't click on My Orders else you have to start from all over again. +* Do not keep visiting the same order page over and over again! +To prevent this, whenever you visit a page, always make a note. For example note [Nothing relevant purchased on September 29, 2022] +See note [] to see what dates you have visit, and be sure to not visit that page again. +* Once you are done visiting all the pages, return stop [answer] with the answer to the query. +* If the question is how much did I spend on a date, and I didn't spend anything, return stop [$0] +* If the status of an order shows cancelled, that means I did not spend that money +* If you are asked to change the delivery address on an order, you can't. Reply stop [N/A] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +In your REASON, you MUST specify if any of the GENERAL INSTRUCTIONS apply and how that affects the action you choose. +""", + +"response": "", + +"examples": [ +] +} + +list_products = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. Use this whenever any element has the field hasPopup: menu +`goto [url]`: Navigate to a specific URL. Use this when needing to sort by price. Refer to instructions below. +`note [content]`: Use this to make a personal note of some content you would like to remember. This shows up in your history of previous actions so you can refer to it. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] +hover [77] +goto [https://localhost:7770/video-games/playstation-4/accessories.html?product_list_order=price] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. To find a product category, you MUST use hover [id] to expand the popup Menu till you find the leaf element that has no popup. Then click [id] on the leaf element. +For exmaple, to find PS 4 accessories you must hover over Video Games, then hover over Playstation 4, then click on Accessories. +Use note [] eveytime you hover on an item, and don't find the category. This is to ensure you don't keep trying the same category repeatedly. +2. To sort current list of products by price and in ascending order, you MUST use the goto [url] action by appending ?product_list_order=price to the current URL. For example: +If URL is https://localhost:7770/video-games/playstation-4/accessories.html +then issue goto [https://localhost:7770/video-games/playstation-4/accessories.html?product_list_order=price] +3. To sort in descending order, you MUST use the goto [url] action by appending ?product_list_order=price&product_list_dir=desc, e.g. +If URL is https://localhost:7770/video-games/playstation-4/accessories.html +goto [https://localhost:7770/video-games/playstation-4/accessories.html?product_list_order=price&product_list_dir=desc] +4. To list all items less than a particular price, e.g. $25, you MUST use the goto [url] action by appending ?price=0-25 +If URL is https://localhost:7770/video-games/playstation-4/accessories.html +goto [https://localhost:7770/video-games/playstation-4/accessories.html?price=0-25] +5. Once you are done in stop [N/A] +6. If the OBJECTIVE asks you to show the most expensive product, you must click on the product. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +search_reviews = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`scroll [direction=down|up]`: Scroll the page up or down. +`note [content]`: Use this to make a personal note of some content you would like to remember. This shows up in your history of previous actions so you can refer to it. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +scroll [down] +note [Reviewer X made comment Y] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +* If you are not in the product page, you can search for the product using the search bar. +* To find the list of reviews, search for a link with Reviewers. If you can't find it, scroll down to search for it. +* Iterate over all reviews. For every relevant review, make a note [reviewer_name: review_info]. Record the relevant reviewer_name and review VERBATIM. Once you are done with all the reviews in a page, scroll down to access more reviews. +* Refer to PREVIOUS ACTIONS to know which reviews you have noted already. If you have noted a review already, look for the next review in your current OBSERVATION or scroll down. +* Do NOT repeat the note [] action for the same review. +* Not all reviews will be visible on the reviews page. You MUST scroll down till you reach the end of the page. You will know that you have reached the end of the page if you see “Contact Us” in the OBSERVATION. +* Once you have scrolled through all reviews, combine all your noted reviews that you can find under PREVIOUS ACTIONS. To combine, create a list of dicts where every dict has a name and review key. Be sure to capture ALL the reviews in your note. Return that as stop [{name: reviewer_name_1, review: review_1}, {name: reviewer_name_2, review: review_2}, ..] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +In your REASON, you MUST specify if any of the general instructions apply and how that affects the action you choose. +""", + +"response": "", + +"examples": [ +] +} + +maps_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`scroll [direction=down|up]`: Scroll the page up or down. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_directions [query]`: Find directions between two locations to answer the query +`search_nearest_place [query]`: Find places near a given location + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +scroll [down] +find_directions [Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University] +search_nearest_place [Tell me the closest cafe(s) to CMU Hunt library] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +1. If the OBJECTIVE is about finding directions from A to B, you MUST use find_directions [] subroutine. +e.g. find_directions [Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University] +2. If the OBJECTIVE is about searching nearest place to a location, you MUST use search_nearest_place [] subroutine. +e.g. search_nearest_place [Tell me the closest restaurant(s) to Cohon University Center at Carnegie Mellon University] +3. If the OBJECTIVE is to pull up a description, once that place appears in the sidepane, return stop [N/A] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +find_directions = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [5h 47min] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. First click on "Find directions between two points", then enter From and To Fields, and click search. +2. If you have to find directions to social security administration in Pittsburgh, search for it in a structured format like Social Security Administration, Pittsburgh. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +search_nearest_place = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [De Fer Coffee & Tea, La Prima Espresso, Rothberg's Roasters II, Cafe Phipps, La Prima Espresso, Starbucks] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. For searches that refer to CMU, e.g. "find cafes near CMU Hunt Library" +a. You have to first center your map around a location. If you have to find cafes near CMU Hunt Library, the first step is to make sure the map is centered around Carnegie Mellon University. To do that, first search for Carnegie Mellon University and then click [] on a list of location that appears. You MUST click on the Carnegie Mellon University location to center the map. Else the map will not centered. E.g click [646] +b. Now that your map is centered around Carnegie Mellon University, directly search for "cafes near Hunt Library". Do not include the word CMU in the search item. +The word CMU cannot be parsed by maps and will result in an invalid search. +c. When your search returns a list of elements, return them in a structured format like stop [A, B, C] +2. For searches that don't refer to CMU +a. No need to center the map. Directly search what is specified in OBJECTIVE, e.g. "bars near Carnegie Music Hall" +b. When your search returns a list of elements, return them in a structured format like stop [A, B, C] +3. Be sure to double check whether the OBJECTIVE has CMU or not and then choose between instruction 1 and 2. +4. Remember that the word CMU cannot be typed in the search bar as it cannot be parsed by maps. +5. Remember that if you want to center your map around Carnegie Mellon University, you have to click on it after you search for it. Check your PREVIOUS ACTIONS to confirm you have done so, e.g. click [646] should be in the previous actions. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} \ No newline at end of file diff --git a/webagents_step/prompts/webarena/step_fewshot_template_adapted.py b/webagents_step/prompts/webarena/step_fewshot_template_adapted.py new file mode 100644 index 0000000..df40fcb --- /dev/null +++ b/webagents_step/prompts/webarena/step_fewshot_template_adapted.py @@ -0,0 +1,1396 @@ +github_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_commits [query]`: Given you are in a project page, this subroutine searches Gitlab for commits made to the project and retrieves information about a commit. This function returns the answer to the query. +`search_issues [query]`: Given you are in my issue page, this subroutine searches Gitlab to find issue that matches the query. Any objective that says "openn my latest issue" or "open issue with in the title" must be passed through this subroutine. +`create_project [query]`: Given you are in the create new project page, this subroutine completes the act of creating a project, adding members etc. +`create_group [query]`: Given you are in the create new group page, this subroutine completes the act of creating a group, adding members etc. + + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?] +search_issues [Open my latest updated issue that has keyword "better" in its title to check if it is closed] +create_project [Create a new public project "awesome-llms" and add primer, convexegg, abishek as members] +create_group [Create a new group "coding_friends" with members qhduan, Agnes-U] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response, e.g. 1 = find_commits [query] + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: + +* Use responses from your subroutine calls to answer the objective. The PREVIOUS ACTIONS section lists all previous actions and their corresponding responses. For instance, an entry of 1 = find_commits [query] implies that the find_commits subroutine you invoked returned a response of 1. Use that response to solve the objective. +* If the subroutine returns a response, e.g. Open = search_issues [query], and you have to issue a stop, then issue the same format as that of the response, e.g. stop [Open] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +How many commits did Mike Perotti make to diffusionProject on 03/23/2023? +OBSERVATION: +RootWebArea 'Projects · Dashboard · GitLab' focused: True + heading 'The diffusion / diffusionProject.com' + link [1522] 'The diffusion / diffusionProject.com +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Let's think step-by-step. We are in the dashboard page. The task is to find how many commits Mike Perotti made to the diffusionProject. We have to first click on the project page. Once we are in the project page, we can use the find_commits subroutine. The id corresponding to the project page is 1522. In summary, the next action I will perform is ```click [1522]``` +ACTION: +click [1522] +"""}, +{ +"input": """ +OBJECTIVE: +How many commits did Mike Perotti make to diffusionProject on 03/23/2023? +OBSERVATION: +RootWebArea 'The diffusionProject · GitLab' focused: True + link [3014] '0' + link [3373] '2,320 Commits' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Let's think step-by-step. We are in the diffusionProject page. The task is to find how many commits Mike Perotti made to the diffusionProject. Since we are in the project page, we can use the find_commits subroutine. In summary, the next action I will perform is ```find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?]``` +ACTION: +find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?] +""" +}, +] +} + +find_commits = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +stop [Mark made 2 commits on 07/08/2023] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?] +OBSERVATION: +RootWebArea 'The diffusionProject · GitLab' focused: True + link [3014] '0' + link [3373] '2,320 Commits' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Let's think step-by-step. To find the number of commits made by Mike Perotti on a specific date, I need to navigate to the commits section of the repository. The link with the text '2,320 Commits' suggests that it leads to the commit history. Therefore, I will click on this link to proceed to the commits section. In summary, the next action I will perform is ```click [3373]``` +ACTION: +click [3373] +"""}, +{ +"input": """ +OBJECTIVE: +find_commits [How many commits did Mike Perotti make to diffusionProject on 03/23/2023?] +OBSERVATION: +text '23 Mar, 2023' +text '3 commits' +link [11366] "Emily Brick's avatar" +link [11369] 'Coverage improvements (#449)' +button [11371] 'Toggle commit description' +link [11380] 'Emily Brick' +time 'Mar 23, 2023 7:58pm EDT' +link [11440] 'Browse Files' +link [11451] "Mike Perrotti's avatar" +link [11454] 'updates guidance about numeric table values to be more specific (#451)' +link [11459] 'Mike Perrotti' +text ' authored ' +time 'Mar 23, 2023 2:58pm EDT' +button [13266] 'Unverified' +text 'da9575e5' +link [11469] 'Browse Files' +link [11480] "Cole Bemis's avatar" +link [11483] 'Add SSR compatibility to component lifecycle criteria (#440)' +link [11494] 'Cole Bemis' +time 'Mar 22, 2023 2:40pm EDT' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [3373] +""", +"response": """ +REASON: +Let's think step-by-step. I have already clicked on the link for commits. +From the browser content, I can see this is a list of commits for the diffusionProject. +I see that there has been 3 commits on 03/23/2023. I see that Mike Perrotti made 1 commit on Mar 23, 2023 2:58pm EDT. In summary, the next action I will perform is ```stop [1]``` +ACTION: +stop[1] +""" +}, +] +} + +search_issues = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [Closed] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} + +Please follow these general instructions: +1. By default you begin with the page containing all open issues. If the objective requires you to search over all issues, e.g. "Open my latest updated issue ... check if closed", make sure that you navigate to the page containing "all issues"". +2. If the objective says "Open ... issue to check if it is closed", this means: +a. First open the issue being referred to by clicking on it +b. Then return the status, i.e. stop [open], stop [closed]. +Do not return stop [] until you are sure that you have clicked on the issue. +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +Tab 0 (current): Issues \u00b7 Dashboard \u00b7 GitLab\n\nRootWebArea 'Issues \u00b7 Dashboard \u00b7 GitLab' focused: True\n\tlink [2241] 'Skip to content'\n\tlink [2295] 'Dashboard'\n\tbutton [4183] '' hasPopup: menu expanded: False\n\ttextbox [3082] 'Search GitLab' required: False\n\tgeneric [3025] 'Use the shortcut key / to start a search'\n\tlink [2367] 'Create new...'\n\tlink [2369] 'Issues'\n\t\tgeneric [2638] '13 assigned issues'\n\tlink [2370] 'Merge requests'\n\t\tgeneric [2640] '8 merge requests'\n\tlink [2372] 'To-Do List'\n\t\tgeneric [2656] 'Todos count'\n\tlink [2373] 'Help'\n\tlink [2375] 'Byte Blaze'\n\t\timg 'Byte Blaze'\n\tmain [2247] ''\n\t\theading 'Issues'\n\t\tlink [2332] 'Select project to create issue'\n\t\tbutton [2334] 'Toggle project select'\n\t\tlink [2335] 'Open 13'\n\t\tlink [2336] 'Closed 53'\n\t\tlink [2337] 'All 66'\n\t\tlink [2268] 'Subscribe to RSS feed'\n\t\tlink [2269] 'Subscribe to calendar'\n\t\tbutton [2390] 'Recent searches'\n\t\tbutton [3466] 'Assignee = Byte Blaze'\n\t\t\tbutton [4251] ''\n\t\ttextbox [3084] '' required: False\n\t\tbutton [3462] ''\n\t\tgroup [2282] ''\n\t\t\tbutton [3115] 'Updated date' hasPopup: listbox\n\t\t\tlink [2349] 'Sort direction'\n\t\tlink [4035] 'Tm Theme Editor' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: + +""", +"response": """ +REASON: +Let's think step-by-step. Since the objective is to search over all issues, first I need to navigate to the page containing All issues. Looking at the observation, I am not certain if I am seeing all issues. Hence, I must click the link with text link 'All 66' that suggests that it leads to the page with all issues. In summary, the next action I will perform is ```click [2337]``` +ACTION: +click [2337] +"""}, +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +Tab 0 (current): Issues \u00b7 Dashboard \u00b7 GitLab\n\nRootWebArea 'Issues \u00b7 Dashboard \u00b7 GitLab' focused: True\n\tlink [4622] 'Skip to content'\n\tlink [4631] 'Dashboard'\n\tbutton [6161] '' hasPopup: menu expanded: False\n\ttextbox [4383] 'Search GitLab' required: False\n\tgeneric [4660] 'Use the shortcut key / to start a search'\n\tlink [4680] 'Create new...'\n\tlink [4700] 'Issues'\n\t\tgeneric [4702] '13 assigned issues'\n\tlink [4707] 'Merge requests'\n\t\tgeneric [4710] '8 merge requests'\n\tlink [4732] 'To-Do List'\n\t\tgeneric [4734] 'Todos count'\n\t\t\ttext '5'\n\tlink [4739] 'Help'\n\tlink [4780] 'Byte Blaze'\n\t\timg 'Byte Blaze'\n\tmain [4387] ''\n\t\theading 'Issues'\n\t\tlink [4843] 'Select project to create issue'\n\t\tbutton [4844] 'Toggle project select'\n\t\tlink [4862] 'Open 13'\n\t\tlink [4867] 'Closed 53'\n\t\tlink [4872] 'All 66'\n\t\tlink [4877] 'Subscribe to RSS feed'\n\t\tlink [4878] 'Subscribe to calendar'\n\t\tbutton [4894] 'Recent searches'\n\t\tbutton [4400] 'Assignee = Byte Blaze'\n\t\t\tbutton [4946] ''\n\t\ttextbox [4398] '' required: False\n\t\tbutton [4893] ''\n\t\tgroup [5199] ''\n\t\t\tbutton [5202] 'Updated date' hasPopup: listbox\n\t\t\tlink [5201] 'Sort direction'\n\t\tlink [5299] 'Tm Theme Editor'\n\t\ttext 'byteblaze/a11y-syntax-highlighting#1 '\n\t\ttext '\u00b7 created '\n\t\ttime 'Apr 2, 2020 12:13am EDT'\n\t\t\ttext '3 years ago'\n\t\ttext ' by '\n\t\tlink [5305] 'earle'\n\t\tlink [5312] 'Assigned to Byte Blaze'\n\t\tlink [5314] ' 14'\n\t\ttext 'updated '\n\t\ttime 'Mar 27, 2023 7:15pm EDT'\n\t\t\t[5321] +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2337] +""", +"response": """ +REASON: +Let's think step-by-step. I see all issues on the current page. Given the objective, I must search for issues that contain the keyword "homepage content". To do this, I will type the keyword into the search input textbox that appears just before the list of issues. In summary, the next action I will perform is ```type [4398] [homepage content] [1]``` +ACTION: +type [4398] [homepage content] [1] +""" +}, +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +'\n\t\ttext '\u00b7 created '\n\t\ttime 'Jul 15, 2020 2:48pm EDT'\n\t\t\ttext '3 years ago'\n\t\ttext ' by '\n\t\tlink [7144] 'Byte Blaze'\n\t\ttext ' \\xa0 '\n\t\tlink [7147] 'bug'\n\t\tlink [7149] 'claimed'\n\t\tlink [7151] 'redesign'\n\t\tlink [7153] 'styling'\n\t\ttext 'CLOSED'\n\t\tlink [7170] 'Assigned to Byte Blaze'\n\t\tlink [7172] ' 4'\n\t\ttext 'updated '\n\t\ttime 'Mar 23, 2023 4:40am EDT'\n\t\t\ttext '10 months ago'\n\t\tlink [7182] \"Rethink the homepage's content\"\n\t\ttext 'a11yproject/a11yproject.com#719 '\n\t\ttext '\u00b7 created '\n\t\ttime 'Dec 30, 2018 2:42pm EST'\n\t\t\ttext '5 years ago'\n\t\ttext ' by '\n\t\tlink [7188] 'Byte Blaze'\n\t\ttext '\\xa0 '\n\t\tlink [7191] ' Content Updates for 2019'\n\t\ttext ' \\xa0 '\n\t\tlink [7194] 'enhancement'\n\t\tlink [7196] 'redesign'\n\t\ttext 'CLOSED'\n\t\tlink [7214] 'Assigned to Byte Blaze'\n\t\tlink [7216] ' 5'\n\t\ttext 'updated '\n\t\ttime 'Mar 23, 2023 4:39am EDT'\n\t\t\ttext +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2337] +type [4398] [homepage content] [1] +""", +"response": """ +REASON: +Let's think step-by-step. I see one issue called "Rethink the homepage's content" that contains the keywords "homepage content". Since it is only one issue, it must be the latest updated issue. According to the general instructions, since the objective asks me to open the issue, I must click the issue. In summary, the next action I will perform is ```click [7182]``` +ACTION: +click [7182] +""" +}, +{ +"input": """ +OBJECTIVE: +search_issues [Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed] +OBSERVATION: +button 'Collapse sidebar'\n\t[8080] navigation 'Breadcrumbs'\n\t\tlink [8095] 'The A11Y Project'\n\t\tlink [8101] 'a11yproject.com'\n\t\tlink [8108] 'Issues'\n\t\tlink [8114] '#719'\n\tmain [7383] ''\n\t\ttext 'Closed'\n\t\ttext 'Issue created '\n\t\ttime 'Dec 30, 2018 7:42pm'\n\t\ttext ' by' +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2337] +type [4398] [homepage content] [1] +click [7182] +""", +"response": """ +REASON: +The observation indicates that the issue titled \"Rethink the homepage's content (#719)\" is already open in the current tab. There is also a text element with the text 'Closed' (id 8138), which suggests that the status of the issue is visible on the page. Therefore, no further navigation or clicks are required to complete the task. Hence, I must stop and return the desired response. In summary, the next action I will perform is ```stop [Closed]``` +ACTION: +"stop [Closed]" +""" +}, +] +} + +create_project = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these general instructions: +1. To add new members, once you have created the project, click on Project Information in the sidebar to be guided to a link with memmbers. +2. When adding members, first type their name, then click on their name from the down down. Consult PREVIOUS ACTIONS to see if you have typed and selected the names. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +{ +"input": """ +OBJECTIVE: +create_project [Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members] +OBSERVATION: +Tab 0 (current): Byte Blaze / awesome-llms \u00b7 GitLab\n\nRootWebArea 'Byte Blaze / awesome-llms \u00b7 GitLab' focused: True\n\tlink [35051] 'Skip to content'\n\tlink [35060] 'Dashboard'\n\tbutton [36630] '' hasPopup: menu expanded: False\n\ttextbox [34985] 'Search GitLab' required: False\n\tgeneric [35092] 'Use the shortcut key / to start a search'\n\tlink [35112] 'Create new...'\n\tlink [35151] 'Issues'\n\t\tgeneric [35153] '13 assigned issues'\n\tlink [35158] 'Merge requests'\n\t\tgeneric [35161] '8 merge requests'\n\tlink [35183] 'To-Do List'\n\t\tgeneric [35185] 'Todos count'\n\tlink [35190] 'Help'\n\tlink [35231] 'Byte Blaze'\n\t\timg 'Byte Blaze'\n\tcomplementary [35287] 'Project navigation'\n\t\tlistitem 'awesome-llms'\n\t\t\tlink [35313] 'awesome-llms'\n\t\t\t\ttext 'A'\n\t\tlink [35319] 'Project information'\n\t\tlink [35345] 'Repository'\n\t\tlink [35387] 'Issues'\n\t\t\ttext '0'\n\t\tlink [35421] 'Merge requests'\n\t\tlink [35438] 'CI/CD'\n\t\tlink [35468] 'Security & Compliance'\n\t\tlink [35486] 'Deployments'\n\t\tlink [35512] 'Packages and registries'\n\t\tlink [35534] 'Infrastructure'\n\t\tlink [35560] 'Monitor'\n\t\tlink [35590] 'Analytics'\n\t\tlink [35616] 'Wiki'\n\t\tlink [35629] 'Snippets'\n\t\tlink [35642] 'Settings'\n\t\tbutton [35290] 'Collapse sidebar'\n\tgeneric [4817] 'Public - The project can be accessed without any authentication.'\n\t\ttext 'Project ID: 195 '\n\t\tbutton [4824] 'Copy project ID' live: polite atomic: False relevant: additions text\n\t\tbutton [5115] '' hasPopup: menu expanded: False\n\t\tbutton [4833] 'Star'\n\t\tlink [4834] '0'\n\t\tlink [4844] 'Fork'\n\t\tlink [4843] '0'\n\t\tlink [4857] '1 Commit'\n\t\tlink [4865] '1 Branch'\n\t\tlink [4873] '0 Tags'\n\t\tlink [4881] '72 KB Project Storage'\n\t\tlink [6112] ''\n\t\tlink [6119] 'Initial commit'\n\t\t[6122] +URL: +https://localhost:8023/ +PREVIOUS ACTIONS: +click [2782] +type [3910] [awesome-llms] [0] +click [3903] +""", +"response": """ +REASON: +Let's think step-by-step. The task is to create a new public project named "awesome-llms" and add members to it. However, based on the observation, it appears that the project "awesome-llms" has already been created, as indicated by the presence of elements such as 'Project ID: 195' and 'Initial commit'. The next step is to add members to the project. To do this, I need to navigate to the project information which contains a link to add members. The link to 'Project information' is present in the observation with id 35319. In summary, the next action I will perform is ```click [35319]``` +ACTION: +click [35319] +"""}, +] +} + +create_group = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these general instructions: +1. To add new members, click on the Members tab in the side pane. If you don't see it, click on Group Information in the sidebar to be guided to a link with memmbers. +2. When adding members, first type their name, then click on their name from the down down. Consult PREVIOUS ACTIONS to see if you have typed and selected the names. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +reddit_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_subreddit [query]`: Find a subreddit corresponding to the query. The query can either be the name of the subreddit or a vague description of what the subreddit may contain. The subroutine hands back control once it navigates to the subreddit. +`find_user [user_name]`: Navigate to the page of a user with user_name. The page contains all the posts made by the user. + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +find_subreddit [books] +find_subreddit [something related to driving in Pittsburgh] +find_user [AdamCannon] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +1. If you have to do a task related to a particular user, first find the user using find_user subroutine +2. Otherwise, if you have to post or edit a post in a subreddit, first find the subreddit using the find_subreddit subroutine +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +find_subreddit = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective find_subreddit [query] asks you to navigate to the subreddit that best matches the query. The query can be specific or vague. +2. To navigate to a subreddit, first click on Forums from the top menu. +3. Once you are in the Forums page, and you see the Alphabetical option, click on it to see a list of all subreddits alphabetically. +4. Once you are in the page with all the subreddits listed alphabetically, click on the subreddit that matches the query +5. Once you have navigated to the subreddit, return stop [N/A]. You can check that you are in the subreddit by looking at the current observation and seeing "heading '/f/subreddit_name'". You will also see a number of posts. If the subreddit_name vaguely matches the query, it means you are already in the subreddit and should stop, e.g. gaming and games are the same subreddit. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +find_user = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [issue] [1] +stop [N/A] +goto [http://localhost:9999/user/] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective find_user [user_name] asks you to navigate the page of a user with user_name +2. To do so, look at the current base URL (e.g. http://localhost:9999) and add a suffix /user/user_name, i.e. +goto [http://localhost:9999/user/user_name] +3. Once you have navigated to the user page (as seen in your past actions), return stop [N/A] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +shopping_admin_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_customer_review [query]`: Find customer reviews for a particular product using the query to specify the kind of review. +`find_order [query]`: Find an order corresponding to a particular customer or order number. +`search_customer [query]`: Find a customer given some details about them such as their phone number. + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +find_customer_review [Show me customer reviews for Zoe products] +find_order [Most recent pending order by Sarah Miller] +find_order [Order 305] +search_customer [Search customer with phone number 8015551212] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +1. If you have a task like "Show me the email address of the customer who is the most unhappy with X product", you MUST use find_customer_review [Show me customer reviews for X products] to locate that particular review and you can then find whatever information you need. Do not try to solve the task without using the subroutine as it contains specific instructions on how to solve it. +2. If you have a task like "Show me the customers who have expressed dissatisfaction with X product", you MUST use find_customer_review [Show me customer reviews for X product]. +3. If you have a task about a particular order, e.g. "Notify X in their most recent pending order with message Y", you MUST use find_order [Most recent pending order for X] to locate the order, and then do operations on that page. +4. To write a comment on the order page, you MUST NOT click on "Comments History" tab, it does not lead you to the right place. Stay on the current page and check the comment section. +5. If you have a task about a particular order, e.g. "Cancel order 305", you MUST use find_order [Find order 305] to locate the order, and then do operations on that page. +6. If you have a task like "Find the customer name and email with phone number X", you MUST use search_customer [Search customer with phone number X] to locate the customer, and then answer the query. Do NOT click on CUSTOMERS side panel. +7. You MUST use Subroutine Actions whenever possible. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +find_customer_review = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective find_customer_review [query] asks you to navigate to the product page containing customer reviews. +2. To navigate to a review, first click on REPORTS in the side panel +3. Once you have clicked on REPORTS, and you see the Reports panel with Marketing, Sales, Reviews, Customers etc, click on By Products under Customers. +4. Once you are in the Product Reviews Report, you need to locate the product by searching for it. Use the gridcell below Product to search for a product. Do not use other search boxes. Look at the example below where I show you how to search for Zoe in the correct gridcell. +5. When searching for a product, search the first word only like Zoe, or Antonia or Chloe. +6. Once the product shows up, click on 'Show Reviews'. +7. Once all the reviews show up, return stop [N/A] to hand back control to the agent that queried you. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ + { +"input": """ +OBJECTIVE: +find_product_review [Show me the review of the customer who is the most unhappy with the style of Zoe products] +OBSERVATION: +Tab 0 (current): Product Reviews Report / Reviews / Reports / Magento Admin + table '' + row '| ID | Product | Reviews | Average | Average (Approved) | Last Review | Action |' + row '| --- | --- | --- | --- | --- | --- | --- |' + row "| textbox [1996] [required: False] | textbox [1998] [required: False] | textbox [2000] [required: False] | textbox [2002] [required: False] | textbox [2004] [required: False] | undefined undefined\ttextbox [2008] 'From' [required: False]\tbutton [3196] 'undefined'\ttextbox [2010] 'To' [required: False]\tbutton [3201] 'undefined' | |" + row 'https://localhost:7780/admin/review/product/index/productId/1620/' + row "| 1620 | Erica Evercool Sports Bra | 4 | 60.0000 | 60.0000 | Apr 19, 2023, 12:15:17 PM | Show Reviews\tlink [2792] 'Show + row 'https://localhost:7780/admin/review/product/index/productId/39/' + row "| 39 | +URL: +https://localhost:7780/admin/reports/report_review/product/ +PREVIOUS ACTIONS: +click [1195] +click [1531] +""", +"response": """ +REASON: +Let's think step-by-step. We are in the Product Review Report page. We have to search for a Zoe product. To do so, I have to type in Zoe in a gridcell corresponding to the Product column. The Product column is the second column after ID. Therefore I have to find the second gridcell just below it. The first one is 1995, and the second one is 1997. In summary, the next action I will perform is type [1997] [Zoe] [1] +ACTION: +type [1997] [Zoe] [1] +"""}, +] +} + +find_order = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective find_order [query] asks you to navigate to the order page corresponding to the query +2. To navigate to orders, first click on SALES in the side panel +3. Once you have clicked on SALES, click on Orders. +4. Once you are in the orders page, you have to use the 'Search by keyword' text box to search for your order. Always be sure to search first. For example, for find_order [Most recent pending order by Sarah Miller], search Sarah Miller. +5. Click on View to open the right order. +6. Once you are in the order page, as noted by "Order & Account Information", you MUST return stop [N/A] to hand back control to the agent that queried you. Do not go back to another page. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +search_customer = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. The objective search_customer [query] asks you to search for customer details corresponding to the query +2. To navigate to customers, first click on CUSTOMERS in the side panel +3. Once you have clicked on CUSTOMERS, click on All Customers. +4. Once you are in the customers page, you have to use the 'Search by keyword' text box to search for your customer. Always be sure to search first. For example, for find_order [Search customer with phone number 8015551212], search 8015551212. +5. If the page shows a number has already been searched, click on Clear All first. Then proceed with the search. +6. Once you are done with the search, and the customer with matching query shows up, you MUST return stop [N/A] to hand back control to the agent that queried you. Do not go back to another page. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +shopping_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`search_order [question]`: Search orders to answer a question about my orders +`find_products [query]`: Find products that match a query +`search_reviews [query]`: Search reviews to answer a question about reviews + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +stop [Closed] +hover [11] +search_order [How much I spend on 4/19/2023 on shopping at One Stop Market?] +list_products [List products from PS4 accessories category by ascending price] +search_reviews [List out reviewers, if exist, who mention about ear cups being small] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +1. First check thhe OBJECTIVE. If the OBJECTIVE is a question about my orders, you MUST use search_order [question] to answer the question. For example, +a. search_order [How much I spend on ...?] +b. search_order [What is the size of the picture frame I bought Sep 2022?] +c. search_order [Change the delivery address for my most recent order] +Do not click on MyAccount directly! +Do not try to solve the task without using search_order as it contains specific instructions on how to solve it. +2. Once you call the search_order [] subroutine, the response is stored in PREVIOUS ACTIONS. For example, +$0 = search_order [How much I spend on 4/19/2023 on shopping at One Stop Market?] +means that the response was $0. In that case, return the answer directly, e.g. stop [$0] +If the response was N/A, reply stop [N/A] +3. If the OBJECTIVE is a question about listing / showing products, you MUST use list_products. For example, +a. list_products [List products from PS4 accessories category by ascending price] +4. If the OBJECTIVE requires you to retrieve details about a particular order you placed liked SKU, you MUST first use search_order [] to retrieve the SKU +a. If the OBJECTIVE is "Fill the "contact us" form in the site for a refund on the bluetooth speaker I bought ... Also, ensure to include the order number #161 and the product SKU." +you must first issue search_order [Give me the SKU of bluetooth speaker from order number #161] +b. If the OBJECTIVE is "Draft a refund message via their "contact us" form for the phone screen protector I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message." +you must first issue search_order [Give me the order id and the amount for the phone screen protector I bought March 2023.] +5. If the OBJECTIVE is about reviews for the product, you MUST use search_reviews. For example, +a. search_reviews [List out reviewers, if exist, who mention about ear cups being small] +b. search_reviews [What are the main criticisms of this product? Please extract the relevant sentences] +6. In your REASON, you MUST specify if any of the general instructions above apply that would affect the action you choose. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +search_order = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`note [content]`: Use this to make a personal note of some content you would like to remember. This shows up in your history of previous actions so you can refer to it. +`go_back`: Navigate to the previously viewed page. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +note [Spent $10 on 4/1/2024] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. Navigate to My Account, then My Orders to access all orders +2. The orders are sorted by descending date. Use Page Next to navigate to orders placed at an earlier date than displayed. Use Page Previous to navigate to orders at a later date than displayed. +click [id] corresponding to Page Previous to go to a later date +click [id] corresponding to Page Next to go to a earlier date +3. If you don't see an order for a date, and the first order on the page is after the date, the last order on the page is before the date, then it means there is no order for the date. No point navigating to previous or next pages. +4. If the question is how much did I spend on a date, and I didn't spend anything, return stop [$0] +5. If the status of an order shows cancelled, that means I did not spend that money +6. If you have to find the total amount you spent on orders that span multiple pages, use note [Spent $10 on 4/1/2024] to make a personal note before moving on to the next page. When you are done, you can look at PREVIOUS ACTIONS to find all notes. +7. When you are adding numbers, work out each addition step by step in REASON. +8. Use go_back to go back to a previous page from an order. +But before you do, use note [] to make a note that you checked the page, e.g. +note [Checked order on 11/29/2023, no picture frame.] +9. If you are asked to change the delivery address on an order, you can't. Reply stop [N/A] +10. If you are in an order page and need to go back, issue go_back. +Don't click on My Orders else you have to start from all over again. +11. Do not keep visiting the same order page over and over again! +To prevent this, whenever you visit a page, always make a note. For example note [Nothing relevant purchased on September 29, 2022] +See note [] to see what dates you have visit, and be sure to not visit that page again. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +list_products = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. Use this whenever any element has the field hasPopup: menu +`goto [url]`: Navigate to a specific URL. Use this when needing to sort by price. Refer to instructions below. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [N/A] +hover [77] +goto [https://localhost:7770/video-games/playstation-4/accessories.html?product_list_order=price] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. To find a product category, you MUST use hover [id] to expand the popup Menu till you find the leaf element that has no popup. Then click [id] on the leaf element. +For exmaple, to find PS 4 accessories you must hover over Video Games, then hover over Playstation 4, then click on Accessories. +2. To sort current list of products by price and in ascending order, you MUST use the goto [url] action by appending ?product_list_order=price to the current URL. For example: +If URL is https://localhost:7770/video-games/playstation-4/accessories.html +then issue goto [https://localhost:7770/video-games/playstation-4/accessories.html?product_list_order=price] +3. To sort in descending order, you MUST use the goto [url] action by appending ?product_list_order=price&product_list_dir=desc, e.g. +If URL is https://localhost:7770/video-games/playstation-4/accessories.html +goto [https://localhost:7770/video-games/playstation-4/accessories.html?product_list_order=price&product_list_dir=desc] +4. To list all items less than a particular price, e.g. $25, you MUST use the goto [url] action by appending ?price=0-25 +If URL is https://localhost:7770/video-games/playstation-4/accessories.html +goto [https://localhost:7770/video-games/playstation-4/accessories.html?price=0-25] +5. Once you are done in stop [N/A] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +search_reviews = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`note [content]`: Use this to make a personal note of some content you would like to remember. This shows up in your history of previous actions so you can refer to it. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +note [Reviewer X made comment Y] +stop [N/A] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. To find the list of reviews, search for a link with Reviewers. +2. If you have to list multiple reviewers, use `note [Reviewer X made comment Y; Reviewer A made comment B; ...]` to make a personal note. When you are done, you can look at PREVIOUS ACTIONS to find all notes. In stop [], make sure you answer the question in the OBJECTIVE. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +maps_agent = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Subroutine Actions: +`find_directions [query]`: Find directions between two locations to answer the query +`search_nearest_place [query]`: Find places near a given location + +Example actions: +click [7] +type [15] [Carnegie Mellon University] [1] +find_directions [Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University] +search_nearest_place [Tell me the closest cafe(s) to CMU Hunt library] + +You will be provided with the following, + OBJECTIVE: + The goal you need to achieve. + OBSERVATION: + A simplified text description of the current browser content, without formatting elements. + URL: + The current webpage URL + PREVIOUS ACTIONS: + A list of your past actions with an optional response + +You need to generate a response in the following format. Please issue only a single action at a time. + REASON: + Your reason for selecting the action below + ACTION: + Your action + +Please follow these general instructions: +1. If the OBJECTIVE is about finding directions from A to B, you MUST use find_directions [] subroutine. +e.g. find_directions [Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University] +2. If the OBJECTIVE is about searching nearest place to a location, you MUST use search_nearest_place [] subroutine. +e.g. search_nearest_place [Tell me the closest restaurant(s) to Cohon University Center at Carnegie Mellon University] +3. If the OBJECTIVE is to pull up a description, once that place appears in the sidepane, return stop [N/A] +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +find_directions = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [5h 47min] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. First click on "Find directions between two points", then enter From and To Fields, and click search. +2. If you have to find directions to social security administration in Pittsburgh, search for it in a structured format like Social Security Administration, Pittsburgh. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} + +search_nearest_place = { +"instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Example actions: +click [7] +type [7] [Zoe] [1] +stop [De Fer Coffee & Tea, La Prima Espresso, Rothberg's Roasters II, Cafe Phipps, La Prima Espresso, Starbucks] + +You will be provided with the following, +OBJECTIVE: +The goal you need to achieve. +OBSERVATION: +A simplified text description of the current browser content, without formatting elements. +URL: +The current webpage URL +PREVIOUS ACTIONS: +A list of your past actions + +You need to generate a response in the following format. Please issue only a single action at a time. +REASON: +Your reason for selecting the action below +ACTION: +Your action + +Please follow these instructions to solve the subtask: +1. For searches that refer to CMU, e.g. "find cafes near CMU Hunt Library" +a. You have to first center your map around a location. If you have to find cafes near CMU Hunt Library, the first step is to make sure the map is centered around Carnegie Mellon University. To do that, first search for Carnegie Mellon University and then click [] on a list of location that appears. You MUST click on the Carnegie Mellon University location to center the map. Else the map will not centered. E.g click [646] +b. Now that your map is centered around Carnegie Mellon University, directly search for "cafes near Hunt Library". Do not include the word CMU in the search item. +The word CMU cannot be parsed by maps and will result in an invalid search. +c. When your search returns a list of elements, return them in a structured format like stop [A, B, C] +2. For searches that don't refer to CMU +a. No need to center the map. Directly search what is specified in OBJECTIVE, e.g. "bars near Carnegie Music Hall" +b. When your search returns a list of elements, return them in a structured format like stop [A, B, C] +3. Be sure to double check whether the OBJECTIVE has CMU or not and then choose between instruction 1 and 2. +4. Remember that the word CMU cannot be typed in the search bar as it cannot be parsed by maps. +5. Remember that if you want to center your map around Carnegie Mellon University, you have to click on it after you search for it. Check your PREVIOUS ACTIONS to confirm you have done so, e.g. click [646] should be in the previous actions. +""", + +"input": """ +OBJECTIVE: +{objective} +OBSERVATION: +{observation} +URL: +{url} +PREVIOUS ACTIONS: +{previous_actions} +""", + +"response": "", + +"examples": [ +] +} diff --git a/webagents_step/utils/data_prep.py b/webagents_step/utils/data_prep.py new file mode 100644 index 0000000..e3647eb --- /dev/null +++ b/webagents_step/utils/data_prep.py @@ -0,0 +1,137 @@ +import os +import json +import pandas as pd +import random +import yaml +import copy +import numpy as np + +class yamlConfig: + def __init__(self, dictionary): + for key, value in dictionary.items(): + if isinstance(value, dict): + value = yamlConfig(value) + setattr(self, key, value) + +class DotDict: + """access dictionary attributes with dot notation""" + def __init__(self, dictionary): + for key, value in dictionary.items(): + if isinstance(value, dict): + value = DotDict(value) + setattr(self, key, value) + + def to_dict(self): + regular_dict = {} + for key, value in self.__dict__.items(): + if isinstance(value, DotDict): + regular_dict[key] = value.to_dict() + else: + regular_dict[key] = value + return regular_dict + + def has_key(self, key): + """Check if the DotDict has a specific key""" + return hasattr(self, key) + +##### +# Dataset processing functions +##### + +def split_list_by_proportion(proportion, original_list): + list_size = len(original_list) + first_list_size = int(proportion * list_size) + shuffled_list = copy.copy(original_list) + random.shuffle(shuffled_list) + first_list = shuffled_list[:first_list_size] + second_list = shuffled_list[first_list_size:] + return first_list, second_list + + +def examples_to_prompts(data_config, json_df, prompt_template, inference): + all_prompt_strings = [] + for idx, row in json_df.iterrows(): + try: + # read samples + json_filepath = os.path.join(data_config.basedir, str(row.logfile)) + with open(json_filepath,'r') as json_file: + example_json = json.load(json_file) + all_prompt_strings.extend(convert_example_to_prompts(example_json, data_config, prompt_template, inference)) + except FileNotFoundError: + print(f"File {data_config.basedir}/{row.logfile} not found. Skipping ...") + continue + + return all_prompt_strings + +def convert_example_to_prompts(example_json, data_config, prompt_template, inference): + filled_prompt_strings = [] + previous_actions = [] + for step_ind, step in enumerate(example_json["trajectory"]): + # Fill in prompt components + prompt_components = copy.deepcopy(prompt_template) + prompt_components["input"] = prompt_components["input"].format( + objective = step["objective"], + observation = step["observation"], + url = step["url"], + previous_actions = "\n".join([str(step) for step in step["previous_actions"][-data_config.action_lookback:] if step is not None]) if "previous_actions" in step else "\n".join([str(action) for action in previous_actions[-data_config.action_lookback:] if action is not None]) + ) + previous_actions.append(step["action"]) + + prompt_components["response"] = convert_actions_and_reasons_to_response(step["reason"], step["action"]) + filled_prompt_strings.append(create_prompt(prompt_components, inference=inference)) + return filled_prompt_strings + +def convert_actions_and_reasons_to_response(reasons, actions): + response = f""" +REASON: +{reasons} +ACTION: +{actions} +""" + return response + +def create_prompt(prompt_components, inference=False): + #If inference mode, do not fill in the response + if inference: + prompt_template = """ +[INST] <> +{instruction} +<> + +{input} [/INST]""" + return prompt_template.format(instruction = prompt_components["instruction"], + input = prompt_components["input"]) + else: + prompt_template = """ +[INST] <> +{instruction} +<> + +{input} [/INST] {response} +""" + return prompt_template.format(instruction = prompt_components["instruction"], + input = prompt_components["input"], + response = prompt_components["response"]) + +def log_run(log_file, log_data, summary_file=None, summary_data=None, json_indent=4, verbose=1): + """ + Logs demo data to a JSON file and optionally updates a summary CSV file. + """ + # Write log data to JSON file + with open(log_file, 'w') as json_file: + json.dump(log_data, json_file, indent=json_indent) + if verbose: + print(f"Saved log to {log_file}") + + # If summary data and file path are provided, update the summary + if summary_data and summary_file: + if os.path.exists(summary_file): + df_summary = pd.read_csv(summary_file) + else: + df_summary = pd.DataFrame() + df_summary = pd.concat([df_summary, pd.DataFrame([summary_data])], ignore_index=True) + df_summary.to_csv(summary_file, index=False) + if verbose: + print(f"Updated summary: {df_summary}") + + return df_summary \ No newline at end of file diff --git a/webagents_step/utils/llm.py b/webagents_step/utils/llm.py new file mode 100644 index 0000000..1aac13a --- /dev/null +++ b/webagents_step/utils/llm.py @@ -0,0 +1,422 @@ +import openai +import re +import copy +import torch +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + AutoModelForSeq2SeqLM, +) +import ctranslate2 +from time import sleep +import tiktoken +from openai import OpenAI + +import warnings +warnings.simplefilter("ignore") + +input_token_cost_usd_by_model = { + "gpt-4-1106-preview": 0.01 / 1000, + "gpt-4": 0.03 / 1000, + "gpt-4-32k": 0.06 / 1000, + "gpt-3.5-turbo": 0.001 / 1000, + "gpt-3.5-turbo-instruct": 0.0015 / 1000, + "gpt-3.5-turbo-16k": 0.003 / 1000, + "babbage-002": 0.0016 / 1000, + "davinci-002": 0.012 / 1000, + "ada-v2": 0.0001 / 1000, +} + +output_token_cost_usd_by_model = { + "gpt-4-1106-preview": 0.03 / 1000, + "gpt-4": 0.06 / 1000, + "gpt-4-32k": 0.12 / 1000, + "gpt-3.5-turbo": 0.002 / 1000, + "gpt-3.5-turbo-instruct": 0.002 / 1000, + "gpt-3.5-turbo-16k": 0.004 / 1000, + "babbage-002": 0.0016 / 1000, + "davinci-002": 0.012 / 1000, + "ada-v2": 0.0001 / 1000, +} + +def fill_prompt_template(prompt_template, objective, observation, url, previous_history): + prompt = copy.deepcopy(prompt_template) + prompt["input"] = prompt["input"].replace("{objective}", objective) + if isinstance(observation, dict): + prompt["input"] = prompt["input"].replace("{observation}", observation["text"]) + else: + prompt["input"] = prompt["input"].replace("{observation}", observation) + prompt["input"] = prompt["input"].replace("{url}", url) + prompt["input"] = prompt["input"].replace("{previous_actions}", previous_history) + return prompt + +def filter_quotes_if_matches_template(action): + if action is None: + return None + + # Regex pattern to match the entire 'type [X] ["Y"]' template, allowing for Y to be digits as well + pattern = r'^type \[\d+\] \["([^"\[\]]+)"\]$' + # Check if the action matches the specific template + match = re.match(pattern, action) + if match: + # Extract the matched part that needs to be unquoted + y_part = match.group(1) + # Reconstruct the action string without quotes around Y + filtered_action = f'type [{match.group(0).split("[")[1].split("]")[0]}] [{y_part}]' + return filtered_action.strip() # filtered_action.split("\n")[0].strip() + else: + # Return the original action if it doesn't match the template + return action.strip() # action.split("\n")[0].strip() + +def parse_action_reason(model_response): + reason_match = re.search(r'REASON:\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL) + reason = reason_match.group(1) if reason_match else None + + # action_match = re.search(r'ACTION:\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL) + action_match = re.search(r'(?:ACTION|ACTIONS):\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL) + action = action_match.group(1) if action_match else None + + action = filter_quotes_if_matches_template(action) + + return action, reason + +def construct_llm_message_hf(prompt, prompt_mode, model_type="llama2"): + if model_type == "llama2": + instruction = "[INST] " + prompt["instruction"] + else: + instruction = prompt["instruction"] + + messages = [{"role": "system", "content": instruction}] + + if prompt["examples"]: + messages.append({"role": "system", "content": "Here are a few examples:"}) + for example in prompt["examples"]: + messages.append({"role": "system", "content": f"\n### Input:\n{example['input']}\n\n### Response:\n{example['response']}"}) + + if model_type == "llama2": + query = f"\nHere is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n[/INST]\n" + else: + query = f"\nHere is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:" + + messages.append({"role": "user", "content": query}) + if prompt_mode == "chat": + return messages + elif prompt_mode == "completion": + all_content = ''.join(message['content'] for message in messages) + messages_completion = [{"role": "user", "content": all_content}] + return messages_completion + +def construct_llm_message_anthropic(prompt, plan_list=None, action_list=None): + if plan_list and action_list: + import os + from global_utils import CURRENT_DIR + + assert len(plan_list) > 0 and len(action_list) > 0 + plan_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "plan_instructions", f"{p}.txt"), "r").readlines()) for p in plan_list]) + action_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "action_instructions", f"{a}.txt"), "r").readlines()) for a in action_list]) + prompt["instruction"] = prompt["instruction"].replace("{plan_instructions}", plan_instructions) + prompt["instruction"] = prompt["instruction"].replace("{action_instructions}", action_instructions) + + system_message = prompt["instruction"] + + if prompt["examples"]: + system_message += f"\n\n## Here are a few examples:" + for i, example in enumerate(prompt["examples"]): + example_input = example["input"] + example_response = example["response"] + if "example_format" in prompt.keys(): + system_message += "\n\n" + system_message += prompt["example_format"].replace("{i}", i).replace("{example_input}", example_input).replace("{example_response}", example_response) + else: + system_message += f"\n\n| Example {i}\n\n### Input:\n{example_input}\n\n### Response: Let's think step by step.\n{example_response}" + + if "input_format" in prompt.keys(): + if "{visual_observation}" in prompt.keys(): + from claude import arrange_message_for_claude + text = prompt["input_format"].replace("{input}", prompt['input']) + text_prior, text_subsequent = text.split("{visual_observation}") + messages = arrange_message_for_claude([("text", text_prior), ("image", prompt["{visual_observation}"]), ("text", text_subsequent)]) + else: + messages = [{"role": "user", "content": [{"type": "text", "text": prompt["input_format"].replace("{input}", prompt['input'])}]}] + else: + if "{visual_observation}" in prompt.keys(): + pass + else: + messages = [{"role": "user", "content": [{"type": "text", "text": f"## Here is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"}]}] + + return system_message, messages + +def construct_llm_message_openai(prompt, prompt_mode, plan_list=None, action_list=None): + if not (plan_list and action_list): + messages = [{"role": "system", "content": prompt["instruction"]}] + + if prompt["examples"]: + messages.append({"role": "system", "content": "Here are a few examples:"}) + for example in prompt["examples"]: + messages.append({"role": "system", "content": f"\n### Input:\n{example['input']}\n\n### Response:\n{example['response']}"}) + + messages.append({"role": "user", "content": f"Here is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"}) + if prompt_mode == "chat": + return messages + elif prompt_mode == "completion": + all_content = ''.join(message['content'] for message in messages) + messages_completion = [{"role": "user", "content": all_content}] + return messages_completion + import os + from global_utils import CURRENT_DIR + + assert len(plan_list) > 0 and len(action_list) > 0 + plan_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "plan_instructions", f"{p}.txt"), "r").readlines()) for p in plan_list]) + action_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "action_instructions", f"{a}.txt"), "r").readlines()) for a in action_list]) + prompt["instruction"] = prompt["instruction"].replace("{plan_instructions}", plan_instructions) + prompt["instruction"] = prompt["instruction"].replace("{action_instructions}", action_instructions) + + messages = [{"role": "system", "content": prompt["instruction"]}] + + if prompt["examples"]: + messages.append({"role": "system", "content": "## Here are a few examples:"}) + for i, example in enumerate(prompt["examples"]): + example_input = example["input"] + example_response = example["response"] + messages.append({"role": "system", "content": f"| Example {i}\n\n### Input:\n{example_input}\n\n### Response: Let's think step by step.\n{example_response}"}) + + if "input_format" in prompt.keys(): + messages.append({"role": "user", "content": prompt["input_format"].replace("{input}", prompt['input'])}) + else: + messages.append({"role": "user", "content": f"## Here is the current Input. Please respond with PLAN, REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"}) + if prompt_mode == "chat": + return messages + elif prompt_mode == "completion": + all_content = ''.join(message['content'] for message in messages) + messages_completion = [{"role": "user", "content": all_content}] + return messages_completion + +def call_anthropic_llm(system_message, messages, model="anthropic.claude-3-haiku-20240307-v1:0", **model_kwargs): + # Use the native inference API to send a text message to Anthropic Claude. + + import boto3 + import json + + # Create a Bedrock Runtime client in the AWS Region of your choice. + client = boto3.client("bedrock-runtime", region_name="us-east-1") + print(system_message, file=open("trash.txt", "a")) + print("\n".join(item["content"][0]["text"] for item in messages), end="\n"+"#"*100+"\n", file=open("trash.txt", "a")) + native_request = { + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 512, + "temperature": 0.5, + "system": system_message, + "messages": messages, + } + + # Convert the native request to JSON. + request = json.dumps(native_request) + + try: + # Invoke the model with the request. + response = client.invoke_model(modelId=model, body=request) + + except Exception as e: + raise KeyError(f"ERROR: Can't invoke '{model}'. Reason: {e}") + + # Decode the response body. + model_response = json.loads(response["body"].read()) + + # Extract and print the response text. + response_text = model_response["content"][0]["text"] + return response_text + +def call_openai_llm(messages, model="gpt-3.5-turbo", **model_kwargs): + """ + Sends a request with a chat conversation to OpenAI's chat API and returns a response. + + Parameters: + messages (list) + A list of dictionaries containing the messages to send to the chatbot. + model (str) + The model to use for the chatbot. Default is "gpt-3.5-turbo". + temperature (float) + The temperature to use for the chatbot. Defaults to 0. Note that a temperature + of 0 does not guarantee the same response (https://platform.openai.com/docs/models/gpt-3-5). + + Returns: + response (Optional[dict]) + The response from OpenAI's chat API, if any. + """ + # client = OpenAI() + temperature = model_kwargs.get('temperature', 0.7) + top_p = model_kwargs.get('top_p', 1.0) + n = model_kwargs.get('n', 1) + + for m in messages: + print(m["content"], file=open("trash.txt", "a")) + print("*"*100, file=open("trash.txt", "a")) + + num_attempts = 0 + while True: + if num_attempts >= 10: + raise ValueError("OpenAI request failed.") + try: + if model=="text-davinci-003": + response = openai.Completion.create( + model=model, + prompt=messages[0]["content"], + temperature=temperature, + top_p=top_p, + n=n, + max_tokens=128) + return response.choices[0].text.strip() + + response = OpenAI().chat.completions.create( + model=model, + messages=messages, + temperature=temperature, + top_p=top_p, + n=n + ) + return response.choices[0].message.content.strip() + except openai.AuthenticationError as e: + print(e) + return None + except openai.RateLimitError as e: + print(e) + print("Sleeping for 10 seconds...") + sleep(10) + num_attempts += 1 + except Exception as e: + print(e) + print("Sleeping for 10 seconds...") + sleep(10) + num_attempts += 1 + +def get_num_tokens(text: str, model_name: str) -> int: + tokenizer = tiktoken.encoding_for_model(model_name=model_name) + return len(tokenizer.encode_ordinary(text)) + +def calculate_cost_openai(messages: str, response: str, model_name: str) -> int: + input_text = " ".join([msg["content"] for msg in messages]) + num_input_tokens = get_num_tokens(input_text, model_name) + num_output_tokens = get_num_tokens(response, model_name) + + input_token_cost = input_token_cost_usd_by_model.get(model_name, None) + output_token_cost = output_token_cost_usd_by_model.get(model_name, None) + if input_token_cost is None or output_token_cost is None: + print(f"[calculate_cost_openai] unknown model {model_name}") + return 0 + return num_input_tokens * input_token_cost + num_output_tokens * output_token_cost + +def load_tokenizer(mpath, context_size): + tokenizer = AutoTokenizer.from_pretrained(mpath, return_token_type_ids=False) + # tokenizer.pad_token = tokenizer.eos_token + # tokenizer.pad_token_id = tokenizer.eos_token_id + # tokenizer.model_max_length = context_size + # tokenizer.padding_side = "right" + # tokenizer.truncation_side = "left" + # tokenizer.add_eos_token = True + return tokenizer + +def load_model(mpath, dtype, device="cuda", context_len=4096, is_seq2seq=False, ct2_mpath=None): + if is_seq2seq: + model_loader = AutoModelForSeq2SeqLM + else: + model_loader = AutoModelForCausalLM + + if dtype == "bf16": + model = model_loader.from_pretrained( + mpath, + max_position_embeddings=context_len, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + device_map="balanced_low_0", + ) + elif dtype == "4bit": + model = model_loader.from_pretrained( + mpath, + max_position_embeddings=context_len, + low_cpu_mem_usage=True, + load_in_4bit=True, + device_map="auto", + ) + elif dtype == "4bit-optimized": + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + ) + model = model_loader.from_pretrained( + mpath, + use_cache=True, + device_map="auto", + quantization_config=bnb_config, + max_position_embeddings=context_len, + ) + elif dtype == "8bit": + model = model_loader.from_pretrained( + mpath, + max_position_embeddings=context_len, + low_cpu_mem_usage=True, + load_in_8bit=True, + device_map="auto", + ) + elif dtype == "ct2": + assert ct2_mpath is not None + model = ctranslate2.Generator(ct2_mpath, device=device) + + return model + +# @torch.no_grad() +# def generate_prediction( +# inputs, +# model, +# tokenizer, +# max_new_tokens, +# is_seq2seq=False, +# **kwargs, +# # num_beams, +# # do_sample, +# # no_repeat_ngram_size, +# # temperature, +# # top_k, +# # top_p, +# ): +# input_ids = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length - max_new_tokens).input_ids + +# outputs = model.generate( +# input_ids=input_ids.cuda(), +# max_new_tokens=max_new_tokens, +# **kwargs, +# ).cpu() + +# torch.cuda.empty_cache() +# if not is_seq2seq: +# outputs = outputs[:, input_ids.shape[1] :] + +# prediction = [ +# p.split(tokenizer.pad_token, 1)[0] +# for p in tokenizer.batch_decode(outputs, skip_special_tokens=True) +# ][0].strip() + +# return prediction + +def generate_prediction( + inputs, + model, + tokenizer, + **kwargs, +): + inputs = tokenizer([inputs], return_tensors='pt', truncation=True, add_special_tokens=False).to(model.device) + + # if torch.cuda.is_available(): + # inputs = inputs.to('cuda') + outputs = model.generate( + input_ids=inputs['input_ids'], + attention_mask=inputs['attention_mask'], + **kwargs, + ) + + outputs = outputs[:, inputs.input_ids.shape[1] :] + prediction = tokenizer.decode(outputs[0], skip_special_tokens=True) + + return prediction \ No newline at end of file diff --git a/webagents_step/utils/stack.py b/webagents_step/utils/stack.py new file mode 100644 index 0000000..8944891 --- /dev/null +++ b/webagents_step/utils/stack.py @@ -0,0 +1,22 @@ +class Stack: + def __init__(self): + self.items = [] + + def is_empty(self): + return len(self.items) == 0 + + def push(self, item): + self.items.append(item) + + def pop(self): + if self.is_empty(): + raise IndexError("pop from empty stack") + return self.items.pop() + + def peek(self): + if self.is_empty(): + raise IndexError("peek from empty stack") + return self.items[-1] + + def size(self): + return len(self.items) \ No newline at end of file diff --git a/webarena/agent/__init__.py b/webarena/agent/__init__.py new file mode 100644 index 0000000..9028d30 --- /dev/null +++ b/webarena/agent/__init__.py @@ -0,0 +1,8 @@ +from .agent import ( + Agent, + PromptAgent, + TeacherForcingAgent, + construct_agent, +) + +__all__ = ["Agent", "TeacherForcingAgent", "PromptAgent", "construct_agent"] diff --git a/webarena/agent/agent.py b/webarena/agent/agent.py new file mode 100644 index 0000000..490d08e --- /dev/null +++ b/webarena/agent/agent.py @@ -0,0 +1,227 @@ +import argparse +import json +from typing import Any + +import tiktoken +from beartype import beartype + +from agent.prompts import * +from browser_env import Trajectory +from browser_env.actions import ( + Action, + ActionParsingError, + create_id_based_action, + create_none_action, + create_playwright_action, +) +from browser_env.utils import Observation, StateInfo +from llms import ( + call_llm, + generate_from_huggingface_completion, + generate_from_openai_chat_completion, + generate_from_openai_completion, + lm_config, +) +from llms.tokenizers import Tokenizer + + +class Agent: + """Base class for the agent""" + + def __init__(self, *args: Any) -> None: + pass + + def next_action( + self, trajectory: Trajectory, intent: str, meta_data: Any + ) -> Action: + """Predict the next action given the observation""" + raise NotImplementedError + + def check_action( + self, trajectory: Trajectory, intent: str, meta_data: dict[str, Any], target_action: str + ) -> Action: + """Predict the next action given the observation""" + raise NotImplementedError + + def reset( + self, + test_config_file: str, + ) -> None: + raise NotImplementedError + + +class TeacherForcingAgent(Agent): + """Agent that follows a pre-defined action sequence""" + + def __init__(self) -> None: + super().__init__() + + def set_action_set_tag(self, tag: str) -> None: + self.action_set_tag = tag + + def set_actions(self, action_seq: str | list[str]) -> None: + if isinstance(action_seq, str): + action_strs = action_seq.strip().split("\n") + else: + action_strs = action_seq + action_strs = [a.strip() for a in action_strs] + + actions = [] + for a_str in action_strs: + try: + if self.action_set_tag == "playwright": + cur_action = create_playwright_action(a_str) + elif self.action_set_tag == "id_accessibility_tree": + cur_action = create_id_based_action(a_str) + else: + raise ValueError( + f"Unknown action type {self.action_set_tag}" + ) + except ActionParsingError as e: + cur_action = create_none_action() + + cur_action["raw_prediction"] = a_str + actions.append(cur_action) + + self.actions: list[Action] = actions + + def next_action( + self, trajectory: Trajectory, intent: str, meta_data: Any + ) -> Action: + """Predict the next action given the observation""" + return self.actions.pop(0) + + def reset( + self, + test_config_file: str, + ) -> None: + with open(test_config_file) as f: + ref_actions = json.load(f)["reference_action_sequence"] + tag = ref_actions["action_set_tag"] + action_seq = ref_actions["action_sequence"] + self.set_action_set_tag(tag) + self.set_actions(action_seq) + + +class PromptAgent(Agent): + """prompt-based agent that emits action given the history""" + + @beartype + def __init__( + self, + action_set_tag: str, + lm_config: lm_config.LMConfig, + prompt_constructor: PromptConstructor, + ) -> None: + super().__init__() + self.lm_config = lm_config + self.prompt_constructor = prompt_constructor + self.action_set_tag = action_set_tag + + def set_action_set_tag(self, tag: str) -> None: + self.action_set_tag = tag + + @beartype + def next_action( + self, trajectory: Trajectory, intent: str, meta_data: dict[str, Any] + ) -> Action: + prompt = self.prompt_constructor.construct( + trajectory, intent, meta_data + ) + lm_config = self.lm_config + n = 0 + while True: + response = call_llm(lm_config, prompt) + force_prefix = self.prompt_constructor.instruction[ + "meta_data" + ].get("force_prefix", "") + response = f"{force_prefix}{response}" + n += 1 + try: + parsed_response = self.prompt_constructor.extract_action( + response + ) + if self.action_set_tag in ["id_html_tree", "id_html_nasc_tree", "id_accessibility_tree"]: + action = create_id_based_action(parsed_response) + elif self.action_set_tag == "playwright": + action = create_playwright_action(parsed_response) + else: + raise ValueError( + f"Unknown action type {self.action_set_tag}" + ) + action["raw_prediction"] = response + break + except ActionParsingError as e: + if n >= lm_config.gen_config["max_retry"]: + action = create_none_action() + action["raw_prediction"] = response + break + + return action + + def check_action( + self, trajectory: Trajectory, intent: str, meta_data: dict[str, Any], target_action: str + ) -> Action: + prompt = self.prompt_constructor.construct( + trajectory, intent, meta_data + ) + lm_config = self.lm_config + n = 0 + + # agent will retry if the action is not parsed correctly + while True: + response = target_action + force_prefix = self.prompt_constructor.instruction[ + "meta_data" + ].get("force_prefix", "") + response = f"{force_prefix}{response}" + n += 1 + try: + parsed_response = self.prompt_constructor.extract_action( + response + ) + if self.action_set_tag in ["id_accessibility_tree", "id_html_tree", "id_html_nasc_tree"]: + action = create_id_based_action(parsed_response) + elif self.action_set_tag == "playwright": + action = create_playwright_action(parsed_response) + else: + raise ValueError( + f"Unknown action type {self.action_set_tag}" + ) + action["raw_prediction"] = response + break + except ActionParsingError as e: + if n >= lm_config.gen_config["max_retry"]: + action = create_none_action() + action["raw_prediction"] = response + break + + return prompt, action + + def reset(self, test_config_file: str) -> None: + pass + + +def construct_agent(args: argparse.Namespace) -> Agent: + llm_config = lm_config.construct_llm_config(args) + + agent: Agent + if args.agent_type == "teacher_forcing": + agent = TeacherForcingAgent() + elif args.agent_type == "prompt": + with open(args.instruction_path) as f: + constructor_type = json.load(f)["meta_data"]["prompt_constructor"] + tokenizer = Tokenizer(args.provider, args.model) + prompt_constructor = eval(constructor_type)( + args.instruction_path, lm_config=llm_config, tokenizer=tokenizer + ) + agent = PromptAgent( + action_set_tag=args.action_set_tag, + lm_config=llm_config, + prompt_constructor=prompt_constructor, + ) + else: + raise NotImplementedError( + f"agent type {args.agent_type} not implemented" + ) + return agent diff --git a/webarena/agent/prompts/__init__.py b/webarena/agent/prompts/__init__.py new file mode 100644 index 0000000..3f3caba --- /dev/null +++ b/webarena/agent/prompts/__init__.py @@ -0,0 +1 @@ +from .prompt_constructor import * diff --git a/webarena/agent/prompts/jsons/new_action_prompt.json b/webarena/agent/prompts/jsons/new_action_prompt.json new file mode 100644 index 0000000..d477d0c --- /dev/null +++ b/webarena/agent/prompts/jsons/new_action_prompt.json @@ -0,0 +1,21 @@ +{ + "intro": "", + "examples": [], + "template": " {html} \n\nYou are a helpful assistant that can assist with web navigation tasks.\nYou are given a simplified html webpage and a task description.\nYour goal is to complete the task. You can use the provided functions below to interact with the current webpage.\n\n#Provided functions:\ndef click(element_id: str) -> None:\n \"\"\"\n Click on the element with the specified id.\n\n Args:\n element_id: The id of the element.\n \"\"\"\n\ndef hover(element_id: str) -> None:\n \"\"\"\n Hover on the element with the specified id.\n\n Args:\n element_id: The id of the element.\n \"\"\"\n\ndef select(element_id: str, option: str) -> None:\n \"\"\"\n Select an option from a dropdown.\n\n Args:\n element_id: The id of the element.\n option: Value of the option to select.\n \"\"\"\n\ndef type_string(element_id: str, content: str, press_enter: bool) -> None:\n \"\"\"\n Type a string into the element with the specified id.\n\n Args:\n element_id: The id of the element.\n content: The string to type.\n press_enter: Whether to press enter after typing the string.\n \"\"\"\n\ndef scroll_page(direction: Literal['up', 'down']) -> None:\n \"\"\"\n Scroll down/up one page.\n\n Args:\n direction: The direction to scroll.\n \"\"\"\n\ndef go(direction: Literal['forward', 'backward']) -> None:\n \"\"\"\n Go forward/backward\n\n Args:\n direction: The direction to go to.\n \"\"\"\n\ndef jump_to(url: str, new_tab: bool) -> None:\n \"\"\"\n Jump to the specified url.\n\n Args:\n url: The url to jump to.\n new_tab: Whether to open the url in a new tab.\n \"\"\"\n\ndef switch_tab(tab_index: int) -> None:\n \"\"\"\n Switch to the specified tab.\n\n Args:\n tab_index: The index of the tab to switch to.\n \"\"\"\n\ndef user_input(message: str) -> str:\n \"\"\"\n Wait for user input.\n\n Args:\n message: The message to display to the user.\n\n Returns: The user input.\n \"\"\"\n\ndef finish(answer: Optional[str]) -> None:\n \"\"\"\n Finish the task (optionally with an answer).\n\n Args:\n answer: The answer to the task.\n \"\"\"\n\n#Previous commands: {previous_action}\n\n#Window tabs: {tabs}\n\n#Current viewport (pages): {position}\n\n#Task: {objective}\n\nYou should output one command to interact to the currrent webpage.\nYou should add a brief comment to your command to explain your reasoning and thinking process.\n", + "finale": "", + "meta_data": { + "observation": "html", + "action_type": "id_html_nasc_tree", + "keywords": [ + "url", + "html", + "objective", + "position", + "previous_action", + "tabs" + ], + "prompt_constructor": "NewASPromptConstructor", + "answer_phrase": "", + "action_splitter": "#" + } +} \ No newline at end of file diff --git a/webarena/agent/prompts/jsons/p_cot_id_actree_2s.json b/webarena/agent/prompts/jsons/p_cot_id_actree_2s.json new file mode 100644 index 0000000..9d2eae4 --- /dev/null +++ b/webarena/agent/prompts/jsons/p_cot_id_actree_2s.json @@ -0,0 +1,27 @@ +{ + "intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [direction=down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"N/A\" in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.", + "examples": [ + [ + "OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n\t\t[1749] StaticText '$279.49'\n\t\t[1757] button 'Add to Cart'\n\t\t[1760] button 'Add to Wish List'\n\t\t[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine\nPREVIOUS ACTION: None", + "Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```" + ], + [ + "OBSERVATION:\n[164] textbox 'Search' focused: True required: False\n[171] button 'Go'\n[174] link 'Find directions between two points'\n[212] heading 'Search Results'\n[216] button 'Close'\nURL: http://openstreetmap.org\nOBJECTIVE: Show me the restaurants near CMU\nPREVIOUS ACTION: None", + "Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```" + ] + ], + "template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": [ + "url", + "objective", + "observation", + "previous_action" + ], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + } +} \ No newline at end of file diff --git a/webarena/agent/prompts/jsons/p_cot_id_actree_2s_no_na.json b/webarena/agent/prompts/jsons/p_cot_id_actree_2s_no_na.json new file mode 100644 index 0000000..6b0f23f --- /dev/null +++ b/webarena/agent/prompts/jsons/p_cot_id_actree_2s_no_na.json @@ -0,0 +1,27 @@ +{ + "intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [direction=down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. Generate the action in the correct format. Start with a \"In summary, the next action I will perform is\" phrase, followed by action inside ``````. For example, \"In summary, the next action I will perform is ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.", + "examples": [ + [ + "OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n\t\t[1749] StaticText '$279.49'\n\t\t[1757] button 'Add to Cart'\n\t\t[1760] button 'Add to Wish List'\n\t\t[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine\nPREVIOUS ACTION: None", + "Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```" + ], + [ + "OBSERVATION:\n[164] textbox 'Search' focused: True required: False\n[171] button 'Go'\n[174] link 'Find directions between two points'\n[212] heading 'Search Results'\n[216] button 'Close'\nURL: http://openstreetmap.org\nOBJECTIVE: Show me the restaurants near CMU\nPREVIOUS ACTION: None", + "Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```" + ] + ], + "template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": [ + "url", + "objective", + "observation", + "previous_action" + ], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + } +} \ No newline at end of file diff --git a/webarena/agent/prompts/jsons/p_direct_id_actree_2s.json b/webarena/agent/prompts/jsons/p_direct_id_actree_2s.json new file mode 100644 index 0000000..d336a03 --- /dev/null +++ b/webarena/agent/prompts/jsons/p_direct_id_actree_2s.json @@ -0,0 +1,26 @@ +{ + "intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [direction=down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"N/A\" in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n3. Generate the action in the correct format. Always put the action inside a pair of ```. For example, ```click [1234]```.\n5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.", + "examples": [ + [ + "OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n\t\t[1749] StaticText '$279.49'\n\t\t[1757] button 'Add to Cart'\n\t\t[1760] button 'Add to Wish List'\n\t\t[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine\nPREVIOUS ACTION: None", + "```stop [$279.49]```" + ], + [ + "OBSERVATION:\n[164] textbox 'Search' focused: True required: False\n[171] button 'Go'\n[174] link 'Find directions between two points'\n[212] heading 'Search Results'\n[216] button 'Close'\nURL: http://openstreetmap.org\nOBJECTIVE: Show me the restaurants near CMU\nPREVIOUS ACTION: None", + "```type [164] [restaurants near CMU] [1]```" + ] + ], + "template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": [ + "url", + "objective", + "observation", + "previous_action" + ], + "prompt_constructor": "DirectPromptConstructor", + "action_splitter": "```" + } +} \ No newline at end of file diff --git a/webarena/agent/prompts/jsons/p_direct_id_actree_2s_no_na.json b/webarena/agent/prompts/jsons/p_direct_id_actree_2s_no_na.json new file mode 100644 index 0000000..ac3306f --- /dev/null +++ b/webarena/agent/prompts/jsons/p_direct_id_actree_2s_no_na.json @@ -0,0 +1,27 @@ +{ + "intro": "You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe current web page's URL: This is the page you're currently navigating.\nThe open tabs: These are the tabs you have open.\nThe previous action: This is the action you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [direction=down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\nhttp://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation\n2. You should only issue one action at a time.\n4. Generate the action in the correct format, wrap the action inside ``````. For example, ```click [1234]```\".\n5. Issue stop action when you think you have achieved the objective.", + "examples": [ + [ + "OBSERVATION:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n\t\t[1749] StaticText '$279.49'\n\t\t[1757] button 'Add to Cart'\n\t\t[1760] button 'Add to Wish List'\n\t\t[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nOBJECTIVE: What is the price of HP Inkjet Fax Machine\nPREVIOUS ACTION: None", + "```stop [$279.49]```" + ], + [ + "OBSERVATION:\n[164] textbox 'Search' focused: True required: False\n[171] button 'Go'\n[174] link 'Find directions between two points'\n[212] heading 'Search Results'\n[216] button 'Close'\nURL: http://openstreetmap.org\nOBJECTIVE: Show me the restaurants near CMU\nPREVIOUS ACTION: None", + "```type [164] [restaurants near CMU] [1]```" + ] + ], + "template": "OBSERVATION:\n{observation}\nURL: {url}\nOBJECTIVE: {objective}\nPREVIOUS ACTION: {previous_action}", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": [ + "url", + "objective", + "observation", + "previous_action" + ], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + } +} \ No newline at end of file diff --git a/webarena/agent/prompts/jsons/p_direct_id_actree_3s_llama.json b/webarena/agent/prompts/jsons/p_direct_id_actree_3s_llama.json new file mode 100644 index 0000000..f87f09f --- /dev/null +++ b/webarena/agent/prompts/jsons/p_direct_id_actree_3s_llama.json @@ -0,0 +1,32 @@ +{ + "intro": "You are an autonomous intelligent agent tasked with navigating a web browser. The actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"Enter\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [direction=down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket.\n\nHomepage:\nIf you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit.\n\nYou can only issue one action at a time", + "examples": [ + [ + "Observation:\n[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)'\n\t[1749] StaticText '$279.49'\n\t[1757] button 'Add to Cart'\n\t[1760] button 'Add to Wish List'\n\t[1761] button 'Add to Compare'\nURL: http://onestopmarket.com/office-products/office-electronics.html\nObjective: What is the price of HP Inkjet Fax Machine\nPrevious action: None", + "```stop [$279.49]```" + ], + [ + "Observation:\n[164] textbox 'Search' focused: True required: False\n[171] button 'Go'\n[174] link 'Find directions between two points'\n[212] heading 'Search Results'\n[216] button 'Close'\nURL: http://openstreetmap.org\nObjective: Show me the restaurants near CMU\nPrevious action: None", + "```type [164] [restaurants near CMU] [1]```" + ], + [ + "Observation:\n[2036] button 'Sort by: New' hasPopup: menu expanded: False\n\t[587] link 'US Marine\u2019s adoption of Afghan war orphan voided'\n\t\t[989] time 'March 30, 2023 at 15:03:48 AM UTC'\n\t[602] link 'York student uses AI chatbot to get parking fine revoked'\n\t\t[1025] time 'March 15, 2023 at 7:48:34 AM UTC'\n\t[617] link 'Loveland parents furious after teachers leave, communication lagged during school threat investigation'\n\t\t[1025] time 'March 2, 2023 at 3:46:01 AM UTC'\nURL: http://reddit.com/f/news/new\nObjective: Open the most recent post that was published prior to March 1st.\nPrevious action: None", + "```scroll [down]```" + ] + ], + "template": "Observation:\n{observation}\nURL: {url}\nObjective: {objective}\nPrevious action: {previous_action}", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": [ + "url", + "objective", + "observation", + "previous_action" + ], + "prompt_constructor": "DirectPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```", + "force_prefix": "```" + } +} \ No newline at end of file diff --git a/webarena/agent/prompts/jsons/test_prompt.json b/webarena/agent/prompts/jsons/test_prompt.json new file mode 100644 index 0000000..e227938 --- /dev/null +++ b/webarena/agent/prompts/jsons/test_prompt.json @@ -0,0 +1,20 @@ +{ + "intro": "", + "examples": [], + "template": " {html} \n\nYou are a helpful assistant that can assist with web navigation tasks.\nYou are given a simplified html webpage and a task description. \nYour goal is to complete the task. You can perform the specified operations below to interact with the webpage.\n\n#Valid operations: - #Click# id: Click on the element with the specified id\n- #Scroll_up#: Scroll up 1 page.\n- #Scroll_down#: Scroll down 1 page.\n- #Go_backward#: Go back to the previous page.\n- #Go_forward#: Go forward to the next page.\n- #Hover# id: Hover over the element with the specified id.\n- #Type# id \"text\": Type in the text at the element with the specified id.\n- #Select# id \"option\": Select the option at the element with the specified id.\n- #Record# \"content\": Mark content that is useful in answering the question.\n- #Answer# \"text\": output the text as the answer to the user.\n- #Exit#: Complete the task and exit the program.\n\n#Current viewport position: {position}\n\n#Previous Operation: {previous_action}\n\n#Task: {objective}\n", + "finale": "\nYour output SHOULD be in the following format:\n#Operation: {Next operation to perform}\n", + "meta_data": { + "observation": "html", + "action_type": "id_html_tree", + "keywords": [ + "url", + "html", + "objective", + "position", + "previous_action" + ], + "prompt_constructor": "MyPromptConstructor", + "answer_phrase": "", + "action_splitter": "#" + } +} \ No newline at end of file diff --git a/webarena/agent/prompts/prompt_constructor.py b/webarena/agent/prompts/prompt_constructor.py new file mode 100644 index 0000000..039b87c --- /dev/null +++ b/webarena/agent/prompts/prompt_constructor.py @@ -0,0 +1,683 @@ +import json +import re +from pathlib import Path +from typing import Any, TypedDict + +from browser_env import Action, ActionParsingError, Trajectory +from browser_env.env_config import URL_MAPPINGS +from browser_env.utils import StateInfo +from llms import lm_config +from llms.tokenizers import Tokenizer +from llms.utils import APIInput + + +class Instruction(TypedDict): + """Instruction for constructing prompt""" + + intro: str + examples: list[tuple[str, str]] + template: str + meta_data: dict[str, Any] + + +class PromptConstructor(object): + def __init__( + self, + instruction_path: str | Path, + lm_config: lm_config.LMConfig, + tokenizer: Tokenizer, + ): + self.instruction_path = Path(instruction_path) + self.obs_modality = "text" + self.lm_config = lm_config + instruction = json.load(open(self.instruction_path)) + instruction["examples"] = [tuple(e) for e in instruction["examples"]] + self.instruction: Instruction = instruction + self.tokenizer = tokenizer + + def get_lm_api_input( + self, intro: str, examples: list[tuple[str, str]], current: str + ) -> APIInput: + + """Return the require format for an API""" + message: list[dict[str, str]] | str + if "openai" in self.lm_config.provider: + if self.lm_config.mode == "chat": + message = [{"role": "system", "content": intro}] + for (x, y) in examples: + message.append( + { + "role": "system", + "name": "example_user", + "content": x, + } + ) + message.append( + { + "role": "system", + "name": "example_assistant", + "content": y, + } + ) + message.append({"role": "user", "content": current}) + return message + elif self.lm_config.mode == "completion": + message = f"{intro}\n\n" + message += "Here are a few examples:\n" + for example in examples: + message += f"Observation\n:{example[0]}\n\n" + message += f"Action: {example[1]}\n\n" + message += "Now make prediction given the observation\n\n" + message += f"Observation\n:{current}\n\n" + message += "Action:" + return message + else: + raise ValueError( + f"OpenAI models do not support mode {self.lm_config.mode}" + ) + elif "huggingface" in self.lm_config.provider: + # https://huggingface.co/blog/llama2#how-to-prompt-llama-2 + # https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L320 + if "Llama-2" in self.lm_config.model: + if self.lm_config.mode == "chat": + B_INST, E_INST = "[INST]", "[/INST]" + B_SYS, E_SYS = "<>\n", "\n<>\n\n" + BOS, EOS = "", "" + # adding the system message to be the starting of the first example + examples = [ + ( + B_SYS + intro + E_SYS + examples[0][0], + examples[0][1], + ) + ] + examples[1:] + message = "".join( + [ + f"{BOS}{B_INST} {x.strip()} {E_INST} {y.strip()} {EOS}" + for (x, y) in examples + ] + ) + # add the current observation + message += f"{BOS}{B_INST} {current.strip()} {E_INST} {self.instruction['meta_data'].get('force_prefix', '')}" + + return message + else: + raise ValueError("Only chat mode is supported for Llama-2") + else: + raise ValueError( + f"Huggingface models do not support model_tag {self.lm_config.gen_config['model_tag']}" + ) + elif "ours" in self.lm_config.provider: + message = f"{intro}\n\n" + message += "Now make prediction given the observation\n\n" + message += f"Observation\n:{current}\n\n" + message += "Action:" + return message + else: + raise NotImplementedError( + f"Provider {self.lm_config.provider} not implemented" + ) + + def construct( + self, + trajectory: Trajectory, + intent: str, + meta_data: dict[str, Any] = {}, + ) -> APIInput: + raise NotImplementedError + + def map_url_to_real(self, url: str) -> str: + """Map the urls to their real world counterparts""" + for i, j in URL_MAPPINGS.items(): + if i in url: + url = url.replace(i, j) + return url + + def map_url_to_local(self, url: str) -> str: + """Map the urls to their local counterparts""" + for i, j in URL_MAPPINGS.items(): + if j in url: + url = url.replace(j, i) + # https + if j.replace("http", "https") in url: + url = url.replace(j.replace("http", "https"), i) + return url + + def _extract_action(self, response: str) -> str: + raise NotImplementedError + + def extract_action(self, response: str) -> str: + response = self._extract_action(response) + response = self.map_url_to_local(response) + return response + + +class DirectPromptConstructor(PromptConstructor): + """The agent will direct predict the action""" + + def __init__( + self, + instruction_path: str | Path, + lm_config: lm_config.LMConfig, + tokenizer: Tokenizer, + ): + super().__init__(instruction_path, lm_config, tokenizer) + + def construct( + self, + trajectory: Trajectory, + intent: str, + meta_data: dict[str, Any] = {}, + ) -> APIInput: + """Construct prompt given the trajectory""" + intro = self.instruction["intro"] + examples = self.instruction["examples"] + template = self.instruction["template"] + keywords = self.instruction["meta_data"]["keywords"] + state_info: StateInfo = trajectory[-1] # type: ignore[assignment] + + obs = state_info["observation"][self.obs_modality] + max_obs_length = self.lm_config.gen_config["max_obs_length"] + if max_obs_length: + obs = self.tokenizer.decode(self.tokenizer.encode(obs)[:max_obs_length]) # type: ignore[arg-type] + + page = state_info["info"]["page"] + url = page.url + previous_action_str = meta_data["action_history"][-1] + + # input x + current = template.format( + objective=intent, + url=self.map_url_to_real(url), + observation=obs, + previous_action=previous_action_str, + ) + + # make sure all keywords are replaced + assert all([f"{{k}}" not in current for k in keywords]) + prompt = self.get_lm_api_input(intro, examples, current) + return prompt + + def _extract_action(self, response: str) -> str: + action_splitter = self.instruction["meta_data"]["action_splitter"] + pattern = rf"{action_splitter}((.|\n)*?){action_splitter}" + match = re.search(pattern, response) + if match: + return match.group(1).strip() + else: + raise ActionParsingError( + f"Cannot parse action from response {response}" + ) + + +class CoTPromptConstructor(PromptConstructor): + """The agent will perform step-by-step reasoning before the answer""" + + def __init__( + self, + instruction_path: str | Path, + lm_config: lm_config.LMConfig, + tokenizer: Tokenizer, + ): + super().__init__(instruction_path, lm_config, tokenizer) + self.answer_phrase = self.instruction["meta_data"]["answer_phrase"] + + def construct( + self, + trajectory: Trajectory, + intent: str, + meta_data: dict[str, Any] = {}, + ) -> APIInput: + intro = self.instruction["intro"] + examples = self.instruction["examples"] + template = self.instruction["template"] + keywords = self.instruction["meta_data"]["keywords"] + state_info: StateInfo = trajectory[-1] # type: ignore[assignment] + + obs = state_info["observation"][self.obs_modality] + max_obs_length = self.lm_config.gen_config["max_obs_length"] + if max_obs_length: + obs = self.tokenizer.decode(self.tokenizer.encode(obs)[:max_obs_length]) # type: ignore[arg-type] + + page = state_info["info"]["page"] + url = page.url + previous_action_str = meta_data["action_history"][-1] + current = template.format( + objective=intent, + url=self.map_url_to_real(url), + observation=obs, + previous_action=previous_action_str, + ) + + assert all([f"{{k}}" not in current for k in keywords]) + + prompt = self.get_lm_api_input(intro, examples, current) + return prompt + + def _extract_action(self, response: str) -> str: + # find the first occurence of action + action_splitter = self.instruction["meta_data"]["action_splitter"] + pattern = rf"{action_splitter}((.|\n)*?){action_splitter}" + match = re.search(pattern, response) + if match: + return match.group(1).strip() + else: + raise ActionParsingError( + f'Cannot find the answer phrase "{self.answer_phrase}" in "{response}"' + ) + +class MyPromptConstructor(PromptConstructor): + """The agent will perform step-by-step reasoning before the answer""" + operation = [ + r"#?(Click)#?\s*([0-9]+)", + r"#?(Type)#?\s*([0-9]+)\s+[\'\"]{0,1}([\s\S]+)[\'\"]{0,1}", + r"#?(Select)#?\s*([0-9]+)\s+[\'\"]{0,1}(.+)[\'\"]{0,1}", + r"#?(Scroll_up)#?", + r"#?(Scroll_down)#?", + r"#?(Goto)#?\s*(https?:\/\/[-a-z0-9]+(?:\.[-a-z0-9]+)*\.(?:com|cn|edu|uk)(?:\/[-a-z0-9_:@&?=+,.!/~*'%$]*)?)", + r"#?(Go_backward)#?", + r"#?(Go_forward)#?", + r"#?(Hover)#?\s*([0-9]+)", + r"#?(Answer)#?\s+(.+)", + r"#?(Login)#?", + r"#?(Verify)#?", + r"#?(Exit)#?", + r"#?(Record)#?\s+[\'\"]{0,1}(.+)[\'\"]{0,1}", + ] + + translate = [ + "click", + "type", + "select", + "scroll [up]", + "scroll [down]", + "goto", + "go_back", + "go_forward", + "hover", + "stop", + "stop", + "stop", + "stop", + "record", + ] + + def __init__( + self, + instruction_path: str | Path, + lm_config: lm_config.LMConfig, + tokenizer: Tokenizer, + ): + super().__init__(instruction_path, lm_config, tokenizer) + self.answer_phrase = self.instruction["meta_data"]["answer_phrase"] + self.state = {} + + def construct( + self, + trajectory: Trajectory, + intent: str, + meta_data: dict[str, Any] = {}, + ) -> APIInput: + intro = self.instruction["intro"] + examples = self.instruction["examples"] + template = self.instruction["template"] + keywords = self.instruction["meta_data"]["keywords"] + finale = self.instruction["finale"] + state_info: StateInfo = trajectory[-1] # type: ignore[assignment] + + obs = state_info["observation"][self.obs_modality] + max_obs_length = self.lm_config.gen_config["max_obs_length"] + if max_obs_length: + obs = self.tokenizer.decode(self.tokenizer.encode(obs)[:max_obs_length]) # type: ignore[arg-type] + + info = state_info["info"] + obs_metadata = info["observation_metadata"]["text"] + nodes = obs_metadata["obs_nodes_info"] + position_info = obs_metadata["position_info"] + html_parser = obs_metadata["html_parser"] + self.nodes = nodes + + page = info["page"] + url = self.map_url_to_real(page.url) + position_bar = self._get_position_bar(position_info) + + history = [f"{ix}. {his}" for his in meta_data["action_history"]] + if len(history) == 1: + previous_action_str = "None" + else: + previous_action_str = '\n'.join(history[1:]) + + self.state.update({ + "url": url, + "html": obs, + "html_parser": html_parser, + "segment": "None", + "operation": "None", + }) + + current = template.format( + objective=intent, + url=url, + html=obs, + position=position_bar, + previous_action=previous_action_str, + ) + + assert all([f"{{k}}" not in current for k in keywords]) + + # prompt = self.get_lm_api_input(intro, examples, current) + prompt = current + finale + + return prompt + + def _extract_action(self, response: str) -> str: + # find the first occurence of action + self.state["intention"] = self._extract_intention(response) + + for regex, act in zip(self.operation, self.translate): + match = re.search(regex, response) + + if match: + m = match.groups() + if isinstance(m, tuple): + exact_act = m[0] + param = m[1:] + else: + exact_act = m + param = [] + + param = list(param) + if act in ['click', 'hover', 'type', 'select']: + if len(param) == 0: + continue + + for node_id, node in self.nodes.items(): + if node['label'] == param[0]: + label = param[0] + hp = self.state["html_parser"] + bid = hp.id_label_converter(label) + segment = hp.get_segment(bid) + + print('[Label]', label, bid, segment) + self.state["segment"] = segment + #self._extract_segment(self.state["html"], label) + if act not in ['select']: + param[0] = node_id + break + + + if act in ['stop', 'select', 'record']: + if len(param) > 0: + param[-1] = param[-1].strip("\'\"") + + if act in ['type']: + print('In prompt constructer', param[-1]) + if len(param) > 0: + param[-1] = param[-1].strip("\'\"") + print(param[-1]) + if param[-1].endswith('\n'): + param[-1] = param[-1][:-1] + param.append('1') + else: + param.append('0') + + command = act + for p in param: + command += f" [{p}]" + + print(command) + return command + + raise ActionParsingError( + f'Cannot find the answer phrase in "{response}"' + ) + + @staticmethod + def _get_position_bar(data): + position = data.get("position", 0.0) + page_height = data.get("page_height", 1.0) + left_bar = '-' * int(position) + right_bar = '-' * int(max(1, page_height - position)) + return f'[0{left_bar}|{round(position, 1)}{right_bar}{round(page_height, 1)}]' + + @staticmethod + def _extract_intention(response, lang='en'): + if lang == 'en': + matches = re.findall(r"#Thinking Process:\s*(.+)\s*#Operation:", response) + print('[Try to match]', matches) + else: + matches = re.findall(r"#思考过程: (.+)", response) + + if matches: + return matches[-1] + else: + return None + + @staticmethod + def _extract_segment(html: str, tag: str): + tag = f'[{tag}]' + has_content = False + + def _left(html, start): + nonlocal has_content + left_cnt, right_cnt = 0, 0 + for i in range(start, -1, -1): + if html[i] == '<': + left_cnt += 1 + elif html[i] == '>': + if html[i - 2] != '|' and html[i - 2] != '>': + has_content = True + right_cnt += 1 + elif html[i] == '|': + if html[i + 2] != '<' and html[i + 2] != '>': + has_content = True + if left_cnt == right_cnt + 1: + return i + return -1 + + def _right(html, start): + nonlocal has_content + left_cnt, right_cnt = 0, 0 + for i in range(start, len(html), 1): + if html[i] == '<': + left_cnt += 1 + elif html[i] == '>': + if html[i - 2] != '|' and html[i - 2] != '>': + has_content = True + right_cnt += 1 + elif html[i] == '|': + if html[i + 2] != '<' and html[i + 2] != '>': + has_content = True + if left_cnt + 1 == right_cnt: + return i + 1 + return -1 + + tag_start = html.find(tag) + + if tag_start == -1: + return None + + left_bound, right_bound = _left(html, tag_start), _right(html, tag_start) + while True: + if left_bound == -1 or right_bound == -1: + return None + + if has_content: + break + + else: + lb, rb = _left(html, left_bound - 1), _right(html, right_bound + 1) + if lb == -1 or rb == -1: + break + if rb - lb > 150: + break + else: + left_bound, right_bound = lb, rb + + segment = html[left_bound:right_bound] + + if len(segment) > 150: + return segment[:150] + '...>' + + return segment + +class NewASPromptConstructor(PromptConstructor): + """The agent will perform step-by-step reasoning before the answer""" + operation = [ + r"(click)\(\s*[\'\"]([A-Z]{1,3})[\'\"]\s*\)", + r"(type_string)\(\s*[\'\"]([A-Z]{1,3})[\'\"]\s*,\s*[\'\"]([\s\S]+)[\'\"]\s*,\s*(True|False)\s*\)", + r"(select)\(\s*[\'\"]([A-Z]{1,3})[\'\"]\s*,\s*[\'\"]([\s\S]+)[\'\"]\s*\)", + r"(scroll_page)\(\s*[\'\"]up[\'\"]\s*\)", + r"(scroll_page)\(\s*[\'\"]down[\'\"]\s*\)", + r"(jump_to)\(\s*[\'\"](.+)[\'\"]\s*,\s*(True|False)\s*\)", + r"(go)\(\s*[\'\"]backward[\'\"]\s*\)", + r"(go)\(\s*[\'\"]forward[\'\"]\s*\)", + r"(hover)\(\s*[\'\"]([A-Z]{1,3})[\'\"]\s*\)", + r"(finish)\(\s*\)", + r"(finish)\(\s*(.+)\s*\)", + r"(record)\(\s*[\'\"](.+)[\'\"]\s*\)", + r"(switch_tab)\([\d]+\)" + ] + + translate = [ + "click", + "type", + "select", + "scroll [up]", + "scroll [down]", + "goto", + "go_back", + "go_forward", + "hover", + "stop", + "stop", + "record", + "page_focus", + ] + + def __init__( + self, + instruction_path: str | Path, + lm_config: lm_config.LMConfig, + tokenizer: Tokenizer, + ): + super().__init__(instruction_path, lm_config, tokenizer) + self.answer_phrase = self.instruction["meta_data"]["answer_phrase"] + self.state = {} + + def construct( + self, + trajectory: Trajectory, + intent: str, + meta_data: dict[str, Any] = {}, + ) -> APIInput: + intro = self.instruction["intro"] + examples = self.instruction["examples"] + template = self.instruction["template"] + keywords = self.instruction["meta_data"]["keywords"] + finale = self.instruction["finale"] + state_info: StateInfo = trajectory[-1] # type: ignore[assignment] + + obs = state_info["observation"][self.obs_modality] + max_obs_length = self.lm_config.gen_config["max_obs_length"] + if max_obs_length: + obs = self.tokenizer.decode(self.tokenizer.encode(obs)[:max_obs_length]) # type: ignore[arg-type] + + info = state_info["info"] + obs_metadata = info["observation_metadata"]["text"] + nodes = obs_metadata["obs_nodes_info"] + position_info = obs_metadata["position_info"] + html_parser = obs_metadata["html_parser"] + tabs_str = obs_metadata["tab_title"] + self.nodes = nodes + + page = info["page"] + url = self.map_url_to_real(page.url) + position_bar = self._get_position_bar(position_info) + + history = meta_data["action_history"] + if len(history) == 1: + previous_action_str = "None" + else: + previous_action_str = '\n'.join(history[1:]) + + self.state.update({ + "url": url, + "html": obs, + "html_parser": html_parser, + "segment": "None", + "operation": "None", + }) + + current = template.format( + objective=intent, + url=url, + html=obs, + position=position_bar, + previous_action=previous_action_str, + tabs=tabs_str, + ) + + assert all([f"{{k}}" not in current for k in keywords]) + + # prompt = self.get_lm_api_input(intro, examples, current) + prompt = current + finale + + return prompt + + def _extract_action(self, response: str) -> str: + # find the first occurence of action + # self.state["intention"] = self._extract_intention(response) + + for regex, act in zip(self.operation, self.translate): + match = re.search(regex, response) + if match: + m = match.groups() + if isinstance(m, tuple): + exact_act = m[0] + param = m[1:] + else: + exact_act = m + param = [] + + print(exact_act, param) + param = list(param) + if act in ['click', 'hover', 'type', 'select']: + if len(param) == 0: + continue + + for node_id, node in self.nodes.items(): + if node['label'] == param[0]: + label = param[0] + hp = self.state["html_parser"] + bid = hp.id_label_converter(label) + segment = hp.get_segment(bid) + + print('[Label]', label, bid, segment) + self.state["segment"] = segment + #self._extract_segment(self.state["html"], label) + if act not in ['select']: + param[0] = node_id + break + + if len(param) > 0: + if act in ['stop', 'select', 'record']: + param[-1] = param[-1].strip("\'\"") + if act in ['type', 'goto']: + param[-1] = '1' if param[-1] == 'True' else '0' + + command = act + for p in param: + command += f" [{p}]" + + print(command) + return command + + raise ActionParsingError( + f'Cannot find the answer phrase in "{response}"' + ) + + @staticmethod + def _get_position_bar(data): + position = data.get("position", 0.0) + page_height = data.get("page_height", 1.0) + return f"{round(position, 1)} / {round(page_height, 1)}" \ No newline at end of file diff --git a/webarena/agent/prompts/raw/new_action_prompt.py b/webarena/agent/prompts/raw/new_action_prompt.py new file mode 100644 index 0000000..5382b03 --- /dev/null +++ b/webarena/agent/prompts/raw/new_action_prompt.py @@ -0,0 +1,118 @@ +prompt = { + "intro": "", + "examples": [], + "template": """ {html} + +You are a helpful assistant that can assist with web navigation tasks. +You are given a simplified html webpage and a task description. +Your goal is to complete the task. You can use the provided functions below to interact with the current webpage. + +#Provided functions: +def click(element_id: str) -> None: + \"\"\" + Click on the element with the specified id. + + Args: + element_id: The id of the element. + \"\"\" + +def hover(element_id: str) -> None: + \"\"\" + Hover on the element with the specified id. + + Args: + element_id: The id of the element. + \"\"\" + +def select(element_id: str, option: str) -> None: + \"\"\" + Select an option from a dropdown. + + Args: + element_id: The id of the element. + option: Value of the option to select. + \"\"\" + +def type_string(element_id: str, content: str, press_enter: bool) -> None: + \"\"\" + Type a string into the element with the specified id. + + Args: + element_id: The id of the element. + content: The string to type. + press_enter: Whether to press enter after typing the string. + \"\"\" + +def scroll_page(direction: Literal['up', 'down']) -> None: + \"\"\" + Scroll down/up one page. + + Args: + direction: The direction to scroll. + \"\"\" + +def go(direction: Literal['forward', 'backward']) -> None: + \"\"\" + Go forward/backward + + Args: + direction: The direction to go to. + \"\"\" + +def jump_to(url: str, new_tab: bool) -> None: + \"\"\" + Jump to the specified url. + + Args: + url: The url to jump to. + new_tab: Whether to open the url in a new tab. + \"\"\" + +def switch_tab(tab_index: int) -> None: + \"\"\" + Switch to the specified tab. + + Args: + tab_index: The index of the tab to switch to. + \"\"\" + +def user_input(message: str) -> str: + \"\"\" + Wait for user input. + + Args: + message: The message to display to the user. + + Returns: The user input. + \"\"\" + +def finish(answer: Optional[str]) -> None: + \"\"\" + Finish the task (optionally with an answer). + + Args: + answer: The answer to the task. + \"\"\" + +#Previous commands: {previous_action} + +#Window tabs: {tabs} + +#Current viewport (pages): {position} + +#Task: {objective} + +You should output one command to interact to the currrent webpage. +You should add a brief comment to your command to explain your reasoning and thinking process. +""", + "finale": "", + "meta_data": { + "observation": "html", + "action_type": "id_html_nasc_tree", + "keywords": ["url", "html", "objective", "position", "previous_action", "tabs"], + "prompt_constructor": "NewASPromptConstructor", + "answer_phrase": "", + "action_splitter": "#" + }, +} + diff --git a/webarena/agent/prompts/raw/p_cot_id_actree_2s.py b/webarena/agent/prompts/raw/p_cot_id_actree_2s.py new file mode 100644 index 0000000..b85e54c --- /dev/null +++ b/webarena/agent/prompts/raw/p_cot_id_actree_2s.py @@ -0,0 +1,82 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. + +Here's the information you'll have: +The user's objective: This is the task you're trying to complete. +The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. +The current web page's URL: This is the page you're currently navigating. +The open tabs: These are the tabs you have open. +The previous action: This is the action you just performed. It may be helpful to track your progress. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. +http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. + +To be successful, it is very important to follow the following rules: +1. You should only issue an action that is valid given the current observation +2. You should only issue one action at a time. +3. You should follow the examples to reason step by step and then issue the next action. +4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```". +5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""", + "examples": [ + ( + """OBSERVATION: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +OBJECTIVE: What is the price of HP Inkjet Fax Machine +PREVIOUS ACTION: None""", + "Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```", + ), + ( + """OBSERVATION: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +OBJECTIVE: Show me the restaurants near CMU +PREVIOUS ACTION: None""", + "Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```", + ), + ], + "template": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +PREVIOUS ACTION: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + }, +} diff --git a/webarena/agent/prompts/raw/p_cot_id_actree_2s_no_na.py b/webarena/agent/prompts/raw/p_cot_id_actree_2s_no_na.py new file mode 100644 index 0000000..945cd95 --- /dev/null +++ b/webarena/agent/prompts/raw/p_cot_id_actree_2s_no_na.py @@ -0,0 +1,82 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. + +Here's the information you'll have: +The user's objective: This is the task you're trying to complete. +The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. +The current web page's URL: This is the page you're currently navigating. +The open tabs: These are the tabs you have open. +The previous action: This is the action you just performed. It may be helpful to track your progress. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. +http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. + +To be successful, it is very important to follow the following rules: +1. You should only issue an action that is valid given the current observation +2. You should only issue one action at a time. +3. You should follow the examples to reason step by step and then issue the next action. +4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```". +5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""", + "examples": [ + ( + """OBSERVATION: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +OBJECTIVE: What is the price of HP Inkjet Fax Machine +PREVIOUS ACTION: None""", + "Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```", + ), + ( + """OBSERVATION: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +OBJECTIVE: Show me the restaurants near CMU +PREVIOUS ACTION: None""", + "Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```", + ), + ], + "template": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +PREVIOUS ACTION: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + }, +} diff --git a/webarena/agent/prompts/raw/p_direct_id_actree_2s.py b/webarena/agent/prompts/raw/p_direct_id_actree_2s.py new file mode 100644 index 0000000..8d4e4f6 --- /dev/null +++ b/webarena/agent/prompts/raw/p_direct_id_actree_2s.py @@ -0,0 +1,80 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. + +Here's the information you'll have: +The user's objective: This is the task you're trying to complete. +The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. +The current web page's URL: This is the page you're currently navigating. +The open tabs: These are the tabs you have open. +The previous action: This is the action you just performed. It may be helpful to track your progress. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. +http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. + +To be successful, it is very important to follow the following rules: +1. You should only issue an action that is valid given the current observation +2. You should only issue one action at a time. +3. Generate the action in the correct format. Always put the action inside a pair of ```. For example, ```click [1234]```. +5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""", + "examples": [ + ( + """OBSERVATION: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +OBJECTIVE: What is the price of HP Inkjet Fax Machine +PREVIOUS ACTION: None""", + "```stop [$279.49]```", + ), + ( + """OBSERVATION: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +OBJECTIVE: Show me the restaurants near CMU +PREVIOUS ACTION: None""", + "```type [164] [restaurants near CMU] [1]```", + ), + ], + "template": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +PREVIOUS ACTION: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "DirectPromptConstructor", + "action_splitter": "```" + }, +} diff --git a/webarena/agent/prompts/raw/p_direct_id_actree_2s_no_na.py b/webarena/agent/prompts/raw/p_direct_id_actree_2s_no_na.py new file mode 100644 index 0000000..c399454 --- /dev/null +++ b/webarena/agent/prompts/raw/p_direct_id_actree_2s_no_na.py @@ -0,0 +1,81 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. + +Here's the information you'll have: +The user's objective: This is the task you're trying to complete. +The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. +The current web page's URL: This is the page you're currently navigating. +The open tabs: These are the tabs you have open. +The previous action: This is the action you just performed. It may be helpful to track your progress. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. +http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. + +To be successful, it is very important to follow the following rules: +1. You should only issue an action that is valid given the current observation +2. You should only issue one action at a time. +4. Generate the action in the correct format, wrap the action inside ``````. For example, ```click [1234]```". +5. Issue stop action when you think you have achieved the objective.""", + "examples": [ + ( + """OBSERVATION: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +OBJECTIVE: What is the price of HP Inkjet Fax Machine +PREVIOUS ACTION: None""", + "```stop [$279.49]```", + ), + ( + """OBSERVATION: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +OBJECTIVE: Show me the restaurants near CMU +PREVIOUS ACTION: None""", + "```type [164] [restaurants near CMU] [1]```", + ), + ], + "template": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +PREVIOUS ACTION: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "CoTPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + }, +} diff --git a/webarena/agent/prompts/raw/p_direct_id_actree_3s_llama.py b/webarena/agent/prompts/raw/p_direct_id_actree_3s_llama.py new file mode 100644 index 0000000..6278d2b --- /dev/null +++ b/webarena/agent/prompts/raw/p_direct_id_actree_3s_llama.py @@ -0,0 +1,83 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. + +You can only issue one action at a time""", + + "examples": [ + ( + """Observation: +[1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' + [1749] StaticText '$279.49' + [1757] button 'Add to Cart' + [1760] button 'Add to Wish List' + [1761] button 'Add to Compare' +URL: http://onestopmarket.com/office-products/office-electronics.html +Objective: What is the price of HP Inkjet Fax Machine +Previous action: None""", + "```stop [$279.49]```", + ), + ( + """Observation: +[164] textbox 'Search' focused: True required: False +[171] button 'Go' +[174] link 'Find directions between two points' +[212] heading 'Search Results' +[216] button 'Close' +URL: http://openstreetmap.org +Objective: Show me the restaurants near CMU +Previous action: None""", + "```type [164] [restaurants near CMU] [1]```", + ), + ( + """Observation: +[2036] button 'Sort by: New' hasPopup: menu expanded: False + [587] link 'US Marine’s adoption of Afghan war orphan voided' + [989] time 'March 30, 2023 at 15:03:48 AM UTC' + [602] link 'York student uses AI chatbot to get parking fine revoked' + [1025] time 'March 15, 2023 at 7:48:34 AM UTC' + [617] link 'Loveland parents furious after teachers leave, communication lagged during school threat investigation' + [1025] time 'March 2, 2023 at 3:46:01 AM UTC' +URL: http://reddit.com/f/news/new +Objective: Open the most recent post that was published prior to March 1st. +Previous action: None""", + "```scroll [down]```", + ) + ], + "template": """Observation: +{observation} +URL: {url} +Objective: {objective} +Previous action: {previous_action}""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "DirectPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```", + "force_prefix": "```" + }, +} diff --git a/webarena/agent/prompts/raw/test_prompt.py b/webarena/agent/prompts/raw/test_prompt.py new file mode 100644 index 0000000..5979e58 --- /dev/null +++ b/webarena/agent/prompts/raw/test_prompt.py @@ -0,0 +1,41 @@ +prompt = { + "intro": "", + "examples": [], + "template": """ {html} + +You are a helpful assistant that can assist with web navigation tasks. +You are given a simplified html webpage and a task description. +Your goal is to complete the task. You can perform the specified operations below to interact with the webpage. + +#Valid operations: - #Click# id: Click on the element with the specified id +- #Scroll_up#: Scroll up 1 page. +- #Scroll_down#: Scroll down 1 page. +- #Go_backward#: Go back to the previous page. +- #Go_forward#: Go forward to the next page. +- #Hover# id: Hover over the element with the specified id. +- #Type# id "text": Type in the text at the element with the specified id. +- #Select# id "option": Select the option at the element with the specified id. +- #Record# "content": Mark content that is useful in answering the question. +- #Answer# "text": output the text as the answer to the user. +- #Exit#: Complete the task and exit the program. + +#Current viewport position: {position} + +#Previous Operation: {previous_action} + +#Task: {objective} +""", + "finale": """ +Your output SHOULD be in the following format: +#Operation: {Next operation to perform} +""", + "meta_data": { + "observation": "html", + "action_type": "id_html_tree", + "keywords": ["url", "html", "objective", "position", "previous_action"], + "prompt_constructor": "MyPromptConstructor", + "answer_phrase": "", + "action_splitter": "#" + }, +} + diff --git a/webarena/agent/prompts/to_json.py b/webarena/agent/prompts/to_json.py new file mode 100644 index 0000000..efb283c --- /dev/null +++ b/webarena/agent/prompts/to_json.py @@ -0,0 +1,26 @@ +import glob +import importlib +import json +import os + + +# use the current directory as the root +def run() -> None: + """Convert all python files in agent/prompts to json files in agent/prompts/jsons + + Python files are easiser to edit + """ + for p_file in glob.glob(f"agent/prompts/raw/*.py"): + # import the file as a module + base_name = os.path.basename(p_file).replace(".py", "") + module = importlib.import_module(f"agent.prompts.raw.{base_name}") + prompt = module.prompt + # save the prompt as a json file + os.makedirs("agent/prompts/jsons", exist_ok=True) + with open(f"agent/prompts/jsons/{base_name}.json", "w+") as f: + json.dump(prompt, f, indent=2) + print(f"Done convert python files to json") + + +if __name__ == "__main__": + run() diff --git a/webarena_replication.py b/webarena_replication.py new file mode 100644 index 0000000..8c4a542 --- /dev/null +++ b/webarena_replication.py @@ -0,0 +1,464 @@ +"""Script to run end-to-end evaluation on the benchmark""" +import argparse +import glob +import json +import logging +import os +import random +import subprocess +import tempfile +import time +from pathlib import Path + +import openai + +from webarena.agent import ( + Agent, + PromptAgent, + TeacherForcingAgent, + construct_agent, +) +from webarena.agent.prompts import * +from browser_env import ( + Action, + ActionTypes, + ScriptBrowserEnv, + StateInfo, + Trajectory, + create_stop_action, +) +from browser_env.actions import is_equivalent +from browser_env.auto_login import get_site_comb_from_filepath +from browser_env.helper_functions import ( + RenderHelper, + get_action_description, +) +from evaluation_harness import evaluator_router +from tqdm import tqdm +import nltk +nltk.download('punkt_tab') + +LOG_FOLDER = "log_files" +Path(LOG_FOLDER).mkdir(parents=True, exist_ok=True) +LOG_FILE_NAME = f"{LOG_FOLDER}/log_{time.strftime('%Y%m%d%H%M%S', time.localtime())}_{random.randint(0, 10000)}.log" + +logger = logging.getLogger("logger") +logger.setLevel(logging.INFO) + +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) +logger.addHandler(console_handler) + +file_handler = logging.FileHandler(LOG_FILE_NAME) +file_handler.setLevel(logging.DEBUG) +logger.addHandler(file_handler) + +# Set the log format +formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") +console_handler.setFormatter(formatter) +file_handler.setFormatter(formatter) + + +def config() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run end-to-end evaluation on the benchmark" + ) + parser.add_argument( + "--render", action="store_true", help="Render the browser" + ) + parser.add_argument( + "--slow_mo", + type=int, + default=0, + help="Slow down the browser by the specified amount", + ) + parser.add_argument( + "--action_set_tag", default="id_accessibility_tree", help="Action type" + ) + parser.add_argument( + "--observation_type", + choices=["accessibility_tree", "html", "image"], + default="accessibility_tree", + help="Observation type", + ) + parser.add_argument( + "--current_viewport_only", + action="store_true", + help="Only use the current viewport for the observation", + ) + parser.add_argument("--viewport_width", type=int, default=1280) + parser.add_argument("--viewport_height", type=int, default=720) + parser.add_argument("--save_trace_enabled", action="store_true") + parser.add_argument("--sleep_after_execution", type=float, default=0.0) + + parser.add_argument("--max_steps", type=int, default=30) + + # agent config + parser.add_argument("--agent_type", type=str, default="prompt") + parser.add_argument( + "--instruction_path", + type=str, + default="agents/prompts/state_action_agent.json", + ) + parser.add_argument( + "--parsing_failure_th", + help="When concesecutive parsing failure exceeds this threshold, the agent will stop", + type=int, + default=3, + ) + parser.add_argument( + "--repeating_action_failure_th", + help="When concesecutive repeating action exceeds this threshold, the agent will stop", + type=int, + default=3, + ) + + # lm config + parser.add_argument("--provider", type=str, default="openai") + parser.add_argument("--model", type=str, default="gpt-3.5-turbo-0613") + parser.add_argument("--mode", type=str, default="chat") + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--top_p", type=float, default=0.9) + parser.add_argument("--context_length", type=int, default=0) + parser.add_argument("--max_tokens", type=int, default=384) + parser.add_argument("--stop_token", type=str, default=None) + parser.add_argument("--cuda", type=str, default='0') + parser.add_argument( + "--max_retry", + type=int, + help="max retry times to perform generations when parsing fails", + default=1, + ) + parser.add_argument( + "--max_obs_length", + type=int, + help="when not zero, will truncate the observation to this length before feeding to the model", + default=1920, + ) + parser.add_argument( + "--model_endpoint", + help="huggingface model endpoint", + type=str, + default="", + ) + + # example config + parser.add_argument("--test_start_idx", type=int, default=0) + parser.add_argument("--test_end_idx", type=int, default=1000) + parser.add_argument("--sample", type=int, default=1) + + # logging related + parser.add_argument("--result_dir", type=str, default="") + args = parser.parse_args() + + # check the whether the action space is compatible with the observation space + if ( + args.action_set_tag == "id_accessibility_tree" + and args.observation_type not in ["accessibility_tree", "html"] + ): + raise ValueError( + f"Action type {args.action_set_tag} is incompatible with the observation type {args.observation_type}" + ) + + return args + + +def early_stop( + trajectory: Trajectory, max_steps: int, thresholds: dict[str, int] +) -> tuple[bool, str]: + """Check whether need to early stop""" + + # reach the max step + num_steps = (len(trajectory) - 1) / 2 + if num_steps >= max_steps: + return True, f"Reach max steps {max_steps}" + + last_k_actions: list[Action] + action_seq: list[Action] + + # Case: parsing failure for k times + k = thresholds["parsing_failure"] + last_k_actions = trajectory[1::2][-k:] # type: ignore[assignment] + if len(last_k_actions) >= k: + if all( + [ + action["action_type"] == ActionTypes.NONE + for action in last_k_actions + ] + ): + return True, f"Failed to parse actions for {k} times" + + # Case: same action for k times + k = thresholds["repeating_action"] + last_k_actions = trajectory[1::2][-k:] # type: ignore[assignment] + action_seq = trajectory[1::2] # type: ignore[assignment] + + if len(action_seq) == 0: + return False, "" + + last_action: Action = action_seq[-1] + + if last_action["action_type"] != ActionTypes.TYPE: + if len(last_k_actions) >= k: + if all( + [ + is_equivalent(action, last_action) + for action in last_k_actions + ] + ): + return True, f"Same action for {k} times" + + else: + # check the action sequence + if ( + sum([is_equivalent(action, last_action) for action in action_seq]) + >= k + ): + return True, f"Same typing action for {k} times" + + return False, "" + + +def test( + args: argparse.Namespace, + agent: Agent | PromptAgent | TeacherForcingAgent, + config_file_list: list[str], +) -> None: + scores = [] + max_steps = args.max_steps + + early_stop_thresholds = { + "parsing_failure": args.parsing_failure_th, + "repeating_action": args.repeating_action_failure_th, + } + + env = ScriptBrowserEnv( + headless=not args.render, + slow_mo=args.slow_mo, + observation_type=args.observation_type, + current_viewport_only=args.current_viewport_only, + viewport_size={ + "width": args.viewport_width, + "height": args.viewport_height, + }, + save_trace_enabled=args.save_trace_enabled, + sleep_after_execution=args.sleep_after_execution, + ) + + for config_file in tqdm(config_file_list): + try: + render_helper = RenderHelper( + config_file, args.result_dir, args.action_set_tag + ) + + # get intent + with open(config_file) as f: + _c = json.load(f) + intent = _c["intent"] + task_id = _c["task_id"] + if task_id in list(range(600, 650))+list(range(681, 689)): + # continue + print("Reddit post task. Sleep 30 mins.") + time.sleep(1800) + # automatically login + if _c["storage_state"]: + cookie_file_name = os.path.basename(_c["storage_state"]) + comb = get_site_comb_from_filepath(cookie_file_name) + temp_dir = tempfile.mkdtemp() + # subprocess to renew the cookie + subprocess.run( + [ + "python", + "browser_env/auto_login.py", + "--auth_folder", + temp_dir, + "--site_list", + *comb, + ] + ) + _c["storage_state"] = f"{temp_dir}/{cookie_file_name}" + assert os.path.exists(_c["storage_state"]) + # update the config file + config_file = f"{temp_dir}/{os.path.basename(config_file)}" + with open(config_file, "w") as f: + json.dump(_c, f) + + logger.info(f"[Config file]: {config_file}") + logger.info(f"[Intent]: {intent}") + + agent.reset(config_file) + trajectory: Trajectory = [] + obs, info = env.reset(options={"config_file": config_file}) + obs["text"] = obs["text"][0] + state_info: StateInfo = {"observation": obs, "info": info} + trajectory.append(state_info) + + meta_data = {"action_history": ["None"]} + + trace = [] + while True: + early_stop_flag, stop_info = early_stop( + trajectory, max_steps, early_stop_thresholds + ) + + if early_stop_flag: + action = create_stop_action(f"Early stop: {stop_info}") + else: + prompt = agent.prompt_constructor.construct( + trajectory, intent, meta_data + ) + try: + action = agent.next_action( + trajectory, intent, meta_data=meta_data + ) + except ValueError as e: + # get the error message + action = create_stop_action(f"ERROR: {str(e)}") + + trajectory.append(action) + + action_str = get_action_description( + action, + state_info["info"]["observation_metadata"], + action_set_tag=args.action_set_tag, + prompt_constructor=agent.prompt_constructor + if isinstance(agent, PromptAgent) + else None, + ) + render_helper.render( + action, state_info, meta_data, args.render_screenshot + ) + meta_data["action_history"].append(action_str) + + trace.append({ + "source": prompt, + "target": action_str.split(' #HTML Segment')[0], + }) + + if action["action_type"] == ActionTypes.STOP: + break + + obs, _, terminated, _, info = env.step(action) + obs["text"] = obs["text"][0] + state_info = {"observation": obs, "info": info} + trajectory.append(state_info) + + if terminated: + # add a action place holder + trajectory.append(create_stop_action("")) + break + + evaluator = evaluator_router(config_file) + score = evaluator( + trajectory=trajectory, + config_file=config_file, + page=env.page, + client=env.get_page_client(env.page), + ) + + scores.append(score) + + if score == 1: + logger.info(f"[Result] (PASS) {config_file}") + else: + logger.info(f"[Result] (FAIL) {config_file}") + + if args.save_trace_enabled: + env.save_trace( + Path(args.result_dir) / "traces" / f"{task_id}.zip" + ) + result = { + "id": task_id, + "score": score, + "trace": trace, + } + with open(Path(args.result_dir) / "traces" / f"trace_{task_id}.json", "w") as f: + json.dump(result, f, indent=4) + + except openai.OpenAIError as e: + logger.info(f"[OpenAI Error] {repr(e)}") + except Exception as e: + logger.info(f"[Unhandled Error] {repr(e)}]") + import traceback + + # write to error file + with open(Path(args.result_dir) / "error.txt", "a") as f: + f.write(f"[Config file]: {config_file}\n") + f.write(f"[Unhandled Error] {repr(e)}\n") + f.write(traceback.format_exc()) # write stack trace to file + + env.close() + if len(scores) > 0: + logger.info(f"Average score: {sum(scores) / len(scores)}") + + +def prepare(args: argparse.Namespace) -> None: + # convert prompt python files to json + from agent.prompts import to_json + + to_json.run() + + # prepare result dir + result_dir = args.result_dir + if not result_dir: + result_dir = ( + f"cache/results_{time.strftime('%Y%m%d%H%M%S', time.localtime())}" + ) + if not Path(result_dir).exists(): + Path(result_dir).mkdir(parents=True, exist_ok=True) + args.result_dir = result_dir + logger.info(f"Create result dir: {result_dir}") + + if not (Path(result_dir) / "traces").exists(): + (Path(result_dir) / "traces").mkdir(parents=True) + + # log the log file + with open(os.path.join(result_dir, "log_files.txt"), "a+") as f: + f.write(f"{LOG_FILE_NAME}\n") + +def get_unfinished(config_files: list[str], result_dir: str) -> list[str]: + result_files = glob.glob(f"{result_dir}/traces/*.json") + task_ids = [ + os.path.basename(f).split(".")[0].split("_")[1] for f in result_files + ] + unfinished_configs = [] + for config_file in config_files: + task_id = os.path.basename(config_file).split(".")[0] + if task_id not in task_ids: + unfinished_configs.append(config_file) + return unfinished_configs + +def dump_config(args: argparse.Namespace) -> None: + config_file = Path(args.result_dir) / "config.json" + if not config_file.exists(): + with open(config_file, "w") as f: + json.dump(vars(args), f, indent=4) + logger.info(f"Dump config to {config_file}") + + +if __name__ == "__main__": + args = config() + args.sleep_after_execution = 2.0 + prepare(args) + + test_file_list = [] + st_idx = args.test_start_idx + ed_idx = args.test_end_idx + for i in range(st_idx, ed_idx): + if not os.path.exists(os.path.join(os.path.dirname(os.path.abspath(__file__)), "config_files", f"{i}.json")): + continue + test_file_list.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "config_files", f"{i}.json")) + + if len(test_file_list) == 0: + logger.info("No task left to run") + else: + print(f"Total {len(test_file_list)} tasks left") + args.render = False + args.render_screenshot = True + args.save_trace_enabled = True + + args.current_viewport_only = True + dump_config(args) + + agent = construct_agent(args) + test(args, agent, test_file_list)