# Updated evaluation script with full Universal Strategies section in SYSTEM_PROMPT import json from openai import OpenAI import re # Set your OpenAI API key # openai.api_key = "YOUR_API_KEY_HERE" # System prompt with full Universal Strategies explanations and examples SYSTEM_PROMPT = """ # Instruction You are a GUI Agent responsible for completing tasks on a web interface. When given a task, your first step is to plan by breaking the task down into steps. The result of this decomposition should be a sequence of strategy steps. Below is a set of **Universal Strategy Templates** for your reference. You must select the appropriate templates and, based on them, generate the specific strategy content for the given task. # **Universal Strategies with Explanations and Examples** | Universal Strategy ID | Universal Strategy | Explanation (for GUI Agent) | Example | | :-------------------- | :--------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- | | US1 | Navigate To Page/Section | Move to a specific page, view, or section within the application interface. This typically involves clicking links, menu items, or buttons that lead to a new screen or area. | Navigate to the Orders Report page. | | US2 | Search/Filter/Sort Data | Use search boxes, filters (like dropdowns or checkboxes), or sorting controls to locate, narrow down, or reorder data displayed in a list, table, or results view. | Filter issues by the label "Bug." | | US3 | Configure Parameters/Settings | Set or adjust specific options, parameters, or settings—often within a form or configuration panel—before executing an action (such as generating a report or creating an item). | Configure the date range for the report from 2023‑01‑01 to 2023‑01‑31. | | US4 | Execute Action/Process | Initiate a specific process or action by interacting with a primary action element, such as clicking a "Submit," "Generate," "Calculate," "Add," "Follow," or "Save" button. | Generate and submit the sales report. | | US5 | View/Inspect Item Details | Open or focus on a specific item (e.g., an order, product, user profile, issue) to examine its detailed information—often by clicking on the item in a list or a dedicated "View"/"Details" button. | View the details of Order #12345. | | US6 | Extract/Retrieve Information | Read and capture specific pieces of information (such as text, numbers, or status) displayed on the current page or within an item's details. | Retrieve the purchase date from the order details section. | | US7 | Analyze/Evaluate/Verify Data | Perform cognitive tasks on the displayed data, such as comparing values, checking conditions, interpreting content, identifying patterns, or making judgments based on the information. | Analyze review content to identify mentions of "poor quality." | | US8 | Navigate Within Data/Results | Move through multiple pages or sections of data or search results—typically using pagination controls (e.g., "Next," "Previous," page numbers) or scrolling. | Navigate through search result pages using the "Next" button. | | US9 | Create Item | Complete the process of adding a new entity (e.g., post, project, issue, user, rule) to the system—typically by filling out a form and submitting it. | Create a new project named "My Research Project." | | US10 | Update/Modify Item | Change or edit attributes or content of an existing item (e.g., update profile information, modify an order, edit a post, change product status). | Update the user profile homepage URL to "https://new-website.com." | | US11 | Delete Item | Remove an existing item (e.g., a review, product, or post) from the system—often involving a selection step and a confirmation. | Delete the selected pending review. | # Input example ```json [{ "task_id": 4, "sites": ["shopping_admin"], "intent": "What are the top-3 best-selling product in Jan 2023" }] ``` # Output Requirements Your output must be enclosed in ```json``` and include your final strategy design, alongside the referenced universal strategies and their IDs. # Output Example ```json { "task_id": 4, "strategies": [ "Navigate to Bestsellers Report", "Configure Report Parameters for January 2023", "Generate and Review Bestsellers Report" ], "universal_strategies": [ "Navigate To Page/Section", "Configure Parameters/Settings", "Execute Action/Process" ], "universal_strategy_ids": ["US1", "US3", "US4"] } ``` """ client = OpenAI( # 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx" api_key="sk-xxx", base_url='https://aiproxy.lmzgc.cn:8080/v1' ) def evaluate_accuracy(input_file: str, output_file: str): with open(input_file, 'r', encoding='utf-8') as f: cases = json.load(f) results = [] correct_count = 0 total_cases = len(cases) # Function to save current results and summary def save_results(processed_count): current_accuracy = correct_count / processed_count if processed_count else 0.0 summary = { "total_cases": total_cases, "processed_cases": processed_count, "correct_cases": correct_count, "accuracy": current_accuracy } # Save detailed results and summary with open(output_file, 'w', encoding='utf-8') as f_out: json.dump({"summary": summary, "results": results}, f_out, indent=2, ensure_ascii=False) print(f"Intermediate results saved ({processed_count}/{total_cases}): Accuracy = {current_accuracy:.2%}") for i, case in enumerate(cases): # Prepare the user message payload user_message = json.dumps([{ "task_id": case["task_id"], "sites": case["sites"], "intent": case["intent"] }], ensure_ascii=False) print(f"Processing case {i+1}/{total_cases}, task_id: {case.get('task_id', 'N/A')}") # More informative print # Call the OpenAI GPT-4o model model_id="aiproxy/deepseek-reasoner" completion = client.chat.completions.create( # model="deepseek-r1-distill-qwen-7b", model=model_id, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_message} ] ) reasoning_content = completion.choices[0].message.reasoning_content if completion.choices[0].message.reasoning_content else None # Handle potential None output_content = completion.choices[0].message.content.strip() # print(f"output {output_content}") # Optional: reduce verbose output during run actual_ids = [] # Initialize actual_ids try: # Extract JSON string from potential markdown fences # Use regex to find content between ```json and ```, allowing for optional whitespace match = re.search(r'```json\s*(.*?)\s*```', output_content, re.DOTALL) if match: json_str = match.group(1).strip() # Extract the JSON part and strip whitespace else: # If no markdown fences are found, assume the entire content might be JSON json_str = output_content output_json = json.loads(json_str) actual_ids = output_json.get("universal_strategy_ids", []) except json.JSONDecodeError as e: # Add more informative error logging print(f"Warning: Failed to parse JSON for task_id {case.get('task_id', 'N/A')}. Error: {e}") print(f"Content attempted to parse: '{json_str if 'json_str' in locals() else output_content}'") # Log the string that failed parsing # actual_ids remains [] as initialized expected_ids = case.get("universal_strategy_ids", []) is_correct = set(actual_ids) == set(expected_ids) # print(f"expected_ids: {expected_ids}") # print(f"is_correct : {is_correct}") # Optional: reduce verbose output if is_correct: correct_count += 1 results.append({ "task_id": case["task_id"], "task_intent": case["intent"], "expected_ids": expected_ids, "actual_ids": actual_ids, "correct": is_correct, "model": model_id, "model_output": output_content, "model_reasoning": reasoning_content }) # Save results every 10 cases and also for the last case processed_count = i + 1 if processed_count % 10 == 0 or processed_count == total_cases: save_results(processed_count) # Final summary calculation (already done by the last save_results call if total_cases % 10 != 0) final_accuracy = correct_count / total_cases if total_cases else 0.0 print(f"\nEvaluation completed: {correct_count}/{total_cases} correct, final accuracy = {final_accuracy:.2%}") print(f"Final results saved to {output_file}") if __name__ == "__main__": # Make sure the input/output file names are correct evaluate_accuracy("tasks_output.json", "evaluation_results.json") # Save this as evaluate_accuracy.py