trace_synthesis/analysis/evaluation.py

# Updated evaluation script with full Universal Strategies section in SYSTEM_PROMPT

import json
from openai import OpenAI
import re


# Set your OpenAI API key
# openai.api_key = "YOUR_API_KEY_HERE"

# System prompt with full Universal Strategies explanations and examples
SYSTEM_PROMPT = """
# Instruction
You are a GUI Agent responsible for completing tasks on a web interface. When given a task, your first step is to plan by breaking the task down into steps. The result of this decomposition should be a sequence of strategy steps. Below is a set of **Universal Strategy Templates** for your reference. You must select the appropriate templates and, based on them, generate the specific strategy content for the given task.

# **Universal Strategies with Explanations and Examples**

| Universal Strategy ID | Universal Strategy           | Explanation (for GUI Agent)                                                                                                                                 | Example                                                        |
| :-------------------- | :--------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- |
| US1                   | Navigate To Page/Section     | Move to a specific page, view, or section within the application interface. This typically involves clicking links, menu items, or buttons that lead to a new screen or area. | Navigate to the Orders Report page.                            |
| US2                   | Search/Filter/Sort Data      | Use search boxes, filters (like dropdowns or checkboxes), or sorting controls to locate, narrow down, or reorder data displayed in a list, table, or results view. | Filter issues by the label "Bug."                              |
| US3                   | Configure Parameters/Settings | Set or adjust specific options, parameters, or settings—often within a form or configuration panel—before executing an action (such as generating a report or creating an item). | Configure the date range for the report from 2023‑01‑01 to 2023‑01‑31. |
| US4                   | Execute Action/Process       | Initiate a specific process or action by interacting with a primary action element, such as clicking a "Submit," "Generate," "Calculate," "Add," "Follow," or "Save" button. | Generate and submit the sales report.                          |
| US5                   | View/Inspect Item Details    | Open or focus on a specific item (e.g., an order, product, user profile, issue) to examine its detailed information—often by clicking on the item in a list or a dedicated "View"/"Details" button. | View the details of Order #12345.                              |
| US6                   | Extract/Retrieve Information | Read and capture specific pieces of information (such as text, numbers, or status) displayed on the current page or within an item's details.             | Retrieve the purchase date from the order details section.     |
| US7                   | Analyze/Evaluate/Verify Data | Perform cognitive tasks on the displayed data, such as comparing values, checking conditions, interpreting content, identifying patterns, or making judgments based on the information. | Analyze review content to identify mentions of "poor quality." |
| US8                   | Navigate Within Data/Results | Move through multiple pages or sections of data or search results—typically using pagination controls (e.g., "Next," "Previous," page numbers) or scrolling. | Navigate through search result pages using the "Next" button. |
| US9                   | Create Item                  | Complete the process of adding a new entity (e.g., post, project, issue, user, rule) to the system—typically by filling out a form and submitting it.      | Create a new project named "My Research Project."              |
| US10                  | Update/Modify Item           | Change or edit attributes or content of an existing item (e.g., update profile information, modify an order, edit a post, change product status).         | Update the user profile homepage URL to "https://new-website.com." |
| US11                  | Delete Item                  | Remove an existing item (e.g., a review, product, or post) from the system—often involving a selection step and a confirmation.                            | Delete the selected pending review.                            |

# Input example
```json
[{
    "task_id": 4,
    "sites": ["shopping_admin"],
    "intent": "What are the top-3 best-selling product in Jan 2023"
}]
```

# Output Requirements
Your output must be enclosed in ```json``` and include your final strategy design, alongside the referenced universal strategies and their IDs.

# Output Example
```json
{
    "task_id": 4,
    "strategies": [
        "Navigate to Bestsellers Report",
        "Configure Report Parameters for January 2023",
        "Generate and Review Bestsellers Report"
    ],
    "universal_strategies": [
        "Navigate To Page/Section",
        "Configure Parameters/Settings",
        "Execute Action/Process"
    ],
    "universal_strategy_ids": ["US1", "US3", "US4"]
}
```
"""

client = OpenAI(
        # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx"
        api_key="sk-xxx",
        base_url='https://aiproxy.lmzgc.cn:8080/v1'
    )

def evaluate_accuracy(input_file: str, output_file: str):
    with open(input_file, 'r', encoding='utf-8') as f:
        cases = json.load(f)

    results = []
    correct_count = 0
    total_cases = len(cases)

    # Function to save current results and summary
    def save_results(processed_count):
        current_accuracy = correct_count / processed_count if processed_count else 0.0
        summary = {
            "total_cases": total_cases,
            "processed_cases": processed_count,
            "correct_cases": correct_count,
            "accuracy": current_accuracy
        }
        # Save detailed results and summary
        with open(output_file, 'w', encoding='utf-8') as f_out:
            json.dump({"summary": summary, "results": results}, f_out, indent=2, ensure_ascii=False)
        print(f"Intermediate results saved ({processed_count}/{total_cases}): Accuracy = {current_accuracy:.2%}")


    for i, case in enumerate(cases):
        # Prepare the user message payload
        user_message = json.dumps([{
            "task_id": case["task_id"],
            "sites": case["sites"],
            "intent": case["intent"]
        }], ensure_ascii=False)

        print(f"Processing case {i+1}/{total_cases}, task_id: {case.get('task_id', 'N/A')}") # More informative print

        # Call the OpenAI GPT-4o model
        model_id="aiproxy/deepseek-reasoner"

        completion = client.chat.completions.create(
            # model="deepseek-r1-distill-qwen-7b",
            model=model_id,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_message}
            ]
        )

        reasoning_content = completion.choices[0].message.reasoning_content if completion.choices[0].message.reasoning_content else None # Handle potential None
        output_content = completion.choices[0].message.content.strip()

        # print(f"output {output_content}") # Optional: reduce verbose output during run

        actual_ids = [] # Initialize actual_ids
        try:
            # Extract JSON string from potential markdown fences
            # Use regex to find content between ```json and ```, allowing for optional whitespace
            match = re.search(r'```json\s*(.*?)\s*```', output_content, re.DOTALL)
            if match:
                json_str = match.group(1).strip() # Extract the JSON part and strip whitespace
            else:
                # If no markdown fences are found, assume the entire content might be JSON
                json_str = output_content

            output_json = json.loads(json_str)
            actual_ids = output_json.get("universal_strategy_ids", [])
        except json.JSONDecodeError as e:
            # Add more informative error logging
            print(f"Warning: Failed to parse JSON for task_id {case.get('task_id', 'N/A')}. Error: {e}")
            print(f"Content attempted to parse: '{json_str if 'json_str' in locals() else output_content}'") # Log the string that failed parsing
            # actual_ids remains [] as initialized

        expected_ids = case.get("universal_strategy_ids", [])
        is_correct = set(actual_ids) == set(expected_ids)

        # print(f"expected_ids: {expected_ids}")
        # print(f"is_correct : {is_correct}") # Optional: reduce verbose output

        if is_correct:
            correct_count += 1

        results.append({
            "task_id": case["task_id"],
            "task_intent": case["intent"],
            "expected_ids": expected_ids,
            "actual_ids": actual_ids,
            "correct": is_correct,
            "model": model_id,
            "model_output": output_content,
            "model_reasoning": reasoning_content
        })

        # Save results every 10 cases and also for the last case
        processed_count = i + 1
        if processed_count % 10 == 0 or processed_count == total_cases:
             save_results(processed_count)

    # Final summary calculation (already done by the last save_results call if total_cases % 10 != 0)
    final_accuracy = correct_count / total_cases if total_cases else 0.0
    print(f"\nEvaluation completed: {correct_count}/{total_cases} correct, final accuracy = {final_accuracy:.2%}")
    print(f"Final results saved to {output_file}")

if __name__ == "__main__":
    # Make sure the input/output file names are correct
    evaluate_accuracy("tasks_output.json", "evaluation_results.json")

# Save this as evaluate_accuracy.py