trace_synthesis/analysis/evaluation.py
yuyr a84d51a101 1. 增加r1生成综合策略代码和输出;
2. 增加tasks;
3. 增加analysis部分,对策略进行归纳分类,然后进行评测。
2025-04-17 17:40:15 +08:00

173 lines
9.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Updated evaluation script with full Universal Strategies section in SYSTEM_PROMPT
import json
from openai import OpenAI
import re
# Set your OpenAI API key
# openai.api_key = "YOUR_API_KEY_HERE"
# System prompt with full Universal Strategies explanations and examples
SYSTEM_PROMPT = """
# Instruction
You are a GUI Agent responsible for completing tasks on a web interface. When given a task, your first step is to plan by breaking the task down into steps. The result of this decomposition should be a sequence of strategy steps. Below is a set of **Universal Strategy Templates** for your reference. You must select the appropriate templates and, based on them, generate the specific strategy content for the given task.
# **Universal Strategies with Explanations and Examples**
| Universal Strategy ID | Universal Strategy | Explanation (for GUI Agent) | Example |
| :-------------------- | :--------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- |
| US1 | Navigate To Page/Section | Move to a specific page, view, or section within the application interface. This typically involves clicking links, menu items, or buttons that lead to a new screen or area. | Navigate to the Orders Report page. |
| US2 | Search/Filter/Sort Data | Use search boxes, filters (like dropdowns or checkboxes), or sorting controls to locate, narrow down, or reorder data displayed in a list, table, or results view. | Filter issues by the label "Bug." |
| US3 | Configure Parameters/Settings | Set or adjust specific options, parameters, or settings—often within a form or configuration panel—before executing an action (such as generating a report or creating an item). | Configure the date range for the report from 20230101 to 20230131. |
| US4 | Execute Action/Process | Initiate a specific process or action by interacting with a primary action element, such as clicking a "Submit," "Generate," "Calculate," "Add," "Follow," or "Save" button. | Generate and submit the sales report. |
| US5 | View/Inspect Item Details | Open or focus on a specific item (e.g., an order, product, user profile, issue) to examine its detailed information—often by clicking on the item in a list or a dedicated "View"/"Details" button. | View the details of Order #12345. |
| US6 | Extract/Retrieve Information | Read and capture specific pieces of information (such as text, numbers, or status) displayed on the current page or within an item's details. | Retrieve the purchase date from the order details section. |
| US7 | Analyze/Evaluate/Verify Data | Perform cognitive tasks on the displayed data, such as comparing values, checking conditions, interpreting content, identifying patterns, or making judgments based on the information. | Analyze review content to identify mentions of "poor quality." |
| US8 | Navigate Within Data/Results | Move through multiple pages or sections of data or search results—typically using pagination controls (e.g., "Next," "Previous," page numbers) or scrolling. | Navigate through search result pages using the "Next" button. |
| US9 | Create Item | Complete the process of adding a new entity (e.g., post, project, issue, user, rule) to the system—typically by filling out a form and submitting it. | Create a new project named "My Research Project." |
| US10 | Update/Modify Item | Change or edit attributes or content of an existing item (e.g., update profile information, modify an order, edit a post, change product status). | Update the user profile homepage URL to "https://new-website.com." |
| US11 | Delete Item | Remove an existing item (e.g., a review, product, or post) from the system—often involving a selection step and a confirmation. | Delete the selected pending review. |
# Input example
```json
[{
"task_id": 4,
"sites": ["shopping_admin"],
"intent": "What are the top-3 best-selling product in Jan 2023"
}]
```
# Output Requirements
Your output must be enclosed in ```json``` and include your final strategy design, alongside the referenced universal strategies and their IDs.
# Output Example
```json
{
"task_id": 4,
"strategies": [
"Navigate to Bestsellers Report",
"Configure Report Parameters for January 2023",
"Generate and Review Bestsellers Report"
],
"universal_strategies": [
"Navigate To Page/Section",
"Configure Parameters/Settings",
"Execute Action/Process"
],
"universal_strategy_ids": ["US1", "US3", "US4"]
}
```
"""
client = OpenAI(
# 若没有配置环境变量请用百炼API Key将下行替换为api_key="sk-xxx"
api_key="sk-xxx",
base_url='https://aiproxy.lmzgc.cn:8080/v1'
)
def evaluate_accuracy(input_file: str, output_file: str):
with open(input_file, 'r', encoding='utf-8') as f:
cases = json.load(f)
results = []
correct_count = 0
total_cases = len(cases)
# Function to save current results and summary
def save_results(processed_count):
current_accuracy = correct_count / processed_count if processed_count else 0.0
summary = {
"total_cases": total_cases,
"processed_cases": processed_count,
"correct_cases": correct_count,
"accuracy": current_accuracy
}
# Save detailed results and summary
with open(output_file, 'w', encoding='utf-8') as f_out:
json.dump({"summary": summary, "results": results}, f_out, indent=2, ensure_ascii=False)
print(f"Intermediate results saved ({processed_count}/{total_cases}): Accuracy = {current_accuracy:.2%}")
for i, case in enumerate(cases):
# Prepare the user message payload
user_message = json.dumps([{
"task_id": case["task_id"],
"sites": case["sites"],
"intent": case["intent"]
}], ensure_ascii=False)
print(f"Processing case {i+1}/{total_cases}, task_id: {case.get('task_id', 'N/A')}") # More informative print
# Call the OpenAI GPT-4o model
model_id="aiproxy/deepseek-reasoner"
completion = client.chat.completions.create(
# model="deepseek-r1-distill-qwen-7b",
model=model_id,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_message}
]
)
reasoning_content = completion.choices[0].message.reasoning_content if completion.choices[0].message.reasoning_content else None # Handle potential None
output_content = completion.choices[0].message.content.strip()
# print(f"output {output_content}") # Optional: reduce verbose output during run
actual_ids = [] # Initialize actual_ids
try:
# Extract JSON string from potential markdown fences
# Use regex to find content between ```json and ```, allowing for optional whitespace
match = re.search(r'```json\s*(.*?)\s*```', output_content, re.DOTALL)
if match:
json_str = match.group(1).strip() # Extract the JSON part and strip whitespace
else:
# If no markdown fences are found, assume the entire content might be JSON
json_str = output_content
output_json = json.loads(json_str)
actual_ids = output_json.get("universal_strategy_ids", [])
except json.JSONDecodeError as e:
# Add more informative error logging
print(f"Warning: Failed to parse JSON for task_id {case.get('task_id', 'N/A')}. Error: {e}")
print(f"Content attempted to parse: '{json_str if 'json_str' in locals() else output_content}'") # Log the string that failed parsing
# actual_ids remains [] as initialized
expected_ids = case.get("universal_strategy_ids", [])
is_correct = set(actual_ids) == set(expected_ids)
# print(f"expected_ids: {expected_ids}")
# print(f"is_correct : {is_correct}") # Optional: reduce verbose output
if is_correct:
correct_count += 1
results.append({
"task_id": case["task_id"],
"task_intent": case["intent"],
"expected_ids": expected_ids,
"actual_ids": actual_ids,
"correct": is_correct,
"model": model_id,
"model_output": output_content,
"model_reasoning": reasoning_content
})
# Save results every 10 cases and also for the last case
processed_count = i + 1
if processed_count % 10 == 0 or processed_count == total_cases:
save_results(processed_count)
# Final summary calculation (already done by the last save_results call if total_cases % 10 != 0)
final_accuracy = correct_count / total_cases if total_cases else 0.0
print(f"\nEvaluation completed: {correct_count}/{total_cases} correct, final accuracy = {final_accuracy:.2%}")
print(f"Final results saved to {output_file}")
if __name__ == "__main__":
# Make sure the input/output file names are correct
evaluate_accuracy("tasks_output.json", "evaluation_results.json")
# Save this as evaluate_accuracy.py