173 lines
9.8 KiB
Python
173 lines
9.8 KiB
Python
# Updated evaluation script with full Universal Strategies section in SYSTEM_PROMPT
|
||
|
||
import json
|
||
from openai import OpenAI
|
||
import re
|
||
|
||
|
||
# Set your OpenAI API key
|
||
# openai.api_key = "YOUR_API_KEY_HERE"
|
||
|
||
# System prompt with full Universal Strategies explanations and examples
|
||
SYSTEM_PROMPT = """
|
||
# Instruction
|
||
You are a GUI Agent responsible for completing tasks on a web interface. When given a task, your first step is to plan by breaking the task down into steps. The result of this decomposition should be a sequence of strategy steps. Below is a set of **Universal Strategy Templates** for your reference. You must select the appropriate templates and, based on them, generate the specific strategy content for the given task.
|
||
|
||
# **Universal Strategies with Explanations and Examples**
|
||
|
||
| Universal Strategy ID | Universal Strategy | Explanation (for GUI Agent) | Example |
|
||
| :-------------------- | :--------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- |
|
||
| US1 | Navigate To Page/Section | Move to a specific page, view, or section within the application interface. This typically involves clicking links, menu items, or buttons that lead to a new screen or area. | Navigate to the Orders Report page. |
|
||
| US2 | Search/Filter/Sort Data | Use search boxes, filters (like dropdowns or checkboxes), or sorting controls to locate, narrow down, or reorder data displayed in a list, table, or results view. | Filter issues by the label "Bug." |
|
||
| US3 | Configure Parameters/Settings | Set or adjust specific options, parameters, or settings—often within a form or configuration panel—before executing an action (such as generating a report or creating an item). | Configure the date range for the report from 2023‑01‑01 to 2023‑01‑31. |
|
||
| US4 | Execute Action/Process | Initiate a specific process or action by interacting with a primary action element, such as clicking a "Submit," "Generate," "Calculate," "Add," "Follow," or "Save" button. | Generate and submit the sales report. |
|
||
| US5 | View/Inspect Item Details | Open or focus on a specific item (e.g., an order, product, user profile, issue) to examine its detailed information—often by clicking on the item in a list or a dedicated "View"/"Details" button. | View the details of Order #12345. |
|
||
| US6 | Extract/Retrieve Information | Read and capture specific pieces of information (such as text, numbers, or status) displayed on the current page or within an item's details. | Retrieve the purchase date from the order details section. |
|
||
| US7 | Analyze/Evaluate/Verify Data | Perform cognitive tasks on the displayed data, such as comparing values, checking conditions, interpreting content, identifying patterns, or making judgments based on the information. | Analyze review content to identify mentions of "poor quality." |
|
||
| US8 | Navigate Within Data/Results | Move through multiple pages or sections of data or search results—typically using pagination controls (e.g., "Next," "Previous," page numbers) or scrolling. | Navigate through search result pages using the "Next" button. |
|
||
| US9 | Create Item | Complete the process of adding a new entity (e.g., post, project, issue, user, rule) to the system—typically by filling out a form and submitting it. | Create a new project named "My Research Project." |
|
||
| US10 | Update/Modify Item | Change or edit attributes or content of an existing item (e.g., update profile information, modify an order, edit a post, change product status). | Update the user profile homepage URL to "https://new-website.com." |
|
||
| US11 | Delete Item | Remove an existing item (e.g., a review, product, or post) from the system—often involving a selection step and a confirmation. | Delete the selected pending review. |
|
||
|
||
# Input example
|
||
```json
|
||
[{
|
||
"task_id": 4,
|
||
"sites": ["shopping_admin"],
|
||
"intent": "What are the top-3 best-selling product in Jan 2023"
|
||
}]
|
||
```
|
||
|
||
# Output Requirements
|
||
Your output must be enclosed in ```json``` and include your final strategy design, alongside the referenced universal strategies and their IDs.
|
||
|
||
# Output Example
|
||
```json
|
||
{
|
||
"task_id": 4,
|
||
"strategies": [
|
||
"Navigate to Bestsellers Report",
|
||
"Configure Report Parameters for January 2023",
|
||
"Generate and Review Bestsellers Report"
|
||
],
|
||
"universal_strategies": [
|
||
"Navigate To Page/Section",
|
||
"Configure Parameters/Settings",
|
||
"Execute Action/Process"
|
||
],
|
||
"universal_strategy_ids": ["US1", "US3", "US4"]
|
||
}
|
||
```
|
||
"""
|
||
|
||
client = OpenAI(
|
||
# 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx"
|
||
api_key="sk-xxx",
|
||
base_url='https://aiproxy.lmzgc.cn:8080/v1'
|
||
)
|
||
|
||
def evaluate_accuracy(input_file: str, output_file: str):
|
||
with open(input_file, 'r', encoding='utf-8') as f:
|
||
cases = json.load(f)
|
||
|
||
results = []
|
||
correct_count = 0
|
||
total_cases = len(cases)
|
||
|
||
# Function to save current results and summary
|
||
def save_results(processed_count):
|
||
current_accuracy = correct_count / processed_count if processed_count else 0.0
|
||
summary = {
|
||
"total_cases": total_cases,
|
||
"processed_cases": processed_count,
|
||
"correct_cases": correct_count,
|
||
"accuracy": current_accuracy
|
||
}
|
||
# Save detailed results and summary
|
||
with open(output_file, 'w', encoding='utf-8') as f_out:
|
||
json.dump({"summary": summary, "results": results}, f_out, indent=2, ensure_ascii=False)
|
||
print(f"Intermediate results saved ({processed_count}/{total_cases}): Accuracy = {current_accuracy:.2%}")
|
||
|
||
|
||
for i, case in enumerate(cases):
|
||
# Prepare the user message payload
|
||
user_message = json.dumps([{
|
||
"task_id": case["task_id"],
|
||
"sites": case["sites"],
|
||
"intent": case["intent"]
|
||
}], ensure_ascii=False)
|
||
|
||
print(f"Processing case {i+1}/{total_cases}, task_id: {case.get('task_id', 'N/A')}") # More informative print
|
||
|
||
# Call the OpenAI GPT-4o model
|
||
model_id="aiproxy/deepseek-reasoner"
|
||
|
||
completion = client.chat.completions.create(
|
||
# model="deepseek-r1-distill-qwen-7b",
|
||
model=model_id,
|
||
messages=[
|
||
{"role": "system", "content": SYSTEM_PROMPT},
|
||
{"role": "user", "content": user_message}
|
||
]
|
||
)
|
||
|
||
reasoning_content = completion.choices[0].message.reasoning_content if completion.choices[0].message.reasoning_content else None # Handle potential None
|
||
output_content = completion.choices[0].message.content.strip()
|
||
|
||
# print(f"output {output_content}") # Optional: reduce verbose output during run
|
||
|
||
actual_ids = [] # Initialize actual_ids
|
||
try:
|
||
# Extract JSON string from potential markdown fences
|
||
# Use regex to find content between ```json and ```, allowing for optional whitespace
|
||
match = re.search(r'```json\s*(.*?)\s*```', output_content, re.DOTALL)
|
||
if match:
|
||
json_str = match.group(1).strip() # Extract the JSON part and strip whitespace
|
||
else:
|
||
# If no markdown fences are found, assume the entire content might be JSON
|
||
json_str = output_content
|
||
|
||
output_json = json.loads(json_str)
|
||
actual_ids = output_json.get("universal_strategy_ids", [])
|
||
except json.JSONDecodeError as e:
|
||
# Add more informative error logging
|
||
print(f"Warning: Failed to parse JSON for task_id {case.get('task_id', 'N/A')}. Error: {e}")
|
||
print(f"Content attempted to parse: '{json_str if 'json_str' in locals() else output_content}'") # Log the string that failed parsing
|
||
# actual_ids remains [] as initialized
|
||
|
||
expected_ids = case.get("universal_strategy_ids", [])
|
||
is_correct = set(actual_ids) == set(expected_ids)
|
||
|
||
# print(f"expected_ids: {expected_ids}")
|
||
# print(f"is_correct : {is_correct}") # Optional: reduce verbose output
|
||
|
||
if is_correct:
|
||
correct_count += 1
|
||
|
||
results.append({
|
||
"task_id": case["task_id"],
|
||
"task_intent": case["intent"],
|
||
"expected_ids": expected_ids,
|
||
"actual_ids": actual_ids,
|
||
"correct": is_correct,
|
||
"model": model_id,
|
||
"model_output": output_content,
|
||
"model_reasoning": reasoning_content
|
||
})
|
||
|
||
# Save results every 10 cases and also for the last case
|
||
processed_count = i + 1
|
||
if processed_count % 10 == 0 or processed_count == total_cases:
|
||
save_results(processed_count)
|
||
|
||
# Final summary calculation (already done by the last save_results call if total_cases % 10 != 0)
|
||
final_accuracy = correct_count / total_cases if total_cases else 0.0
|
||
print(f"\nEvaluation completed: {correct_count}/{total_cases} correct, final accuracy = {final_accuracy:.2%}")
|
||
print(f"Final results saved to {output_file}")
|
||
|
||
if __name__ == "__main__":
|
||
# Make sure the input/output file names are correct
|
||
evaluate_accuracy("tasks_output.json", "evaluation_results.json")
|
||
|
||
# Save this as evaluate_accuracy.py
|