crawlee/misc/trajectory_analysis_v18.py
2025-04-23 12:14:50 +08:00

247 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
from dotenv import load_dotenv
import base64
from openai import OpenAI
from pathlib import Path
import concurrent.futures
from typing import Dict, Any
from datetime import datetime
# 加载环境变量
load_dotenv()
MODEL_NAME = "gpt-4o-mini"
# MODEL_NAME = "gpt-4o"
# MODEL_NAME = "UI-TARS-72B-DPO"
def encode_image(image_path):
"""将图片文件转换为base64编码"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def analyze_images(image_paths, prompt):
"""
分析多个图片并返回结果
:param image_paths: PNG图片路径列表
:param prompt: 文字指令
:return: 模型响应
"""
# 初始化API客户端
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE_URL") # 如果使用其他兼容服务可以设置基础URL
)
# 准备消息内容
messages = [{"role": "user", "content": []}]
# 添加文字内容
messages[0]["content"].append({
"type": "text",
"text": prompt
})
# 添加所有图片
for image_path in image_paths:
base64_image = encode_image(image_path)
messages[0]["content"].append({
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}"
}
})
# 调用API
try:
response = client.chat.completions.create(
model=MODEL_NAME, # 使用指定模型
messages=messages,
max_tokens=4000,
temperature=0.5
)
return response.choices[0].message.content
except Exception as e:
return f"发生错误: {str(e)}"
def process_path_meta(meta):
"""
处理单个路径的元数据,生成图片路径列表和其他信息
"""
image_paths = []
for idx, (chain_id, child_num) in enumerate(zip(meta["chainIDs"], meta["chainChildNum"])):
# 构建图片文件名ID_childNum.png
# 如果是最后一个元素,需要使用 "full" 作为 childNum
if idx == len(meta["chainIDs"]) - 1:
image_name = f"{chain_id}_full.png"
else:
image_name = f"{chain_id}_{child_num}.png"
image_path = f"screenshots/{image_name}"
image_paths.append(image_path)
return {
"image_paths": image_paths,
"urls": meta["chainUrls"],
"text": meta["chainTexts"],
"boundingbox": meta["chainViewportBoundingBoxes"],
}
def process_single_path(url: str, meta: Dict[str, Any], path_index: int) -> Dict[str, Any]:
"""
处理单个路径的分析
"""
# 跳过空值
if meta is None:
return None
# 处理路径数据
processed_data = process_path_meta(meta)
ACTION_LIST = processed_data["text"]
WEBSITE_URL = url
prompt_template2 = r"""
Given a list of actions performed on the website {WEBSITE_URL} and the corresponding
screenshots
List of actions: {ACTION_LIST}
Your task is to come up with a single task description that will be accomplished by performing
these actions in the given sequence on the website.
IMPORTANT:
1. The task must contain some actions: “Buy, Book, Find, Check, Choose, show me, search,
browse, get, compare, view, give me, add to cart, ...”, ideally involving transactions/finding
information on a specific product or service.
2. You should propose tasks that are clear and specific.
3. The task description should provide all the necessary information to complete the task.
4. The task description must indicate the domain of the website at the end of the task with
the format: “... on task website”, for instance, “Purchase a laptop on Amazon”, “Book a hair
appointment on Yelp”, etc.
5. The task should be feasible to complete by a real user and should not require any additional
information that is not specified in this input.
6. The task description should specify constraints like given budget, product features, and other
specifications that can narrow down the search to a particular item/product.
7. Do NOT use any quotation marks (either single or double) in the task description.
The output should be in the below format:
OUTPUT FORMAT: Please first give some analysis of the actions and screenshots and then
output the overall task description. put your answer within ``` ```, for example, “In summary,
the answer is: ```<TASK_DESCRIPTION>:str```”.
"""
# 构建提示词
prompt_template = r"""You are a GUI agent.
根据给定的{urls_length}个网页截图,总结网页截图完成了一个什么样的任务,
从第一个到倒数第二个网页用户点击的按钮文字text list分别是{text},最后一个网页是最终到达目的页面。
page_description中描述了用户在每个网页中看到的内容。
action_description中描述了用户在每个网页中点击的元素这里元素的文字(用[]包裹)要和前面提供的text list是对应的还要描述元素所处周围环境。
task_summaries中提炼轨迹可能对应的用户完成任务任务内容要无歧义可以验证的并且要和page_description和action_description相匹配task_summaries中要包含不少于三个任务。
示例输出
{{
"page_description": [
"这看起来是一个Granfana首页界面左边的导航栏已经展开并且导航栏不是首屏有滑动痕迹",
"这看起来是点击了左侧导航栏的Probes选项后显示的Probes列表页面截图最上面显示当前路径Home > Testing & synthetics > Synthetics > Probes 。而且列表页中显示了多个探测器每个探测器有名称、版本和有一个View的按钮。页面看起来不是首屏有滑动痕迹",
"这是最终到达探测器详情页标题是Viewing public probe Tokyo(APAC)页面中还显示了该探测的的StatusReachabilityLocation informationVersion, Last offline, Last modified, Region等信息。"
],
"action_description": [
"点击了[Probes]选项。周围环境Probes选择在导航栏三级菜单一级菜单是Testing & synthetics二级菜单是Synthetics三级菜单有hecksProbes和Alerts三个选项我点击了Probes选项。",
"点击了[Tokyo(APAC)]文字标题。周围环境Tokyo(APAC)探测器条目在Probes列表页面中每个探测器有名称、版本和有一个View的按钮",
],
"task_summaries": [
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的状态。", "answer": "Online" }},
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Reachability。", "answer": "100.0%" }},
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Region。", "answer": "APAC" }},
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Version。", "answer": "v0.10.5-0-g9201a28" }},
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Last offline。", "answer": "March 18, 2025 at 05:23 AM" }},
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Last modified。", "answer": "March 04, 2025 at 07:17 AM" }},
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Location information的Lattitude。", "answer": "35.6762" }},
{{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Location information的Longitude。", "answer": "139.6503" }}
]
}}
"""
# 格式化提示词
prompt = prompt_template2.format(
ACTION_LIST=processed_data["text"],
WEBSITE_URL=url
)
print(f"Processing path {path_index} for URL: {url}")
# 调用API分析图片
result = analyze_images(processed_data["image_paths"], prompt)
print(f" path {path_index} for url {url} result: {result}")
try:
meta["raw_result"] = result
# 从result中提取task_summaries``` ```之间的内容
meta["title"] = result.split("```")[1].split("```")[0]
except Exception as e:
print(f"Error processing result for URL {url}: {str(e)}")
meta["title"] = f"错误信息:{str(e)}"
return meta
def update_json_with_analysis(json_path: str, max_workers: int = 4):
"""
读取JSON文件为每个路径添加分析结果使用并行处理
"""
# 读取JSON文件
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 创建任务列表
tasks = []
for url, url_data in data.items():
for i, meta in enumerate(url_data.get("shortestPathsMeta", [])):
if meta is not None:
tasks.append((url, meta, i))
# 定义输出文件路径
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
output_path = json_path.replace('.json', f'_with_analysis_{timestamp}.json')
processed_count = 0
# 使用线程池并行处理
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交所有任务
future_to_task = {
executor.submit(process_single_path, url, meta, i): (url, i)
for url, meta, i in tasks
}
# 获取结果并更新数据
for future in concurrent.futures.as_completed(future_to_task):
url, path_index = future_to_task[future]
try:
result = future.result()
if result is not None:
data[url]["shortestPathsMeta"][path_index] = result
processed_count += 1
# 每处理10个条目保存一次
if processed_count % 10 == 0:
with open(output_path, 'w', encoding='utf-8') as f_out:
json.dump(data, f_out, ensure_ascii=False, indent=2)
print(f"已处理{processed_count}个条目,保存到{output_path}")
except Exception as e:
print(f"Error processing path {path_index} for URL {url}: {str(e)}")
# 最后保存所有数据
with open(output_path, 'w', encoding='utf-8') as f_out:
json.dump(data, f_out, ensure_ascii=False, indent=2)
print(f"全部处理完成,最终保存到{output_path}")
def main():
# 更新为处理JSON文件
json_path = "path/processed_3.json"
update_json_with_analysis(json_path, max_workers=2) # 可以根据需要调整并发数
if __name__ == "__main__":
main()