import os import json from dotenv import load_dotenv import base64 from openai import OpenAI from pathlib import Path import concurrent.futures from typing import Dict, Any from datetime import datetime # 加载环境变量 load_dotenv() MODEL_NAME = "gpt-4o-mini" # MODEL_NAME = "UI-TARS-72B-DPO" def encode_image(image_path): """将图片文件转换为base64编码""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def analyze_images(image_paths, prompt): """ 分析多个图片并返回结果 :param image_paths: PNG图片路径列表 :param prompt: 文字指令 :return: 模型响应 """ # 初始化API客户端 client = OpenAI( api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE_URL") # 如果使用其他兼容服务,可以设置基础URL ) # 准备消息内容 messages = [{"role": "user", "content": []}] # 添加文字内容 messages[0]["content"].append({ "type": "text", "text": prompt }) # 添加所有图片 for image_path in image_paths: base64_image = encode_image(image_path) messages[0]["content"].append({ "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_image}" } }) # 调用API try: response = client.chat.completions.create( model=MODEL_NAME, # 使用指定模型 messages=messages, max_tokens=4000, temperature=0.5 ) return response.choices[0].message.content except Exception as e: return f"发生错误: {str(e)}" def process_path_meta(meta): """ 处理单个路径的元数据,生成图片路径列表和其他信息 """ image_paths = [] for idx, (chain_id, child_num) in enumerate(zip(meta["chainIDs"], meta["chainChildNum"])): # 构建图片文件名:ID_childNum.png # 如果是最后一个元素,需要使用 "full" 作为 childNum if idx == len(meta["chainIDs"]) - 1: image_name = f"{chain_id}_full.png" else: image_name = f"{chain_id}_{child_num}.png" image_path = f"screenshots/{image_name}" image_paths.append(image_path) return { "image_paths": image_paths, "urls": meta["chainUrls"], "text": meta["chainTexts"], "boundingbox": meta["chainViewportBoundingBoxes"], } def process_single_path(url: str, meta: Dict[str, Any], path_index: int) -> Dict[str, Any]: """ 处理单个路径的分析 """ # 跳过空值 if meta is None: return None # 处理路径数据 processed_data = process_path_meta(meta) # 构建提示词 prompt_template = r"""You are a GUI agent. 根据给定的{urls_length}个网页截图,总结网页截图完成了一个什么样的任务, 从第一个到倒数第二个网页用户点击的按钮文字text list分别是{text},最后一个网页是最终到达目的页面。 page_description中描述了用户在每个网页中看到的内容。 action_description中描述了用户在每个网页中点击的元素,这里元素的文字(用[]包裹)要和前面提供的text list是对应的,还要描述元素所处周围环境。 task_summaries中提炼轨迹可能对应的用户完成任务,任务内容要无歧义可以验证的,并且要和page_description和action_description相匹配,task_summaries中要包含不少于三个任务。 示例输出 {{ "page_description": [ "这看起来是一个Granfana首页界面,左边的导航栏已经展开,并且导航栏不是首屏,有滑动痕迹", "这看起来是点击了左侧导航栏的Probes选项后显示的Probes列表页面,截图最上面显示当前路径Home > Testing & synthetics > Synthetics > Probes 。而且列表页中显示了多个探测器,每个探测器有名称、版本和有一个View的按钮。页面看起来不是首屏,有滑动痕迹", "这是最终到达探测器详情页,标题是Viewing public probe Tokyo(APAC),页面中还显示了该探测的的Status,Reachability,Location information,Version, Last offline, Last modified, Region等信息。" ], "action_description": [ "点击了[Probes]选项。周围环境:Probes选择在导航栏三级菜单,一级菜单是Testing & synthetics,二级菜单是Synthetics,三级菜单有hecks,Probes和Alerts三个选项,我点击了Probes选项。", "点击了[Tokyo(APAC)]文字标题。周围环境:Tokyo(APAC)探测器条目在Probes列表页面中,每个探测器有名称、版本和有一个View的按钮", ], "task_summaries": [ {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的状态。", "answer": "Online" }}, {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Reachability。", "answer": "100.0%" }}, {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Region。", "answer": "APAC" }}, {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Version。", "answer": "v0.10.5-0-g9201a28" }}, {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Last offline。", "answer": "March 18, 2025 at 05:23 AM" }}, {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Last modified。", "answer": "March 04, 2025 at 07:17 AM" }}, {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Location information的Lattitude。", "answer": "35.6762" }}, {{ "question": "查询Grafana合成监控下的Tokyo(APAC)探测器的Location information的Longitude。", "answer": "139.6503" }} ] }} """ # 格式化提示词 prompt = prompt_template.format( urls_length=len(processed_data["urls"]), urls=processed_data["urls"], text=processed_data["text"] ) print(f"Processing path {path_index} for URL: {url}") # 调用API分析图片 result = analyze_images(processed_data["image_paths"], prompt) print(f" path {path_index} for url {url} result: {result}") try: meta["raw_result"] = result # 清理和规范化JSON字符串 parsed_result = result.strip() # 移除可能的前缀对话内容 if "assistant" in parsed_result.lower(): parsed_result = parsed_result.split("assistant", 1)[-1].strip() # 查找第一个 { 和最后一个 } 之间的内容 start = parsed_result.find('{') end = parsed_result.rfind('}') if start != -1 and end != -1: parsed_result = parsed_result[start:end+1] # 尝试解析JSON try: result_json = json.loads(parsed_result) meta["page_description"] = result_json.get("page_description", "未能获取页面描述") meta["action_description"] = result_json.get("action_description", "未能获取动作描述") meta["task_summaries"] = result_json.get("task_summaries", "未能获取任务摘要") except json.JSONDecodeError as e: print(f"JSON parsing error for URL {url}: {str(e)}") meta["page_description"] = "解析错误:无效的JSON格式" meta["action_description"] = f"原始响应:{parsed_result}" meta["task_summaries"] = f"原始响应:{parsed_result}" except Exception as e: print(f"Error processing result for URL {url}: {str(e)}") meta["page_description"] = "处理错误" meta["action_description"] = f"错误信息:{str(e)}" meta["task_summaries"] = f"错误信息:{str(e)}" return meta def update_json_with_analysis(json_path: str, max_workers: int = 4): """ 读取JSON文件,为每个路径添加分析结果,使用并行处理 """ # 读取JSON文件 with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) # 创建任务列表 tasks = [] for url, url_data in data.items(): for i, meta in enumerate(url_data.get("shortestPathsMeta", [])): if meta is not None: tasks.append((url, meta, i)) # 定义输出文件路径 timestamp = datetime.now().strftime("%Y%m%d%H%M%S") output_path = json_path.replace('.json', f'_with_analysis_{timestamp}.json') processed_count = 0 # 使用线程池并行处理 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # 提交所有任务 future_to_task = { executor.submit(process_single_path, url, meta, i): (url, i) for url, meta, i in tasks } # 获取结果并更新数据 for future in concurrent.futures.as_completed(future_to_task): url, path_index = future_to_task[future] try: result = future.result() if result is not None: data[url]["shortestPathsMeta"][path_index] = result processed_count += 1 # 每处理10个条目保存一次 if processed_count % 10 == 0: with open(output_path, 'w', encoding='utf-8') as f_out: json.dump(data, f_out, ensure_ascii=False, indent=2) print(f"已处理{processed_count}个条目,保存到{output_path}") except Exception as e: print(f"Error processing path {path_index} for URL {url}: {str(e)}") # 最后保存所有数据 with open(output_path, 'w', encoding='utf-8') as f_out: json.dump(data, f_out, ensure_ascii=False, indent=2) print(f"全部处理完成,最终保存到{output_path}") def main(): # 更新为处理JSON文件 json_path = "path/processed_3.json" update_json_with_analysis(json_path, max_workers=2) # 可以根据需要调整并发数 if __name__ == "__main__": main()