trace_synthesis/qwen_vl_process.py
2025-04-15 22:44:08 +08:00

112 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from openai import OpenAI
import os
import base64
# Base64 编码格式
def encode_video(video_path):
try:
with open(video_path, "rb") as video_file:
return base64.b64encode(video_file.read()).decode("utf-8")
except FileNotFoundError:
print(f"错误:找不到视频文件 {video_path}")
return None
except Exception as e:
print(f"编码视频 {video_path} 时出错: {e}")
return None
# --- 主处理逻辑 ---
VIDEO_DIR = "video" # 视频文件所在的根目录
SUPPORTED_EXTENSIONS = ('.mp4', '.webm') # 支持的视频文件扩展名
# --- OpenAI 客户端初始化 ---
# 请确保 API Key 和 Base URL 配置正确
try:
client = OpenAI(
# 若没有配置环境变量请用百炼API Key将下行替换为api_key="sk-xxx"
api_key="sk-5d90a63c1e784e8f801dee65add68867", # 请替换为你的 API Key
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
except Exception as e:
print(f"初始化 OpenAI 客户端时出错: {e}")
exit() # 如果客户端初始化失败,则退出脚本
# --- 通用 Prompt ---
prompt = """
You are an annotator tasked with carefully observing and faithfully documenting the exact actions shown in a provided video segment. Your role is to describe, step-by-step, the precise actions I am performing on the website, strictly using the first-person perspective (e.g., "I click...", "I select...").
For each action, clearly include:
- Action: Clearly state the specific buttons, links, menus, text boxes, or other UI elements I interact with.
- Page Changes: Accurately describe any changes or responses on the webpage resulting from my actions.
- Possible Purpose: Provide an objective and detailed hypothesis about the likely intent or purpose of each action, without subjective interpretations or assumptions beyond what is directly observable.
Important Note:
The video is split into 4 parts; this request specifically refers ONLY to this Part. Do NOT describe any content from other parts or add any additional context or speculation. Your description must strictly reflect only what you observe in this video segment.
"""
# --- 遍历目录并处理视频 ---
if not os.path.isdir(VIDEO_DIR):
print(f"错误:视频目录 '{VIDEO_DIR}' 不存在或不是一个目录。")
else:
print(f"开始处理目录 '{VIDEO_DIR}' 中的视频...")
for root, dirs, files in os.walk(VIDEO_DIR):
print(f"正在检查目录: {root}")
for filename in files:
if filename.lower().endswith(SUPPORTED_EXTENSIONS):
video_path = os.path.join(root, filename)
output_filename = os.path.splitext(filename)[0] + ".txt"
output_path = os.path.join(root, output_filename)
print(f" 发现视频文件: {video_path}")
# 检查输出文件是否已存在
if os.path.exists(output_path):
print(f" 跳过: 输出文件 '{output_path}' 已存在。")
continue
print(f" 正在处理: {video_path}")
# Base64 编码
base64_video = encode_video(video_path)
if base64_video is None:
continue # 编码失败,跳过此视频
# 调用 OpenAI API
try:
completion = client.chat.completions.create(
model="qwen-vl-max-latest",
messages=[
{
"role": "system",
"content": [{"type":"text","text": "You are a helpful assistant."}]},
{
"role": "user",
"content": [
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{base64_video}"}, # 注意:这里假设所有视频都用 mp4 mime type如果主要是 webm可以改成 video/webm
},
{"type": "text", "text": prompt},
],
}
],
# 可以根据需要调整 temperature 等参数
# temperature=0.7,
)
result_text = completion.choices[0].message.content
# 保存结果到 txt 文件
try:
with open(output_path, "w", encoding="utf-8") as f:
f.write(result_text)
print(f" 成功: 结果已保存到 '{output_path}'")
except IOError as e:
print(f" 错误: 无法写入输出文件 '{output_path}': {e}")
except Exception as e:
print(f" 保存文件 '{output_path}' 时发生未知错误: {e}")
except Exception as e:
print(f" 错误: 调用 API 处理 '{video_path}' 时失败: {e}")
print("所有视频处理完成。")