trace_synthesis/toy/ali_vl.py
2025-04-15 22:44:08 +08:00

57 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from openai import OpenAI
import os
import base64
from dotenv import load_dotenv
load_dotenv()
# Base64 编码格式
def encode_video(video_path):
with open(video_path, "rb") as video_file:
return base64.b64encode(video_file.read()).decode("utf-8")
# 将xxxx/test.mp4替换为你本地视频的绝对路径
base64_video = encode_video("235.trace_recording.webm")
# base64_video = encode_video("50221078283.mp4")
prompt = """
You are an annotator tasked with carefully observing and faithfully documenting the exact actions shown in a provided video segment. Your role is to describe, step-by-step, the precise actions I am performing on the website, strictly using the first-person perspective (e.g., "I click...", "I select...").
For each action, clearly include:
- Action: Clearly state the specific buttons, links, menus, text boxes, or other UI elements I interact with.
- Page Changes: Accurately describe any changes or responses on the webpage resulting from my actions.
- Possible Purpose: Provide an objective and detailed hypothesis about the likely intent or purpose of each action, without subjective interpretations or assumptions beyond what is directly observable.
Important Note:
The video is split into 4 parts; this request specifically refers ONLY to this Part. Do NOT describe any content from other parts or add any additional context or speculation. Your description must strictly reflect only what you observe in this video segment.
"""
client = OpenAI(
# 若没有配置环境变量请用百炼API Key将下行替换为api_key="sk-xxx"
api_key=os.environ["OPENAI_API_KEY"]
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-max-latest",
# model="qwen2.5-vl-72b-instruct",
messages=[
{
"role": "system",
"content": [{"type":"text","text": "You are a helpful assistant."}]},
{
"role": "user",
"content": [
{
# 直接传入视频文件时请将type的值设置为video_url
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{base64_video}"},
},
{"type": "text", "text": prompt},
],
}
],
)
print(completion.choices[0].message.content)
# print(completion)