trace_synthesis/toy/ali_vl.py

from openai import OpenAI
import os
import base64
from dotenv import load_dotenv

load_dotenv()

#  Base64 编码格式
def encode_video(video_path):
    with open(video_path, "rb") as video_file:
        return base64.b64encode(video_file.read()).decode("utf-8")

# 将xxxx/test.mp4替换为你本地视频的绝对路径
base64_video = encode_video("235.trace_recording.webm")
# base64_video = encode_video("50221078283.mp4")
prompt = """
You are an annotator tasked with carefully observing and faithfully documenting the exact actions shown in a provided video segment. Your role is to describe, step-by-step, the precise actions I am performing on the website, strictly using the first-person perspective (e.g., "I click...", "I select...").

For each action, clearly include:
- Action: Clearly state the specific buttons, links, menus, text boxes, or other UI elements I interact with.
- Page Changes: Accurately describe any changes or responses on the webpage resulting from my actions.
- Possible Purpose: Provide an objective and detailed hypothesis about the likely intent or purpose of each action, without subjective interpretations or assumptions beyond what is directly observable.

Important Note:
The video is split into 4 parts; this request specifically refers ONLY to this Part. Do NOT describe any content from other parts or add any additional context or speculation. Your description must strictly reflect only what you observe in this video segment.
"""


client = OpenAI(
    # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx"
    api_key=os.environ["OPENAI_API_KEY"]
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
    model="qwen-vl-max-latest",
    # model="qwen2.5-vl-72b-instruct",
    messages=[
        {
            "role": "system",
            "content": [{"type":"text","text": "You are a helpful assistant."}]},
        {
            "role": "user",
            "content": [
                {
                    # 直接传入视频文件时，请将type的值设置为video_url
                    "type": "video_url",
                    "video_url": {"url": f"data:video/mp4;base64,{base64_video}"},
                },
                {"type": "text", "text": prompt},
            ],
        }
    ],
)
print(completion.choices[0].message.content)

# print(completion)