crawlee/misc/test.py
2025-04-23 12:14:50 +08:00

99 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

model_path = "/data1/yuyr/models--bytedance-research--UI-TARS-7B-DPO/snapshots/727b0df39207dafc6cf211a61f29d84b7659c39c/"
# 打开图片
# image_path = "file:///data1/yuyr/crawlee/screenshots/0fuABgATggRcGam_57.png"
image_path = "/data1/yuyr/crawlee/aaa5.png"
import base64
def encode_image_to_base64(image_path):
"""
将图片编码为base64字符串
"""
try:
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
except Exception as e:
print(f"图片编码失败: {e}")
return None
image_base64 = encode_image_to_base64(image_path)
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
# 从本地加载模型
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path, torch_dtype="auto", device_map="cuda:0"
)
# 从本地加载processor
processor = AutoProcessor.from_pretrained(model_path)
# 设置消息这里图片使用本地png文件路径
messages = [
{
"role": "system",
"content": """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
```
Thought: ...
Action: ...
```
## Action Space
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
hotkey(key='')
type(content='') #If you want to submit your input, use \"\" at the end of `content`.
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished()
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
## Note
- Use Chinese in `Thought` part.
- Summarize your next action (with its target element) in one sentence in `Thought` part."""
},
{
"role": "user",
"content": [
{
"type": "image",
"image_url": f"file://{image_path}"
},
{"type": "text", "text": "点击购物"},
],
}
]
# 准备推理输入
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# 推理:生成输出
generated_ids = model.generate(**inputs, max_new_tokens=1024)
print(f"generated_ids: {generated_ids}")
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
print(f"generated_ids_trimmed: {generated_ids_trimmed}")
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(f"output_text: {output_text}")