webrl/VAB-WebArena-Lite/new/html_tools/fetch.py
2024-11-14 15:51:41 +08:00

109 lines
2.9 KiB
Python
Executable File

import os
import json
import base64
from .html_parser import HtmlParser
from .configs import basic_attrs
from .scripts import *
def get_window(page):
x = page.evaluate("window.scrollX")
y = page.evaluate("window.scrollY")
w = page.evaluate("window.innerWidth")
h = page.evaluate("window.innerHeight")
return (x, y, w, h)
def modify_page(page):
page.wait_for_timeout(500)
try:
page.evaluate(remove_id_script)
except:
pass
packet = {
"raw_html": page.evaluate("document.documentElement.outerHTML"),
"window": get_window(page)
}
page.evaluate(prepare_script)
page.wait_for_timeout(100)
img_bytes = page.screenshot(path="debug_info/screenshot_raw.png")
raw_image = base64.b64encode(img_bytes).decode()
page.evaluate(clickable_checker_script)
page.wait_for_timeout(50)
# get all clickable elements
start_id = 0
items, start_id = page.evaluate(label_script, {
"selector": ".possible-clickable-element",
"startIndex": start_id
})
page.wait_for_timeout(50)
# mark our own labels and get the images
items = page.evaluate(label_marker_script, items)
page.wait_for_timeout(100)
img_bytes = page.screenshot(path="debug_info/marked.png")
marked_image = base64.b64encode(img_bytes).decode()
# remove markers on the page
page.evaluate(remove_label_mark_script)
packet.update({
"raw_image": raw_image,
"marked_image": marked_image,
"modified_html": page.evaluate("document.documentElement.outerHTML")
})
# element_info, include "all_elements" and "clickable_elements"
element_info = page.evaluate(element_info_script)
page.wait_for_timeout(100)
packet.update(element_info)
return packet
def save_debug_info(packet):
with open("debug_info/raw.html", "w") as f:
f.write(packet["modified_html"])
with open("debug_info/parsed.html", "w") as f:
f.write(packet["html"])
with open("debug_info/all_element.json", "w") as f:
f.write(json.dumps(packet["all_elements"]))
def get_parsed_html(page):
if not os.path.exists("debug_info"):
os.makedirs("debug_info")
print("parsing html...")
packet = modify_page(page)
raw_html = packet["modified_html"]
args = {
"use_position": True,
"rect_dict": {},
"window_size": packet["window"],
"id-attr": "data-backend-node-id",
"label_attr": "data-label-id",
"label_generator": "order",
"regenerate_label": False,
"attr_list": basic_attrs,
"prompt": "xml",
"dataset": "pipeline"
}
hp = HtmlParser(raw_html, args)
res = hp.parse_tree()
page_html = res.get("html", "")
packet["html"] = page_html
# for debug
save_debug_info(packet)
print("parsing finished.")
return packet