109 lines
2.9 KiB
Python
Executable File
109 lines
2.9 KiB
Python
Executable File
import os
|
|
import json
|
|
import base64
|
|
from .html_parser import HtmlParser
|
|
from .configs import basic_attrs
|
|
from .scripts import *
|
|
|
|
def get_window(page):
|
|
x = page.evaluate("window.scrollX")
|
|
y = page.evaluate("window.scrollY")
|
|
w = page.evaluate("window.innerWidth")
|
|
h = page.evaluate("window.innerHeight")
|
|
return (x, y, w, h)
|
|
|
|
def modify_page(page):
|
|
page.wait_for_timeout(500)
|
|
|
|
try:
|
|
page.evaluate(remove_id_script)
|
|
except:
|
|
pass
|
|
|
|
packet = {
|
|
"raw_html": page.evaluate("document.documentElement.outerHTML"),
|
|
"window": get_window(page)
|
|
}
|
|
|
|
page.evaluate(prepare_script)
|
|
page.wait_for_timeout(100)
|
|
|
|
img_bytes = page.screenshot(path="debug_info/screenshot_raw.png")
|
|
raw_image = base64.b64encode(img_bytes).decode()
|
|
|
|
page.evaluate(clickable_checker_script)
|
|
page.wait_for_timeout(50)
|
|
|
|
# get all clickable elements
|
|
start_id = 0
|
|
items, start_id = page.evaluate(label_script, {
|
|
"selector": ".possible-clickable-element",
|
|
"startIndex": start_id
|
|
})
|
|
page.wait_for_timeout(50)
|
|
|
|
# mark our own labels and get the images
|
|
items = page.evaluate(label_marker_script, items)
|
|
page.wait_for_timeout(100)
|
|
img_bytes = page.screenshot(path="debug_info/marked.png")
|
|
marked_image = base64.b64encode(img_bytes).decode()
|
|
|
|
# remove markers on the page
|
|
page.evaluate(remove_label_mark_script)
|
|
|
|
packet.update({
|
|
"raw_image": raw_image,
|
|
"marked_image": marked_image,
|
|
"modified_html": page.evaluate("document.documentElement.outerHTML")
|
|
})
|
|
|
|
# element_info, include "all_elements" and "clickable_elements"
|
|
element_info = page.evaluate(element_info_script)
|
|
page.wait_for_timeout(100)
|
|
packet.update(element_info)
|
|
return packet
|
|
|
|
def save_debug_info(packet):
|
|
with open("debug_info/raw.html", "w") as f:
|
|
f.write(packet["modified_html"])
|
|
with open("debug_info/parsed.html", "w") as f:
|
|
f.write(packet["html"])
|
|
with open("debug_info/all_element.json", "w") as f:
|
|
f.write(json.dumps(packet["all_elements"]))
|
|
|
|
def get_parsed_html(page):
|
|
if not os.path.exists("debug_info"):
|
|
os.makedirs("debug_info")
|
|
|
|
print("parsing html...")
|
|
|
|
packet = modify_page(page)
|
|
raw_html = packet["modified_html"]
|
|
|
|
args = {
|
|
"use_position": True,
|
|
"rect_dict": {},
|
|
"window_size": packet["window"],
|
|
"id-attr": "data-backend-node-id",
|
|
"label_attr": "data-label-id",
|
|
"label_generator": "order",
|
|
"regenerate_label": False,
|
|
"attr_list": basic_attrs,
|
|
"prompt": "xml",
|
|
"dataset": "pipeline"
|
|
}
|
|
|
|
hp = HtmlParser(raw_html, args)
|
|
res = hp.parse_tree()
|
|
page_html = res.get("html", "")
|
|
|
|
packet["html"] = page_html
|
|
|
|
# for debug
|
|
save_debug_info(packet)
|
|
|
|
print("parsing finished.")
|
|
|
|
return packet
|
|
|