101 lines
3.0 KiB
Python
101 lines
3.0 KiB
Python
from lxml import html
|
|
def get_xpath_top_down(element: html.HtmlElement, id_column: str='temp_id', label_column: str='temp_clickable_label', path: str='', order: int=0,
|
|
in_svg: bool=False, temp_id: int=0) -> tuple[int, dict[str, str], dict[str]]:
|
|
used_labels, i2xpath = {}, {}
|
|
# path
|
|
tag = element.tag.lower()
|
|
in_svg = in_svg or (tag == 'svg')
|
|
|
|
if not in_svg and 'id' in element.attrib:
|
|
node_id = element.attrib['id']
|
|
path = f'//*[@id="{node_id}"]'
|
|
else:
|
|
suffix = f'[{order}]' if order > 0 else ''
|
|
prefix = f'*[name()="{tag}"]' if in_svg else tag
|
|
path = path + '/' + prefix + suffix
|
|
|
|
# add temp id
|
|
element.attrib[id_column] = str(temp_id)
|
|
ori_label = element.attrib.get(label_column, '')
|
|
if ori_label != '':
|
|
used_labels[ori_label] = True
|
|
|
|
bid = str(temp_id)
|
|
i2xpath[bid] = path
|
|
i2xpath[path] = bid
|
|
i2xpath[f'xpath/{path}'] = bid
|
|
i2xpath[f'xpath=/{path}'] = bid
|
|
|
|
temp_id += 1
|
|
|
|
# traverse node
|
|
children = element.getchildren()
|
|
tag_dict = {}
|
|
id_list = []
|
|
for child in children:
|
|
ctag = child.tag.lower()
|
|
if ctag not in tag_dict:
|
|
tag_dict[ctag] = 0
|
|
tag_dict[ctag] += 1
|
|
id_list.append(tag_dict[ctag])
|
|
|
|
for cid, child in zip(id_list, children):
|
|
ctag = child.tag.lower()
|
|
cod = cid if tag_dict[ctag] > 1 else 0
|
|
temp_id, i2x, ulabels = get_xpath_top_down(child, id_column, label_column, path, cod, in_svg, temp_id)
|
|
i2xpath.update(i2x)
|
|
used_labels.update(ulabels)
|
|
|
|
return temp_id, i2xpath, used_labels
|
|
|
|
def print_html_object(obj: str='') -> str:
|
|
tab_cnt = 0
|
|
result, content, sep = '', '', ''
|
|
last_is_left, last_is_right = False, False
|
|
for ch in obj:
|
|
if ch == '<':
|
|
result += '\n'
|
|
if len(content.strip()) > 0:
|
|
result += sep + content.strip() + '\n'
|
|
result += sep + '<'
|
|
|
|
tab_cnt += 1
|
|
sep = ' ' * tab_cnt
|
|
|
|
content = ''
|
|
last_is_right = False
|
|
last_is_left = True
|
|
elif ch == '>':
|
|
if last_is_left:
|
|
result += content
|
|
else:
|
|
if last_is_right:
|
|
result += '\n'
|
|
if len(content.strip()) > 0:
|
|
result += sep + content.strip() + '\n'
|
|
|
|
tab_cnt -= 1
|
|
sep = ' ' * tab_cnt
|
|
|
|
if not last_is_left:
|
|
result += sep
|
|
|
|
result += '>'
|
|
content = ''
|
|
|
|
last_is_right = True
|
|
last_is_left = False
|
|
else:
|
|
content += ch
|
|
|
|
return result
|
|
|
|
def rect2tuple(rect: str) -> tuple[int, int, int, int]:
|
|
if rect is None or type(rect) != type('str'):
|
|
return None
|
|
rect = rect.strip()
|
|
if rect.count(',') != 3:
|
|
return None
|
|
rect = rect.split(',')
|
|
rect = [float(r) for r in rect]
|
|
return tuple(rect) |