crawlee/misc/get_axtree_v18.js
2025-04-23 12:14:50 +08:00

255 lines
8.0 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const { chromium } = require('playwright');
const fs = require('fs').promises;
const path = require('path');
async function getAXTreeForUrl(url, browser) {
// 创建新页面
const page = await browser.newPage();
const result = {
axtree: "",
idToSelector: {},
error: null
};
try {
// 访问页面并等待加载
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000
});
await page.waitForTimeout(5000);
const clickedButtons = new Set();
const expandButtons = async () => {
console.log('开始寻找可展开按钮...');
const buttons = await page.$$('button[aria-label*="Expand"], button[aria-label*="Open"], button[aria-expanded="false"]');
console.log(`找到 ${buttons.length} 个折叠按钮`);
let newButtonsFound = false;
for (const button of buttons) {
try {
// 使用更可靠的滚动方法
await page.evaluate(element => {
// 使用JavaScript的scrollIntoView更直接且兼容性更好
element.scrollIntoView({behavior: 'smooth', block: 'center'});
}, button);
const ariaLabel = await button.getAttribute('aria-label');
if (!clickedButtons.has(ariaLabel)) {
console.log(`点击新按钮: ${ariaLabel}`);
await button.click();
clickedButtons.add(ariaLabel);
newButtonsFound = true;
await page.waitForTimeout(200);
}
} catch (e) {
console.log(`点击失败: ${e.message}`);
}
}
return newButtonsFound;
};
let iteration = 1;
while (true) {
console.log(`\n${iteration} 次查找...`);
const foundNewButtons = await expandButtons();
if (!foundNewButtons) {
console.log('没有发现新的可展开按钮,结束查找');
break;
}
console.log(`已点击按钮数量: ${clickedButtons.size}`);
await page.waitForTimeout(500);
iteration++;
}
// await page.waitForTimeout(5000);
// 获取 AXTree
const axTree = await page.accessibility.snapshot({ interestingOnly: false });
// 用于存储 id 到 selector 的映射
let idCounter = 1;
const idToSelector = {};
const nodeParents = new Map();
// 用于构建文本形式的 AXTree
let axtreeText = [];
function traverse(node, depth = 0, parent = null) {
nodeParents.set(node, parent);
if (((node.role === 'none' || node.role === 'generic' || node.role === 'InlineTextBox') &&
!node.name &&
!node.focusable &&
!node.focused &&
node.expanded === undefined) ||
node.role === 'InlineTextBox'
) {
if (node.children?.length > 0) {
for (const child of node.children) {
traverse(child, depth, node);
}
}
return;
}
const currentId = idCounter++;
let selectorParts = [`role=${node.role}`];
if (node.name) {
selectorParts.push(`[name="${node.name}"]`);
}
if (node.selected) selectorParts.push('[selected=true]');
if (node.checked !== undefined) selectorParts.push(`[checked=${node.checked}]`);
if (node.pressed !== undefined) selectorParts.push(`[pressed=${node.pressed}]`);
if (parent && parent.role !== 'WebArea') {
let parentSelector = `role=${parent.role}`;
if (parent.name) {
parentSelector += `[name="${parent.name}"]`;
}
selectorParts.unshift(`${parentSelector} >>`);
}
if (parent?.children) {
const siblingIndex = parent.children.findIndex(child => child === node);
if (siblingIndex !== -1) {
selectorParts.push(`:nth-match(${siblingIndex + 1})`);
}
}
idToSelector[currentId] = selectorParts.join(' ');
// 收集所有可能的属性
let props = [];
if (node.focusable) props.push('focusable');
if (node.focused) props.push('focused');
if (node.expanded !== undefined) props.push(`expanded=${node.expanded}`);
if (node.selected) props.push('selected');
if (node.checked !== undefined) props.push(`checked=${node.checked}`);
if (node.disabled) props.push('disabled');
if (node.required) props.push('required');
if (node.pressed !== undefined) props.push(`pressed=${node.pressed}`);
// 判断元素是否可点击
const clickableRoles = ['button', 'link', 'menuitem', 'tab', 'checkbox', 'radio', 'switch', 'option'];
const isClickable = clickableRoles.includes(node.role) ||
node.focusable ||
node.role === 'generic' && node.name && node.focusable;
if (isClickable) props.push('clickable');
const indent = ' '.repeat(depth * 4);
const nodeLine = `${indent}[${currentId}] ${node.role} '${node.name || ''}'${props.length > 0 ? ' (' + props.join(', ') + ')' : ''}`;
axtreeText.push(nodeLine);
if (node.children?.length > 0) {
for (const child of node.children) {
traverse(child, depth + 1, node);
}
}
}
// 添加根节点信息
let rootProps = [];
if (axTree.focusable) rootProps.push('focusable=True');
if (axTree.focused) rootProps.push('focused');
axtreeText.push(`Root${axTree.role ? axTree.role : 'WebArea'} '${axTree.name || ''}'${rootProps.length > 0 ? ', ' + rootProps.join(', ') : ''}`);
// 遍历 AXTree
if (axTree.children?.length > 0) {
for (const child of axTree.children) {
traverse(child, 1, axTree);
}
}
result.axtree = axtreeText.join('\n');
result.idToSelector = idToSelector;
} catch (error) {
result.error = error.message;
console.error(`Error processing URL ${url}:`, error);
} finally {
await page.close();
}
return result;
}
async function processUrls() {
try {
// 读取输入文件
const inputData = JSON.parse(
await fs.readFile('path/processed_3.json', 'utf8')
);
// 启动浏览器
const browser = await chromium.launch({ headless: true });
// 收集所有唯一的 URL
const uniqueUrls = new Set();
for (const item of Object.values(inputData)) {
if (item.shortestPathsMeta) {
for (const meta of item.shortestPathsMeta) {
if (meta.chainUrls) {
meta.chainUrls.forEach(url => uniqueUrls.add(url));
}
}
}
}
// 存储结果的对象
const results = {};
// 处理每个 URL
let processed = 0;
const total = uniqueUrls.size;
for (const url of uniqueUrls) {
processed++;
console.log(`\n========== 处理 URL ${processed}/${total} ==========`);
console.log(`URL: ${url}`);
const result = await getAXTreeForUrl(url, browser);
results[url] = result;
// 打印 AXTree 信息
if (result.error) {
console.log(`获取 AXTree 失败: ${result.error}`);
} else {
console.log(`AXTree 获取成功,包含 ${Object.keys(result.idToSelector).length} 个节点`);
console.log(`AXTree 预览 (前5行):`);
const previewLines = result.axtree.split('\n').slice(0, 5);
console.log(previewLines.join('\n'));
if (result.axtree.split('\n').length > 5) {
console.log('... (更多内容已省略)');
}
}
console.log(`========== URL ${processed}/${total} 处理完成 ==========\n`);
}
// 关闭浏览器
await browser.close();
// 写入结果文件
await fs.writeFile(
'path/processed_3_axtree.json',
JSON.stringify(results, null, 2)
);
console.log('Processing complete. Results saved to path/processed_3_axtree.json');
} catch (error) {
console.error('Error in main process:', error);
}
}
// 运行主程序
processUrls();