crawlee/misc/axtree_complete.js
2025-04-23 12:14:50 +08:00

208 lines
7.1 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { chromium } from 'playwright';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
// 获取当前文件的目录路径ES模块中没有 __dirname
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function getAXTreeForUrl(url) {
// 启动浏览器并打开页面
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
// 等待页面完全加载,避免 "Loading..." 状态
await page.waitForTimeout(10000);
console.log('页面已加载: ' + url);
// 提取完整的 AXTree设置 interestingOnly: false 可获取全部节点)
const axTree = await page.accessibility.snapshot({ interestingOnly: false });
// 全局计数器和编号到"伪选择器"映射的对象
let idCounter = 1;
const idToSelector = {};
// 在文件开头添加一个全局变量来存储父子关系
const nodeParents = new Map();
let axTreeOutput = '';
function traverse(node, depth = 0, parent = null) {
// 记录父节点关系
nodeParents.set(node, parent);
// 增加 InlineTextBox 到过滤条件中
if (((node.role === 'none' || node.role === 'generic' || node.role === 'InlineTextBox') &&
!node.name &&
!node.focusable &&
!node.focused &&
node.expanded === undefined) ||
node.role === 'InlineTextBox' // 无论如何都跳过 InlineTextBox
) {
// 直接处理子节点
if (node.children && node.children.length > 0) {
for (const child of node.children) {
traverse(child, depth, node);
}
}
return;
}
const currentId = idCounter++;
// 构建更详细的 selector
let selectorParts = [`role=${node.role}`];
if (node.name) {
selectorParts.push(`[name="${node.name}"]`);
}
// 添加其他可能的属性
if (node.selected) selectorParts.push('[selected=true]');
if (node.checked !== undefined) selectorParts.push(`[checked=${node.checked}]`);
if (node.pressed !== undefined) selectorParts.push(`[pressed=${node.pressed}]`);
// 如果有父节点,添加父节点信息
if (parent && parent.role !== 'WebArea') {
let parentSelector = `role=${parent.role}`;
if (parent.name) {
parentSelector += `[name="${parent.name}"]`;
}
selectorParts.unshift(`${parentSelector} >>`);
}
// 如果是列表项,添加位置信息
if (parent && parent.children) {
const siblingIndex = parent.children.findIndex(child => child === node);
if (siblingIndex !== -1) {
selectorParts.push(`:nth-match(${siblingIndex + 1})`);
}
}
idToSelector[currentId] = selectorParts.join(' ');
// 收集所有可能的属性
let props = [];
if (node.focusable) props.push('focusable');
if (node.focused) props.push('focused');
if (node.expanded !== undefined) props.push(`expanded=${node.expanded}`);
if (node.selected) props.push('selected');
if (node.checked !== undefined) props.push(`checked=${node.checked}`);
if (node.disabled) props.push('disabled');
if (node.required) props.push('required');
if (node.pressed !== undefined) props.push(`pressed=${node.pressed}`);
// 判断元素是否可点击
const clickableRoles = ['button', 'link', 'menuitem', 'tab', 'checkbox', 'radio', 'switch', 'option'];
const isClickable = clickableRoles.includes(node.role) ||
node.focusable ||
node.role === 'generic' && node.name && node.focusable;
if (isClickable) props.push('clickable');
const indent = ' '.repeat(depth * 4);
const line = `${indent}[${currentId}] ${node.role} '${node.name || ''}'${props.length > 0 ? ' (' + props.join(', ') + ')' : ''}`;
axTreeOutput += line + '\n';
console.log(line);
if (node.children && node.children.length > 0) {
for (const child of node.children) {
traverse(child, depth + 1, node);
}
}
}
// 输出 AXTree 的整体结构
axTreeOutput += '## AXTree:\n';
console.log('## AXTree:');
// 打印根节点信息(这里用 Root+role 来模拟输出)
let rootProps = [];
if (axTree.focusable) rootProps.push('focusable=True');
if (axTree.focused) rootProps.push('focused');
const rootLine = `Root${axTree.role ? axTree.role : 'WebArea'} '${axTree.name || ''}'${rootProps.length > 0 ? ', ' + rootProps.join(', ') : ''}`;
axTreeOutput += rootLine + '\n';
console.log(rootLine);
if (axTree.children && axTree.children.length > 0) {
for (const child of axTree.children) {
traverse(child, 1, axTree);
}
}
await browser.close();
return { axTreeOutput, idToSelector };
}
async function processAXTreeFiles() {
const axtreesDir = path.join(__dirname, 'axtrees');
// 确保目录存在
if (!fs.existsSync(axtreesDir)) {
fs.mkdirSync(axtreesDir, { recursive: true });
}
// 读取所有 txt 文件
const files = fs.readdirSync(axtreesDir).filter(file => file.endsWith('.txt'));
for (const file of files) {
const filePath = path.join(axtreesDir, file);
const content = fs.readFileSync(filePath, 'utf8');
// 检查是否包含 "Loading ..."
if (content.includes('Loading ...')) {
console.log(`文件 ${file} 包含 "Loading ...",需要重新获取`);
// 从文件名中提取 ID
const id = path.basename(file, '.txt');
// 查找对应的 JSON 文件
const jsonPath = path.join(__dirname, 'storage', 'request_queues', 'default', `${id}.json`);
if (fs.existsSync(jsonPath)) {
try {
const jsonContent = fs.readFileSync(jsonPath, 'utf8');
const jsonData = JSON.parse(jsonContent);
const jsonObj = JSON.parse(jsonData.json);
const url = jsonObj.url;
console.log(`为 ID ${id} 找到 URL: ${url}`);
// 获取新的 AXTree
const { axTreeOutput, idToSelector } = await getAXTreeForUrl(url);
// 检查新获取的 AXTree 是否包含 "Loading ..."
if (!axTreeOutput.includes('Loading ...')) {
// 保存 AXTree 到原文件
fs.writeFileSync(filePath, axTreeOutput);
// 保存 idToSelector 到新文件
const selectorFilePath = path.join(axtreesDir, `${id}_idToSelector.json`);
fs.writeFileSync(selectorFilePath, JSON.stringify(idToSelector, null, 2));
console.log(`已更新 ${file} 和创建 ${id}_idToSelector.json`);
} else {
console.log(`警告: 新获取的 AXTree 仍然包含 "Loading ..."`);
}
} catch (error) {
console.error(`处理 ${file} 时出错:`, error);
}
} else {
console.log(`找不到对应的 JSON 文件: ${jsonPath}`);
}
}
}
}
// 主函数
(async () => {
try {
await processAXTreeFiles();
console.log('所有 AXTree 文件处理完成');
} catch (error) {
console.error('处理过程中出错:', error);
}
})();