import { PlaywrightCrawler, RequestQueue } from 'crawlee'; import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; import { v4 as uuidv4 } from 'uuid'; import { createWriteStream } from 'fs'; import { Log } from 'crawlee'; // 获取 __dirname(ES 模块下的写法) const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // 创建日志文件流 const logFile = createWriteStream(__filename.replace('.js', '.log'), { flags: 'a' }); // 创建自定义日志函数 const writeToLog = (level, message) => { const logMessage = `[${new Date().toISOString()}] ${level}: ${message}\n`; logFile.write(logMessage); }; // 重写 Crawlee 的全局日志系统 const originalLogFunctions = { debug: Log.debug, info: Log.info, warning: Log.warning, warn: Log.warn, error: Log.error }; Log.debug = function(msg, options) { writeToLog('DEBUG', msg); return originalLogFunctions.debug(msg, options); }; Log.info = function(msg, options) { writeToLog('INFO', msg); return originalLogFunctions.info(msg, options); }; Log.warning = Log.warn = function(msg, options) { writeToLog('WARN', msg); return originalLogFunctions.warning(msg, options); }; Log.error = function(msg, options) { writeToLog('ERROR', msg); return originalLogFunctions.error(msg, options); }; // 控制台日志也重定向到文件 const originalConsoleLog = console.log; const originalConsoleWarn = console.warn; const originalConsoleError = console.error; console.log = function(...args) { const message = args.map(arg => typeof arg === 'object' ? JSON.stringify(arg) : arg ).join(' '); logFile.write(`[${new Date().toISOString()}] INFO: ${message}\n`); originalConsoleLog.apply(console, args); }; console.warn = function(...args) { const message = args.map(arg => typeof arg === 'object' ? JSON.stringify(arg) : arg ).join(' '); logFile.write(`[${new Date().toISOString()}] WARN: ${message}\n`); originalConsoleWarn.apply(console, args); }; console.error = function(...args) { const message = args.map(arg => typeof arg === 'object' ? JSON.stringify(arg) : arg ).join(' '); logFile.write(`[${new Date().toISOString()}] ERROR: ${message}\n`); originalConsoleError.apply(console, args); }; // 添加进程退出时的日志文件关闭处理 process.on('exit', () => { logFile.end(); }); // 全局异常处理,防止未捕获异常导致程序退出 process.on('uncaughtException', (err) => { console.error('未捕获的异常:', err); }); // 在文件开头添加 SIGINT 信号处理 process.on('SIGINT', () => { console.log('\n检测到 Ctrl+C,正在保存数据并退出程序...'); // 保存最终的 processedUrlToPaths const mapObject = Object.fromEntries(processedUrlToPaths); fs.writeFileSync( path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2) ); console.log('数据已保存,程序退出!'); logFile.end(); // 确保日志文件正确关闭 process.exit(0); // 正常退出程序 }); // 定义保存 HTML 与截图的目录 const pagesDir = path.join(__dirname, 'pages'); const axtreesDir = path.join(__dirname, 'axtrees'); const screenshotsDir = path.join(__dirname, 'screenshots'); const storageDir = path.join(__dirname, 'storage'); const pathDir = path.join(__dirname, 'path'); const childsDir = path.join(pathDir, 'childs'); // 新增子链接记录目录 if (fs.existsSync(pagesDir)) fs.rmSync(pagesDir, { recursive: true }); if (fs.existsSync(screenshotsDir)) fs.rmSync(screenshotsDir, { recursive: true }); if (fs.existsSync(storageDir)) fs.rmSync(storageDir, { recursive: true }); if (fs.existsSync(pathDir)) fs.rmSync(pathDir, { recursive: true }); if (fs.existsSync(childsDir)) fs.rmSync(childsDir, { recursive: true }); if (fs.existsSync(axtreesDir)) fs.rmSync(axtreesDir, { recursive: true }); fs.mkdirSync(pagesDir); fs.mkdirSync(screenshotsDir); fs.mkdirSync(storageDir); fs.mkdirSync(pathDir); fs.mkdirSync(childsDir); // 创建子链接记录目录 fs.mkdirSync(axtreesDir); // 创建axtree目录 console.log("启动爬虫..."); // 全局保存已经处理过的页面路径,每个页面对应一个list,list中每个元素是到达该页面的路径 const processedUrlToPaths = new Map(); // 全局保存已经处理过的页面路径,每个页面对应一个list,list中每个元素是到达该页面的路径 const processedUrlToParentChainLength = new Map(); // 全局保存已经探索过的url,用于去重 const urlExplored = new Set(); // 原子计数器,用于记录已处理过的页面数量 let processedRequestsCount = 0; (async () => { // 打开请求队列,建议在首次运行前清空 storage/request_queues 文件夹 const requestQueue = await RequestQueue.open(); // 使用 UUID 作为初始请求的 uniqueKey 和 id await requestQueue.addRequest({ // url : "https://play.grafana.org/a/grafana-app-observability-app", // uniqueKey : "https://play.grafana.org/a/grafana-app-observability-app_0", url: 'https://play.grafana.org', uniqueKey: 'https://play.grafana.org_0', userData: { parentChain: [], parentChainIDs: [] } }); const crawler = new PlaywrightCrawler({ requestQueue, keepAlive: true, navigationTimeoutSecs: 120, requestHandlerTimeoutSecs: 360, async handlePageFunction({ page, request, enqueueLinks, log }) { // 重写 log 对象的方法 const originalLog = log; log = { ...originalLog, info: (message) => { writeToLog('INFO', message); originalLog.info(message); }, debug: (message) => { writeToLog('DEBUG', message); originalLog.debug(message); }, warning: (message) => { writeToLog('WARN', message); originalLog.warning(message); }, warn: (message) => { // 添加 warn 方法 writeToLog('WARN', message); originalLog.warn(message); }, error: (message) => { writeToLog('ERROR', message); originalLog.error(message); } }; // 获取页面最终重定向后的 URL(去除参数) const finalUrl = page.url().split('?')[0]; // 如果已经探索过该页面,则直接返回 if (urlExplored.has(finalUrl)) { log.info(`页面 ${finalUrl} 已探索,跳过当前请求`); return; } // 获取当前页面的完整路径 const fullChain = [...(request.userData.parentChainIDs || []), request.id]; // 获取已经存入processedUrlToPaths的页面路径 const processedPath = processedUrlToPaths.get(finalUrl); if (processedPath) { processedPath.push(fullChain); processedUrlToPaths.set(finalUrl, processedPath); log.info(`Final URL ${finalUrl} 记录新的可达路径:${fullChain}。`); } else { processedUrlToPaths.set(finalUrl, [fullChain]); log.info(`Final URL ${finalUrl} 记录第一个可达路径:${fullChain}。`); } processedRequestsCount++; // 每处理10个请求保存processedUrlToPaths if (processedRequestsCount % 10 === 0) { const mapObject = Object.fromEntries(processedUrlToPaths); fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2)); log.info(`已保存 processedUrlToPaths.json`); } // 设置页面视口大小 await page.setViewportSize({ width: 2560, height: 1440 }); // await page.setViewportSize({ width: 1280, height: 720 }); // 增加超时时间到 120 秒 await page.goto(request.url, { timeout: 120000, // 增加到 120 秒 waitUntil: 'domcontentloaded' // 改为只等待 DOM 加载完成,不等待所有资源 }); console.log('页面加载完成'); // 等待页面稳定 try { await page.waitForLoadState('networkidle', { timeout: 30000 }); } catch (e) { console.log('网络未完全空闲,但继续执行:', e.message); } // 在等待 networkidle 之前,先展开所有可折叠内容 console.log('\n开始展开导航项...'); const clickedButtons = new Set(); const expandButtons = async () => { console.log('开始寻找可展开按钮...'); const buttons = await page.$$('button[aria-label*="Expand"], button[aria-label*="Open"], button[aria-expanded="false"]'); console.log(`找到 ${buttons.length} 个折叠按钮`); let newButtonsFound = false; for (const button of buttons) { try { // 使用更可靠的滚动方法 await page.evaluate(element => { // 使用JavaScript的scrollIntoView,更直接且兼容性更好 element.scrollIntoView({behavior: 'smooth', block: 'center'}); }, button); const ariaLabel = await button.getAttribute('aria-label'); if (!clickedButtons.has(ariaLabel)) { console.log(`点击新按钮: ${ariaLabel}`); await button.click(); clickedButtons.add(ariaLabel); newButtonsFound = true; await page.waitForTimeout(200); } } catch (e) { console.log(`点击失败: ${e.message}`); } } return newButtonsFound; }; let iteration = 1; while (true) { console.log(`\n第 ${iteration} 次查找...`); const foundNewButtons = await expandButtons(); if (!foundNewButtons) { console.log('没有发现新的可展开按钮,结束查找'); break; } console.log(`已点击按钮数量: ${clickedButtons.size}`); await page.waitForTimeout(500); iteration++; } // 获取所有 标签的元素句柄 const anchorHandles = await page.$$('a'); console.log(`当前网页${finalUrl}找到 ${anchorHandles.length} 个链接`); // 记录子元素编号 let childNum = 0; // 创建记录所有子链接和加入队列的子链接的数组 const allChildLinks = []; const queuedChildLinks = []; for (const anchorHandle of anchorHandles) { childNum++; // 先获取 标签的 href 与文本内容 const anchorData = await page.evaluate(el => { return { url: el.href, text: el.innerText.trim() }; }, anchorHandle); // 获取元素边界框信息 let rect = null; let scroll = null; let pageBoundingBox = null; try { // 尝试获取元素的边界框信息 rect = await anchorHandle.boundingBox(); if (rect) { scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY })); pageBoundingBox = { x: rect.x + scroll.x, y: rect.y + scroll.y, width: rect.width, height: rect.height, }; } } catch (err) { console.error(`获取元素边界框失败: ${err.message}`); } // 判断链接是否在目标域内 const isInLoop = anchorData.url.startsWith('https://play.grafana.org'); // 记录所有子链接,包含完整信息 const childLink = { childNum, url: anchorData.url, text: anchorData.text, isInLoop, isInQueue: false, // 默认未加入队列,后续会更新 viewportBoundingBox: rect, pageBoundingBox: pageBoundingBox, scroll: scroll }; allChildLinks.push(childLink); // 如果链接不属于目标域,则直接跳过(即为外链) if (!isInLoop) continue; log.info(`处理链接:${anchorData.text},childNum:${childNum}`); // 使用更可靠的滚动方法 await page.evaluate(element => { // 使用JavaScript的scrollIntoView,更直接且兼容性更好 element.scrollIntoView({behavior: 'smooth', block: 'center'}); }, anchorHandle); await page.waitForTimeout(500); // 给滚动和渲染更多时间 // 获取元素在窗口内的 bounding box rect = await anchorHandle.boundingBox(); if (!rect) continue; // 获取当前窗口滚动偏移,用于计算页面内位置 scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY })); pageBoundingBox = { x: rect.x + scroll.x, y: rect.y + scroll.y, width: rect.width, height: rect.height, }; // 构造新的 userData,包含父链与元素详细信息 const newUserData = { parentChainChildNum: [...(request.userData.parentChainChildNum || []), childNum], parentChain: [...(request.userData.parentChain || []), request.url], parentChainIDs: [...(request.userData.parentChainIDs || []), request.id], parentChainTexts: [...(request.userData.parentChainTexts || []), anchorData.text], parentChainScrolls: [...(request.userData.parentChainScrolls || []), scroll], parentChainViewportBoundingBoxes: [...(request.userData.parentChainViewportBoundingBoxes || []), rect], parentChainPageBoundingBoxes: [...(request.userData.parentChainPageBoundingBoxes || []), pageBoundingBox], elementPosition: { viewportBoundingBox: rect, pageBoundingBox: pageBoundingBox, scroll: scroll, text: anchorData.text, childNum: childNum } }; // 增加自定义url去重,通过全局的map记录url与parentchain长度的映射,如果当前路径长度大于等于已记录的url的parentchain长度,则跳过该url const urlToParentChainLength = processedUrlToParentChainLength.get(anchorData.url); if (urlToParentChainLength && urlToParentChainLength <= newUserData.parentChain.length) { log.info(`url:${anchorData.url} 历史发现路径最短长度为${urlToParentChainLength},当前路径长度为${newUserData.parentChain.length},跳过当前请求url`); continue; } // 记录当前url的parentchain长度,记录最小值 processedUrlToParentChainLength.set(anchorData.url, newUserData.parentChain.length); const uniqueKey = `${anchorData.url}_${newUserData.parentChain.length}`; log.info(`请求加入队列:${anchorData.url},uniqueKey:${uniqueKey},childNum:${childNum}`); try { await requestQueue.addRequest({ url: anchorData.url, uniqueKey: uniqueKey, userData: newUserData }); // 更新原始链接的队列状态 childLink.isInQueue = true; // 记录成功加入队列的子链接,包含完整信息 queuedChildLinks.push({ childNum, url: anchorData.url, text: anchorData.text, uniqueKey, isInLoop, isInQueue: true, viewportBoundingBox: rect, pageBoundingBox: pageBoundingBox, scroll: scroll }); } catch (err) { log.info(`请求已存在或添加失败:${anchorData.url}`); } // 截图保存当前窗口 const screenshotPath = path.join(screenshotsDir, `${request.id}_${childNum}.png`); await page.screenshot({ path: screenshotPath, fullPage: false }); log.info(`已保存第${childNum}个子元素截图:${anchorData.url} -> ${screenshotPath}`); } // 保存子链接记录到JSON文件 const childLinksData = { requestId: request.id, url: finalUrl, totalAnchors: anchorHandles.length, // 添加记录总的标签数量 allChildLinks, queuedChildLinks, totalFound: allChildLinks.length, totalQueued: queuedChildLinks.length }; const childLinksPath = path.join(childsDir, `${request.id}.json`); fs.writeFileSync(childLinksPath, JSON.stringify(childLinksData, null, 2)); log.info(`已保存子链接记录:${finalUrl} -> ${childLinksPath}`); // 将当前页面标记为已探索 urlExplored.add(finalUrl); // 保存当前页面 HTML,文件名使用 request.id const content = await page.content(); const htmlFilePath = path.join(pagesDir, request.id + '.html'); fs.writeFileSync(htmlFilePath, content); log.info(`已保存 HTML:${finalUrl} -> ${htmlFilePath}`); // 获取 AXTree const axTree = await page.accessibility.snapshot({ interestingOnly: false }); // 用于存储 id 到 selector 的映射 let idCounter = 1; const idToSelector = {}; const nodeParents = new Map(); // 用于构建文本形式的 AXTree let axtreeText = []; function traverse(node, depth = 0, parent = null) { nodeParents.set(node, parent); if (((node.role === 'none' || node.role === 'generic' || node.role === 'InlineTextBox') && !node.name && !node.focusable && !node.focused && node.expanded === undefined) || node.role === 'InlineTextBox' ) { if (node.children?.length > 0) { for (const child of node.children) { traverse(child, depth, node); } } return; } const currentId = idCounter++; let selectorParts = [`role=${node.role}`]; if (node.name) { selectorParts.push(`[name="${node.name}"]`); } if (node.selected) selectorParts.push('[selected=true]'); if (node.checked !== undefined) selectorParts.push(`[checked=${node.checked}]`); if (node.pressed !== undefined) selectorParts.push(`[pressed=${node.pressed}]`); if (parent && parent.role !== 'WebArea') { let parentSelector = `role=${parent.role}`; if (parent.name) { parentSelector += `[name="${parent.name}"]`; } selectorParts.unshift(`${parentSelector} >>`); } if (parent?.children) { const siblingIndex = parent.children.findIndex(child => child === node); if (siblingIndex !== -1) { selectorParts.push(`:nth-match(${siblingIndex + 1})`); } } idToSelector[currentId] = selectorParts.join(' '); // 收集所有可能的属性 let props = []; if (node.focusable) props.push('focusable'); if (node.focused) props.push('focused'); if (node.expanded !== undefined) props.push(`expanded=${node.expanded}`); if (node.selected) props.push('selected'); if (node.checked !== undefined) props.push(`checked=${node.checked}`); if (node.disabled) props.push('disabled'); if (node.required) props.push('required'); if (node.pressed !== undefined) props.push(`pressed=${node.pressed}`); // 判断元素是否可点击 const clickableRoles = ['button', 'link', 'menuitem', 'tab', 'checkbox', 'radio', 'switch', 'option']; const isClickable = clickableRoles.includes(node.role) || node.focusable || node.role === 'generic' && node.name && node.focusable; if (isClickable) props.push('clickable'); const indent = ' '.repeat(depth * 4); const nodeLine = `${indent}[${currentId}] ${node.role} '${node.name || ''}'${props.length > 0 ? ' (' + props.join(', ') + ')' : ''}`; axtreeText.push(nodeLine); if (node.children?.length > 0) { for (const child of node.children) { traverse(child, depth + 1, node); } } } // 添加根节点信息 let rootProps = []; if (axTree.focusable) rootProps.push('focusable=True'); if (axTree.focused) rootProps.push('focused'); axtreeText.push(`Root${axTree.role ? axTree.role : 'WebArea'} '${axTree.name || ''}'${rootProps.length > 0 ? ', ' + rootProps.join(', ') : ''}`); // 遍历 AXTree if (axTree.children?.length > 0) { for (const child of axTree.children) { traverse(child, 1, axTree); } } // 保存axtree到文件 const axtreePath = path.join(axtreesDir, `${request.id}.txt`); fs.writeFileSync(axtreePath, axtreeText.join('\n')); log.info(`已保存 axtree:${finalUrl} -> ${axtreePath}`); // 保存idToSelector到文件 const idToSelectorPath = path.join(axtreesDir, `${request.id}_idToSelector.json`); fs.writeFileSync(idToSelectorPath, JSON.stringify(idToSelector, null, 2)); log.info(`已保存 idToSelector:${finalUrl} -> ${idToSelectorPath}`); // 保存当前页面截图,文件名使用 request.id const fullPageScreenshotPath = path.join(screenshotsDir, `${request.id}_full.png`); await page.screenshot({ path: fullPageScreenshotPath, fullPage: true }); log.info(`已保存全屏截图:${finalUrl} -> ${fullPageScreenshotPath}`); console.log(`记录已探索页面到全局集合:${finalUrl}`); }, async handleFailedRequestFunction({ request }) { console.error(`请求 ${request.url} 处理失败。`); }, }); try { await crawler.run(); } catch (err) { const mapObject = Object.fromEntries(processedUrlToPaths); fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2)); log.info(`已保存 processedUrlToPaths.json`); console.error("爬虫运行时出错:", err); } console.log("爬虫运行结束!"); })();