import { PlaywrightCrawler, RequestQueue } from 'crawlee';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { v4 as uuidv4 } from 'uuid';
import { createWriteStream } from 'fs';
import { Log } from 'crawlee';
// 获取 __dirname(ES 模块下的写法)
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// 创建日志文件流
const logFile = createWriteStream(__filename.replace('.js', '.log'), { flags: 'a' });
// 创建自定义日志函数
const writeToLog = (level, message) => {
const logMessage = `[${new Date().toISOString()}] ${level}: ${message}\n`;
logFile.write(logMessage);
};
// 重写 Crawlee 的全局日志系统
const originalLogFunctions = {
debug: Log.debug,
info: Log.info,
warning: Log.warning,
warn: Log.warn,
error: Log.error
};
Log.debug = function(msg, options) {
writeToLog('DEBUG', msg);
return originalLogFunctions.debug(msg, options);
};
Log.info = function(msg, options) {
writeToLog('INFO', msg);
return originalLogFunctions.info(msg, options);
};
Log.warning = Log.warn = function(msg, options) {
writeToLog('WARN', msg);
return originalLogFunctions.warning(msg, options);
};
Log.error = function(msg, options) {
writeToLog('ERROR', msg);
return originalLogFunctions.error(msg, options);
};
// 控制台日志也重定向到文件
const originalConsoleLog = console.log;
const originalConsoleWarn = console.warn;
const originalConsoleError = console.error;
console.log = function(...args) {
const message = args.map(arg =>
typeof arg === 'object' ? JSON.stringify(arg) : arg
).join(' ');
logFile.write(`[${new Date().toISOString()}] INFO: ${message}\n`);
originalConsoleLog.apply(console, args);
};
console.warn = function(...args) {
const message = args.map(arg =>
typeof arg === 'object' ? JSON.stringify(arg) : arg
).join(' ');
logFile.write(`[${new Date().toISOString()}] WARN: ${message}\n`);
originalConsoleWarn.apply(console, args);
};
console.error = function(...args) {
const message = args.map(arg =>
typeof arg === 'object' ? JSON.stringify(arg) : arg
).join(' ');
logFile.write(`[${new Date().toISOString()}] ERROR: ${message}\n`);
originalConsoleError.apply(console, args);
};
// 添加进程退出时的日志文件关闭处理
process.on('exit', () => {
logFile.end();
});
// 全局异常处理,防止未捕获异常导致程序退出
process.on('uncaughtException', (err) => {
console.error('未捕获的异常:', err);
});
// 在文件开头添加 SIGINT 信号处理
process.on('SIGINT', () => {
console.log('\n检测到 Ctrl+C,正在保存数据并退出程序...');
// 保存最终的 processedUrlToPaths
const mapObject = Object.fromEntries(processedUrlToPaths);
fs.writeFileSync(
path.join(pathDir, 'processedUrlToPaths.json'),
JSON.stringify(mapObject, null, 2)
);
console.log('数据已保存,程序退出!');
logFile.end(); // 确保日志文件正确关闭
process.exit(0); // 正常退出程序
});
// 定义保存 HTML 与截图的目录
const pagesDir = path.join(__dirname, 'pages');
const axtreesDir = path.join(__dirname, 'axtrees');
const screenshotsDir = path.join(__dirname, 'screenshots');
const storageDir = path.join(__dirname, 'storage');
const pathDir = path.join(__dirname, 'path');
const childsDir = path.join(pathDir, 'childs'); // 新增子链接记录目录
if (fs.existsSync(pagesDir)) fs.rmSync(pagesDir, { recursive: true });
if (fs.existsSync(screenshotsDir)) fs.rmSync(screenshotsDir, { recursive: true });
if (fs.existsSync(storageDir)) fs.rmSync(storageDir, { recursive: true });
if (fs.existsSync(pathDir)) fs.rmSync(pathDir, { recursive: true });
if (fs.existsSync(childsDir)) fs.rmSync(childsDir, { recursive: true });
if (fs.existsSync(axtreesDir)) fs.rmSync(axtreesDir, { recursive: true });
fs.mkdirSync(pagesDir);
fs.mkdirSync(screenshotsDir);
fs.mkdirSync(storageDir);
fs.mkdirSync(pathDir);
fs.mkdirSync(childsDir); // 创建子链接记录目录
fs.mkdirSync(axtreesDir); // 创建axtree目录
console.log("启动爬虫...");
// 全局保存已经处理过的页面路径,每个页面对应一个list,list中每个元素是到达该页面的路径
const processedUrlToPaths = new Map();
// 全局保存已经处理过的页面路径,每个页面对应一个list,list中每个元素是到达该页面的路径
const processedUrlToParentChainLength = new Map();
// 全局保存已经探索过的url,用于去重
const urlExplored = new Set();
// 原子计数器,用于记录已处理过的页面数量
let processedRequestsCount = 0;
(async () => {
// 打开请求队列,建议在首次运行前清空 storage/request_queues 文件夹
const requestQueue = await RequestQueue.open();
// 使用 UUID 作为初始请求的 uniqueKey 和 id
await requestQueue.addRequest({
// url : "https://play.grafana.org/a/grafana-app-observability-app",
// uniqueKey : "https://play.grafana.org/a/grafana-app-observability-app_0",
url: 'https://play.grafana.org',
uniqueKey: 'https://play.grafana.org_0',
userData: { parentChain: [], parentChainIDs: [] }
});
const crawler = new PlaywrightCrawler({
requestQueue,
keepAlive: true,
navigationTimeoutSecs: 120,
requestHandlerTimeoutSecs: 360,
async handlePageFunction({ page, request, enqueueLinks, log }) {
// 重写 log 对象的方法
const originalLog = log;
log = {
...originalLog,
info: (message) => {
writeToLog('INFO', message);
originalLog.info(message);
},
debug: (message) => {
writeToLog('DEBUG', message);
originalLog.debug(message);
},
warning: (message) => {
writeToLog('WARN', message);
originalLog.warning(message);
},
warn: (message) => { // 添加 warn 方法
writeToLog('WARN', message);
originalLog.warn(message);
},
error: (message) => {
writeToLog('ERROR', message);
originalLog.error(message);
}
};
// 获取页面最终重定向后的 URL(去除参数)
const finalUrl = page.url().split('?')[0];
// 如果已经探索过该页面,则直接返回
if (urlExplored.has(finalUrl)) {
log.info(`页面 ${finalUrl} 已探索,跳过当前请求`);
return;
}
// 获取当前页面的完整路径
const fullChain = [...(request.userData.parentChainIDs || []), request.id];
// 获取已经存入processedUrlToPaths的页面路径
const processedPath = processedUrlToPaths.get(finalUrl);
if (processedPath) {
processedPath.push(fullChain);
processedUrlToPaths.set(finalUrl, processedPath);
log.info(`Final URL ${finalUrl} 记录新的可达路径:${fullChain}。`);
} else {
processedUrlToPaths.set(finalUrl, [fullChain]);
log.info(`Final URL ${finalUrl} 记录第一个可达路径:${fullChain}。`);
}
processedRequestsCount++;
// 每处理10个请求保存processedUrlToPaths
if (processedRequestsCount % 10 === 0) {
const mapObject = Object.fromEntries(processedUrlToPaths);
fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2));
log.info(`已保存 processedUrlToPaths.json`);
}
// 设置页面视口大小
await page.setViewportSize({ width: 2560, height: 1440 });
// await page.setViewportSize({ width: 1280, height: 720 });
// 增加超时时间到 120 秒
await page.goto(request.url, {
timeout: 120000, // 增加到 120 秒
waitUntil: 'domcontentloaded' // 改为只等待 DOM 加载完成,不等待所有资源
});
console.log('页面加载完成');
// 等待页面稳定
try {
await page.waitForLoadState('networkidle', { timeout: 30000 });
} catch (e) {
console.log('网络未完全空闲,但继续执行:', e.message);
}
// 在等待 networkidle 之前,先展开所有可折叠内容
console.log('\n开始展开导航项...');
const clickedButtons = new Set();
const expandButtons = async () => {
console.log('开始寻找可展开按钮...');
const buttons = await page.$$('button[aria-label*="Expand"], button[aria-label*="Open"], button[aria-expanded="false"]');
console.log(`找到 ${buttons.length} 个折叠按钮`);
let newButtonsFound = false;
for (const button of buttons) {
try {
// 使用更可靠的滚动方法
await page.evaluate(element => {
// 使用JavaScript的scrollIntoView,更直接且兼容性更好
element.scrollIntoView({behavior: 'smooth', block: 'center'});
}, button);
const ariaLabel = await button.getAttribute('aria-label');
if (!clickedButtons.has(ariaLabel)) {
console.log(`点击新按钮: ${ariaLabel}`);
await button.click();
clickedButtons.add(ariaLabel);
newButtonsFound = true;
await page.waitForTimeout(200);
}
} catch (e) {
console.log(`点击失败: ${e.message}`);
}
}
return newButtonsFound;
};
let iteration = 1;
while (true) {
console.log(`\n第 ${iteration} 次查找...`);
const foundNewButtons = await expandButtons();
if (!foundNewButtons) {
console.log('没有发现新的可展开按钮,结束查找');
break;
}
console.log(`已点击按钮数量: ${clickedButtons.size}`);
await page.waitForTimeout(500);
iteration++;
}
// 获取所有 标签的元素句柄
const anchorHandles = await page.$$('a');
console.log(`当前网页${finalUrl}找到 ${anchorHandles.length} 个链接`);
// 记录子元素编号
let childNum = 0;
// 创建记录所有子链接和加入队列的子链接的数组
const allChildLinks = [];
const queuedChildLinks = [];
for (const anchorHandle of anchorHandles) {
childNum++;
// 先获取 标签的 href 与文本内容
const anchorData = await page.evaluate(el => {
return {
url: el.href,
text: el.innerText.trim()
};
}, anchorHandle);
// 获取元素边界框信息
let rect = null;
let scroll = null;
let pageBoundingBox = null;
try {
// 尝试获取元素的边界框信息
rect = await anchorHandle.boundingBox();
if (rect) {
scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY }));
pageBoundingBox = {
x: rect.x + scroll.x,
y: rect.y + scroll.y,
width: rect.width,
height: rect.height,
};
}
} catch (err) {
console.error(`获取元素边界框失败: ${err.message}`);
}
// 判断链接是否在目标域内
const isInLoop = anchorData.url.startsWith('https://play.grafana.org');
// 记录所有子链接,包含完整信息
const childLink = {
childNum,
url: anchorData.url,
text: anchorData.text,
isInLoop,
isInQueue: false, // 默认未加入队列,后续会更新
viewportBoundingBox: rect,
pageBoundingBox: pageBoundingBox,
scroll: scroll
};
allChildLinks.push(childLink);
// 如果链接不属于目标域,则直接跳过(即为外链)
if (!isInLoop) continue;
log.info(`处理链接:${anchorData.text},childNum:${childNum}`);
// 使用更可靠的滚动方法
await page.evaluate(element => {
// 使用JavaScript的scrollIntoView,更直接且兼容性更好
element.scrollIntoView({behavior: 'smooth', block: 'center'});
}, anchorHandle);
await page.waitForTimeout(500); // 给滚动和渲染更多时间
// 获取元素在窗口内的 bounding box
rect = await anchorHandle.boundingBox();
if (!rect) continue;
// 获取当前窗口滚动偏移,用于计算页面内位置
scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY }));
pageBoundingBox = {
x: rect.x + scroll.x,
y: rect.y + scroll.y,
width: rect.width,
height: rect.height,
};
// 构造新的 userData,包含父链与元素详细信息
const newUserData = {
parentChainChildNum: [...(request.userData.parentChainChildNum || []), childNum],
parentChain: [...(request.userData.parentChain || []), request.url],
parentChainIDs: [...(request.userData.parentChainIDs || []), request.id],
parentChainTexts: [...(request.userData.parentChainTexts || []), anchorData.text],
parentChainScrolls: [...(request.userData.parentChainScrolls || []), scroll],
parentChainViewportBoundingBoxes: [...(request.userData.parentChainViewportBoundingBoxes || []), rect],
parentChainPageBoundingBoxes: [...(request.userData.parentChainPageBoundingBoxes || []), pageBoundingBox],
elementPosition: {
viewportBoundingBox: rect,
pageBoundingBox: pageBoundingBox,
scroll: scroll,
text: anchorData.text,
childNum: childNum
}
};
// 增加自定义url去重,通过全局的map记录url与parentchain长度的映射,如果当前路径长度大于等于已记录的url的parentchain长度,则跳过该url
const urlToParentChainLength = processedUrlToParentChainLength.get(anchorData.url);
if (urlToParentChainLength && urlToParentChainLength <= newUserData.parentChain.length) {
log.info(`url:${anchorData.url} 历史发现路径最短长度为${urlToParentChainLength},当前路径长度为${newUserData.parentChain.length},跳过当前请求url`);
continue;
}
// 记录当前url的parentchain长度,记录最小值
processedUrlToParentChainLength.set(anchorData.url, newUserData.parentChain.length);
const uniqueKey = `${anchorData.url}_${newUserData.parentChain.length}`;
log.info(`请求加入队列:${anchorData.url},uniqueKey:${uniqueKey},childNum:${childNum}`);
try {
await requestQueue.addRequest({
url: anchorData.url,
uniqueKey: uniqueKey,
userData: newUserData
});
// 更新原始链接的队列状态
childLink.isInQueue = true;
// 记录成功加入队列的子链接,包含完整信息
queuedChildLinks.push({
childNum,
url: anchorData.url,
text: anchorData.text,
uniqueKey,
isInLoop,
isInQueue: true,
viewportBoundingBox: rect,
pageBoundingBox: pageBoundingBox,
scroll: scroll
});
} catch (err) {
log.info(`请求已存在或添加失败:${anchorData.url}`);
}
// 截图保存当前窗口
const screenshotPath = path.join(screenshotsDir, `${request.id}_${childNum}.png`);
await page.screenshot({ path: screenshotPath, fullPage: false });
log.info(`已保存第${childNum}个子元素截图:${anchorData.url} -> ${screenshotPath}`);
}
// 保存子链接记录到JSON文件
const childLinksData = {
requestId: request.id,
url: finalUrl,
totalAnchors: anchorHandles.length, // 添加记录总的标签数量
allChildLinks,
queuedChildLinks,
totalFound: allChildLinks.length,
totalQueued: queuedChildLinks.length
};
const childLinksPath = path.join(childsDir, `${request.id}.json`);
fs.writeFileSync(childLinksPath, JSON.stringify(childLinksData, null, 2));
log.info(`已保存子链接记录:${finalUrl} -> ${childLinksPath}`);
// 将当前页面标记为已探索
urlExplored.add(finalUrl);
// 保存当前页面 HTML,文件名使用 request.id
const content = await page.content();
const htmlFilePath = path.join(pagesDir, request.id + '.html');
fs.writeFileSync(htmlFilePath, content);
log.info(`已保存 HTML:${finalUrl} -> ${htmlFilePath}`);
// 获取 AXTree
const axTree = await page.accessibility.snapshot({ interestingOnly: false });
// 用于存储 id 到 selector 的映射
let idCounter = 1;
const idToSelector = {};
const nodeParents = new Map();
// 用于构建文本形式的 AXTree
let axtreeText = [];
function traverse(node, depth = 0, parent = null) {
nodeParents.set(node, parent);
if (((node.role === 'none' || node.role === 'generic' || node.role === 'InlineTextBox') &&
!node.name &&
!node.focusable &&
!node.focused &&
node.expanded === undefined) ||
node.role === 'InlineTextBox'
) {
if (node.children?.length > 0) {
for (const child of node.children) {
traverse(child, depth, node);
}
}
return;
}
const currentId = idCounter++;
let selectorParts = [`role=${node.role}`];
if (node.name) {
selectorParts.push(`[name="${node.name}"]`);
}
if (node.selected) selectorParts.push('[selected=true]');
if (node.checked !== undefined) selectorParts.push(`[checked=${node.checked}]`);
if (node.pressed !== undefined) selectorParts.push(`[pressed=${node.pressed}]`);
if (parent && parent.role !== 'WebArea') {
let parentSelector = `role=${parent.role}`;
if (parent.name) {
parentSelector += `[name="${parent.name}"]`;
}
selectorParts.unshift(`${parentSelector} >>`);
}
if (parent?.children) {
const siblingIndex = parent.children.findIndex(child => child === node);
if (siblingIndex !== -1) {
selectorParts.push(`:nth-match(${siblingIndex + 1})`);
}
}
idToSelector[currentId] = selectorParts.join(' ');
// 收集所有可能的属性
let props = [];
if (node.focusable) props.push('focusable');
if (node.focused) props.push('focused');
if (node.expanded !== undefined) props.push(`expanded=${node.expanded}`);
if (node.selected) props.push('selected');
if (node.checked !== undefined) props.push(`checked=${node.checked}`);
if (node.disabled) props.push('disabled');
if (node.required) props.push('required');
if (node.pressed !== undefined) props.push(`pressed=${node.pressed}`);
// 判断元素是否可点击
const clickableRoles = ['button', 'link', 'menuitem', 'tab', 'checkbox', 'radio', 'switch', 'option'];
const isClickable = clickableRoles.includes(node.role) ||
node.focusable ||
node.role === 'generic' && node.name && node.focusable;
if (isClickable) props.push('clickable');
const indent = ' '.repeat(depth * 4);
const nodeLine = `${indent}[${currentId}] ${node.role} '${node.name || ''}'${props.length > 0 ? ' (' + props.join(', ') + ')' : ''}`;
axtreeText.push(nodeLine);
if (node.children?.length > 0) {
for (const child of node.children) {
traverse(child, depth + 1, node);
}
}
}
// 添加根节点信息
let rootProps = [];
if (axTree.focusable) rootProps.push('focusable=True');
if (axTree.focused) rootProps.push('focused');
axtreeText.push(`Root${axTree.role ? axTree.role : 'WebArea'} '${axTree.name || ''}'${rootProps.length > 0 ? ', ' + rootProps.join(', ') : ''}`);
// 遍历 AXTree
if (axTree.children?.length > 0) {
for (const child of axTree.children) {
traverse(child, 1, axTree);
}
}
// 保存axtree到文件
const axtreePath = path.join(axtreesDir, `${request.id}.txt`);
fs.writeFileSync(axtreePath, axtreeText.join('\n'));
log.info(`已保存 axtree:${finalUrl} -> ${axtreePath}`);
// 保存idToSelector到文件
const idToSelectorPath = path.join(axtreesDir, `${request.id}_idToSelector.json`);
fs.writeFileSync(idToSelectorPath, JSON.stringify(idToSelector, null, 2));
log.info(`已保存 idToSelector:${finalUrl} -> ${idToSelectorPath}`);
// 保存当前页面截图,文件名使用 request.id
const fullPageScreenshotPath = path.join(screenshotsDir, `${request.id}_full.png`);
await page.screenshot({ path: fullPageScreenshotPath, fullPage: true });
log.info(`已保存全屏截图:${finalUrl} -> ${fullPageScreenshotPath}`);
console.log(`记录已探索页面到全局集合:${finalUrl}`);
},
async handleFailedRequestFunction({ request }) {
console.error(`请求 ${request.url} 处理失败。`);
},
});
try {
await crawler.run();
} catch (err) {
const mapObject = Object.fromEntries(processedUrlToPaths);
fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2));
log.info(`已保存 processedUrlToPaths.json`);
console.error("爬虫运行时出错:", err);
}
console.log("爬虫运行结束!");
})();