crawlee/crawl_grafana_v18.js
2025-04-23 12:14:50 +08:00

590 lines
25 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { PlaywrightCrawler, RequestQueue } from 'crawlee';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { v4 as uuidv4 } from 'uuid';
import { createWriteStream } from 'fs';
import { Log } from 'crawlee';
// 获取 __dirnameES 模块下的写法)
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// 创建日志文件流
const logFile = createWriteStream(__filename.replace('.js', '.log'), { flags: 'a' });
// 创建自定义日志函数
const writeToLog = (level, message) => {
const logMessage = `[${new Date().toISOString()}] ${level}: ${message}\n`;
logFile.write(logMessage);
};
// 重写 Crawlee 的全局日志系统
const originalLogFunctions = {
debug: Log.debug,
info: Log.info,
warning: Log.warning,
warn: Log.warn,
error: Log.error
};
Log.debug = function(msg, options) {
writeToLog('DEBUG', msg);
return originalLogFunctions.debug(msg, options);
};
Log.info = function(msg, options) {
writeToLog('INFO', msg);
return originalLogFunctions.info(msg, options);
};
Log.warning = Log.warn = function(msg, options) {
writeToLog('WARN', msg);
return originalLogFunctions.warning(msg, options);
};
Log.error = function(msg, options) {
writeToLog('ERROR', msg);
return originalLogFunctions.error(msg, options);
};
// 控制台日志也重定向到文件
const originalConsoleLog = console.log;
const originalConsoleWarn = console.warn;
const originalConsoleError = console.error;
console.log = function(...args) {
const message = args.map(arg =>
typeof arg === 'object' ? JSON.stringify(arg) : arg
).join(' ');
logFile.write(`[${new Date().toISOString()}] INFO: ${message}\n`);
originalConsoleLog.apply(console, args);
};
console.warn = function(...args) {
const message = args.map(arg =>
typeof arg === 'object' ? JSON.stringify(arg) : arg
).join(' ');
logFile.write(`[${new Date().toISOString()}] WARN: ${message}\n`);
originalConsoleWarn.apply(console, args);
};
console.error = function(...args) {
const message = args.map(arg =>
typeof arg === 'object' ? JSON.stringify(arg) : arg
).join(' ');
logFile.write(`[${new Date().toISOString()}] ERROR: ${message}\n`);
originalConsoleError.apply(console, args);
};
// 添加进程退出时的日志文件关闭处理
process.on('exit', () => {
logFile.end();
});
// 全局异常处理,防止未捕获异常导致程序退出
process.on('uncaughtException', (err) => {
console.error('未捕获的异常:', err);
});
// 在文件开头添加 SIGINT 信号处理
process.on('SIGINT', () => {
console.log('\n检测到 Ctrl+C正在保存数据并退出程序...');
// 保存最终的 processedUrlToPaths
const mapObject = Object.fromEntries(processedUrlToPaths);
fs.writeFileSync(
path.join(pathDir, 'processedUrlToPaths.json'),
JSON.stringify(mapObject, null, 2)
);
console.log('数据已保存,程序退出!');
logFile.end(); // 确保日志文件正确关闭
process.exit(0); // 正常退出程序
});
// 定义保存 HTML 与截图的目录
const pagesDir = path.join(__dirname, 'pages');
const axtreesDir = path.join(__dirname, 'axtrees');
const screenshotsDir = path.join(__dirname, 'screenshots');
const storageDir = path.join(__dirname, 'storage');
const pathDir = path.join(__dirname, 'path');
const childsDir = path.join(pathDir, 'childs'); // 新增子链接记录目录
if (fs.existsSync(pagesDir)) fs.rmSync(pagesDir, { recursive: true });
if (fs.existsSync(screenshotsDir)) fs.rmSync(screenshotsDir, { recursive: true });
if (fs.existsSync(storageDir)) fs.rmSync(storageDir, { recursive: true });
if (fs.existsSync(pathDir)) fs.rmSync(pathDir, { recursive: true });
if (fs.existsSync(childsDir)) fs.rmSync(childsDir, { recursive: true });
if (fs.existsSync(axtreesDir)) fs.rmSync(axtreesDir, { recursive: true });
fs.mkdirSync(pagesDir);
fs.mkdirSync(screenshotsDir);
fs.mkdirSync(storageDir);
fs.mkdirSync(pathDir);
fs.mkdirSync(childsDir); // 创建子链接记录目录
fs.mkdirSync(axtreesDir); // 创建axtree目录
console.log("启动爬虫...");
// 全局保存已经处理过的页面路径每个页面对应一个listlist中每个元素是到达该页面的路径
const processedUrlToPaths = new Map();
// 全局保存已经处理过的页面路径每个页面对应一个listlist中每个元素是到达该页面的路径
const processedUrlToParentChainLength = new Map();
// 全局保存已经探索过的url用于去重
const urlExplored = new Set();
// 原子计数器,用于记录已处理过的页面数量
let processedRequestsCount = 0;
(async () => {
// 打开请求队列,建议在首次运行前清空 storage/request_queues 文件夹
const requestQueue = await RequestQueue.open();
// 使用 UUID 作为初始请求的 uniqueKey 和 id
await requestQueue.addRequest({
// url : "https://play.grafana.org/a/grafana-app-observability-app",
// uniqueKey : "https://play.grafana.org/a/grafana-app-observability-app_0",
url: 'https://play.grafana.org',
uniqueKey: 'https://play.grafana.org_0',
userData: { parentChain: [], parentChainIDs: [] }
});
const crawler = new PlaywrightCrawler({
requestQueue,
keepAlive: true,
navigationTimeoutSecs: 120,
requestHandlerTimeoutSecs: 360,
async handlePageFunction({ page, request, enqueueLinks, log }) {
// 重写 log 对象的方法
const originalLog = log;
log = {
...originalLog,
info: (message) => {
writeToLog('INFO', message);
originalLog.info(message);
},
debug: (message) => {
writeToLog('DEBUG', message);
originalLog.debug(message);
},
warning: (message) => {
writeToLog('WARN', message);
originalLog.warning(message);
},
warn: (message) => { // 添加 warn 方法
writeToLog('WARN', message);
originalLog.warn(message);
},
error: (message) => {
writeToLog('ERROR', message);
originalLog.error(message);
}
};
// 获取页面最终重定向后的 URL去除参数
const finalUrl = page.url().split('?')[0];
// 如果已经探索过该页面,则直接返回
if (urlExplored.has(finalUrl)) {
log.info(`页面 ${finalUrl} 已探索,跳过当前请求`);
return;
}
// 获取当前页面的完整路径
const fullChain = [...(request.userData.parentChainIDs || []), request.id];
// 获取已经存入processedUrlToPaths的页面路径
const processedPath = processedUrlToPaths.get(finalUrl);
if (processedPath) {
processedPath.push(fullChain);
processedUrlToPaths.set(finalUrl, processedPath);
log.info(`Final URL ${finalUrl} 记录新的可达路径:${fullChain}`);
} else {
processedUrlToPaths.set(finalUrl, [fullChain]);
log.info(`Final URL ${finalUrl} 记录第一个可达路径:${fullChain}`);
}
processedRequestsCount++;
// 每处理10个请求保存processedUrlToPaths
if (processedRequestsCount % 10 === 0) {
const mapObject = Object.fromEntries(processedUrlToPaths);
fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2));
log.info(`已保存 processedUrlToPaths.json`);
}
// 设置页面视口大小
await page.setViewportSize({ width: 2560, height: 1440 });
// await page.setViewportSize({ width: 1280, height: 720 });
// 增加超时时间到 120 秒
await page.goto(request.url, {
timeout: 120000, // 增加到 120 秒
waitUntil: 'domcontentloaded' // 改为只等待 DOM 加载完成,不等待所有资源
});
console.log('页面加载完成');
// 等待页面稳定
try {
await page.waitForLoadState('networkidle', { timeout: 30000 });
} catch (e) {
console.log('网络未完全空闲,但继续执行:', e.message);
}
// 在等待 networkidle 之前,先展开所有可折叠内容
console.log('\n开始展开导航项...');
const clickedButtons = new Set();
const expandButtons = async () => {
console.log('开始寻找可展开按钮...');
const buttons = await page.$$('button[aria-label*="Expand"], button[aria-label*="Open"], button[aria-expanded="false"]');
console.log(`找到 ${buttons.length} 个折叠按钮`);
let newButtonsFound = false;
for (const button of buttons) {
try {
// 使用更可靠的滚动方法
await page.evaluate(element => {
// 使用JavaScript的scrollIntoView更直接且兼容性更好
element.scrollIntoView({behavior: 'smooth', block: 'center'});
}, button);
const ariaLabel = await button.getAttribute('aria-label');
if (!clickedButtons.has(ariaLabel)) {
console.log(`点击新按钮: ${ariaLabel}`);
await button.click();
clickedButtons.add(ariaLabel);
newButtonsFound = true;
await page.waitForTimeout(200);
}
} catch (e) {
console.log(`点击失败: ${e.message}`);
}
}
return newButtonsFound;
};
let iteration = 1;
while (true) {
console.log(`\n${iteration} 次查找...`);
const foundNewButtons = await expandButtons();
if (!foundNewButtons) {
console.log('没有发现新的可展开按钮,结束查找');
break;
}
console.log(`已点击按钮数量: ${clickedButtons.size}`);
await page.waitForTimeout(500);
iteration++;
}
// 获取所有 <a> 标签的元素句柄
const anchorHandles = await page.$$('a');
console.log(`当前网页${finalUrl}找到 ${anchorHandles.length} 个链接`);
// 记录子元素编号
let childNum = 0;
// 创建记录所有子链接和加入队列的子链接的数组
const allChildLinks = [];
const queuedChildLinks = [];
for (const anchorHandle of anchorHandles) {
childNum++;
// 先获取 <a> 标签的 href 与文本内容
const anchorData = await page.evaluate(el => {
return {
url: el.href,
text: el.innerText.trim()
};
}, anchorHandle);
// 获取元素边界框信息
let rect = null;
let scroll = null;
let pageBoundingBox = null;
try {
// 尝试获取元素的边界框信息
rect = await anchorHandle.boundingBox();
if (rect) {
scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY }));
pageBoundingBox = {
x: rect.x + scroll.x,
y: rect.y + scroll.y,
width: rect.width,
height: rect.height,
};
}
} catch (err) {
console.error(`获取元素边界框失败: ${err.message}`);
}
// 判断链接是否在目标域内
const isInLoop = anchorData.url.startsWith('https://play.grafana.org');
// 记录所有子链接,包含完整信息
const childLink = {
childNum,
url: anchorData.url,
text: anchorData.text,
isInLoop,
isInQueue: false, // 默认未加入队列,后续会更新
viewportBoundingBox: rect,
pageBoundingBox: pageBoundingBox,
scroll: scroll
};
allChildLinks.push(childLink);
// 如果链接不属于目标域,则直接跳过(即为外链)
if (!isInLoop) continue;
log.info(`处理链接:${anchorData.text}childNum${childNum}`);
// 使用更可靠的滚动方法
await page.evaluate(element => {
// 使用JavaScript的scrollIntoView更直接且兼容性更好
element.scrollIntoView({behavior: 'smooth', block: 'center'});
}, anchorHandle);
await page.waitForTimeout(500); // 给滚动和渲染更多时间
// 获取元素在窗口内的 bounding box
rect = await anchorHandle.boundingBox();
if (!rect) continue;
// 获取当前窗口滚动偏移,用于计算页面内位置
scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY }));
pageBoundingBox = {
x: rect.x + scroll.x,
y: rect.y + scroll.y,
width: rect.width,
height: rect.height,
};
// 构造新的 userData包含父链与元素详细信息
const newUserData = {
parentChainChildNum: [...(request.userData.parentChainChildNum || []), childNum],
parentChain: [...(request.userData.parentChain || []), request.url],
parentChainIDs: [...(request.userData.parentChainIDs || []), request.id],
parentChainTexts: [...(request.userData.parentChainTexts || []), anchorData.text],
parentChainScrolls: [...(request.userData.parentChainScrolls || []), scroll],
parentChainViewportBoundingBoxes: [...(request.userData.parentChainViewportBoundingBoxes || []), rect],
parentChainPageBoundingBoxes: [...(request.userData.parentChainPageBoundingBoxes || []), pageBoundingBox],
elementPosition: {
viewportBoundingBox: rect,
pageBoundingBox: pageBoundingBox,
scroll: scroll,
text: anchorData.text,
childNum: childNum
}
};
// 增加自定义url去重通过全局的map记录url与parentchain长度的映射如果当前路径长度大于等于已记录的url的parentchain长度则跳过该url
const urlToParentChainLength = processedUrlToParentChainLength.get(anchorData.url);
if (urlToParentChainLength && urlToParentChainLength <= newUserData.parentChain.length) {
log.info(`url${anchorData.url} 历史发现路径最短长度为${urlToParentChainLength},当前路径长度为${newUserData.parentChain.length}跳过当前请求url`);
continue;
}
// 记录当前url的parentchain长度记录最小值
processedUrlToParentChainLength.set(anchorData.url, newUserData.parentChain.length);
const uniqueKey = `${anchorData.url}_${newUserData.parentChain.length}`;
log.info(`请求加入队列:${anchorData.url}uniqueKey${uniqueKey}childNum${childNum}`);
try {
await requestQueue.addRequest({
url: anchorData.url,
uniqueKey: uniqueKey,
userData: newUserData
});
// 更新原始链接的队列状态
childLink.isInQueue = true;
// 记录成功加入队列的子链接,包含完整信息
queuedChildLinks.push({
childNum,
url: anchorData.url,
text: anchorData.text,
uniqueKey,
isInLoop,
isInQueue: true,
viewportBoundingBox: rect,
pageBoundingBox: pageBoundingBox,
scroll: scroll
});
} catch (err) {
log.info(`请求已存在或添加失败:${anchorData.url}`);
}
// 截图保存当前窗口
const screenshotPath = path.join(screenshotsDir, `${request.id}_${childNum}.png`);
await page.screenshot({ path: screenshotPath, fullPage: false });
log.info(`已保存第${childNum}个子元素截图:${anchorData.url} -> ${screenshotPath}`);
}
// 保存子链接记录到JSON文件
const childLinksData = {
requestId: request.id,
url: finalUrl,
totalAnchors: anchorHandles.length, // 添加记录总的<a>标签数量
allChildLinks,
queuedChildLinks,
totalFound: allChildLinks.length,
totalQueued: queuedChildLinks.length
};
const childLinksPath = path.join(childsDir, `${request.id}.json`);
fs.writeFileSync(childLinksPath, JSON.stringify(childLinksData, null, 2));
log.info(`已保存子链接记录:${finalUrl} -> ${childLinksPath}`);
// 将当前页面标记为已探索
urlExplored.add(finalUrl);
// 保存当前页面 HTML文件名使用 request.id
const content = await page.content();
const htmlFilePath = path.join(pagesDir, request.id + '.html');
fs.writeFileSync(htmlFilePath, content);
log.info(`已保存 HTML${finalUrl} -> ${htmlFilePath}`);
// 获取 AXTree
const axTree = await page.accessibility.snapshot({ interestingOnly: false });
// 用于存储 id 到 selector 的映射
let idCounter = 1;
const idToSelector = {};
const nodeParents = new Map();
// 用于构建文本形式的 AXTree
let axtreeText = [];
function traverse(node, depth = 0, parent = null) {
nodeParents.set(node, parent);
if (((node.role === 'none' || node.role === 'generic' || node.role === 'InlineTextBox') &&
!node.name &&
!node.focusable &&
!node.focused &&
node.expanded === undefined) ||
node.role === 'InlineTextBox'
) {
if (node.children?.length > 0) {
for (const child of node.children) {
traverse(child, depth, node);
}
}
return;
}
const currentId = idCounter++;
let selectorParts = [`role=${node.role}`];
if (node.name) {
selectorParts.push(`[name="${node.name}"]`);
}
if (node.selected) selectorParts.push('[selected=true]');
if (node.checked !== undefined) selectorParts.push(`[checked=${node.checked}]`);
if (node.pressed !== undefined) selectorParts.push(`[pressed=${node.pressed}]`);
if (parent && parent.role !== 'WebArea') {
let parentSelector = `role=${parent.role}`;
if (parent.name) {
parentSelector += `[name="${parent.name}"]`;
}
selectorParts.unshift(`${parentSelector} >>`);
}
if (parent?.children) {
const siblingIndex = parent.children.findIndex(child => child === node);
if (siblingIndex !== -1) {
selectorParts.push(`:nth-match(${siblingIndex + 1})`);
}
}
idToSelector[currentId] = selectorParts.join(' ');
// 收集所有可能的属性
let props = [];
if (node.focusable) props.push('focusable');
if (node.focused) props.push('focused');
if (node.expanded !== undefined) props.push(`expanded=${node.expanded}`);
if (node.selected) props.push('selected');
if (node.checked !== undefined) props.push(`checked=${node.checked}`);
if (node.disabled) props.push('disabled');
if (node.required) props.push('required');
if (node.pressed !== undefined) props.push(`pressed=${node.pressed}`);
// 判断元素是否可点击
const clickableRoles = ['button', 'link', 'menuitem', 'tab', 'checkbox', 'radio', 'switch', 'option'];
const isClickable = clickableRoles.includes(node.role) ||
node.focusable ||
node.role === 'generic' && node.name && node.focusable;
if (isClickable) props.push('clickable');
const indent = ' '.repeat(depth * 4);
const nodeLine = `${indent}[${currentId}] ${node.role} '${node.name || ''}'${props.length > 0 ? ' (' + props.join(', ') + ')' : ''}`;
axtreeText.push(nodeLine);
if (node.children?.length > 0) {
for (const child of node.children) {
traverse(child, depth + 1, node);
}
}
}
// 添加根节点信息
let rootProps = [];
if (axTree.focusable) rootProps.push('focusable=True');
if (axTree.focused) rootProps.push('focused');
axtreeText.push(`Root${axTree.role ? axTree.role : 'WebArea'} '${axTree.name || ''}'${rootProps.length > 0 ? ', ' + rootProps.join(', ') : ''}`);
// 遍历 AXTree
if (axTree.children?.length > 0) {
for (const child of axTree.children) {
traverse(child, 1, axTree);
}
}
// 保存axtree到文件
const axtreePath = path.join(axtreesDir, `${request.id}.txt`);
fs.writeFileSync(axtreePath, axtreeText.join('\n'));
log.info(`已保存 axtree${finalUrl} -> ${axtreePath}`);
// 保存idToSelector到文件
const idToSelectorPath = path.join(axtreesDir, `${request.id}_idToSelector.json`);
fs.writeFileSync(idToSelectorPath, JSON.stringify(idToSelector, null, 2));
log.info(`已保存 idToSelector${finalUrl} -> ${idToSelectorPath}`);
// 保存当前页面截图,文件名使用 request.id
const fullPageScreenshotPath = path.join(screenshotsDir, `${request.id}_full.png`);
await page.screenshot({ path: fullPageScreenshotPath, fullPage: true });
log.info(`已保存全屏截图:${finalUrl} -> ${fullPageScreenshotPath}`);
console.log(`记录已探索页面到全局集合:${finalUrl}`);
},
async handleFailedRequestFunction({ request }) {
console.error(`请求 ${request.url} 处理失败。`);
},
});
try {
await crawler.run();
} catch (err) {
const mapObject = Object.fromEntries(processedUrlToPaths);
fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2));
log.info(`已保存 processedUrlToPaths.json`);
console.error("爬虫运行时出错:", err);
}
console.log("爬虫运行结束!");
})();