590 lines
25 KiB
JavaScript
590 lines
25 KiB
JavaScript
import { PlaywrightCrawler, RequestQueue } from 'crawlee';
|
||
import fs from 'fs';
|
||
import path from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
import { v4 as uuidv4 } from 'uuid';
|
||
import { createWriteStream } from 'fs';
|
||
import { Log } from 'crawlee';
|
||
|
||
// 获取 __dirname(ES 模块下的写法)
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = path.dirname(__filename);
|
||
|
||
// 创建日志文件流
|
||
const logFile = createWriteStream(__filename.replace('.js', '.log'), { flags: 'a' });
|
||
|
||
// 创建自定义日志函数
|
||
const writeToLog = (level, message) => {
|
||
const logMessage = `[${new Date().toISOString()}] ${level}: ${message}\n`;
|
||
logFile.write(logMessage);
|
||
};
|
||
|
||
// 重写 Crawlee 的全局日志系统
|
||
const originalLogFunctions = {
|
||
debug: Log.debug,
|
||
info: Log.info,
|
||
warning: Log.warning,
|
||
warn: Log.warn,
|
||
error: Log.error
|
||
};
|
||
|
||
Log.debug = function(msg, options) {
|
||
writeToLog('DEBUG', msg);
|
||
return originalLogFunctions.debug(msg, options);
|
||
};
|
||
|
||
Log.info = function(msg, options) {
|
||
writeToLog('INFO', msg);
|
||
return originalLogFunctions.info(msg, options);
|
||
};
|
||
|
||
Log.warning = Log.warn = function(msg, options) {
|
||
writeToLog('WARN', msg);
|
||
return originalLogFunctions.warning(msg, options);
|
||
};
|
||
|
||
Log.error = function(msg, options) {
|
||
writeToLog('ERROR', msg);
|
||
return originalLogFunctions.error(msg, options);
|
||
};
|
||
|
||
// 控制台日志也重定向到文件
|
||
const originalConsoleLog = console.log;
|
||
const originalConsoleWarn = console.warn;
|
||
const originalConsoleError = console.error;
|
||
|
||
console.log = function(...args) {
|
||
const message = args.map(arg =>
|
||
typeof arg === 'object' ? JSON.stringify(arg) : arg
|
||
).join(' ');
|
||
logFile.write(`[${new Date().toISOString()}] INFO: ${message}\n`);
|
||
originalConsoleLog.apply(console, args);
|
||
};
|
||
|
||
console.warn = function(...args) {
|
||
const message = args.map(arg =>
|
||
typeof arg === 'object' ? JSON.stringify(arg) : arg
|
||
).join(' ');
|
||
logFile.write(`[${new Date().toISOString()}] WARN: ${message}\n`);
|
||
originalConsoleWarn.apply(console, args);
|
||
};
|
||
|
||
console.error = function(...args) {
|
||
const message = args.map(arg =>
|
||
typeof arg === 'object' ? JSON.stringify(arg) : arg
|
||
).join(' ');
|
||
logFile.write(`[${new Date().toISOString()}] ERROR: ${message}\n`);
|
||
originalConsoleError.apply(console, args);
|
||
};
|
||
|
||
// 添加进程退出时的日志文件关闭处理
|
||
process.on('exit', () => {
|
||
logFile.end();
|
||
});
|
||
|
||
// 全局异常处理,防止未捕获异常导致程序退出
|
||
process.on('uncaughtException', (err) => {
|
||
console.error('未捕获的异常:', err);
|
||
});
|
||
|
||
// 在文件开头添加 SIGINT 信号处理
|
||
process.on('SIGINT', () => {
|
||
console.log('\n检测到 Ctrl+C,正在保存数据并退出程序...');
|
||
// 保存最终的 processedUrlToPaths
|
||
const mapObject = Object.fromEntries(processedUrlToPaths);
|
||
fs.writeFileSync(
|
||
path.join(pathDir, 'processedUrlToPaths.json'),
|
||
JSON.stringify(mapObject, null, 2)
|
||
);
|
||
console.log('数据已保存,程序退出!');
|
||
logFile.end(); // 确保日志文件正确关闭
|
||
process.exit(0); // 正常退出程序
|
||
});
|
||
|
||
|
||
// 定义保存 HTML 与截图的目录
|
||
const pagesDir = path.join(__dirname, 'pages');
|
||
const axtreesDir = path.join(__dirname, 'axtrees');
|
||
const screenshotsDir = path.join(__dirname, 'screenshots');
|
||
const storageDir = path.join(__dirname, 'storage');
|
||
const pathDir = path.join(__dirname, 'path');
|
||
const childsDir = path.join(pathDir, 'childs'); // 新增子链接记录目录
|
||
if (fs.existsSync(pagesDir)) fs.rmSync(pagesDir, { recursive: true });
|
||
if (fs.existsSync(screenshotsDir)) fs.rmSync(screenshotsDir, { recursive: true });
|
||
if (fs.existsSync(storageDir)) fs.rmSync(storageDir, { recursive: true });
|
||
if (fs.existsSync(pathDir)) fs.rmSync(pathDir, { recursive: true });
|
||
if (fs.existsSync(childsDir)) fs.rmSync(childsDir, { recursive: true });
|
||
if (fs.existsSync(axtreesDir)) fs.rmSync(axtreesDir, { recursive: true });
|
||
|
||
|
||
fs.mkdirSync(pagesDir);
|
||
fs.mkdirSync(screenshotsDir);
|
||
fs.mkdirSync(storageDir);
|
||
fs.mkdirSync(pathDir);
|
||
fs.mkdirSync(childsDir); // 创建子链接记录目录
|
||
fs.mkdirSync(axtreesDir); // 创建axtree目录
|
||
console.log("启动爬虫...");
|
||
|
||
// 全局保存已经处理过的页面路径,每个页面对应一个list,list中每个元素是到达该页面的路径
|
||
const processedUrlToPaths = new Map();
|
||
|
||
// 全局保存已经处理过的页面路径,每个页面对应一个list,list中每个元素是到达该页面的路径
|
||
const processedUrlToParentChainLength = new Map();
|
||
|
||
// 全局保存已经探索过的url,用于去重
|
||
const urlExplored = new Set();
|
||
|
||
// 原子计数器,用于记录已处理过的页面数量
|
||
let processedRequestsCount = 0;
|
||
|
||
|
||
(async () => {
|
||
// 打开请求队列,建议在首次运行前清空 storage/request_queues 文件夹
|
||
const requestQueue = await RequestQueue.open();
|
||
// 使用 UUID 作为初始请求的 uniqueKey 和 id
|
||
await requestQueue.addRequest({
|
||
|
||
// url : "https://play.grafana.org/a/grafana-app-observability-app",
|
||
// uniqueKey : "https://play.grafana.org/a/grafana-app-observability-app_0",
|
||
url: 'https://play.grafana.org',
|
||
uniqueKey: 'https://play.grafana.org_0',
|
||
userData: { parentChain: [], parentChainIDs: [] }
|
||
});
|
||
|
||
const crawler = new PlaywrightCrawler({
|
||
requestQueue,
|
||
keepAlive: true,
|
||
navigationTimeoutSecs: 120,
|
||
requestHandlerTimeoutSecs: 360,
|
||
async handlePageFunction({ page, request, enqueueLinks, log }) {
|
||
// 重写 log 对象的方法
|
||
const originalLog = log;
|
||
log = {
|
||
...originalLog,
|
||
info: (message) => {
|
||
writeToLog('INFO', message);
|
||
originalLog.info(message);
|
||
},
|
||
debug: (message) => {
|
||
writeToLog('DEBUG', message);
|
||
originalLog.debug(message);
|
||
},
|
||
warning: (message) => {
|
||
writeToLog('WARN', message);
|
||
originalLog.warning(message);
|
||
},
|
||
warn: (message) => { // 添加 warn 方法
|
||
writeToLog('WARN', message);
|
||
originalLog.warn(message);
|
||
},
|
||
error: (message) => {
|
||
writeToLog('ERROR', message);
|
||
originalLog.error(message);
|
||
}
|
||
};
|
||
|
||
// 获取页面最终重定向后的 URL(去除参数)
|
||
const finalUrl = page.url().split('?')[0];
|
||
|
||
// 如果已经探索过该页面,则直接返回
|
||
if (urlExplored.has(finalUrl)) {
|
||
log.info(`页面 ${finalUrl} 已探索,跳过当前请求`);
|
||
return;
|
||
}
|
||
|
||
// 获取当前页面的完整路径
|
||
const fullChain = [...(request.userData.parentChainIDs || []), request.id];
|
||
// 获取已经存入processedUrlToPaths的页面路径
|
||
const processedPath = processedUrlToPaths.get(finalUrl);
|
||
if (processedPath) {
|
||
processedPath.push(fullChain);
|
||
processedUrlToPaths.set(finalUrl, processedPath);
|
||
log.info(`Final URL ${finalUrl} 记录新的可达路径:${fullChain}。`);
|
||
} else {
|
||
processedUrlToPaths.set(finalUrl, [fullChain]);
|
||
log.info(`Final URL ${finalUrl} 记录第一个可达路径:${fullChain}。`);
|
||
}
|
||
|
||
processedRequestsCount++;
|
||
|
||
// 每处理10个请求保存processedUrlToPaths
|
||
if (processedRequestsCount % 10 === 0) {
|
||
const mapObject = Object.fromEntries(processedUrlToPaths);
|
||
fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2));
|
||
log.info(`已保存 processedUrlToPaths.json`);
|
||
}
|
||
|
||
// 设置页面视口大小
|
||
await page.setViewportSize({ width: 2560, height: 1440 });
|
||
// await page.setViewportSize({ width: 1280, height: 720 });
|
||
|
||
|
||
// 增加超时时间到 120 秒
|
||
await page.goto(request.url, {
|
||
timeout: 120000, // 增加到 120 秒
|
||
waitUntil: 'domcontentloaded' // 改为只等待 DOM 加载完成,不等待所有资源
|
||
});
|
||
|
||
console.log('页面加载完成');
|
||
|
||
// 等待页面稳定
|
||
try {
|
||
await page.waitForLoadState('networkidle', { timeout: 30000 });
|
||
} catch (e) {
|
||
console.log('网络未完全空闲,但继续执行:', e.message);
|
||
}
|
||
|
||
// 在等待 networkidle 之前,先展开所有可折叠内容
|
||
console.log('\n开始展开导航项...');
|
||
const clickedButtons = new Set();
|
||
|
||
const expandButtons = async () => {
|
||
console.log('开始寻找可展开按钮...');
|
||
const buttons = await page.$$('button[aria-label*="Expand"], button[aria-label*="Open"], button[aria-expanded="false"]');
|
||
console.log(`找到 ${buttons.length} 个折叠按钮`);
|
||
|
||
let newButtonsFound = false;
|
||
|
||
for (const button of buttons) {
|
||
try {
|
||
// 使用更可靠的滚动方法
|
||
await page.evaluate(element => {
|
||
// 使用JavaScript的scrollIntoView,更直接且兼容性更好
|
||
element.scrollIntoView({behavior: 'smooth', block: 'center'});
|
||
}, button);
|
||
|
||
const ariaLabel = await button.getAttribute('aria-label');
|
||
if (!clickedButtons.has(ariaLabel)) {
|
||
console.log(`点击新按钮: ${ariaLabel}`);
|
||
await button.click();
|
||
clickedButtons.add(ariaLabel);
|
||
newButtonsFound = true;
|
||
await page.waitForTimeout(200);
|
||
}
|
||
} catch (e) {
|
||
console.log(`点击失败: ${e.message}`);
|
||
}
|
||
}
|
||
|
||
return newButtonsFound;
|
||
};
|
||
|
||
let iteration = 1;
|
||
while (true) {
|
||
console.log(`\n第 ${iteration} 次查找...`);
|
||
const foundNewButtons = await expandButtons();
|
||
|
||
if (!foundNewButtons) {
|
||
console.log('没有发现新的可展开按钮,结束查找');
|
||
break;
|
||
}
|
||
|
||
console.log(`已点击按钮数量: ${clickedButtons.size}`);
|
||
await page.waitForTimeout(500);
|
||
iteration++;
|
||
}
|
||
|
||
// 获取所有 <a> 标签的元素句柄
|
||
const anchorHandles = await page.$$('a');
|
||
console.log(`当前网页${finalUrl}找到 ${anchorHandles.length} 个链接`);
|
||
|
||
|
||
// 记录子元素编号
|
||
let childNum = 0;
|
||
// 创建记录所有子链接和加入队列的子链接的数组
|
||
const allChildLinks = [];
|
||
const queuedChildLinks = [];
|
||
|
||
for (const anchorHandle of anchorHandles) {
|
||
childNum++;
|
||
// 先获取 <a> 标签的 href 与文本内容
|
||
const anchorData = await page.evaluate(el => {
|
||
return {
|
||
url: el.href,
|
||
text: el.innerText.trim()
|
||
};
|
||
}, anchorHandle);
|
||
|
||
// 获取元素边界框信息
|
||
let rect = null;
|
||
let scroll = null;
|
||
let pageBoundingBox = null;
|
||
|
||
try {
|
||
// 尝试获取元素的边界框信息
|
||
rect = await anchorHandle.boundingBox();
|
||
if (rect) {
|
||
scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY }));
|
||
pageBoundingBox = {
|
||
x: rect.x + scroll.x,
|
||
y: rect.y + scroll.y,
|
||
width: rect.width,
|
||
height: rect.height,
|
||
};
|
||
}
|
||
} catch (err) {
|
||
console.error(`获取元素边界框失败: ${err.message}`);
|
||
}
|
||
|
||
// 判断链接是否在目标域内
|
||
const isInLoop = anchorData.url.startsWith('https://play.grafana.org');
|
||
|
||
// 记录所有子链接,包含完整信息
|
||
const childLink = {
|
||
childNum,
|
||
url: anchorData.url,
|
||
text: anchorData.text,
|
||
isInLoop,
|
||
isInQueue: false, // 默认未加入队列,后续会更新
|
||
viewportBoundingBox: rect,
|
||
pageBoundingBox: pageBoundingBox,
|
||
scroll: scroll
|
||
};
|
||
|
||
allChildLinks.push(childLink);
|
||
|
||
// 如果链接不属于目标域,则直接跳过(即为外链)
|
||
if (!isInLoop) continue;
|
||
|
||
log.info(`处理链接:${anchorData.text},childNum:${childNum}`);
|
||
|
||
// 使用更可靠的滚动方法
|
||
await page.evaluate(element => {
|
||
// 使用JavaScript的scrollIntoView,更直接且兼容性更好
|
||
element.scrollIntoView({behavior: 'smooth', block: 'center'});
|
||
}, anchorHandle);
|
||
|
||
await page.waitForTimeout(500); // 给滚动和渲染更多时间
|
||
|
||
// 获取元素在窗口内的 bounding box
|
||
rect = await anchorHandle.boundingBox();
|
||
if (!rect) continue;
|
||
// 获取当前窗口滚动偏移,用于计算页面内位置
|
||
scroll = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY }));
|
||
pageBoundingBox = {
|
||
x: rect.x + scroll.x,
|
||
y: rect.y + scroll.y,
|
||
width: rect.width,
|
||
height: rect.height,
|
||
};
|
||
|
||
// 构造新的 userData,包含父链与元素详细信息
|
||
const newUserData = {
|
||
parentChainChildNum: [...(request.userData.parentChainChildNum || []), childNum],
|
||
parentChain: [...(request.userData.parentChain || []), request.url],
|
||
parentChainIDs: [...(request.userData.parentChainIDs || []), request.id],
|
||
parentChainTexts: [...(request.userData.parentChainTexts || []), anchorData.text],
|
||
parentChainScrolls: [...(request.userData.parentChainScrolls || []), scroll],
|
||
parentChainViewportBoundingBoxes: [...(request.userData.parentChainViewportBoundingBoxes || []), rect],
|
||
parentChainPageBoundingBoxes: [...(request.userData.parentChainPageBoundingBoxes || []), pageBoundingBox],
|
||
elementPosition: {
|
||
viewportBoundingBox: rect,
|
||
pageBoundingBox: pageBoundingBox,
|
||
scroll: scroll,
|
||
text: anchorData.text,
|
||
childNum: childNum
|
||
}
|
||
};
|
||
|
||
// 增加自定义url去重,通过全局的map记录url与parentchain长度的映射,如果当前路径长度大于等于已记录的url的parentchain长度,则跳过该url
|
||
const urlToParentChainLength = processedUrlToParentChainLength.get(anchorData.url);
|
||
if (urlToParentChainLength && urlToParentChainLength <= newUserData.parentChain.length) {
|
||
log.info(`url:${anchorData.url} 历史发现路径最短长度为${urlToParentChainLength},当前路径长度为${newUserData.parentChain.length},跳过当前请求url`);
|
||
continue;
|
||
}
|
||
// 记录当前url的parentchain长度,记录最小值
|
||
processedUrlToParentChainLength.set(anchorData.url, newUserData.parentChain.length);
|
||
|
||
const uniqueKey = `${anchorData.url}_${newUserData.parentChain.length}`;
|
||
|
||
log.info(`请求加入队列:${anchorData.url},uniqueKey:${uniqueKey},childNum:${childNum}`);
|
||
|
||
try {
|
||
await requestQueue.addRequest({
|
||
url: anchorData.url,
|
||
uniqueKey: uniqueKey,
|
||
userData: newUserData
|
||
});
|
||
|
||
// 更新原始链接的队列状态
|
||
childLink.isInQueue = true;
|
||
|
||
// 记录成功加入队列的子链接,包含完整信息
|
||
queuedChildLinks.push({
|
||
childNum,
|
||
url: anchorData.url,
|
||
text: anchorData.text,
|
||
uniqueKey,
|
||
isInLoop,
|
||
isInQueue: true,
|
||
viewportBoundingBox: rect,
|
||
pageBoundingBox: pageBoundingBox,
|
||
scroll: scroll
|
||
});
|
||
|
||
} catch (err) {
|
||
log.info(`请求已存在或添加失败:${anchorData.url}`);
|
||
}
|
||
|
||
// 截图保存当前窗口
|
||
const screenshotPath = path.join(screenshotsDir, `${request.id}_${childNum}.png`);
|
||
await page.screenshot({ path: screenshotPath, fullPage: false });
|
||
log.info(`已保存第${childNum}个子元素截图:${anchorData.url} -> ${screenshotPath}`);
|
||
|
||
}
|
||
|
||
// 保存子链接记录到JSON文件
|
||
const childLinksData = {
|
||
requestId: request.id,
|
||
url: finalUrl,
|
||
totalAnchors: anchorHandles.length, // 添加记录总的<a>标签数量
|
||
allChildLinks,
|
||
queuedChildLinks,
|
||
totalFound: allChildLinks.length,
|
||
totalQueued: queuedChildLinks.length
|
||
};
|
||
|
||
const childLinksPath = path.join(childsDir, `${request.id}.json`);
|
||
fs.writeFileSync(childLinksPath, JSON.stringify(childLinksData, null, 2));
|
||
log.info(`已保存子链接记录:${finalUrl} -> ${childLinksPath}`);
|
||
|
||
// 将当前页面标记为已探索
|
||
urlExplored.add(finalUrl);
|
||
|
||
// 保存当前页面 HTML,文件名使用 request.id
|
||
const content = await page.content();
|
||
const htmlFilePath = path.join(pagesDir, request.id + '.html');
|
||
fs.writeFileSync(htmlFilePath, content);
|
||
log.info(`已保存 HTML:${finalUrl} -> ${htmlFilePath}`);
|
||
|
||
// 获取 AXTree
|
||
const axTree = await page.accessibility.snapshot({ interestingOnly: false });
|
||
|
||
// 用于存储 id 到 selector 的映射
|
||
let idCounter = 1;
|
||
const idToSelector = {};
|
||
const nodeParents = new Map();
|
||
|
||
// 用于构建文本形式的 AXTree
|
||
let axtreeText = [];
|
||
|
||
function traverse(node, depth = 0, parent = null) {
|
||
nodeParents.set(node, parent);
|
||
|
||
if (((node.role === 'none' || node.role === 'generic' || node.role === 'InlineTextBox') &&
|
||
!node.name &&
|
||
!node.focusable &&
|
||
!node.focused &&
|
||
node.expanded === undefined) ||
|
||
node.role === 'InlineTextBox'
|
||
) {
|
||
if (node.children?.length > 0) {
|
||
for (const child of node.children) {
|
||
traverse(child, depth, node);
|
||
}
|
||
}
|
||
return;
|
||
}
|
||
|
||
const currentId = idCounter++;
|
||
|
||
let selectorParts = [`role=${node.role}`];
|
||
if (node.name) {
|
||
selectorParts.push(`[name="${node.name}"]`);
|
||
}
|
||
|
||
if (node.selected) selectorParts.push('[selected=true]');
|
||
if (node.checked !== undefined) selectorParts.push(`[checked=${node.checked}]`);
|
||
if (node.pressed !== undefined) selectorParts.push(`[pressed=${node.pressed}]`);
|
||
|
||
if (parent && parent.role !== 'WebArea') {
|
||
let parentSelector = `role=${parent.role}`;
|
||
if (parent.name) {
|
||
parentSelector += `[name="${parent.name}"]`;
|
||
}
|
||
selectorParts.unshift(`${parentSelector} >>`);
|
||
}
|
||
|
||
if (parent?.children) {
|
||
const siblingIndex = parent.children.findIndex(child => child === node);
|
||
if (siblingIndex !== -1) {
|
||
selectorParts.push(`:nth-match(${siblingIndex + 1})`);
|
||
}
|
||
}
|
||
|
||
idToSelector[currentId] = selectorParts.join(' ');
|
||
|
||
// 收集所有可能的属性
|
||
let props = [];
|
||
if (node.focusable) props.push('focusable');
|
||
if (node.focused) props.push('focused');
|
||
if (node.expanded !== undefined) props.push(`expanded=${node.expanded}`);
|
||
if (node.selected) props.push('selected');
|
||
if (node.checked !== undefined) props.push(`checked=${node.checked}`);
|
||
if (node.disabled) props.push('disabled');
|
||
if (node.required) props.push('required');
|
||
if (node.pressed !== undefined) props.push(`pressed=${node.pressed}`);
|
||
|
||
// 判断元素是否可点击
|
||
const clickableRoles = ['button', 'link', 'menuitem', 'tab', 'checkbox', 'radio', 'switch', 'option'];
|
||
const isClickable = clickableRoles.includes(node.role) ||
|
||
node.focusable ||
|
||
node.role === 'generic' && node.name && node.focusable;
|
||
if (isClickable) props.push('clickable');
|
||
|
||
const indent = ' '.repeat(depth * 4);
|
||
const nodeLine = `${indent}[${currentId}] ${node.role} '${node.name || ''}'${props.length > 0 ? ' (' + props.join(', ') + ')' : ''}`;
|
||
axtreeText.push(nodeLine);
|
||
|
||
if (node.children?.length > 0) {
|
||
for (const child of node.children) {
|
||
traverse(child, depth + 1, node);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 添加根节点信息
|
||
let rootProps = [];
|
||
if (axTree.focusable) rootProps.push('focusable=True');
|
||
if (axTree.focused) rootProps.push('focused');
|
||
axtreeText.push(`Root${axTree.role ? axTree.role : 'WebArea'} '${axTree.name || ''}'${rootProps.length > 0 ? ', ' + rootProps.join(', ') : ''}`);
|
||
|
||
// 遍历 AXTree
|
||
if (axTree.children?.length > 0) {
|
||
for (const child of axTree.children) {
|
||
traverse(child, 1, axTree);
|
||
}
|
||
}
|
||
|
||
// 保存axtree到文件
|
||
const axtreePath = path.join(axtreesDir, `${request.id}.txt`);
|
||
fs.writeFileSync(axtreePath, axtreeText.join('\n'));
|
||
log.info(`已保存 axtree:${finalUrl} -> ${axtreePath}`);
|
||
// 保存idToSelector到文件
|
||
const idToSelectorPath = path.join(axtreesDir, `${request.id}_idToSelector.json`);
|
||
fs.writeFileSync(idToSelectorPath, JSON.stringify(idToSelector, null, 2));
|
||
log.info(`已保存 idToSelector:${finalUrl} -> ${idToSelectorPath}`);
|
||
|
||
// 保存当前页面截图,文件名使用 request.id
|
||
const fullPageScreenshotPath = path.join(screenshotsDir, `${request.id}_full.png`);
|
||
await page.screenshot({ path: fullPageScreenshotPath, fullPage: true });
|
||
log.info(`已保存全屏截图:${finalUrl} -> ${fullPageScreenshotPath}`);
|
||
|
||
console.log(`记录已探索页面到全局集合:${finalUrl}`);
|
||
},
|
||
async handleFailedRequestFunction({ request }) {
|
||
console.error(`请求 ${request.url} 处理失败。`);
|
||
},
|
||
});
|
||
|
||
try {
|
||
await crawler.run();
|
||
} catch (err) {
|
||
const mapObject = Object.fromEntries(processedUrlToPaths);
|
||
fs.writeFileSync(path.join(pathDir, 'processedUrlToPaths.json'), JSON.stringify(mapObject, null, 2));
|
||
log.info(`已保存 processedUrlToPaths.json`);
|
||
console.error("爬虫运行时出错:", err);
|
||
}
|
||
console.log("爬虫运行结束!");
|
||
})();
|