crawlee/analysis_v18.js
2025-04-23 12:14:50 +08:00

296 lines
10 KiB
JavaScript

import { readFileSync, writeFileSync, existsSync } from 'fs';
import { join } from 'path';
// 读取 JSON 文件
const data = JSON.parse(readFileSync('path/processedUrlToPaths.json', 'utf8'));
// 打印 URL 总数
console.log(`总共有 ${Object.keys(data).length} 个 URL 需要分析`);
// 计算所有路径总数
const totalPaths = Object.values(data).reduce((sum, paths) => sum + paths.length, 0);
console.log(`所有 URL 的路径总数为: ${totalPaths}`);
// 获取请求元信息的函数
function getRequestMetadata(requestId) {
try {
const requestPath = join('storage', 'request_queues', 'default', `${requestId}.json`);
const requestData = JSON.parse(readFileSync(requestPath, 'utf8'));
// 解析 json 字段中的字符串
const jsonData = JSON.parse(requestData.json);
const userData = jsonData.userData || {};
// 创建结果对象,添加 request_id
const result = { request_id: requestId };
// chain 是 parentChain 的追加id
userData['parentChainIDs'].push(requestId);
result['chainIDs'] = userData['parentChainIDs'];
// chain 是 parentChain 的追加text
result['chainTexts'] = userData['parentChainTexts'];
// 获取 chainAxTreeID
result['chainAxTreeID'] = getChainAxTreeIDs(userData['parentChainIDs'], userData['parentChainTexts']);
// chain 是 parentChain 的追加url
userData['parentChain'].push(requestData.url);
result['chainUrls'] = userData['parentChain'];
result['chainChildNum'] = userData['parentChainChildNum'];
result['chainPageBoundingBoxes'] = userData['parentChainPageBoundingBoxes'];
result['chainViewportBoundingBoxes'] = userData['parentChainViewportBoundingBoxes'];
// chain 是 parentChain 的追加scroll
userData['parentChainScrolls'].push(userData['elementPosition'].scroll);
result['chainScrolls'] = userData['parentChainScrolls'];
return result;
} catch (error) {
console.error(`无法读取请求 ${requestId} 的元信息:`, error.message);
return null;
}
}
// 获取 chainAxTreeID 的函数
function getChainAxTreeIDs(chainIDs, chainTexts) {
const axTreeIDs = [];
// 遍历每个 chainID 和对应的 chainText
for (let i = 0; i < chainIDs.length; i++) {
const id = chainIDs[i];
const text = chainTexts[i];
if (!id || !text) continue;
try {
// 读取对应的 axtree 文件
const axTreePath = join('axtrees', `${id}.txt`);
if (!existsSync(axTreePath)) {
axTreeIDs.push(null);
continue;
}
const axTreeContent = readFileSync(axTreePath, 'utf8');
// 查找包含指定文本和 clickable 属性的行
const lines = axTreeContent.split('\n');
let matchedAxTreeIDs = [];
const matchText = `'${text}'`
for (let j = 0; j < lines.length; j++) {
const line = lines[j];
if (line.includes(matchText) && line.includes('clickable')) {
// 提取 ID 编号
const match = line.match(/\[\s*(\d+)\s*\]/);
if (match && match[1]) {
matchedAxTreeIDs.push(parseInt(match[1]));
}
}
}
// 如果有多个匹配,用逗号连接它们
if (matchedAxTreeIDs.length > 0) {
axTreeIDs.push(matchedAxTreeIDs.join(','));
} else {
axTreeIDs.push(null);
}
} catch (error) {
console.error(`无法读取 axtree 文件 ${id}:`, error.message);
axTreeIDs.push(null);
}
}
return axTreeIDs;
}
// 存储分析结果
const analysis = {};
const shortestPathLengths = [];
let totalShortestPaths = 0; // 添加总最短路径计数器
// 添加缓存对象
const validRequestCache = new Map();
// 检查请求 ID 是否合法的函数
function isValidRequest(requestId) {
// 先检查缓存
if (validRequestCache.has(requestId)) {
return validRequestCache.get(requestId);
}
// 检查对应的 HTML 文件是否存在
const htmlPath = join('pages', `${requestId}.html`);
const isValid = existsSync(htmlPath);
// 存入缓存
validRequestCache.set(requestId, isValid);
return isValid;
}
// 检查路径是否合法的函数
function isValidPath(path) {
return path.every(requestId => isValidRequest(requestId));
}
// 分析每个 URL
for (const [url, paths] of Object.entries(data)) {
analysis[url] = {
totalPaths: paths.length,
shortestPathLength: Infinity,
shortestPaths: [],
shortestPathsMeta: [],
shortestPathsSet: new Set()
};
// 先过滤出合法路径
const validPaths = paths.filter(path => isValidPath(path));
// 如果没有合法路径,跳过这个 URL
if (validPaths.length === 0) {
delete analysis[url];
continue;
}
// 在合法路径中找出最短路径
validPaths.forEach(path => {
if (path.length < analysis[url].shortestPathLength) {
analysis[url].shortestPathLength = path.length;
analysis[url].shortestPaths = [path];
analysis[url].shortestPathsSet = new Set([JSON.stringify(path)]);
const lastRequestId = path[path.length - 1];
analysis[url].shortestPathsMeta = [getRequestMetadata(lastRequestId)];
} else if (path.length === analysis[url].shortestPathLength) {
const pathStr = JSON.stringify(path);
if (!analysis[url].shortestPathsSet.has(pathStr)) {
analysis[url].shortestPathsSet.add(pathStr);
analysis[url].shortestPaths.push(path);
const lastRequestId = path[path.length - 1];
analysis[url].shortestPathsMeta.push(getRequestMetadata(lastRequestId));
}
}
});
// 删除临时使用的 Set
delete analysis[url].shortestPathsSet;
// 添加统计信息
shortestPathLengths.push(analysis[url].shortestPathLength);
// 添加最短路径数量信息(现在是去重后的数量)
analysis[url].shortestPathCount = analysis[url].shortestPaths.length;
totalShortestPaths += analysis[url].shortestPathCount;
}
// 打印统计信息
console.log(`${Object.values(analysis).filter(a => a.shortestPathCount > 1).length} 个 URL 具有多条最短路径`);
console.log(`所有 URL 的最短路径总数为: ${totalShortestPaths}`);
// 按路径长度分组
const analysisByLength = {};
Object.entries(analysis).forEach(([url, data]) => {
const length = data.shortestPathLength;
if (!analysisByLength[length]) {
analysisByLength[length] = {};
}
analysisByLength[length][url] = data;
});
// 将完整分析结果写入文件
writeFileSync('path/processed.json', JSON.stringify(analysis, null, 2));
// 为每个路径长度创建单独的文件
Object.entries(analysisByLength).forEach(([length, data]) => {
const filename = `path/processed_${length}.json`;
writeFileSync(filename, JSON.stringify(data, null, 2));
console.log(`路径长度为 ${length} 的URL数量: ${Object.keys(data).length}`);
});
// 计算最短路径长度的分布
const lengthDistribution = {};
shortestPathLengths.forEach(length => {
lengthDistribution[length] = (lengthDistribution[length] || 0) + 1;
});
// 计算累积分布
const totalUrls = shortestPathLengths.length;
const cumulativeDistribution = {};
let cumulative = 0;
Object.keys(lengthDistribution)
.sort((a, b) => Number(a) - Number(b))
.forEach(length => {
cumulative += lengthDistribution[length];
cumulativeDistribution[length] = cumulative / totalUrls;
});
// 统计具有多条最短路径的 URL 数量
const multipleShortestPathsCount = Object.values(analysis).filter(a => a.shortestPathCount > 1).length;
// 将分布分析结果写入单独的文件
const distributionAnalysis = {
pathLengthDistribution: lengthDistribution,
cumulativeDistribution: cumulativeDistribution,
totalUrlsAnalyzed: totalUrls,
multipleShortestPathsUrls: multipleShortestPathsCount,
statistics: {
minLength: Math.min(...shortestPathLengths),
maxLength: Math.max(...shortestPathLengths),
averageLength: shortestPathLengths.reduce((a, b) => a + b, 0) / totalUrls
}
};
writeFileSync('path/distribution_analysis.json', JSON.stringify(distributionAnalysis, null, 2));
// 打印平均路径长度
console.log(`所有 URL 的平均最短路径长度为: ${distributionAnalysis.statistics.averageLength.toFixed(2)}`);
// 生成绘图用的 HTML 文件
const htmlContent = `
<!DOCTYPE html>
<html>
<head>
<title>Shortest Path Length Distribution</title>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
</head>
<body>
<div id="plot"></div>
<script>
const distribution = ${JSON.stringify(lengthDistribution)};
const x = Object.keys(distribution);
const y = Object.values(distribution);
const trace = {
x: x,
y: y,
type: 'bar',
name: 'Path Length Distribution'
};
const layout = {
title: 'URL Shortest Path Length Distribution',
xaxis: {
title: 'Shortest Path Length',
tickmode: 'linear'
},
yaxis: {
title: 'Number of URLs'
}
};
Plotly.newPlot('plot', [trace], layout);
</script>
</body>
</html>
`;
// 改成linux路径
writeFileSync('path/distribution.html', htmlContent);
// 打印具有多条最短路径的 URL 数量
console.log(`${multipleShortestPathsCount} 个 URL 具有多条最短路径`);