296 lines
10 KiB
JavaScript
296 lines
10 KiB
JavaScript
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
|
import { join } from 'path';
|
|
|
|
// 读取 JSON 文件
|
|
const data = JSON.parse(readFileSync('path/processedUrlToPaths.json', 'utf8'));
|
|
|
|
// 打印 URL 总数
|
|
console.log(`总共有 ${Object.keys(data).length} 个 URL 需要分析`);
|
|
|
|
// 计算所有路径总数
|
|
const totalPaths = Object.values(data).reduce((sum, paths) => sum + paths.length, 0);
|
|
console.log(`所有 URL 的路径总数为: ${totalPaths}`);
|
|
|
|
// 获取请求元信息的函数
|
|
function getRequestMetadata(requestId) {
|
|
try {
|
|
const requestPath = join('storage', 'request_queues', 'default', `${requestId}.json`);
|
|
const requestData = JSON.parse(readFileSync(requestPath, 'utf8'));
|
|
// 解析 json 字段中的字符串
|
|
const jsonData = JSON.parse(requestData.json);
|
|
const userData = jsonData.userData || {};
|
|
|
|
// 创建结果对象,添加 request_id
|
|
const result = { request_id: requestId };
|
|
|
|
// chain 是 parentChain 的追加id
|
|
userData['parentChainIDs'].push(requestId);
|
|
result['chainIDs'] = userData['parentChainIDs'];
|
|
|
|
// chain 是 parentChain 的追加text
|
|
result['chainTexts'] = userData['parentChainTexts'];
|
|
|
|
// 获取 chainAxTreeID
|
|
result['chainAxTreeID'] = getChainAxTreeIDs(userData['parentChainIDs'], userData['parentChainTexts']);
|
|
|
|
// chain 是 parentChain 的追加url
|
|
userData['parentChain'].push(requestData.url);
|
|
result['chainUrls'] = userData['parentChain'];
|
|
|
|
result['chainChildNum'] = userData['parentChainChildNum'];
|
|
|
|
result['chainPageBoundingBoxes'] = userData['parentChainPageBoundingBoxes'];
|
|
result['chainViewportBoundingBoxes'] = userData['parentChainViewportBoundingBoxes'];
|
|
|
|
// chain 是 parentChain 的追加scroll
|
|
userData['parentChainScrolls'].push(userData['elementPosition'].scroll);
|
|
result['chainScrolls'] = userData['parentChainScrolls'];
|
|
|
|
return result;
|
|
} catch (error) {
|
|
console.error(`无法读取请求 ${requestId} 的元信息:`, error.message);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// 获取 chainAxTreeID 的函数
|
|
function getChainAxTreeIDs(chainIDs, chainTexts) {
|
|
const axTreeIDs = [];
|
|
|
|
// 遍历每个 chainID 和对应的 chainText
|
|
for (let i = 0; i < chainIDs.length; i++) {
|
|
const id = chainIDs[i];
|
|
const text = chainTexts[i];
|
|
|
|
if (!id || !text) continue;
|
|
|
|
try {
|
|
// 读取对应的 axtree 文件
|
|
const axTreePath = join('axtrees', `${id}.txt`);
|
|
if (!existsSync(axTreePath)) {
|
|
axTreeIDs.push(null);
|
|
continue;
|
|
}
|
|
|
|
const axTreeContent = readFileSync(axTreePath, 'utf8');
|
|
|
|
// 查找包含指定文本和 clickable 属性的行
|
|
const lines = axTreeContent.split('\n');
|
|
let matchedAxTreeIDs = [];
|
|
|
|
const matchText = `'${text}'`
|
|
|
|
for (let j = 0; j < lines.length; j++) {
|
|
const line = lines[j];
|
|
if (line.includes(matchText) && line.includes('clickable')) {
|
|
// 提取 ID 编号
|
|
const match = line.match(/\[\s*(\d+)\s*\]/);
|
|
if (match && match[1]) {
|
|
matchedAxTreeIDs.push(parseInt(match[1]));
|
|
}
|
|
}
|
|
}
|
|
|
|
// 如果有多个匹配,用逗号连接它们
|
|
if (matchedAxTreeIDs.length > 0) {
|
|
axTreeIDs.push(matchedAxTreeIDs.join(','));
|
|
} else {
|
|
axTreeIDs.push(null);
|
|
}
|
|
} catch (error) {
|
|
console.error(`无法读取 axtree 文件 ${id}:`, error.message);
|
|
axTreeIDs.push(null);
|
|
}
|
|
}
|
|
|
|
return axTreeIDs;
|
|
}
|
|
|
|
// 存储分析结果
|
|
const analysis = {};
|
|
const shortestPathLengths = [];
|
|
let totalShortestPaths = 0; // 添加总最短路径计数器
|
|
|
|
// 添加缓存对象
|
|
const validRequestCache = new Map();
|
|
|
|
// 检查请求 ID 是否合法的函数
|
|
function isValidRequest(requestId) {
|
|
// 先检查缓存
|
|
if (validRequestCache.has(requestId)) {
|
|
return validRequestCache.get(requestId);
|
|
}
|
|
|
|
// 检查对应的 HTML 文件是否存在
|
|
const htmlPath = join('pages', `${requestId}.html`);
|
|
const isValid = existsSync(htmlPath);
|
|
|
|
// 存入缓存
|
|
validRequestCache.set(requestId, isValid);
|
|
return isValid;
|
|
}
|
|
|
|
// 检查路径是否合法的函数
|
|
function isValidPath(path) {
|
|
return path.every(requestId => isValidRequest(requestId));
|
|
}
|
|
|
|
// 分析每个 URL
|
|
for (const [url, paths] of Object.entries(data)) {
|
|
analysis[url] = {
|
|
totalPaths: paths.length,
|
|
shortestPathLength: Infinity,
|
|
shortestPaths: [],
|
|
shortestPathsMeta: [],
|
|
shortestPathsSet: new Set()
|
|
};
|
|
|
|
// 先过滤出合法路径
|
|
const validPaths = paths.filter(path => isValidPath(path));
|
|
|
|
// 如果没有合法路径,跳过这个 URL
|
|
if (validPaths.length === 0) {
|
|
delete analysis[url];
|
|
continue;
|
|
}
|
|
|
|
// 在合法路径中找出最短路径
|
|
validPaths.forEach(path => {
|
|
if (path.length < analysis[url].shortestPathLength) {
|
|
analysis[url].shortestPathLength = path.length;
|
|
analysis[url].shortestPaths = [path];
|
|
analysis[url].shortestPathsSet = new Set([JSON.stringify(path)]);
|
|
const lastRequestId = path[path.length - 1];
|
|
analysis[url].shortestPathsMeta = [getRequestMetadata(lastRequestId)];
|
|
} else if (path.length === analysis[url].shortestPathLength) {
|
|
const pathStr = JSON.stringify(path);
|
|
if (!analysis[url].shortestPathsSet.has(pathStr)) {
|
|
analysis[url].shortestPathsSet.add(pathStr);
|
|
analysis[url].shortestPaths.push(path);
|
|
const lastRequestId = path[path.length - 1];
|
|
analysis[url].shortestPathsMeta.push(getRequestMetadata(lastRequestId));
|
|
}
|
|
}
|
|
});
|
|
|
|
// 删除临时使用的 Set
|
|
delete analysis[url].shortestPathsSet;
|
|
|
|
// 添加统计信息
|
|
shortestPathLengths.push(analysis[url].shortestPathLength);
|
|
|
|
// 添加最短路径数量信息(现在是去重后的数量)
|
|
analysis[url].shortestPathCount = analysis[url].shortestPaths.length;
|
|
totalShortestPaths += analysis[url].shortestPathCount;
|
|
}
|
|
|
|
// 打印统计信息
|
|
console.log(`有 ${Object.values(analysis).filter(a => a.shortestPathCount > 1).length} 个 URL 具有多条最短路径`);
|
|
console.log(`所有 URL 的最短路径总数为: ${totalShortestPaths}`);
|
|
|
|
// 按路径长度分组
|
|
const analysisByLength = {};
|
|
Object.entries(analysis).forEach(([url, data]) => {
|
|
const length = data.shortestPathLength;
|
|
if (!analysisByLength[length]) {
|
|
analysisByLength[length] = {};
|
|
}
|
|
analysisByLength[length][url] = data;
|
|
});
|
|
|
|
// 将完整分析结果写入文件
|
|
writeFileSync('path/processed.json', JSON.stringify(analysis, null, 2));
|
|
|
|
// 为每个路径长度创建单独的文件
|
|
Object.entries(analysisByLength).forEach(([length, data]) => {
|
|
const filename = `path/processed_${length}.json`;
|
|
writeFileSync(filename, JSON.stringify(data, null, 2));
|
|
console.log(`路径长度为 ${length} 的URL数量: ${Object.keys(data).length}`);
|
|
});
|
|
|
|
|
|
// 计算最短路径长度的分布
|
|
const lengthDistribution = {};
|
|
shortestPathLengths.forEach(length => {
|
|
lengthDistribution[length] = (lengthDistribution[length] || 0) + 1;
|
|
});
|
|
|
|
// 计算累积分布
|
|
const totalUrls = shortestPathLengths.length;
|
|
const cumulativeDistribution = {};
|
|
let cumulative = 0;
|
|
|
|
Object.keys(lengthDistribution)
|
|
.sort((a, b) => Number(a) - Number(b))
|
|
.forEach(length => {
|
|
cumulative += lengthDistribution[length];
|
|
cumulativeDistribution[length] = cumulative / totalUrls;
|
|
});
|
|
|
|
// 统计具有多条最短路径的 URL 数量
|
|
const multipleShortestPathsCount = Object.values(analysis).filter(a => a.shortestPathCount > 1).length;
|
|
|
|
// 将分布分析结果写入单独的文件
|
|
const distributionAnalysis = {
|
|
pathLengthDistribution: lengthDistribution,
|
|
cumulativeDistribution: cumulativeDistribution,
|
|
totalUrlsAnalyzed: totalUrls,
|
|
multipleShortestPathsUrls: multipleShortestPathsCount,
|
|
statistics: {
|
|
minLength: Math.min(...shortestPathLengths),
|
|
maxLength: Math.max(...shortestPathLengths),
|
|
averageLength: shortestPathLengths.reduce((a, b) => a + b, 0) / totalUrls
|
|
}
|
|
};
|
|
|
|
writeFileSync('path/distribution_analysis.json', JSON.stringify(distributionAnalysis, null, 2));
|
|
|
|
// 打印平均路径长度
|
|
console.log(`所有 URL 的平均最短路径长度为: ${distributionAnalysis.statistics.averageLength.toFixed(2)}`);
|
|
|
|
// 生成绘图用的 HTML 文件
|
|
const htmlContent = `
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Shortest Path Length Distribution</title>
|
|
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
|
</head>
|
|
<body>
|
|
<div id="plot"></div>
|
|
<script>
|
|
const distribution = ${JSON.stringify(lengthDistribution)};
|
|
|
|
const x = Object.keys(distribution);
|
|
const y = Object.values(distribution);
|
|
|
|
const trace = {
|
|
x: x,
|
|
y: y,
|
|
type: 'bar',
|
|
name: 'Path Length Distribution'
|
|
};
|
|
|
|
const layout = {
|
|
title: 'URL Shortest Path Length Distribution',
|
|
xaxis: {
|
|
title: 'Shortest Path Length',
|
|
tickmode: 'linear'
|
|
},
|
|
yaxis: {
|
|
title: 'Number of URLs'
|
|
}
|
|
};
|
|
|
|
Plotly.newPlot('plot', [trace], layout);
|
|
</script>
|
|
</body>
|
|
</html>
|
|
`;
|
|
|
|
// 改成linux路径
|
|
writeFileSync('path/distribution.html', htmlContent);
|
|
|
|
// 打印具有多条最短路径的 URL 数量
|
|
console.log(`有 ${multipleShortestPathsCount} 个 URL 具有多条最短路径`);
|