import { readFileSync, writeFileSync, existsSync } from 'fs'; import { join } from 'path'; // 读取 JSON 文件 const data = JSON.parse(readFileSync('path/processedUrlToPaths.json', 'utf8')); // 打印 URL 总数 console.log(`总共有 ${Object.keys(data).length} 个 URL 需要分析`); // 计算所有路径总数 const totalPaths = Object.values(data).reduce((sum, paths) => sum + paths.length, 0); console.log(`所有 URL 的路径总数为: ${totalPaths}`); // 获取请求元信息的函数 function getRequestMetadata(requestId) { try { const requestPath = join('storage', 'request_queues', 'default', `${requestId}.json`); const requestData = JSON.parse(readFileSync(requestPath, 'utf8')); // 解析 json 字段中的字符串 const jsonData = JSON.parse(requestData.json); const userData = jsonData.userData || {}; // 创建结果对象,添加 request_id const result = { request_id: requestId }; // chain 是 parentChain 的追加id userData['parentChainIDs'].push(requestId); result['chainIDs'] = userData['parentChainIDs']; // chain 是 parentChain 的追加text result['chainTexts'] = userData['parentChainTexts']; // 获取 chainAxTreeID result['chainAxTreeID'] = getChainAxTreeIDs(userData['parentChainIDs'], userData['parentChainTexts']); // chain 是 parentChain 的追加url userData['parentChain'].push(requestData.url); result['chainUrls'] = userData['parentChain']; result['chainChildNum'] = userData['parentChainChildNum']; result['chainPageBoundingBoxes'] = userData['parentChainPageBoundingBoxes']; result['chainViewportBoundingBoxes'] = userData['parentChainViewportBoundingBoxes']; // chain 是 parentChain 的追加scroll userData['parentChainScrolls'].push(userData['elementPosition'].scroll); result['chainScrolls'] = userData['parentChainScrolls']; return result; } catch (error) { console.error(`无法读取请求 ${requestId} 的元信息:`, error.message); return null; } } // 获取 chainAxTreeID 的函数 function getChainAxTreeIDs(chainIDs, chainTexts) { const axTreeIDs = []; // 遍历每个 chainID 和对应的 chainText for (let i = 0; i < chainIDs.length; i++) { const id = chainIDs[i]; const text = chainTexts[i]; if (!id || !text) continue; try { // 读取对应的 axtree 文件 const axTreePath = join('axtrees', `${id}.txt`); if (!existsSync(axTreePath)) { axTreeIDs.push(null); continue; } const axTreeContent = readFileSync(axTreePath, 'utf8'); // 查找包含指定文本和 clickable 属性的行 const lines = axTreeContent.split('\n'); let matchedAxTreeIDs = []; const matchText = `'${text}'` for (let j = 0; j < lines.length; j++) { const line = lines[j]; if (line.includes(matchText) && line.includes('clickable')) { // 提取 ID 编号 const match = line.match(/\[\s*(\d+)\s*\]/); if (match && match[1]) { matchedAxTreeIDs.push(parseInt(match[1])); } } } // 如果有多个匹配,用逗号连接它们 if (matchedAxTreeIDs.length > 0) { axTreeIDs.push(matchedAxTreeIDs.join(',')); } else { axTreeIDs.push(null); } } catch (error) { console.error(`无法读取 axtree 文件 ${id}:`, error.message); axTreeIDs.push(null); } } return axTreeIDs; } // 存储分析结果 const analysis = {}; const shortestPathLengths = []; let totalShortestPaths = 0; // 添加总最短路径计数器 // 添加缓存对象 const validRequestCache = new Map(); // 检查请求 ID 是否合法的函数 function isValidRequest(requestId) { // 先检查缓存 if (validRequestCache.has(requestId)) { return validRequestCache.get(requestId); } // 检查对应的 HTML 文件是否存在 const htmlPath = join('pages', `${requestId}.html`); const isValid = existsSync(htmlPath); // 存入缓存 validRequestCache.set(requestId, isValid); return isValid; } // 检查路径是否合法的函数 function isValidPath(path) { return path.every(requestId => isValidRequest(requestId)); } // 分析每个 URL for (const [url, paths] of Object.entries(data)) { analysis[url] = { totalPaths: paths.length, shortestPathLength: Infinity, shortestPaths: [], shortestPathsMeta: [], shortestPathsSet: new Set() }; // 先过滤出合法路径 const validPaths = paths.filter(path => isValidPath(path)); // 如果没有合法路径,跳过这个 URL if (validPaths.length === 0) { delete analysis[url]; continue; } // 在合法路径中找出最短路径 validPaths.forEach(path => { if (path.length < analysis[url].shortestPathLength) { analysis[url].shortestPathLength = path.length; analysis[url].shortestPaths = [path]; analysis[url].shortestPathsSet = new Set([JSON.stringify(path)]); const lastRequestId = path[path.length - 1]; analysis[url].shortestPathsMeta = [getRequestMetadata(lastRequestId)]; } else if (path.length === analysis[url].shortestPathLength) { const pathStr = JSON.stringify(path); if (!analysis[url].shortestPathsSet.has(pathStr)) { analysis[url].shortestPathsSet.add(pathStr); analysis[url].shortestPaths.push(path); const lastRequestId = path[path.length - 1]; analysis[url].shortestPathsMeta.push(getRequestMetadata(lastRequestId)); } } }); // 删除临时使用的 Set delete analysis[url].shortestPathsSet; // 添加统计信息 shortestPathLengths.push(analysis[url].shortestPathLength); // 添加最短路径数量信息(现在是去重后的数量) analysis[url].shortestPathCount = analysis[url].shortestPaths.length; totalShortestPaths += analysis[url].shortestPathCount; } // 打印统计信息 console.log(`有 ${Object.values(analysis).filter(a => a.shortestPathCount > 1).length} 个 URL 具有多条最短路径`); console.log(`所有 URL 的最短路径总数为: ${totalShortestPaths}`); // 按路径长度分组 const analysisByLength = {}; Object.entries(analysis).forEach(([url, data]) => { const length = data.shortestPathLength; if (!analysisByLength[length]) { analysisByLength[length] = {}; } analysisByLength[length][url] = data; }); // 将完整分析结果写入文件 writeFileSync('path/processed.json', JSON.stringify(analysis, null, 2)); // 为每个路径长度创建单独的文件 Object.entries(analysisByLength).forEach(([length, data]) => { const filename = `path/processed_${length}.json`; writeFileSync(filename, JSON.stringify(data, null, 2)); console.log(`路径长度为 ${length} 的URL数量: ${Object.keys(data).length}`); }); // 计算最短路径长度的分布 const lengthDistribution = {}; shortestPathLengths.forEach(length => { lengthDistribution[length] = (lengthDistribution[length] || 0) + 1; }); // 计算累积分布 const totalUrls = shortestPathLengths.length; const cumulativeDistribution = {}; let cumulative = 0; Object.keys(lengthDistribution) .sort((a, b) => Number(a) - Number(b)) .forEach(length => { cumulative += lengthDistribution[length]; cumulativeDistribution[length] = cumulative / totalUrls; }); // 统计具有多条最短路径的 URL 数量 const multipleShortestPathsCount = Object.values(analysis).filter(a => a.shortestPathCount > 1).length; // 将分布分析结果写入单独的文件 const distributionAnalysis = { pathLengthDistribution: lengthDistribution, cumulativeDistribution: cumulativeDistribution, totalUrlsAnalyzed: totalUrls, multipleShortestPathsUrls: multipleShortestPathsCount, statistics: { minLength: Math.min(...shortestPathLengths), maxLength: Math.max(...shortestPathLengths), averageLength: shortestPathLengths.reduce((a, b) => a + b, 0) / totalUrls } }; writeFileSync('path/distribution_analysis.json', JSON.stringify(distributionAnalysis, null, 2)); // 打印平均路径长度 console.log(`所有 URL 的平均最短路径长度为: ${distributionAnalysis.statistics.averageLength.toFixed(2)}`); // 生成绘图用的 HTML 文件 const htmlContent = `