crawlee/misc/temp_analysis/exam_gen.py
2025-04-23 12:14:50 +08:00

56 lines
2.1 KiB
Python

import json
import os
import uuid
def create_exam_cases():
# 读取process_3.json文件
with open('temp_analysis/process_3.json', 'r', encoding='utf-8') as f:
data = json.load(f)
exam_cases = []
num = 0
# 遍历每个URL条目
for url, url_data in data.items():
# 获取title作为question
question = url_data.get('title', '').strip()
# 遍历所有最短路径的元数据
for path_meta in url_data.get('shortestPathsMeta', []):
chain_ids = path_meta.get('chainIDs', [])
chain_urls = path_meta.get('chainUrls', [])
chain_ax_tree_ids = path_meta.get('chainAxTreeID', [])
chain_texts = path_meta.get('chainTexts', [])
trajectory_id = str(uuid.uuid4())
trajectory_step_num = 0
# 确保所有列表长度一致
min_length = min(len(chain_ids), len(chain_urls), len(chain_ax_tree_ids), len(chain_texts))
# 为每个步骤创建一个测试用例
for i in range(min_length):
num +=1
trajectory_step_num += 1
case = {
"num": num,
"id": str(uuid.uuid4()),
"trajectory_id": trajectory_id,
"trajectory_step_num": trajectory_step_num,
"page_id": chain_ids[i] if i < len(chain_ids) else "",
"url": chain_urls[i] if i < len(chain_urls) else "",
"question": question,
"answer": str(chain_ax_tree_ids[i]) if i < len(chain_ax_tree_ids) else "",
"answer_text": chain_texts[i] if i < len(chain_texts) else ""
}
exam_cases.append(case)
# 将结果保存到exam.json
with open('temp_analysis/exam.json', 'w', encoding='utf-8') as f:
json.dump(exam_cases, f, ensure_ascii=False, indent=2)
print(f"已生成 {len(exam_cases)} 个测试用例并保存到 temp_analysis/exam.json")
if __name__ == "__main__":
create_exam_cases()