crawlee/misc/temp_analysis/merge_process_3.py
2025-04-23 12:14:50 +08:00

56 lines
2.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
def merge_json_files():
# 定义输入和输出文件路径
process_3_path = "path/processed_3.json"
processed_with_analysis_path = "backup/v17_2/path/processed_3_with_analysis_20250324163759.json"
output_path = "temp_analysis/process_3.json"
# 读取第一个JSON文件
with open(process_3_path, 'r', encoding='utf-8') as f:
process_3_data = json.load(f)
# 读取第二个JSON文件
with open(processed_with_analysis_path, 'r', encoding='utf-8') as f:
processed_with_analysis_data = json.load(f)
# 获取两个文件的键交集
common_keys = set(process_3_data.keys()) & set(processed_with_analysis_data.keys())
# 创建新的JSON数据
merged_data = {}
for key in common_keys:
skip = False
# 如果shortestPathsMeta为空则从processed_3_with_analysis文件中获取
axTreeIDList = process_3_data[key]['shortestPathsMeta'][0]['chainAxTreeID']
for axTreeID in axTreeIDList:
if axTreeID is None:
skip = True
break
if skip:
continue
# 以process_3.json为基础
merged_data[key] = process_3_data[key]
# 从processed_3_with_analysis文件中获取raw_result和title并添加
if 'raw_result' in processed_with_analysis_data[key]['shortestPathsMeta'][0]:
merged_data[key]['raw_result'] = processed_with_analysis_data[key]['shortestPathsMeta'][0]['raw_result']
if 'title' in processed_with_analysis_data[key]['shortestPathsMeta'][0]:
merged_data[key]['title'] = processed_with_analysis_data[key]['shortestPathsMeta'][0]['title'].strip()
# 将合并后的数据写入输出文件
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, ensure_ascii=False, indent=2)
print(f"合并完成!共处理了 {len(common_keys)} 个键。")
print(f"结果已保存到 {output_path}")
if __name__ == "__main__":
merge_json_files()