use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::mpsc::{self, Receiver, SyncSender, TryRecvError, TrySendError}; use std::time::{Duration, Instant}; use crate::audit::{DiscoveredFrom, PublicationPointAudit}; use crate::cir::CirInputAccumulator; use crate::parallel::object_worker::ObjectWorkerSubmitError; use crate::parallel::repo_runtime::{RepoSyncRequestStatus, RepoSyncRuntimeOutcome}; use crate::parallel::types::RepoIdentity; use crate::policy::SignedObjectFailurePolicy; use crate::report::Warning; use crate::storage::VcirReplaceTimingBreakdown; use crate::validation::manifest::PublicationPointData; use crate::validation::manifest::PublicationPointSource; use crate::validation::objects::{ ObjectsOutput, OwnedRoaTask, ParallelObjectsPrepare, ParallelObjectsStage, RoaValidationCacheInput, prepare_publication_point_for_parallel_roa_with_cache, reduce_parallel_roa_stage, }; use crate::validation::tree::{ CaInstanceHandle, DiscoveredChildCaInstance, PublicationPointRunResult, PublicationPointRunner, TreeRunAuditOutput, TreeRunConfig, TreeRunError, TreeRunOutput, run_tree_serial_audit_multi_root, }; use crate::validation::tree_runner::{ FreshPublicationPointFinalizeOutput, FreshPublicationPointStage, Rpkiv1PublicationPointRunner, }; #[derive(Clone, Debug)] struct QueuedCaInstance { id: u64, handle: CaInstanceHandle, parent_id: Option, discovered_from: Option, } #[derive(Clone, Debug)] struct ReadyCaInstance { node: QueuedCaInstance, repo_outcome: RepoSyncRuntimeOutcome, ready_enqueued_at: Instant, } struct InflightPublicationPoint { node: QueuedCaInstance, fresh_stage: FreshPublicationPointStage, objects_stage: ParallelObjectsStage, repo_outcome: RepoSyncRuntimeOutcome, warnings: Vec, started_at: Instant, objects_started_at: Instant, task_count: usize, tasks_submitted: usize, first_task_submitted_at: Option, last_task_submitted_at: Option, first_result_at: Option, last_result_at: Option, worker_ms_total: u64, worker_ms_max: u64, queue_wait_ms_total: u64, queue_wait_ms_max: u64, finalize_enqueued_at: Option, results: Vec, } struct FinishedPublicationPoint { node: FinishedPublicationPointNode, result: FinishedPublicationPointResult, } #[derive(Clone, Debug)] struct FinishedPublicationPointNode { id: u64, parent_id: Option, discovered_from: Option, manifest_rsync_uri: String, } impl FinishedPublicationPointNode { fn from_queued(node: QueuedCaInstance) -> Self { Self { id: node.id, parent_id: node.parent_id, discovered_from: node.discovered_from, manifest_rsync_uri: node.handle.manifest_rsync_uri, } } } #[derive(Debug)] enum FinishedPublicationPointResult { Ok { source: PublicationPointSource, warnings: Vec, objects: ObjectsOutput, audit: PublicationPointAudit, cir_fresh_objects: Vec, cir_cached_objects: Vec, }, Err(String), } struct FinalizeTask { state: InflightPublicationPoint, } struct FinalizeWorkerResult { finished: FinishedPublicationPoint, metrics: FinalizePublicationPointMetrics, } #[derive(Default)] struct ReadyStageMetrics { manifest_rsync_uri: Option, publication_point_rsync_uri: Option, ready_count: usize, fallback_count: usize, complete_count: usize, staged_count: usize, zero_task_count: usize, error_count: usize, discovered_children: usize, locked_files: usize, roa_tasks: usize, aspa_objects: usize, stage_fresh_ms: u64, snapshot_prepare_ms: u64, snapshot_current_index_lock_ms: u64, snapshot_manifest_load_ms: u64, snapshot_manifest_index_lookup_ms: u64, snapshot_manifest_blob_load_ms: u64, snapshot_manifest_decode_ms: u64, snapshot_replay_guard_ms: u64, replay_meta_hit_count: usize, replay_meta_miss_count: usize, snapshot_manifest_entries_ms: u64, snapshot_pack_files_ms: u64, snapshot_pack_files_index_lookup_ms: u64, snapshot_pack_files_blob_load_ms: u64, snapshot_ee_path_validate_ms: u64, snapshot_manifest_file_count: usize, child_discovery_ms: u64, child_enqueue_ms: u64, ready_queue_wait_ms: u64, ready_queue_len_after_pop: usize, roa_presence_scan_ms: u64, roa_cache_view_ms: u64, direct_finalize_ms: u64, fallback_full_run_ms: u64, prepare_ms: u64, build_roa_tasks_ms: u64, total_ms: u64, } #[derive(Default)] struct ReadyStageBatchMetrics { ready_count: usize, fallback_count: usize, complete_count: usize, staged_count: usize, zero_task_count: usize, error_count: usize, discovered_children: usize, locked_files: usize, roa_tasks: usize, aspa_objects: usize, stage_fresh_ms_total: u64, stage_fresh_ms_max: u64, stage_fresh_ms_max_manifest_rsync_uri: Option, stage_fresh_ms_max_publication_point_rsync_uri: Option, snapshot_prepare_ms_total: u64, snapshot_prepare_ms_max: u64, snapshot_current_index_lock_ms_total: u64, snapshot_current_index_lock_ms_max: u64, snapshot_manifest_load_ms_total: u64, snapshot_manifest_load_ms_max: u64, snapshot_manifest_index_lookup_ms_total: u64, snapshot_manifest_index_lookup_ms_max: u64, snapshot_manifest_blob_load_ms_total: u64, snapshot_manifest_blob_load_ms_max: u64, snapshot_manifest_decode_ms_total: u64, snapshot_manifest_decode_ms_max: u64, snapshot_replay_guard_ms_total: u64, snapshot_replay_guard_ms_max: u64, replay_meta_hit_count: usize, replay_meta_miss_count: usize, snapshot_manifest_entries_ms_total: u64, snapshot_manifest_entries_ms_max: u64, snapshot_pack_files_ms_total: u64, snapshot_pack_files_ms_max: u64, snapshot_pack_files_index_lookup_ms_total: u64, snapshot_pack_files_index_lookup_ms_max: u64, snapshot_pack_files_blob_load_ms_total: u64, snapshot_pack_files_blob_load_ms_max: u64, snapshot_ee_path_validate_ms_total: u64, snapshot_ee_path_validate_ms_max: u64, snapshot_manifest_file_count_total: usize, snapshot_manifest_file_count_max: usize, child_discovery_ms_total: u64, child_discovery_ms_max: u64, child_enqueue_ms_total: u64, child_enqueue_ms_max: u64, ready_queue_wait_ms_total: u64, ready_queue_wait_ms_max: u64, roa_presence_scan_ms_total: u64, roa_presence_scan_ms_max: u64, roa_cache_view_ms_total: u64, roa_cache_view_ms_max: u64, direct_finalize_ms_total: u64, direct_finalize_ms_max: u64, fallback_full_run_ms_total: u64, fallback_full_run_ms_max: u64, prepare_ms_total: u64, prepare_ms_max: u64, build_roa_tasks_ms_total: u64, build_roa_tasks_ms_max: u64, total_ms: u64, } impl ReadyStageBatchMetrics { fn record(&mut self, metrics: ReadyStageMetrics) { self.ready_count += metrics.ready_count; self.fallback_count += metrics.fallback_count; self.complete_count += metrics.complete_count; self.staged_count += metrics.staged_count; self.zero_task_count += metrics.zero_task_count; self.error_count += metrics.error_count; self.discovered_children += metrics.discovered_children; self.locked_files += metrics.locked_files; self.roa_tasks += metrics.roa_tasks; self.aspa_objects += metrics.aspa_objects; if metrics.stage_fresh_ms >= self.stage_fresh_ms_max { self.stage_fresh_ms_max_manifest_rsync_uri = metrics.manifest_rsync_uri.clone(); self.stage_fresh_ms_max_publication_point_rsync_uri = metrics.publication_point_rsync_uri.clone(); } self.stage_fresh_ms_total += metrics.stage_fresh_ms; self.stage_fresh_ms_max = self.stage_fresh_ms_max.max(metrics.stage_fresh_ms); self.snapshot_prepare_ms_total += metrics.snapshot_prepare_ms; self.snapshot_prepare_ms_max = self .snapshot_prepare_ms_max .max(metrics.snapshot_prepare_ms); self.snapshot_current_index_lock_ms_total += metrics.snapshot_current_index_lock_ms; self.snapshot_current_index_lock_ms_max = self .snapshot_current_index_lock_ms_max .max(metrics.snapshot_current_index_lock_ms); self.snapshot_manifest_load_ms_total += metrics.snapshot_manifest_load_ms; self.snapshot_manifest_load_ms_max = self .snapshot_manifest_load_ms_max .max(metrics.snapshot_manifest_load_ms); self.snapshot_manifest_index_lookup_ms_total += metrics.snapshot_manifest_index_lookup_ms; self.snapshot_manifest_index_lookup_ms_max = self .snapshot_manifest_index_lookup_ms_max .max(metrics.snapshot_manifest_index_lookup_ms); self.snapshot_manifest_blob_load_ms_total += metrics.snapshot_manifest_blob_load_ms; self.snapshot_manifest_blob_load_ms_max = self .snapshot_manifest_blob_load_ms_max .max(metrics.snapshot_manifest_blob_load_ms); self.snapshot_manifest_decode_ms_total += metrics.snapshot_manifest_decode_ms; self.snapshot_manifest_decode_ms_max = self .snapshot_manifest_decode_ms_max .max(metrics.snapshot_manifest_decode_ms); self.snapshot_replay_guard_ms_total += metrics.snapshot_replay_guard_ms; self.snapshot_replay_guard_ms_max = self .snapshot_replay_guard_ms_max .max(metrics.snapshot_replay_guard_ms); self.replay_meta_hit_count += metrics.replay_meta_hit_count; self.replay_meta_miss_count += metrics.replay_meta_miss_count; self.snapshot_manifest_entries_ms_total += metrics.snapshot_manifest_entries_ms; self.snapshot_manifest_entries_ms_max = self .snapshot_manifest_entries_ms_max .max(metrics.snapshot_manifest_entries_ms); self.snapshot_pack_files_ms_total += metrics.snapshot_pack_files_ms; self.snapshot_pack_files_ms_max = self .snapshot_pack_files_ms_max .max(metrics.snapshot_pack_files_ms); self.snapshot_pack_files_index_lookup_ms_total += metrics.snapshot_pack_files_index_lookup_ms; self.snapshot_pack_files_index_lookup_ms_max = self .snapshot_pack_files_index_lookup_ms_max .max(metrics.snapshot_pack_files_index_lookup_ms); self.snapshot_pack_files_blob_load_ms_total += metrics.snapshot_pack_files_blob_load_ms; self.snapshot_pack_files_blob_load_ms_max = self .snapshot_pack_files_blob_load_ms_max .max(metrics.snapshot_pack_files_blob_load_ms); self.snapshot_ee_path_validate_ms_total += metrics.snapshot_ee_path_validate_ms; self.snapshot_ee_path_validate_ms_max = self .snapshot_ee_path_validate_ms_max .max(metrics.snapshot_ee_path_validate_ms); self.snapshot_manifest_file_count_total += metrics.snapshot_manifest_file_count; self.snapshot_manifest_file_count_max = self .snapshot_manifest_file_count_max .max(metrics.snapshot_manifest_file_count); self.child_discovery_ms_total += metrics.child_discovery_ms; self.child_discovery_ms_max = self.child_discovery_ms_max.max(metrics.child_discovery_ms); self.child_enqueue_ms_total += metrics.child_enqueue_ms; self.child_enqueue_ms_max = self.child_enqueue_ms_max.max(metrics.child_enqueue_ms); self.ready_queue_wait_ms_total += metrics.ready_queue_wait_ms; self.ready_queue_wait_ms_max = self .ready_queue_wait_ms_max .max(metrics.ready_queue_wait_ms); self.roa_presence_scan_ms_total += metrics.roa_presence_scan_ms; self.roa_presence_scan_ms_max = self .roa_presence_scan_ms_max .max(metrics.roa_presence_scan_ms); self.roa_cache_view_ms_total += metrics.roa_cache_view_ms; self.roa_cache_view_ms_max = self.roa_cache_view_ms_max.max(metrics.roa_cache_view_ms); self.direct_finalize_ms_total += metrics.direct_finalize_ms; self.direct_finalize_ms_max = self.direct_finalize_ms_max.max(metrics.direct_finalize_ms); self.fallback_full_run_ms_total += metrics.fallback_full_run_ms; self.fallback_full_run_ms_max = self .fallback_full_run_ms_max .max(metrics.fallback_full_run_ms); self.prepare_ms_total += metrics.prepare_ms; self.prepare_ms_max = self.prepare_ms_max.max(metrics.prepare_ms); self.build_roa_tasks_ms_total += metrics.build_roa_tasks_ms; self.build_roa_tasks_ms_max = self.build_roa_tasks_ms_max.max(metrics.build_roa_tasks_ms); self.total_ms += metrics.total_ms; } } #[derive(Default)] struct RoaDispatchMetrics { attempted: usize, submitted: usize, queue_full: bool, pending_remaining: usize, duration_ms: u64, } #[derive(Default)] struct ObjectDrainMetrics { results_drained: usize, publication_points_completed: usize, worker_ms_total: u64, worker_ms_max: u64, queue_wait_ms_total: u64, queue_wait_ms_max: u64, result_budget_exhausted: bool, duration_ms: u64, } #[derive(Default)] struct FinalizeSubmitMetrics { submitted: usize, queue_full: bool, duration_ms: u64, } #[derive(Default)] struct FinalizePublicationPointMetrics { reduce_ms: u64, finalize_ms: u64, finalize_queue_wait_ms: Option, finalize_worker_ms: u64, snapshot_pack_ms: u64, persist_vcir_ms: u64, persist_build_vcir_ms: u64, persist_replace_vcir_ms: u64, persist_replace_breakdown: VcirReplaceTimingBreakdown, ccr_projection_build_ms: u64, ccr_append_ms: u64, audit_build_ms: u64, locked_files: usize, child_count: usize, warning_count: usize, vrp_count: usize, vap_count: usize, router_key_count: usize, audit_object_count: usize, } #[derive(Default)] struct FinalizeResultsDrainMetrics { results_drained: usize, reduce_ms_total: u64, reduce_ms_max: u64, finalize_ms_total: u64, finalize_ms_max: u64, finalize_queue_wait_ms_max: u64, finalize_worker_ms_total: u64, finalize_worker_ms_max: u64, snapshot_pack_ms_total: u64, snapshot_pack_ms_max: u64, persist_vcir_ms_total: u64, persist_vcir_ms_max: u64, persist_build_vcir_ms_total: u64, persist_build_vcir_ms_max: u64, persist_replace_vcir_ms_total: u64, persist_replace_vcir_ms_max: u64, ccr_projection_build_ms_total: u64, ccr_projection_build_ms_max: u64, ccr_append_ms_total: u64, ccr_append_ms_max: u64, audit_build_ms_total: u64, audit_build_ms_max: u64, duration_ms: u64, } #[derive(Default)] struct RepoDrainMetrics { event_count: usize, completions: usize, ready_enqueued: usize, duration_ms: u64, } fn elapsed_ms(started: Instant) -> u64 { started.elapsed().as_millis() as u64 } fn emit_control_loop_slow( duration_ms: u64, repo_poll_timeout: Duration, repo_metrics: &RepoDrainMetrics, ready_batch_metrics: &ReadyStageBatchMetrics, ca_queue_len: usize, ready_queue_len: usize, ca_waiting_repo_identities: usize, pending_roa_dispatch_len: usize, inflight_publication_points_len: usize, pending_finalization_len: usize, finalize_inflight: usize, ) { let threshold_ms = crate::progress_log::control_loop_slow_threshold_ms(); if duration_ms < threshold_ms { return; } crate::progress_log::emit( "phase2_control_loop_slow", serde_json::json!({ "duration_ms": duration_ms, "slow_threshold_ms": threshold_ms, "repo_poll_timeout_ms": repo_poll_timeout.as_millis() as u64, "repo_event_count": repo_metrics.event_count, "repo_completions": repo_metrics.completions, "repo_ready_enqueued": repo_metrics.ready_enqueued, "repo_drain_duration_ms": repo_metrics.duration_ms, "ready_count": ready_batch_metrics.ready_count, "ready_batch_duration_ms": ready_batch_metrics.total_ms, "ready_batch_stage_fresh_ms_total": ready_batch_metrics.stage_fresh_ms_total, "ready_batch_stage_fresh_ms_max": ready_batch_metrics.stage_fresh_ms_max, "ready_batch_stage_fresh_ms_max_manifest_rsync_uri": ready_batch_metrics.stage_fresh_ms_max_manifest_rsync_uri, "ready_batch_child_discovery_ms_total": ready_batch_metrics.child_discovery_ms_total, "ready_batch_child_discovery_ms_max": ready_batch_metrics.child_discovery_ms_max, "ready_batch_prepare_ms_total": ready_batch_metrics.prepare_ms_total, "ready_batch_prepare_ms_max": ready_batch_metrics.prepare_ms_max, "ready_batch_direct_finalize_ms_total": ready_batch_metrics.direct_finalize_ms_total, "ready_batch_direct_finalize_ms_max": ready_batch_metrics.direct_finalize_ms_max, "ca_queue_len": ca_queue_len, "ready_queue_len": ready_queue_len, "ca_waiting_repo_identities": ca_waiting_repo_identities, "pending_roa_dispatch_len": pending_roa_dispatch_len, "inflight_publication_points_len": inflight_publication_points_len, "pending_finalization_len": pending_finalization_len, "finalize_inflight": finalize_inflight, }), ); } fn compact_phase2_finished_result( mut result: PublicationPointRunResult, compact_audit: bool, ) -> FinishedPublicationPointResult { result.objects.audit.clear(); result.objects.local_outputs_cache.clear(); let cir_fresh_objects = if compact_audit && result.source == PublicationPointSource::Fresh { result.audit.objects.clone() } else { result.cir_fresh_objects }; if compact_audit { result.audit.objects.clear(); result.audit.warnings.clear(); } FinishedPublicationPointResult::Ok { source: result.source, warnings: result.warnings, objects: result.objects, audit: result.audit, cir_fresh_objects, cir_cached_objects: result.cir_cached_objects, } } fn compact_phase2_finished_result_result( result: Result, compact_audit: bool, ) -> FinishedPublicationPointResult { match result { Ok(result) => compact_phase2_finished_result(result, compact_audit), Err(err) => FinishedPublicationPointResult::Err(err), } } pub fn run_tree_parallel_phase2_audit_multi_root( roots: Vec, runner: &Rpkiv1PublicationPointRunner<'_>, config: &TreeRunConfig, ) -> Result { if runner.policy.signed_object_failure_policy == SignedObjectFailurePolicy::DropPublicationPoint { return run_tree_serial_audit_multi_root(roots, runner, config); } let Some(repo_runtime) = runner.repo_sync_runtime.as_ref() else { return run_tree_serial_audit_multi_root(roots, runner, config); }; if runner.parallel_roa_worker_pool.is_none() { return run_tree_serial_audit_multi_root(roots, runner, config); } let mut next_id: u64 = 0; let mut ca_queue: VecDeque = VecDeque::new(); for root in roots { ca_queue.push_back(QueuedCaInstance { id: next_id, handle: root, parent_id: None, discovered_from: None, }); next_id += 1; } let mut visited_manifest_uris: HashSet = HashSet::new(); let mut ca_waiting_repo_by_identity: HashMap> = HashMap::new(); let mut ready_queue: VecDeque = VecDeque::new(); let mut inflight_publication_points: HashMap = HashMap::new(); let mut pending_finalization: VecDeque = VecDeque::new(); let mut pending_roa_dispatch: VecDeque = VecDeque::new(); let mut finished: Vec = Vec::new(); let mut instances_started = 0usize; let phase2_config = runner.parallel_phase2_config.as_ref(); let ready_batch_size = phase2_config .map(|cfg| cfg.ready_batch_size) .unwrap_or(256) .max(1); let ready_batch_wall_time_budget_ms = phase2_config .map(|cfg| cfg.ready_batch_wall_time_budget_ms) .unwrap_or(100) .max(1); let ready_batch_wall_time_budget = Duration::from_millis(ready_batch_wall_time_budget_ms); let object_result_drain_batch_size = phase2_config .map(|cfg| cfg.object_result_drain_batch_size) .unwrap_or(2048) .max(1); let publication_point_finalize_queue_capacity = phase2_config .map(|cfg| cfg.publication_point_finalize_queue_capacity) .unwrap_or(32768) .max(1); let (finalize_task_tx, finalize_task_rx) = mpsc::sync_channel::(publication_point_finalize_queue_capacity); let (finalize_result_tx, finalize_result_rx) = mpsc::channel::(); let mut finalize_inflight = 0usize; return std::thread::scope(|scope| { let finalize_worker = scope.spawn(move || { run_finalize_worker( runner, finalize_task_rx, finalize_result_tx, config.compact_audit, ) }); let run_result: Result<(), TreeRunError> = (|| { loop { let control_loop_started = Instant::now(); drain_finalize_results_with_progress( &finalize_result_rx, &mut finished, &mut finalize_inflight, pending_finalization.len(), pending_roa_dispatch.len(), inflight_publication_points.len(), )?; flush_pending_roa_dispatch_with_progress( runner, &mut pending_roa_dispatch, &mut inflight_publication_points, &pending_finalization, )?; drain_object_results_with_progress( runner, &mut inflight_publication_points, &mut pending_finalization, pending_roa_dispatch.len(), object_result_drain_batch_size, )?; submit_pending_finalization_with_progress( &finalize_task_tx, &mut pending_finalization, &mut finalize_inflight, publication_point_finalize_queue_capacity, pending_roa_dispatch.len(), inflight_publication_points.len(), )?; start_queued_ca_instances( repo_runtime.as_ref(), &mut ca_queue, &mut ready_queue, &mut ca_waiting_repo_by_identity, &mut finished, &mut visited_manifest_uris, &mut instances_started, config, ); let repo_poll_timeout = event_poll_timeout( &ca_queue, &ready_queue, &pending_roa_dispatch, &inflight_publication_points, &pending_finalization, finalize_inflight, instances_started, config, ); let repo_metrics = drain_repo_events( repo_runtime.as_ref(), &mut ca_waiting_repo_by_identity, &mut ready_queue, repo_poll_timeout, )?; if repo_metrics.event_count > 0 { crate::progress_log::emit( "phase2_repo_events_drain", serde_json::json!({ "event_count": repo_metrics.event_count, "completions": repo_metrics.completions, "ready_enqueued": repo_metrics.ready_enqueued, "duration_ms": repo_metrics.duration_ms, "ready_queue_len": ready_queue.len(), "ca_waiting_repo_identities": ca_waiting_repo_by_identity.len(), }), ); } let ready_batch_started = Instant::now(); let mut ready_batch_metrics = ReadyStageBatchMetrics::default(); let mut ready_time_budget_exhausted = false; while ready_batch_metrics.ready_count < ready_batch_size { let Some(ready) = ready_queue.pop_front() else { break; }; let ready_queue_len_after_pop = ready_queue.len(); let metrics = stage_ready_publication_point( runner, &mut next_id, &mut ca_queue, &mut pending_roa_dispatch, &mut inflight_publication_points, &mut pending_finalization, &mut finished, ready, ready_queue_len_after_pop, config.compact_audit, ); ready_batch_metrics.record(metrics); if ready_batch_metrics.ready_count > 0 && ready_batch_started.elapsed() >= ready_batch_wall_time_budget { ready_time_budget_exhausted = !ready_queue.is_empty(); break; } } if ready_batch_metrics.ready_count > 0 { ready_batch_metrics.total_ms = elapsed_ms(ready_batch_started); let ready_count_budget_exhausted = ready_batch_metrics.ready_count >= ready_batch_size && !ready_queue.is_empty(); ready_time_budget_exhausted = ready_time_budget_exhausted || (!ready_queue.is_empty() && ready_batch_metrics.total_ms >= ready_batch_wall_time_budget_ms); emit_ready_queue_batch_progress( &ready_batch_metrics, ready_batch_size, ready_batch_wall_time_budget_ms, ready_queue.len(), ready_count_budget_exhausted, ready_time_budget_exhausted, ca_queue.len(), pending_roa_dispatch.len(), inflight_publication_points.len(), pending_finalization.len(), finalize_inflight, ); crate::progress_log::emit( "phase2_ready_queue_stage_fresh_breakdown", serde_json::json!({ "ready_count": ready_batch_metrics.ready_count, "stage_fresh_ms_total": ready_batch_metrics.stage_fresh_ms_total, "stage_fresh_ms_max": ready_batch_metrics.stage_fresh_ms_max, "stage_fresh_ms_max_manifest_rsync_uri": ready_batch_metrics.stage_fresh_ms_max_manifest_rsync_uri, "stage_fresh_ms_max_publication_point_rsync_uri": ready_batch_metrics.stage_fresh_ms_max_publication_point_rsync_uri, "snapshot_prepare_ms_total": ready_batch_metrics.snapshot_prepare_ms_total, "snapshot_prepare_ms_max": ready_batch_metrics.snapshot_prepare_ms_max, "snapshot_current_index_lock_ms_total": ready_batch_metrics.snapshot_current_index_lock_ms_total, "snapshot_current_index_lock_ms_max": ready_batch_metrics.snapshot_current_index_lock_ms_max, "snapshot_manifest_load_ms_total": ready_batch_metrics.snapshot_manifest_load_ms_total, "snapshot_manifest_load_ms_max": ready_batch_metrics.snapshot_manifest_load_ms_max, "snapshot_manifest_index_lookup_ms_total": ready_batch_metrics.snapshot_manifest_index_lookup_ms_total, "snapshot_manifest_index_lookup_ms_max": ready_batch_metrics.snapshot_manifest_index_lookup_ms_max, "snapshot_manifest_blob_load_ms_total": ready_batch_metrics.snapshot_manifest_blob_load_ms_total, "snapshot_manifest_blob_load_ms_max": ready_batch_metrics.snapshot_manifest_blob_load_ms_max, "snapshot_manifest_decode_ms_total": ready_batch_metrics.snapshot_manifest_decode_ms_total, "snapshot_manifest_decode_ms_max": ready_batch_metrics.snapshot_manifest_decode_ms_max, "snapshot_replay_guard_ms_total": ready_batch_metrics.snapshot_replay_guard_ms_total, "snapshot_replay_guard_ms_max": ready_batch_metrics.snapshot_replay_guard_ms_max, "replay_meta_hit_count": ready_batch_metrics.replay_meta_hit_count, "replay_meta_miss_count": ready_batch_metrics.replay_meta_miss_count, "snapshot_manifest_entries_ms_total": ready_batch_metrics.snapshot_manifest_entries_ms_total, "snapshot_manifest_entries_ms_max": ready_batch_metrics.snapshot_manifest_entries_ms_max, "snapshot_pack_files_ms_total": ready_batch_metrics.snapshot_pack_files_ms_total, "snapshot_pack_files_ms_max": ready_batch_metrics.snapshot_pack_files_ms_max, "snapshot_pack_files_index_lookup_ms_total": ready_batch_metrics.snapshot_pack_files_index_lookup_ms_total, "snapshot_pack_files_index_lookup_ms_max": ready_batch_metrics.snapshot_pack_files_index_lookup_ms_max, "snapshot_pack_files_blob_load_ms_total": ready_batch_metrics.snapshot_pack_files_blob_load_ms_total, "snapshot_pack_files_blob_load_ms_max": ready_batch_metrics.snapshot_pack_files_blob_load_ms_max, "snapshot_ee_path_validate_ms_total": ready_batch_metrics.snapshot_ee_path_validate_ms_total, "snapshot_ee_path_validate_ms_max": ready_batch_metrics.snapshot_ee_path_validate_ms_max, "snapshot_manifest_file_count_total": ready_batch_metrics.snapshot_manifest_file_count_total, "snapshot_manifest_file_count_max": ready_batch_metrics.snapshot_manifest_file_count_max, "child_discovery_ms_total": ready_batch_metrics.child_discovery_ms_total, "child_discovery_ms_max": ready_batch_metrics.child_discovery_ms_max, "batch_duration_ms": ready_batch_metrics.total_ms, }), ); } flush_pending_roa_dispatch_with_progress( runner, &mut pending_roa_dispatch, &mut inflight_publication_points, &pending_finalization, )?; drain_object_results_with_progress( runner, &mut inflight_publication_points, &mut pending_finalization, pending_roa_dispatch.len(), object_result_drain_batch_size, )?; submit_pending_finalization_with_progress( &finalize_task_tx, &mut pending_finalization, &mut finalize_inflight, publication_point_finalize_queue_capacity, pending_roa_dispatch.len(), inflight_publication_points.len(), )?; drain_finalize_results_with_progress( &finalize_result_rx, &mut finished, &mut finalize_inflight, pending_finalization.len(), pending_roa_dispatch.len(), inflight_publication_points.len(), )?; emit_control_loop_slow( elapsed_ms(control_loop_started), repo_poll_timeout, &repo_metrics, &ready_batch_metrics, ca_queue.len(), ready_queue.len(), ca_waiting_repo_by_identity.len(), pending_roa_dispatch.len(), inflight_publication_points.len(), pending_finalization.len(), finalize_inflight, ); if is_complete( &ca_queue, &ready_queue, &ca_waiting_repo_by_identity, &pending_roa_dispatch, &inflight_publication_points, &pending_finalization, finalize_inflight, instances_started, config, ) { break; } } repo_runtime .reset_run_state() .map_err(TreeRunError::Runner)?; Ok(()) })(); drop(finalize_task_tx); let worker_result = finalize_worker .join() .map_err(|_| TreeRunError::Runner("phase2 finalize worker panicked".to_string()))?; run_result?; worker_result?; drain_finalize_results_with_progress( &finalize_result_rx, &mut finished, &mut finalize_inflight, pending_finalization.len(), pending_roa_dispatch.len(), inflight_publication_points.len(), )?; if finalize_inflight != 0 || !pending_finalization.is_empty() { return Err(TreeRunError::Runner(format!( "phase2 finalize worker stopped with pending work: queued={} inflight={}", pending_finalization.len(), finalize_inflight ))); } Ok(build_tree_output(finished)) }); } fn emit_ready_queue_batch_progress( metrics: &ReadyStageBatchMetrics, ready_batch_size: usize, ready_batch_wall_time_budget_ms: u64, ready_queue_len_after_batch: usize, ready_count_budget_exhausted: bool, ready_time_budget_exhausted: bool, ca_queue_len_after_batch: usize, pending_roa_dispatch_len_after_batch: usize, inflight_publication_points_after_batch: usize, pending_finalization_len_after_batch: usize, finalize_inflight_after_batch: usize, ) { crate::progress_log::emit( "phase2_ready_queue_batch", serde_json::json!({ "ready_count": metrics.ready_count, "fallback_count": metrics.fallback_count, "complete_count": metrics.complete_count, "staged_count": metrics.staged_count, "zero_task_count": metrics.zero_task_count, "error_count": metrics.error_count, "discovered_children": metrics.discovered_children, "locked_files": metrics.locked_files, "roa_tasks": metrics.roa_tasks, "aspa_objects": metrics.aspa_objects, "stage_fresh_ms_total": metrics.stage_fresh_ms_total, "stage_fresh_ms_max": metrics.stage_fresh_ms_max, "stage_fresh_ms_max_manifest_rsync_uri": metrics.stage_fresh_ms_max_manifest_rsync_uri, "stage_fresh_ms_max_publication_point_rsync_uri": metrics.stage_fresh_ms_max_publication_point_rsync_uri, "prepare_ms_total": metrics.prepare_ms_total, "prepare_ms_max": metrics.prepare_ms_max, "build_roa_tasks_ms_total": metrics.build_roa_tasks_ms_total, "build_roa_tasks_ms_max": metrics.build_roa_tasks_ms_max, "batch_duration_ms": metrics.total_ms, "ready_batch_size": ready_batch_size, "ready_batch_wall_time_budget_ms": ready_batch_wall_time_budget_ms, "ready_queue_len_after_batch": ready_queue_len_after_batch, "ready_queue_budget_exhausted": ready_queue_len_after_batch > 0, "ready_count_budget_exhausted": ready_count_budget_exhausted, "ready_time_budget_exhausted": ready_time_budget_exhausted, "ca_queue_len_after_batch": ca_queue_len_after_batch, "pending_roa_dispatch_len_after_batch": pending_roa_dispatch_len_after_batch, "inflight_publication_points_after_batch": inflight_publication_points_after_batch, "pending_finalization_len_after_batch": pending_finalization_len_after_batch, "finalize_inflight_after_batch": finalize_inflight_after_batch, }), ); crate::progress_log::emit( "phase2_ready_queue_control_breakdown", serde_json::json!({ "ready_count": metrics.ready_count, "ready_queue_wait_ms_total": metrics.ready_queue_wait_ms_total, "ready_queue_wait_ms_max": metrics.ready_queue_wait_ms_max, "child_enqueue_ms_total": metrics.child_enqueue_ms_total, "child_enqueue_ms_max": metrics.child_enqueue_ms_max, "roa_presence_scan_ms_total": metrics.roa_presence_scan_ms_total, "roa_presence_scan_ms_max": metrics.roa_presence_scan_ms_max, "roa_cache_view_ms_total": metrics.roa_cache_view_ms_total, "roa_cache_view_ms_max": metrics.roa_cache_view_ms_max, "direct_finalize_ms_total": metrics.direct_finalize_ms_total, "direct_finalize_ms_max": metrics.direct_finalize_ms_max, "fallback_full_run_ms_total": metrics.fallback_full_run_ms_total, "fallback_full_run_ms_max": metrics.fallback_full_run_ms_max, "batch_duration_ms": metrics.total_ms, }), ); } fn can_start_more(instances_started: usize, config: &TreeRunConfig) -> bool { config .max_instances .map(|max| instances_started < max) .unwrap_or(true) } fn start_queued_ca_instances( repo_runtime: &dyn crate::parallel::repo_runtime::RepoSyncRuntime, ca_queue: &mut VecDeque, ready_queue: &mut VecDeque, ca_waiting_repo_by_identity: &mut HashMap>, finished: &mut Vec, visited_manifest_uris: &mut HashSet, instances_started: &mut usize, config: &TreeRunConfig, ) { while can_start_more(*instances_started, config) { let Some(node) = ca_queue.pop_front() else { break; }; if !visited_manifest_uris.insert(node.handle.manifest_rsync_uri.clone()) { continue; } if let Some(max_depth) = config.max_depth { if node.handle.depth > max_depth { continue; } } *instances_started += 1; match repo_runtime.request_publication_point_repo(&node.handle, 0) { Ok(RepoSyncRequestStatus::Ready { mut outcome, .. }) => { // Ready here means this CA is reusing repo work that has already completed // (often due to child prefetch). Do not add the transport duration again. outcome.repo_sync_duration_ms = 0; ready_queue.push_back(ReadyCaInstance { node, repo_outcome: outcome, ready_enqueued_at: Instant::now(), }); } Ok(RepoSyncRequestStatus::Pending { identity, .. }) => { ca_waiting_repo_by_identity .entry(identity) .or_default() .push(node); } Err(err) => { finished.push(FinishedPublicationPoint { node: FinishedPublicationPointNode::from_queued(node), result: FinishedPublicationPointResult::Err(err), }); } } } } fn stage_ready_publication_point( runner: &Rpkiv1PublicationPointRunner<'_>, next_id: &mut u64, ca_queue: &mut VecDeque, pending_roa_dispatch: &mut VecDeque, inflight_publication_points: &mut HashMap, pending_finalization: &mut VecDeque, finished: &mut Vec, ready: ReadyCaInstance, ready_queue_len_after_pop: usize, compact_audit: bool, ) -> ReadyStageMetrics { let publication_point_started = Instant::now(); let ready_queue_wait_ms = publication_point_started .saturating_duration_since(ready.ready_enqueued_at) .as_millis() as u64; let mut metrics = ReadyStageMetrics { ready_count: 1, manifest_rsync_uri: Some(ready.node.handle.manifest_rsync_uri.clone()), publication_point_rsync_uri: Some(ready.node.handle.publication_point_rsync_uri.clone()), ready_queue_wait_ms, ready_queue_len_after_pop, ..ReadyStageMetrics::default() }; let mut warnings = ready.repo_outcome.warnings.clone(); let repo_outcome = ready.repo_outcome.clone(); if let Some(result) = runner.observe_or_reuse_publication_point_cache( &ready.node.handle, repo_outcome.repo_sync_source.as_deref(), repo_outcome.repo_sync_phase.as_deref(), repo_outcome.repo_sync_duration_ms, repo_outcome.repo_sync_err.as_deref(), &warnings, ) { metrics.complete_count = 1; metrics.discovered_children = result.discovered_children.len(); let child_enqueue_started = Instant::now(); enqueue_discovered_children( runner, next_id, ca_queue, &ready.node, result.discovered_children.clone(), ); metrics.child_enqueue_ms = elapsed_ms(child_enqueue_started); finished.push(FinishedPublicationPoint { node: FinishedPublicationPointNode::from_queued(ready.node), result: compact_phase2_finished_result(result, compact_audit), }); metrics.total_ms = elapsed_ms(publication_point_started); emit_ready_publication_point_control_slow( metrics.manifest_rsync_uri.as_deref().unwrap_or_default(), metrics .publication_point_rsync_uri .as_deref() .unwrap_or_default(), &repo_outcome, &metrics, "publication_point_cache", false, ); return metrics; } let stage_fresh_started = Instant::now(); let stage = runner.stage_fresh_publication_point_after_repo_ready( &ready.node.handle, repo_outcome.repo_sync_ok, repo_outcome.repo_sync_err.as_deref(), ); metrics.stage_fresh_ms = elapsed_ms(stage_fresh_started); let fresh_stage = match stage { Ok(stage) => stage, Err(err) => { if metrics.stage_fresh_ms >= crate::progress_log::stage_fresh_slow_threshold_ms() { crate::progress_log::emit( "phase2_stage_fresh_slow", serde_json::json!({ "manifest_rsync_uri": ready.node.handle.manifest_rsync_uri.as_str(), "publication_point_rsync_uri": ready.node.handle.publication_point_rsync_uri.as_str(), "status": "error", "error": err.error.to_string(), "stage_fresh_ms": metrics.stage_fresh_ms, "snapshot_prepare_ms": err.snapshot_prepare_ms, "repo_sync_source": repo_outcome.repo_sync_source.as_deref(), "repo_sync_phase": repo_outcome.repo_sync_phase.as_deref(), "repo_sync_duration_ms": repo_outcome.repo_sync_duration_ms, }), ); } metrics.fallback_count = 1; let fallback_started = Instant::now(); let fallback = runner.run_publication_point(&ready.node.handle); metrics.fallback_full_run_ms = elapsed_ms(fallback_started); if let Ok(result) = fallback.as_ref() { metrics.discovered_children = result.discovered_children.len(); let child_enqueue_started = Instant::now(); enqueue_discovered_children( runner, next_id, ca_queue, &ready.node, result.discovered_children.clone(), ); metrics.child_enqueue_ms = elapsed_ms(child_enqueue_started); } finished.push(FinishedPublicationPoint { node: FinishedPublicationPointNode::from_queued(ready.node), result: compact_phase2_finished_result_result(fallback, compact_audit), }); metrics.total_ms = elapsed_ms(publication_point_started); emit_ready_publication_point_control_slow( metrics.manifest_rsync_uri.as_deref().unwrap_or_default(), metrics .publication_point_rsync_uri .as_deref() .unwrap_or_default(), &repo_outcome, &metrics, "fallback", true, ); return metrics; } }; metrics.snapshot_prepare_ms = fresh_stage.snapshot_prepare_ms; metrics.snapshot_current_index_lock_ms = fresh_stage.snapshot_prepare_timing.current_index_lock_ms; metrics.snapshot_manifest_load_ms = fresh_stage.snapshot_prepare_timing.manifest_load_ms; metrics.snapshot_manifest_index_lookup_ms = fresh_stage.snapshot_prepare_timing.manifest_index_lookup_ms; metrics.snapshot_manifest_blob_load_ms = fresh_stage.snapshot_prepare_timing.manifest_blob_load_ms; metrics.snapshot_manifest_decode_ms = fresh_stage.snapshot_prepare_timing.manifest_decode_ms; metrics.snapshot_replay_guard_ms = fresh_stage.snapshot_prepare_timing.replay_guard_ms; metrics.replay_meta_hit_count = fresh_stage.snapshot_prepare_timing.replay_meta_hit as usize; metrics.replay_meta_miss_count = fresh_stage.snapshot_prepare_timing.replay_meta_miss as usize; metrics.snapshot_manifest_entries_ms = fresh_stage.snapshot_prepare_timing.manifest_entries_ms; metrics.snapshot_pack_files_ms = fresh_stage.snapshot_prepare_timing.pack_files_ms; metrics.snapshot_pack_files_index_lookup_ms = fresh_stage .snapshot_prepare_timing .pack_files_index_lookup_ms; metrics.snapshot_pack_files_blob_load_ms = fresh_stage.snapshot_prepare_timing.pack_files_blob_load_ms; metrics.snapshot_ee_path_validate_ms = fresh_stage.snapshot_prepare_timing.ee_path_validate_ms; metrics.snapshot_manifest_file_count = fresh_stage.snapshot_prepare_timing.manifest_file_count; metrics.child_discovery_ms = fresh_stage.child_discovery_ms; if metrics.stage_fresh_ms >= crate::progress_log::stage_fresh_slow_threshold_ms() { crate::progress_log::emit( "phase2_stage_fresh_slow", serde_json::json!({ "manifest_rsync_uri": ready.node.handle.manifest_rsync_uri.as_str(), "publication_point_rsync_uri": ready.node.handle.publication_point_rsync_uri.as_str(), "status": "ok", "stage_fresh_ms": metrics.stage_fresh_ms, "snapshot_prepare_ms": fresh_stage.snapshot_prepare_ms, "snapshot_current_index_lock_ms": fresh_stage.snapshot_prepare_timing.current_index_lock_ms, "snapshot_manifest_load_ms": fresh_stage.snapshot_prepare_timing.manifest_load_ms, "snapshot_manifest_index_lookup_ms": fresh_stage.snapshot_prepare_timing.manifest_index_lookup_ms, "snapshot_manifest_blob_load_ms": fresh_stage.snapshot_prepare_timing.manifest_blob_load_ms, "snapshot_manifest_decode_ms": fresh_stage.snapshot_prepare_timing.manifest_decode_ms, "snapshot_replay_guard_ms": fresh_stage.snapshot_prepare_timing.replay_guard_ms, "replay_meta_hit": fresh_stage.snapshot_prepare_timing.replay_meta_hit, "replay_meta_miss": fresh_stage.snapshot_prepare_timing.replay_meta_miss, "snapshot_manifest_entries_ms": fresh_stage.snapshot_prepare_timing.manifest_entries_ms, "snapshot_pack_files_ms": fresh_stage.snapshot_prepare_timing.pack_files_ms, "snapshot_pack_files_index_lookup_ms": fresh_stage.snapshot_prepare_timing.pack_files_index_lookup_ms, "snapshot_pack_files_blob_load_ms": fresh_stage.snapshot_prepare_timing.pack_files_blob_load_ms, "snapshot_ee_path_validate_ms": fresh_stage.snapshot_prepare_timing.ee_path_validate_ms, "snapshot_manifest_file_count": fresh_stage.snapshot_prepare_timing.manifest_file_count, "child_discovery_ms": fresh_stage.child_discovery_ms, "child_count": fresh_stage.discovered_children.len(), "repo_sync_source": repo_outcome.repo_sync_source.as_deref(), "repo_sync_phase": repo_outcome.repo_sync_phase.as_deref(), "repo_sync_duration_ms": repo_outcome.repo_sync_duration_ms, }), ); } warnings.extend(fresh_stage.warnings.clone()); metrics.discovered_children = fresh_stage.discovered_children.len(); let child_enqueue_started = Instant::now(); enqueue_discovered_children( runner, next_id, ca_queue, &ready.node, fresh_stage.discovered_children.clone(), ); metrics.child_enqueue_ms = elapsed_ms(child_enqueue_started); let prepare_started = Instant::now(); let roa_presence_scan_started = Instant::now(); let has_roa = fresh_stage .fresh_point .files() .iter() .any(|file| file.rsync_uri.ends_with(".roa")); metrics.roa_presence_scan_ms = elapsed_ms(roa_presence_scan_started); if runner.enable_roa_validation_cache { if let Some(timing) = runner.timing.as_ref() { if has_roa { timing.record_count("roa_validation_cache_roa_candidate_publication_points", 1); } else { timing.record_count("roa_validation_cache_skipped_no_roa_publication_points", 1); } } } let roa_cache_view = if has_roa { let roa_cache_view_started = Instant::now(); let view = runner .roa_validation_cache_view_for_fresh_point(&fresh_stage.fresh_point.manifest_rsync_uri); metrics.roa_cache_view_ms = elapsed_ms(roa_cache_view_started); view } else { None }; let roa_cache = if runner.enable_roa_validation_cache && has_roa { RoaValidationCacheInput::enabled(roa_cache_view.as_ref()) } else { RoaValidationCacheInput::disabled() }; match prepare_publication_point_for_parallel_roa_with_cache( ready.node.id, &fresh_stage.fresh_point, runner.policy, &ready.node.handle.ca_certificate_der, ready.node.handle.ca_certificate_rsync_uri.as_deref(), ready.node.handle.effective_ip_resources.as_ref(), ready.node.handle.effective_as_resources.as_ref(), runner.validation_time, runner.persist_vcir, roa_cache, ) { ParallelObjectsPrepare::Complete(mut objects) => { metrics.prepare_ms = elapsed_ms(prepare_started); metrics.complete_count = 1; metrics.roa_tasks = objects.stats.roa_total; metrics.aspa_objects = objects.stats.aspa_total; objects .router_keys .extend(fresh_stage.discovered_router_keys.clone()); objects.local_outputs_cache.extend( crate::validation::tree_runner::build_router_key_local_outputs( &ready.node.handle, &objects.router_keys, ), ); let direct_finalize_started = Instant::now(); let finalize_metrics = finalize_ready_objects( runner, ready.node, fresh_stage, warnings, objects, repo_outcome.clone(), finished, compact_audit, ); metrics.direct_finalize_ms = finalize_metrics .finalize_ms .max(elapsed_ms(direct_finalize_started)); } ParallelObjectsPrepare::Staged(objects_stage) => { metrics.prepare_ms = elapsed_ms(prepare_started); metrics.staged_count = 1; metrics.locked_files = objects_stage.locked_file_count(); metrics.aspa_objects = objects_stage.aspa_task_count(); let build_tasks_started = Instant::now(); objects_stage.append_roa_tasks_to(pending_roa_dispatch); metrics.build_roa_tasks_ms = elapsed_ms(build_tasks_started); let task_count = objects_stage.roa_task_count(); metrics.roa_tasks = task_count; if task_count == 0 { metrics.zero_task_count = 1; pending_finalization.push_back(FinalizeTask { state: InflightPublicationPoint { node: ready.node, fresh_stage, objects_stage, repo_outcome: repo_outcome.clone(), warnings, started_at: publication_point_started, objects_started_at: Instant::now(), task_count, tasks_submitted: 0, first_task_submitted_at: None, last_task_submitted_at: None, first_result_at: None, last_result_at: None, worker_ms_total: 0, worker_ms_max: 0, queue_wait_ms_total: 0, queue_wait_ms_max: 0, finalize_enqueued_at: Some(Instant::now()), results: Vec::new(), }, }); } else { inflight_publication_points.insert( ready.node.id, InflightPublicationPoint { node: ready.node, fresh_stage, objects_stage, repo_outcome: repo_outcome.clone(), warnings, started_at: publication_point_started, objects_started_at: Instant::now(), task_count, tasks_submitted: 0, first_task_submitted_at: None, last_task_submitted_at: None, first_result_at: None, last_result_at: None, worker_ms_total: 0, worker_ms_max: 0, queue_wait_ms_total: 0, queue_wait_ms_max: 0, finalize_enqueued_at: None, results: Vec::with_capacity(task_count), }, ); } } } metrics.total_ms = elapsed_ms(publication_point_started); emit_ready_publication_point_control_slow( metrics.manifest_rsync_uri.as_deref().unwrap_or_default(), metrics .publication_point_rsync_uri .as_deref() .unwrap_or_default(), &repo_outcome, &metrics, if metrics.complete_count > 0 { "complete" } else if metrics.zero_task_count > 0 { "zero_task" } else { "staged" }, false, ); metrics } fn emit_ready_publication_point_control_slow( manifest_rsync_uri: &str, publication_point_rsync_uri: &str, repo_outcome: &RepoSyncRuntimeOutcome, metrics: &ReadyStageMetrics, status: &str, force_error_path: bool, ) { let threshold_ms = crate::progress_log::pp_control_slow_threshold_ms(); if !force_error_path && metrics.total_ms < threshold_ms { return; } crate::progress_log::emit( "phase2_ready_publication_point_control_slow", serde_json::json!({ "manifest_rsync_uri": manifest_rsync_uri, "publication_point_rsync_uri": publication_point_rsync_uri, "status": status, "repo_sync_source": repo_outcome.repo_sync_source.as_deref(), "repo_sync_phase": repo_outcome.repo_sync_phase.as_deref(), "repo_sync_duration_ms": repo_outcome.repo_sync_duration_ms, "repo_sync_ok": repo_outcome.repo_sync_ok, "repo_sync_err": repo_outcome.repo_sync_err.as_deref(), "ready_queue_wait_ms": metrics.ready_queue_wait_ms, "ready_queue_len_after_pop": metrics.ready_queue_len_after_pop, "stage_fresh_ms": metrics.stage_fresh_ms, "child_discovery_ms": metrics.child_discovery_ms, "child_enqueue_ms": metrics.child_enqueue_ms, "discovered_children": metrics.discovered_children, "roa_presence_scan_ms": metrics.roa_presence_scan_ms, "roa_cache_view_ms": metrics.roa_cache_view_ms, "prepare_ms": metrics.prepare_ms, "build_roa_tasks_ms": metrics.build_roa_tasks_ms, "direct_finalize_ms": metrics.direct_finalize_ms, "fallback_full_run_ms": metrics.fallback_full_run_ms, "locked_files": metrics.locked_files, "roa_tasks": metrics.roa_tasks, "aspa_objects": metrics.aspa_objects, "complete_count": metrics.complete_count, "staged_count": metrics.staged_count, "zero_task_count": metrics.zero_task_count, "fallback_count": metrics.fallback_count, "total_ms": metrics.total_ms, "slow_threshold_ms": threshold_ms, }), ); crate::progress_log::emit( "phase2_ready_publication_point_control_snapshot_breakdown", serde_json::json!({ "manifest_rsync_uri": manifest_rsync_uri, "publication_point_rsync_uri": publication_point_rsync_uri, "status": status, "snapshot_prepare_ms": metrics.snapshot_prepare_ms, "snapshot_current_index_lock_ms": metrics.snapshot_current_index_lock_ms, "snapshot_manifest_load_ms": metrics.snapshot_manifest_load_ms, "snapshot_manifest_index_lookup_ms": metrics.snapshot_manifest_index_lookup_ms, "snapshot_manifest_blob_load_ms": metrics.snapshot_manifest_blob_load_ms, "snapshot_manifest_decode_ms": metrics.snapshot_manifest_decode_ms, "snapshot_replay_guard_ms": metrics.snapshot_replay_guard_ms, "replay_meta_hit_count": metrics.replay_meta_hit_count, "replay_meta_miss_count": metrics.replay_meta_miss_count, "snapshot_manifest_entries_ms": metrics.snapshot_manifest_entries_ms, "snapshot_pack_files_ms": metrics.snapshot_pack_files_ms, "snapshot_pack_files_index_lookup_ms": metrics.snapshot_pack_files_index_lookup_ms, "snapshot_pack_files_blob_load_ms": metrics.snapshot_pack_files_blob_load_ms, "snapshot_ee_path_validate_ms": metrics.snapshot_ee_path_validate_ms, "snapshot_manifest_file_count": metrics.snapshot_manifest_file_count, "total_ms": metrics.total_ms, "slow_threshold_ms": threshold_ms, }), ); } fn enqueue_discovered_children( runner: &Rpkiv1PublicationPointRunner<'_>, next_id: &mut u64, ca_queue: &mut VecDeque, parent: &QueuedCaInstance, mut children: Vec, ) { children.sort_by(|a, b| { a.handle .manifest_rsync_uri .cmp(&b.handle.manifest_rsync_uri) .then_with(|| { a.discovered_from .child_ca_certificate_rsync_uri .cmp(&b.discovered_from.child_ca_certificate_rsync_uri) }) }); if let Some(runtime) = runner.repo_sync_runtime.as_ref() { let _ = runtime.prefetch_discovered_children(&children); } for child in children { let mut handle = child.handle.with_depth(parent.handle.depth + 1); handle.parent_manifest_rsync_uri = Some(parent.handle.manifest_rsync_uri.clone()); ca_queue.push_back(QueuedCaInstance { id: *next_id, handle, parent_id: Some(parent.id), discovered_from: Some(child.discovered_from), }); *next_id += 1; } } fn finalize_ready_objects( runner: &Rpkiv1PublicationPointRunner<'_>, node: QueuedCaInstance, fresh_stage: FreshPublicationPointStage, warnings: Vec, objects: ObjectsOutput, repo_outcome: RepoSyncRuntimeOutcome, finished: &mut Vec, compact_audit: bool, ) -> FinalizePublicationPointMetrics { let locked_files = fresh_stage.fresh_point.files().len(); let finalize_started = Instant::now(); let finalized = runner.finalize_fresh_publication_point_from_reducer( &node.handle, &fresh_stage.fresh_point, warnings, objects, fresh_stage.child_audits, fresh_stage.discovered_children, repo_outcome.repo_sync_source.as_deref(), repo_outcome.repo_sync_phase.as_deref(), repo_outcome.repo_sync_duration_ms, repo_outcome.repo_sync_err.as_deref(), ); let finalize_ms = elapsed_ms(finalize_started); let (result, metrics) = match finalized { Ok(output) => { let metrics = finalize_metrics_from_output( &output, 0, finalize_ms, None, finalize_ms, locked_files, ); emit_finalize_breakdown( "phase2_direct_finalize_breakdown", &node.handle.manifest_rsync_uri, &node.handle.publication_point_rsync_uri, &metrics, ); ( compact_phase2_finished_result(output.result, compact_audit), metrics, ) } Err(err) => { let metrics = FinalizePublicationPointMetrics { finalize_ms, finalize_worker_ms: finalize_ms, locked_files, ..FinalizePublicationPointMetrics::default() }; emit_finalize_breakdown( "phase2_direct_finalize_breakdown", &node.handle.manifest_rsync_uri, &node.handle.publication_point_rsync_uri, &metrics, ); (FinishedPublicationPointResult::Err(err), metrics) } }; finished.push(FinishedPublicationPoint { node: FinishedPublicationPointNode::from_queued(node), result, }); metrics } fn finalize_metrics_from_output( output: &FreshPublicationPointFinalizeOutput, reduce_ms: u64, finalize_ms: u64, finalize_queue_wait_ms: Option, finalize_worker_ms: u64, locked_files: usize, ) -> FinalizePublicationPointMetrics { FinalizePublicationPointMetrics { reduce_ms, finalize_ms, finalize_queue_wait_ms, finalize_worker_ms, snapshot_pack_ms: output.snapshot_pack_ms, persist_vcir_ms: output.persist_vcir_ms, persist_build_vcir_ms: output.persist_vcir_timing.build_vcir_ms, persist_replace_vcir_ms: output.persist_vcir_timing.replace_vcir_ms, persist_replace_breakdown: output.persist_vcir_timing.replace_vcir.clone(), ccr_projection_build_ms: output.ccr_projection_build_ms, ccr_append_ms: output.ccr_append_ms, audit_build_ms: output.audit_build_ms, locked_files, child_count: output.result.discovered_children.len(), warning_count: output.result.warnings.len(), vrp_count: output.result.objects.vrps.len(), vap_count: output.result.objects.aspas.len(), router_key_count: output.result.objects.router_keys.len(), audit_object_count: output.result.audit.objects.len(), } } fn emit_finalize_breakdown( event_name: &str, manifest_rsync_uri: &str, publication_point_rsync_uri: &str, metrics: &FinalizePublicationPointMetrics, ) { crate::progress_log::emit( event_name, serde_json::json!({ "manifest_rsync_uri": manifest_rsync_uri, "publication_point_rsync_uri": publication_point_rsync_uri, "reduce_ms": metrics.reduce_ms, "finalize_ms": metrics.finalize_ms, "finalize_queue_wait_ms": metrics.finalize_queue_wait_ms, "finalize_worker_ms": metrics.finalize_worker_ms, "snapshot_pack_ms": metrics.snapshot_pack_ms, "persist_vcir_ms": metrics.persist_vcir_ms, "persist_build_vcir_ms": metrics.persist_build_vcir_ms, "persist_replace_vcir_ms": metrics.persist_replace_vcir_ms, "persist_replace_breakdown": &metrics.persist_replace_breakdown, "ccr_projection_build_ms": metrics.ccr_projection_build_ms, "ccr_append_ms": metrics.ccr_append_ms, "audit_build_ms": metrics.audit_build_ms, "locked_files": metrics.locked_files, "child_count": metrics.child_count, "warning_count": metrics.warning_count, "vrp_count": metrics.vrp_count, "vap_count": metrics.vap_count, "router_key_count": metrics.router_key_count, "audit_object_count": metrics.audit_object_count, }), ); } fn flush_pending_roa_dispatch( runner: &Rpkiv1PublicationPointRunner<'_>, pending_roa_dispatch: &mut VecDeque, inflight_publication_points: &mut HashMap, ) -> Result { let started = Instant::now(); let mut metrics = RoaDispatchMetrics::default(); let Some(pool) = runner.parallel_roa_worker_pool.as_ref() else { return Ok(metrics); }; while let Some(mut task) = pending_roa_dispatch.pop_front() { metrics.attempted += 1; let pp_id = task.publication_point_id; task.submitted_at = Some(Instant::now()); match pool.try_submit_round_robin(task) { Ok(_) => { metrics.submitted += 1; if let Some(state) = inflight_publication_points.get_mut(&pp_id) { let now = Instant::now(); state.tasks_submitted += 1; if state.first_task_submitted_at.is_none() { state.first_task_submitted_at = Some(now); } state.last_task_submitted_at = Some(now); } } Err(ObjectWorkerSubmitError::QueueFull { task, .. }) => { pending_roa_dispatch.push_front(task); metrics.queue_full = true; break; } Err(ObjectWorkerSubmitError::Disconnected { .. }) => { return Err(TreeRunError::Runner( "parallel ROA worker queue disconnected".to_string(), )); } } } metrics.pending_remaining = pending_roa_dispatch.len(); metrics.duration_ms = elapsed_ms(started); Ok(metrics) } fn flush_pending_roa_dispatch_with_progress( runner: &Rpkiv1PublicationPointRunner<'_>, pending_roa_dispatch: &mut VecDeque, inflight_publication_points: &mut HashMap, pending_finalization: &VecDeque, ) -> Result<(), TreeRunError> { let dispatch_metrics = flush_pending_roa_dispatch(runner, pending_roa_dispatch, inflight_publication_points)?; if dispatch_metrics.attempted > 0 || dispatch_metrics.queue_full { crate::progress_log::emit( "phase2_roa_dispatch_batch", serde_json::json!({ "attempted": dispatch_metrics.attempted, "submitted": dispatch_metrics.submitted, "queue_full": dispatch_metrics.queue_full, "pending_remaining": dispatch_metrics.pending_remaining, "duration_ms": dispatch_metrics.duration_ms, "inflight_publication_points": inflight_publication_points.len(), "pending_finalization_len": pending_finalization.len(), }), ); } Ok(()) } fn drain_object_results( runner: &Rpkiv1PublicationPointRunner<'_>, inflight_publication_points: &mut HashMap, pending_finalization: &mut VecDeque, result_budget: usize, ) -> Result { let started = Instant::now(); let mut metrics = ObjectDrainMetrics::default(); let Some(pool) = runner.parallel_roa_worker_pool.as_ref() else { return Ok(metrics); }; let result_budget = result_budget.max(1); while metrics.results_drained < result_budget { let Some(result) = pool .recv_result_timeout(Duration::from_millis(0)) .map_err(TreeRunError::Runner)? else { break; }; metrics.results_drained += 1; let pp_id = result.publication_point_id; let _worker_index = result.worker_index; metrics.worker_ms_total += result.worker_ms; metrics.worker_ms_max = metrics.worker_ms_max.max(result.worker_ms); metrics.queue_wait_ms_total += result.queue_wait_ms; metrics.queue_wait_ms_max = metrics.queue_wait_ms_max.max(result.queue_wait_ms); let should_finalize = if let Some(state) = inflight_publication_points.get_mut(&pp_id) { let now = Instant::now(); if state.first_result_at.is_none() { state.first_result_at = Some(now); } state.last_result_at = Some(now); state.worker_ms_total += result.worker_ms; state.worker_ms_max = state.worker_ms_max.max(result.worker_ms); state.queue_wait_ms_total += result.queue_wait_ms; state.queue_wait_ms_max = state.queue_wait_ms_max.max(result.queue_wait_ms); state.results.push(result); state.results.len() == state.task_count } else { false }; if should_finalize { let mut state = inflight_publication_points .remove(&pp_id) .expect("inflight publication point must exist"); state.finalize_enqueued_at = Some(Instant::now()); metrics.publication_points_completed += 1; pending_finalization.push_back(FinalizeTask { state }); } } metrics.result_budget_exhausted = metrics.results_drained == result_budget; metrics.duration_ms = elapsed_ms(started); Ok(metrics) } fn drain_object_results_with_progress( runner: &Rpkiv1PublicationPointRunner<'_>, inflight_publication_points: &mut HashMap, pending_finalization: &mut VecDeque, pending_roa_dispatch_len: usize, result_budget: usize, ) -> Result<(), TreeRunError> { let drain_metrics = drain_object_results( runner, inflight_publication_points, pending_finalization, result_budget, )?; if drain_metrics.results_drained > 0 || drain_metrics.result_budget_exhausted { crate::progress_log::emit( "phase2_object_results_drain", serde_json::json!({ "results_drained": drain_metrics.results_drained, "publication_points_completed": drain_metrics.publication_points_completed, "result_budget_exhausted": drain_metrics.result_budget_exhausted, "result_drain_batch_size": result_budget, "worker_ms_total": drain_metrics.worker_ms_total, "worker_ms_max": drain_metrics.worker_ms_max, "queue_wait_ms_total": drain_metrics.queue_wait_ms_total, "queue_wait_ms_max": drain_metrics.queue_wait_ms_max, "duration_ms": drain_metrics.duration_ms, "pending_roa_dispatch_len": pending_roa_dispatch_len, "inflight_publication_points": inflight_publication_points.len(), "pending_finalization_len": pending_finalization.len(), }), ); } Ok(()) } fn submit_pending_finalization( finalize_task_tx: &SyncSender, pending_finalization: &mut VecDeque, finalize_inflight: &mut usize, ) -> Result { let started = Instant::now(); let mut metrics = FinalizeSubmitMetrics::default(); while let Some(task) = pending_finalization.pop_front() { match finalize_task_tx.try_send(task) { Ok(()) => { metrics.submitted += 1; *finalize_inflight += 1; } Err(TrySendError::Full(task)) => { pending_finalization.push_front(task); metrics.queue_full = true; break; } Err(TrySendError::Disconnected(_task)) => { return Err(TreeRunError::Runner( "phase2 finalize worker queue disconnected".to_string(), )); } } } metrics.duration_ms = elapsed_ms(started); Ok(metrics) } fn submit_pending_finalization_with_progress( finalize_task_tx: &SyncSender, pending_finalization: &mut VecDeque, finalize_inflight: &mut usize, finalize_queue_capacity: usize, pending_roa_dispatch_len: usize, inflight_publication_points_len: usize, ) -> Result<(), TreeRunError> { let submit_metrics = submit_pending_finalization(finalize_task_tx, pending_finalization, finalize_inflight)?; if submit_metrics.submitted > 0 || submit_metrics.queue_full { crate::progress_log::emit( "phase2_finalize_task_submit", serde_json::json!({ "submitted": submit_metrics.submitted, "queue_full": submit_metrics.queue_full, "duration_ms": submit_metrics.duration_ms, "finalize_queue_capacity": finalize_queue_capacity, "pending_finalization_len": pending_finalization.len(), "finalize_inflight": *finalize_inflight, "pending_roa_dispatch_len": pending_roa_dispatch_len, "inflight_publication_points": inflight_publication_points_len, }), ); } Ok(()) } fn drain_finalize_results( finalize_result_rx: &Receiver, finished: &mut Vec, finalize_inflight: &mut usize, ) -> Result { let started = Instant::now(); let mut metrics = FinalizeResultsDrainMetrics::default(); loop { match finalize_result_rx.try_recv() { Ok(result) => { metrics.results_drained += 1; metrics.reduce_ms_total += result.metrics.reduce_ms; metrics.reduce_ms_max = metrics.reduce_ms_max.max(result.metrics.reduce_ms); metrics.finalize_ms_total += result.metrics.finalize_ms; metrics.finalize_ms_max = metrics.finalize_ms_max.max(result.metrics.finalize_ms); metrics.finalize_queue_wait_ms_max = metrics .finalize_queue_wait_ms_max .max(result.metrics.finalize_queue_wait_ms.unwrap_or(0)); metrics.finalize_worker_ms_total += result.metrics.finalize_worker_ms; metrics.finalize_worker_ms_max = metrics .finalize_worker_ms_max .max(result.metrics.finalize_worker_ms); metrics.snapshot_pack_ms_total += result.metrics.snapshot_pack_ms; metrics.snapshot_pack_ms_max = metrics .snapshot_pack_ms_max .max(result.metrics.snapshot_pack_ms); metrics.persist_vcir_ms_total += result.metrics.persist_vcir_ms; metrics.persist_vcir_ms_max = metrics .persist_vcir_ms_max .max(result.metrics.persist_vcir_ms); metrics.persist_build_vcir_ms_total += result.metrics.persist_build_vcir_ms; metrics.persist_build_vcir_ms_max = metrics .persist_build_vcir_ms_max .max(result.metrics.persist_build_vcir_ms); metrics.persist_replace_vcir_ms_total += result.metrics.persist_replace_vcir_ms; metrics.persist_replace_vcir_ms_max = metrics .persist_replace_vcir_ms_max .max(result.metrics.persist_replace_vcir_ms); metrics.ccr_projection_build_ms_total += result.metrics.ccr_projection_build_ms; metrics.ccr_projection_build_ms_max = metrics .ccr_projection_build_ms_max .max(result.metrics.ccr_projection_build_ms); metrics.ccr_append_ms_total += result.metrics.ccr_append_ms; metrics.ccr_append_ms_max = metrics.ccr_append_ms_max.max(result.metrics.ccr_append_ms); metrics.audit_build_ms_total += result.metrics.audit_build_ms; metrics.audit_build_ms_max = metrics .audit_build_ms_max .max(result.metrics.audit_build_ms); *finalize_inflight = finalize_inflight.saturating_sub(1); finished.push(result.finished); } Err(TryRecvError::Empty) => break, Err(TryRecvError::Disconnected) => { if *finalize_inflight == 0 { break; } return Err(TreeRunError::Runner( "phase2 finalize result channel disconnected".to_string(), )); } } } metrics.duration_ms = elapsed_ms(started); Ok(metrics) } fn drain_finalize_results_with_progress( finalize_result_rx: &Receiver, finished: &mut Vec, finalize_inflight: &mut usize, pending_finalization_len: usize, pending_roa_dispatch_len: usize, inflight_publication_points_len: usize, ) -> Result<(), TreeRunError> { let drain_metrics = drain_finalize_results(finalize_result_rx, finished, finalize_inflight)?; if drain_metrics.results_drained >= 64 || (drain_metrics.results_drained > 0 && (*finalize_inflight == 0 || pending_finalization_len > 0)) { crate::progress_log::emit( "phase2_finalize_results_drain", serde_json::json!({ "results_drained": drain_metrics.results_drained, "reduce_ms_total": drain_metrics.reduce_ms_total, "reduce_ms_max": drain_metrics.reduce_ms_max, "finalize_ms_total": drain_metrics.finalize_ms_total, "finalize_ms_max": drain_metrics.finalize_ms_max, "finalize_queue_wait_ms_max": drain_metrics.finalize_queue_wait_ms_max, "finalize_worker_ms_total": drain_metrics.finalize_worker_ms_total, "finalize_worker_ms_max": drain_metrics.finalize_worker_ms_max, "snapshot_pack_ms_total": drain_metrics.snapshot_pack_ms_total, "snapshot_pack_ms_max": drain_metrics.snapshot_pack_ms_max, "persist_vcir_ms_total": drain_metrics.persist_vcir_ms_total, "persist_vcir_ms_max": drain_metrics.persist_vcir_ms_max, "persist_build_vcir_ms_total": drain_metrics.persist_build_vcir_ms_total, "persist_build_vcir_ms_max": drain_metrics.persist_build_vcir_ms_max, "persist_replace_vcir_ms_total": drain_metrics.persist_replace_vcir_ms_total, "persist_replace_vcir_ms_max": drain_metrics.persist_replace_vcir_ms_max, "ccr_projection_build_ms_total": drain_metrics.ccr_projection_build_ms_total, "ccr_projection_build_ms_max": drain_metrics.ccr_projection_build_ms_max, "ccr_append_ms_total": drain_metrics.ccr_append_ms_total, "ccr_append_ms_max": drain_metrics.ccr_append_ms_max, "audit_build_ms_total": drain_metrics.audit_build_ms_total, "audit_build_ms_max": drain_metrics.audit_build_ms_max, "duration_ms": drain_metrics.duration_ms, "pending_finalization_len": pending_finalization_len, "finalize_inflight": *finalize_inflight, "pending_roa_dispatch_len": pending_roa_dispatch_len, "inflight_publication_points": inflight_publication_points_len, }), ); } Ok(()) } fn run_finalize_worker( runner: &Rpkiv1PublicationPointRunner<'_>, finalize_task_rx: Receiver, finalize_result_tx: mpsc::Sender, compact_audit: bool, ) -> Result<(), TreeRunError> { while let Ok(task) = finalize_task_rx.recv() { let result = finalize_publication_point_state(runner, task.state, compact_audit); if finalize_result_tx.send(result).is_err() { return Err(TreeRunError::Runner( "phase2 finalize result receiver disconnected".to_string(), )); } } Ok(()) } fn finalize_publication_point_state( runner: &Rpkiv1PublicationPointRunner<'_>, state: InflightPublicationPoint, compact_audit: bool, ) -> FinalizeWorkerResult { let finalize_worker_started = Instant::now(); let InflightPublicationPoint { node, fresh_stage, objects_stage, repo_outcome, warnings, started_at, objects_started_at, task_count, tasks_submitted, first_task_submitted_at, last_task_submitted_at, first_result_at, last_result_at, worker_ms_total, worker_ms_max, queue_wait_ms_total, queue_wait_ms_max, finalize_enqueued_at, results, } = state; let finalize_queue_wait_ms = finalize_enqueued_at.or(last_result_at).map(|ready_at| { Instant::now() .saturating_duration_since(ready_at) .as_millis() as u64 }); let objects_processing_ms = objects_started_at.elapsed().as_millis() as u64; let reduce_started = Instant::now(); let locked_files = objects_stage.locked_file_count(); let reduce_result = reduce_parallel_roa_stage(objects_stage, results, runner.timing.as_ref()); let reduce_ms = elapsed_ms(reduce_started); let (result, mut metrics) = match reduce_result { Ok(mut objects) => { let finalize_started = Instant::now(); objects .router_keys .extend(fresh_stage.discovered_router_keys.clone()); objects.local_outputs_cache.extend( crate::validation::tree_runner::build_router_key_local_outputs( &node.handle, &objects.router_keys, ), ); let finalized = runner.finalize_fresh_publication_point_from_reducer( &node.handle, &fresh_stage.fresh_point, warnings, objects, fresh_stage.child_audits, fresh_stage.discovered_children, repo_outcome.repo_sync_source.as_deref(), repo_outcome.repo_sync_phase.as_deref(), repo_outcome.repo_sync_duration_ms, repo_outcome.repo_sync_err.as_deref(), ); let finalize_ms = elapsed_ms(finalize_started); match finalized { Ok(output) => { let metrics = finalize_metrics_from_output( &output, reduce_ms, finalize_ms, finalize_queue_wait_ms, 0, locked_files, ); ( compact_phase2_finished_result(output.result, compact_audit), metrics, ) } Err(err) => ( FinishedPublicationPointResult::Err(err), FinalizePublicationPointMetrics { reduce_ms, finalize_ms, finalize_queue_wait_ms, locked_files, ..FinalizePublicationPointMetrics::default() }, ), } } Err(err) => ( FinishedPublicationPointResult::Err(err), FinalizePublicationPointMetrics { reduce_ms, finalize_queue_wait_ms, locked_files, ..FinalizePublicationPointMetrics::default() }, ), }; let finalize_worker_ms = elapsed_ms(finalize_worker_started); metrics.finalize_worker_ms = finalize_worker_ms; emit_finalize_breakdown( "phase2_finalize_worker_breakdown", node.handle.manifest_rsync_uri.as_str(), node.handle.publication_point_rsync_uri.as_str(), &metrics, ); crate::progress_log::emit( "phase2_publication_point_reduced", serde_json::json!({ "manifest_rsync_uri": node.handle.manifest_rsync_uri.as_str(), "publication_point_rsync_uri": node.handle.publication_point_rsync_uri.as_str(), "objects_processing_ms": objects_processing_ms, "task_count": task_count, "tasks_submitted": tasks_submitted, "first_task_submitted_ms": first_task_submitted_at.map(|t| t.saturating_duration_since(objects_started_at).as_millis() as u64), "last_task_submitted_ms": last_task_submitted_at.map(|t| t.saturating_duration_since(objects_started_at).as_millis() as u64), "task_submit_span_ms": match (first_task_submitted_at, last_task_submitted_at) { (Some(first), Some(last)) => Some(last.saturating_duration_since(first).as_millis() as u64), _ => None, }, "first_result_ms": first_result_at.map(|t| t.saturating_duration_since(objects_started_at).as_millis() as u64), "last_result_ms": last_result_at.map(|t| t.saturating_duration_since(objects_started_at).as_millis() as u64), "all_results_ready_ms": last_result_at.map(|t| t.saturating_duration_since(objects_started_at).as_millis() as u64), "finalize_queue_wait_ms": finalize_queue_wait_ms, "result_span_ms": match (first_result_at, last_result_at) { (Some(first), Some(last)) => Some(last.saturating_duration_since(first).as_millis() as u64), _ => None, }, "worker_ms_total": worker_ms_total, "worker_ms_max": worker_ms_max, "worker_ms_avg": if task_count > 0 { worker_ms_total / task_count as u64 } else { 0 }, "queue_wait_ms_total": queue_wait_ms_total, "queue_wait_ms_max": queue_wait_ms_max, "queue_wait_ms_avg": if task_count > 0 { queue_wait_ms_total / task_count as u64 } else { 0 }, "reduce_ms": reduce_ms, "finalize_ms": metrics.finalize_ms, "finalize_worker_ms": finalize_worker_ms, "snapshot_pack_ms": metrics.snapshot_pack_ms, "persist_vcir_ms": metrics.persist_vcir_ms, "persist_build_vcir_ms": metrics.persist_build_vcir_ms, "persist_replace_vcir_ms": metrics.persist_replace_vcir_ms, "ccr_projection_build_ms": metrics.ccr_projection_build_ms, "ccr_append_ms": metrics.ccr_append_ms, "audit_build_ms": metrics.audit_build_ms, "locked_files": metrics.locked_files, "child_count": metrics.child_count, "warning_count": metrics.warning_count, "vrp_count": metrics.vrp_count, "vap_count": metrics.vap_count, "router_key_count": metrics.router_key_count, "audit_object_count": metrics.audit_object_count, "total_duration_ms": started_at.elapsed().as_millis() as u64, }), ); FinalizeWorkerResult { finished: FinishedPublicationPoint { node: FinishedPublicationPointNode::from_queued(node), result, }, metrics, } } fn drain_repo_events( repo_runtime: &dyn crate::parallel::repo_runtime::RepoSyncRuntime, ca_waiting_repo_by_identity: &mut HashMap>, ready_queue: &mut VecDeque, timeout: Duration, ) -> Result { let started = Instant::now(); let mut metrics = RepoDrainMetrics::default(); if let Some(event) = repo_runtime .recv_repo_result_timeout(timeout) .map_err(TreeRunError::Runner)? { metrics.event_count += 1; metrics.completions += event.completions.len(); for completion in event.completions { let mut outcome = completion.outcome; if completion.identity != event.transport_identity { // Shared RRDP/rsync transports release many publication points, but the transport // wall time should only be counted once in per-PP stage timing aggregation. outcome.repo_sync_duration_ms = 0; } if let Some(waiters) = ca_waiting_repo_by_identity.remove(&completion.identity) { metrics.ready_enqueued += waiters.len(); for node in waiters { ready_queue.push_back(ReadyCaInstance { node, repo_outcome: outcome.clone(), ready_enqueued_at: Instant::now(), }); } } } } metrics.duration_ms = elapsed_ms(started); Ok(metrics) } fn event_poll_timeout( ca_queue: &VecDeque, ready_queue: &VecDeque, pending_roa_dispatch: &VecDeque, inflight_publication_points: &HashMap, pending_finalization: &VecDeque, finalize_inflight: usize, instances_started: usize, config: &TreeRunConfig, ) -> Duration { if !ready_queue.is_empty() || !pending_roa_dispatch.is_empty() || !inflight_publication_points.is_empty() || !pending_finalization.is_empty() || (!ca_queue.is_empty() && can_start_more(instances_started, config)) { Duration::from_millis(0) } else if finalize_inflight > 0 { Duration::from_millis(10) } else { Duration::from_millis(50) } } fn is_complete( ca_queue: &VecDeque, ready_queue: &VecDeque, ca_waiting_repo_by_identity: &HashMap>, pending_roa_dispatch: &VecDeque, inflight_publication_points: &HashMap, pending_finalization: &VecDeque, finalize_inflight: usize, instances_started: usize, config: &TreeRunConfig, ) -> bool { let ca_queue_done = ca_queue.is_empty() || !can_start_more(instances_started, config); ca_queue_done && ready_queue.is_empty() && ca_waiting_repo_by_identity.is_empty() && pending_roa_dispatch.is_empty() && inflight_publication_points.is_empty() && pending_finalization.is_empty() && finalize_inflight == 0 } fn build_tree_output(mut finished: Vec) -> TreeRunAuditOutput { finished.sort_by_key(|item| item.node.id); let mut instances_processed = 0usize; let mut instances_failed = 0usize; let mut warnings = Vec::new(); let mut vrps = Vec::new(); let mut aspas = Vec::new(); let mut router_keys = Vec::new(); let mut publication_points = Vec::new(); let mut roa_cache_stats = crate::validation::objects::RoaValidationCacheStats::default(); let mut cir_input = CirInputAccumulator::default(); for item in finished { match item.result { FinishedPublicationPointResult::Ok { source, warnings: result_warnings, objects, audit, cir_fresh_objects, cir_cached_objects, } => { instances_processed += 1; warnings.extend(result_warnings); warnings.extend(objects.warnings); roa_cache_stats.add_assign(&objects.roa_cache_stats); vrps.extend(objects.vrps); aspas.extend(objects.aspas); router_keys.extend(objects.router_keys); let mut audit: PublicationPointAudit = audit; audit.node_id = Some(item.node.id); audit.parent_node_id = item.node.parent_id; audit.discovered_from = item.node.discovered_from; crate::validation::tree::submit_publication_point_cir_input( &mut cir_input, source, &audit, &cir_fresh_objects, &cir_cached_objects, ) .expect("CIR input collection from validated audit must not fail"); publication_points.push(audit); } FinishedPublicationPointResult::Err(err) => { instances_failed += 1; warnings.push( Warning::new(format!("publication point failed: {err}")) .with_context(&item.node.manifest_rsync_uri), ); } } } TreeRunAuditOutput { tree: TreeRunOutput { instances_processed, instances_failed, warnings, vrps, aspas, router_keys, }, publication_points, roa_cache_stats, cir_input: cir_input.finalize(), } } pub fn run_tree_parallel_phase2_audit( root: CaInstanceHandle, runner: &Rpkiv1PublicationPointRunner<'_>, config: &TreeRunConfig, ) -> Result { run_tree_parallel_phase2_audit_multi_root(vec![root], runner, config) } #[cfg(test)] mod tests { use super::{ FinishedPublicationPointResult, compact_phase2_finished_result, compact_phase2_finished_result_result, finalize_metrics_from_output, }; use crate::audit::{ AuditObjectKind, AuditObjectResult, ObjectAuditEntry, PublicationPointAudit, }; use crate::storage::PackTime; use crate::validation::manifest::PublicationPointSource; use crate::validation::objects::{ObjectsOutput, ObjectsStats}; use crate::validation::publication_point::PublicationPointSnapshot; use crate::validation::tree::PublicationPointRunResult; use crate::validation::tree_runner::{ BuildVcirTimingBreakdown, FreshPublicationPointFinalizeOutput, PersistVcirTimingBreakdown, }; fn sample_snapshot() -> PublicationPointSnapshot { PublicationPointSnapshot { format_version: PublicationPointSnapshot::FORMAT_VERSION_V1, manifest_rsync_uri: "rsync://example.test/repo/example.mft".to_string(), publication_point_rsync_uri: "rsync://example.test/repo/".to_string(), manifest_number_be: vec![1], this_update: PackTime { rfc3339_utc: "2026-04-21T00:00:00Z".to_string(), }, next_update: PackTime { rfc3339_utc: "2026-04-22T00:00:00Z".to_string(), }, verified_at: PackTime { rfc3339_utc: "2026-04-21T00:00:01Z".to_string(), }, manifest_bytes: vec![1, 2, 3], files: Vec::new(), } } fn sample_result() -> PublicationPointRunResult { PublicationPointRunResult { source: PublicationPointSource::Fresh, snapshot: Some(sample_snapshot()), warnings: Vec::new(), objects: ObjectsOutput { vrps: Vec::new(), aspas: Vec::new(), router_keys: Vec::new(), local_outputs_cache: Vec::new(), warnings: Vec::new(), stats: ObjectsStats::default(), audit: Vec::new(), roa_cache_stats: crate::validation::objects::RoaValidationCacheStats::default(), }, audit: PublicationPointAudit::default(), cir_fresh_objects: Vec::new(), cir_cached_objects: Vec::new(), discovered_children: Vec::new(), } } #[test] fn compact_phase2_finished_result_drops_snapshot() { let result = compact_phase2_finished_result(sample_result(), false); match result { FinishedPublicationPointResult::Ok { warnings, .. } => { assert!(warnings.is_empty()); } FinishedPublicationPointResult::Err(err) => panic!("unexpected error: {err}"), } } #[test] fn compact_phase2_finished_result_can_drop_audit_payload() { let mut sample = sample_result(); sample.audit.objects.push(crate::audit::ObjectAuditEntry { rsync_uri: "rsync://example.test/repo/a.roa".to_string(), sha256_hex: "11".repeat(32), kind: crate::audit::AuditObjectKind::Roa, result: crate::audit::AuditObjectResult::Ok, detail: None, }); sample.audit.warnings.push(crate::audit::AuditWarning { message: "warning".to_string(), rfc_refs: Vec::new(), context: None, }); sample.objects.audit.push(crate::audit::ObjectAuditEntry { rsync_uri: "rsync://example.test/repo/b.roa".to_string(), sha256_hex: "22".repeat(32), kind: crate::audit::AuditObjectKind::Roa, result: crate::audit::AuditObjectResult::Ok, detail: None, }); let result = compact_phase2_finished_result(sample, true); match result { FinishedPublicationPointResult::Ok { objects, audit, cir_fresh_objects, .. } => { assert!(audit.objects.is_empty()); assert!(audit.warnings.is_empty()); assert!(objects.audit.is_empty()); assert_eq!(cir_fresh_objects.len(), 1); } FinishedPublicationPointResult::Err(err) => panic!("unexpected error: {err}"), } } #[test] fn compact_phase2_finished_result_result_preserves_err() { match compact_phase2_finished_result_result(Err("boom".to_string()), false) { FinishedPublicationPointResult::Err(err) => assert_eq!(err, "boom"), FinishedPublicationPointResult::Ok { .. } => panic!("error should be preserved"), } } #[test] fn finalize_metrics_from_output_captures_breakdown_and_counts() { let mut result = sample_result(); result.objects.vrps.push(crate::validation::objects::Vrp { asn: 64496, prefix: crate::data_model::roa::IpPrefix { afi: crate::data_model::roa::RoaAfi::Ipv4, addr: [192, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], prefix_len: 24, }, max_length: 24, }); result .objects .aspas .push(crate::validation::objects::AspaAttestation { customer_as_id: 64497, provider_as_ids: vec![64498], }); result.audit.objects.push(ObjectAuditEntry { rsync_uri: "rsync://example.test/repo/a.roa".to_string(), sha256_hex: "11".repeat(32), kind: AuditObjectKind::Roa, result: AuditObjectResult::Ok, detail: None, }); result .discovered_children .push(crate::validation::tree::DiscoveredChildCaInstance { handle: crate::validation::tree::CaInstanceHandle { tal_id: "test".to_string(), ca_certificate_der: vec![1], ca_certificate_rsync_uri: Some( "rsync://example.test/repo/child.cer".to_string(), ), effective_ip_resources: None, effective_as_resources: None, manifest_rsync_uri: "rsync://example.test/repo/child.mft".to_string(), publication_point_rsync_uri: "rsync://example.test/repo/".to_string(), rsync_base_uri: "rsync://example.test/repo/".to_string(), rrdp_notification_uri: None, parent_manifest_rsync_uri: None, depth: 1, }, discovered_from: crate::audit::DiscoveredFrom { parent_manifest_rsync_uri: "rsync://example.test/repo/example.mft".to_string(), child_ca_certificate_rsync_uri: "rsync://example.test/repo/child.cer" .to_string(), child_ca_certificate_sha256_hex: "55".repeat(32), }, }); let output = FreshPublicationPointFinalizeOutput { result, snapshot_pack_ms: 1, persist_vcir_ms: 2, persist_vcir_timing: PersistVcirTimingBreakdown { build_vcir_ms: 3, replace_vcir_ms: 4, build_vcir: BuildVcirTimingBreakdown { related_artifacts_ms: 5, ..BuildVcirTimingBreakdown::default() }, ..PersistVcirTimingBreakdown::default() }, ccr_projection_build_ms: 6, ccr_append_ms: 7, audit_build_ms: 8, }; let metrics = finalize_metrics_from_output(&output, 9, 10, Some(11), 12, 13); assert_eq!(metrics.snapshot_pack_ms, 1); assert_eq!(metrics.persist_vcir_ms, 2); assert_eq!(metrics.persist_build_vcir_ms, 3); assert_eq!(metrics.persist_replace_vcir_ms, 4); assert_eq!( metrics.persist_replace_breakdown, crate::storage::VcirReplaceTimingBreakdown::default() ); assert_eq!(metrics.ccr_projection_build_ms, 6); assert_eq!(metrics.ccr_append_ms, 7); assert_eq!(metrics.audit_build_ms, 8); assert_eq!(metrics.reduce_ms, 9); assert_eq!(metrics.finalize_ms, 10); assert_eq!(metrics.finalize_queue_wait_ms, Some(11)); assert_eq!(metrics.finalize_worker_ms, 12); assert_eq!(metrics.locked_files, 13); assert_eq!(metrics.child_count, 1); assert_eq!(metrics.vrp_count, 1); assert_eq!(metrics.vap_count, 1); assert_eq!(metrics.audit_object_count, 1); } }