20260427_3 增强db_stats归因work-db体积

This commit is contained in:
yuyr 2026-04-27 18:00:34 +08:00
parent 87275b5c57
commit 26aec5ff35

View File

@ -1,5 +1,6 @@
use std::collections::BTreeMap;
use std::path::PathBuf;
use std::fs;
use std::path::{Path, PathBuf};
use rocksdb::{DB, IteratorMode, Options};
use rpki::storage::{
@ -21,6 +22,43 @@ enum CfGroup {
LegacyCompatibility,
}
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct CfStats {
keys: u64,
key_bytes: u64,
value_bytes: u64,
metadata_size_bytes: u64,
metadata_file_count: u64,
live_sst_size_bytes: u64,
live_sst_files: u64,
}
impl CfStats {
fn avg_key_bytes(&self) -> f64 {
avg_bytes(self.key_bytes, self.keys)
}
fn avg_value_bytes(&self) -> f64 {
avg_bytes(self.value_bytes, self.keys)
}
}
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct DbFileStats {
total_size_bytes: u64,
total_count: u64,
sst_size_bytes: u64,
sst_count: u64,
blob_size_bytes: u64,
blob_count: u64,
log_size_bytes: u64,
log_count: u64,
manifest_size_bytes: u64,
manifest_count: u64,
other_size_bytes: u64,
other_count: u64,
}
impl CfGroup {
fn as_str(self) -> &'static str {
match self {
@ -32,6 +70,14 @@ impl CfGroup {
}
}
fn avg_bytes(bytes: u64, keys: u64) -> f64 {
if keys == 0 {
0.0
} else {
bytes as f64 / keys as f64
}
}
fn usage() -> String {
let bin = "db_stats";
format!(
@ -41,9 +87,15 @@ Usage:
Options:
--db <path> RocksDB directory
--exact Iterate to count keys (slower; default uses RocksDB estimates)
--exact Iterate to count keys and logical bytes (slower; default uses RocksDB estimates)
--help Show this help
Output:
- legacy fields: <cf>=<keys>, group_<name>=<keys>, sst_files=<n>
- cf.<name>.*: key/value bytes, RocksDB metadata, live SST size
- group.<name>.*: grouped key/value bytes and physical SST metadata
- db.files.*: DB directory file totals split by .sst/.blob/.log/MANIFEST/other
Output groups:
- current_repository_view: repository_view + raw_by_hash
- current_validation_state: vcir + audit_rule_index
@ -59,17 +111,72 @@ fn estimate_keys(db: &DB, cf_name: &str) -> Result<Option<u64>, Box<dyn std::err
Ok(db.property_int_value_cf(cf, "rocksdb.estimate-num-keys")?)
}
fn exact_keys(db: &DB, cf_name: &str) -> Result<u64, Box<dyn std::error::Error>> {
fn exact_logical_stats(db: &DB, cf_name: &str) -> Result<CfStats, Box<dyn std::error::Error>> {
let cf = db
.cf_handle(cf_name)
.ok_or_else(|| format!("missing column family: {cf_name}"))?;
let mode = IteratorMode::Start;
let mut count = 0u64;
let mut stats = CfStats::default();
for res in db.iterator_cf(cf, mode) {
res?;
count += 1;
let (key, value) = res?;
stats.keys = stats.keys.saturating_add(1);
stats.key_bytes = stats.key_bytes.saturating_add(key.len() as u64);
stats.value_bytes = stats.value_bytes.saturating_add(value.len() as u64);
}
Ok(count)
Ok(stats)
}
fn cf_metadata_stats(db: &DB, cf_name: &str) -> Result<(u64, u64), Box<dyn std::error::Error>> {
let cf = db
.cf_handle(cf_name)
.ok_or_else(|| format!("missing column family: {cf_name}"))?;
let metadata = db.get_column_family_metadata_cf(cf);
Ok((metadata.size, metadata.file_count as u64))
}
fn live_sst_stats(db: &DB) -> Result<BTreeMap<String, (u64, u64)>, Box<dyn std::error::Error>> {
let mut stats = BTreeMap::new();
for file in db.live_files()? {
let entry = stats.entry(file.column_family_name).or_insert((0u64, 0u64));
entry.0 = entry.0.saturating_add(file.size as u64);
entry.1 = entry.1.saturating_add(1);
}
Ok(stats)
}
fn collect_db_file_stats(db_path: &Path) -> Result<DbFileStats, Box<dyn std::error::Error>> {
let mut stats = DbFileStats::default();
for entry in fs::read_dir(db_path)? {
let entry = entry?;
if !entry.file_type()?.is_file() {
continue;
}
let size = entry.metadata()?.len();
let name = entry.file_name();
let name = name.to_string_lossy();
stats.total_size_bytes = stats.total_size_bytes.saturating_add(size);
stats.total_count = stats.total_count.saturating_add(1);
if name.ends_with(".sst") {
stats.sst_size_bytes = stats.sst_size_bytes.saturating_add(size);
stats.sst_count = stats.sst_count.saturating_add(1);
} else if name.ends_with(".blob") {
stats.blob_size_bytes = stats.blob_size_bytes.saturating_add(size);
stats.blob_count = stats.blob_count.saturating_add(1);
} else if name == "LOG" || name.starts_with("LOG.") || name.ends_with(".log") {
stats.log_size_bytes = stats.log_size_bytes.saturating_add(size);
stats.log_count = stats.log_count.saturating_add(1);
} else if name.starts_with("MANIFEST-") {
stats.manifest_size_bytes = stats.manifest_size_bytes.saturating_add(size);
stats.manifest_count = stats.manifest_count.saturating_add(1);
} else {
stats.other_size_bytes = stats.other_size_bytes.saturating_add(size);
stats.other_count = stats.other_count.saturating_add(1);
}
}
Ok(stats)
}
fn cf_group(cf_name: &str) -> CfGroup {
@ -91,6 +198,31 @@ fn summarize_counts<'a>(
grouped
}
fn summarize_cf_stats<'a>(
stats: impl IntoIterator<Item = (&'a str, &'a CfStats)>,
) -> BTreeMap<CfGroup, CfStats> {
let mut grouped = BTreeMap::new();
for (cf_name, cf_stats) in stats {
let entry = grouped
.entry(cf_group(cf_name))
.or_insert_with(CfStats::default);
entry.keys = entry.keys.saturating_add(cf_stats.keys);
entry.key_bytes = entry.key_bytes.saturating_add(cf_stats.key_bytes);
entry.value_bytes = entry.value_bytes.saturating_add(cf_stats.value_bytes);
entry.metadata_size_bytes = entry
.metadata_size_bytes
.saturating_add(cf_stats.metadata_size_bytes);
entry.metadata_file_count = entry
.metadata_file_count
.saturating_add(cf_stats.metadata_file_count);
entry.live_sst_size_bytes = entry
.live_sst_size_bytes
.saturating_add(cf_stats.live_sst_size_bytes);
entry.live_sst_files = entry.live_sst_files.saturating_add(cf_stats.live_sst_files);
}
grouped
}
fn mode_label(mode: DbStatsMode) -> &'static str {
match mode {
DbStatsMode::Estimate => "estimate",
@ -131,26 +263,100 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("db={}", db_path.display());
println!("mode={}", mode_label(mode));
println!(
"logical_bytes_available={}",
matches!(mode, DbStatsMode::Exact)
);
let live_sst = live_sst_stats(&db)?;
let mut per_cf = Vec::with_capacity(ALL_COLUMN_FAMILY_NAMES.len());
let mut total: u64 = 0;
for &name in ALL_COLUMN_FAMILY_NAMES {
let n = match mode {
DbStatsMode::Exact => exact_keys(&db, name)?,
DbStatsMode::Estimate => estimate_keys(&db, name)?.unwrap_or(0),
let mut stats = match mode {
DbStatsMode::Exact => exact_logical_stats(&db, name)?,
DbStatsMode::Estimate => CfStats {
keys: estimate_keys(&db, name)?.unwrap_or(0),
..CfStats::default()
},
};
total = total.saturating_add(n);
per_cf.push((name, n));
println!("{name}={n}");
let (metadata_size_bytes, metadata_file_count) = cf_metadata_stats(&db, name)?;
let (live_sst_size_bytes, live_sst_files) = live_sst.get(name).copied().unwrap_or((0, 0));
stats.metadata_size_bytes = metadata_size_bytes;
stats.metadata_file_count = metadata_file_count;
stats.live_sst_size_bytes = live_sst_size_bytes;
stats.live_sst_files = live_sst_files;
total = total.saturating_add(stats.keys);
println!("{name}={}", stats.keys);
per_cf.push((name, stats));
}
println!("total={total}");
for (group, count) in summarize_counts(per_cf.iter().copied()) {
for (group, count) in summarize_counts(per_cf.iter().map(|(name, stats)| (*name, stats.keys))) {
println!("group_{}={count}", group.as_str());
}
let live = db.live_files()?;
println!("sst_files={}", live.len());
for (name, stats) in &per_cf {
println!("cf.{name}.keys={}", stats.keys);
println!("cf.{name}.key_bytes={}", stats.key_bytes);
println!("cf.{name}.value_bytes={}", stats.value_bytes);
println!("cf.{name}.avg_key_bytes={:.2}", stats.avg_key_bytes());
println!("cf.{name}.avg_value_bytes={:.2}", stats.avg_value_bytes());
println!(
"cf.{name}.metadata_size_bytes={}",
stats.metadata_size_bytes
);
println!(
"cf.{name}.metadata_file_count={}",
stats.metadata_file_count
);
println!(
"cf.{name}.live_sst_size_bytes={}",
stats.live_sst_size_bytes
);
println!("cf.{name}.live_sst_files={}", stats.live_sst_files);
}
for (group, stats) in summarize_cf_stats(per_cf.iter().map(|(name, stats)| (*name, stats))) {
let group = group.as_str();
println!("group.{group}.keys={}", stats.keys);
println!("group.{group}.key_bytes={}", stats.key_bytes);
println!("group.{group}.value_bytes={}", stats.value_bytes);
println!(
"group.{group}.metadata_size_bytes={}",
stats.metadata_size_bytes
);
println!(
"group.{group}.metadata_file_count={}",
stats.metadata_file_count
);
println!(
"group.{group}.live_sst_size_bytes={}",
stats.live_sst_size_bytes
);
println!("group.{group}.live_sst_files={}", stats.live_sst_files);
}
let live_sst_file_count = live_sst.values().map(|(_, count)| *count).sum::<u64>();
println!("sst_files={live_sst_file_count}");
let file_stats = collect_db_file_stats(&db_path)?;
println!("db.files.total_size_bytes={}", file_stats.total_size_bytes);
println!("db.files.total_count={}", file_stats.total_count);
println!("db.files.sst_size_bytes={}", file_stats.sst_size_bytes);
println!("db.files.sst_count={}", file_stats.sst_count);
println!("db.files.blob_size_bytes={}", file_stats.blob_size_bytes);
println!("db.files.blob_count={}", file_stats.blob_count);
println!("db.files.log_size_bytes={}", file_stats.log_size_bytes);
println!("db.files.log_count={}", file_stats.log_count);
println!(
"db.files.manifest_size_bytes={}",
file_stats.manifest_size_bytes
);
println!("db.files.manifest_count={}", file_stats.manifest_count);
println!("db.files.other_size_bytes={}", file_stats.other_size_bytes);
println!("db.files.other_count={}", file_stats.other_count);
Ok(())
}
@ -189,11 +395,90 @@ mod tests {
assert_eq!(grouped.get(&CfGroup::LegacyCompatibility), None);
}
#[test]
fn summarize_cf_stats_accumulates_bytes_and_physical_stats_by_group() {
let repo = CfStats {
keys: 2,
key_bytes: 20,
value_bytes: 200,
metadata_size_bytes: 50,
metadata_file_count: 1,
live_sst_size_bytes: 40,
live_sst_files: 1,
};
let vcir = CfStats {
keys: 3,
key_bytes: 30,
value_bytes: 300,
metadata_size_bytes: 60,
metadata_file_count: 2,
live_sst_size_bytes: 55,
live_sst_files: 2,
};
let audit = CfStats {
keys: 5,
key_bytes: 50,
value_bytes: 500,
metadata_size_bytes: 70,
metadata_file_count: 3,
live_sst_size_bytes: 65,
live_sst_files: 3,
};
let grouped = summarize_cf_stats([
(CF_REPOSITORY_VIEW, &repo),
(CF_VCIR, &vcir),
(CF_AUDIT_RULE_INDEX, &audit),
]);
assert_eq!(grouped.get(&CfGroup::CurrentRepositoryView), Some(&repo));
assert_eq!(
grouped.get(&CfGroup::CurrentValidationState),
Some(&CfStats {
keys: 8,
key_bytes: 80,
value_bytes: 800,
metadata_size_bytes: 130,
metadata_file_count: 5,
live_sst_size_bytes: 120,
live_sst_files: 5,
})
);
}
#[test]
fn collect_db_file_stats_splits_rocksdb_file_types() {
let td = tempfile::tempdir().expect("tempdir");
std::fs::write(td.path().join("000001.sst"), [0u8; 10]).expect("sst");
std::fs::write(td.path().join("000002.blob"), [0u8; 20]).expect("blob");
std::fs::write(td.path().join("000003.log"), [0u8; 30]).expect("wal");
std::fs::write(td.path().join("LOG.old.1"), [0u8; 40]).expect("log");
std::fs::write(td.path().join("MANIFEST-000004"), [0u8; 50]).expect("manifest");
std::fs::write(td.path().join("CURRENT"), [0u8; 60]).expect("other");
std::fs::create_dir(td.path().join("subdir")).expect("subdir");
let stats = collect_db_file_stats(td.path()).expect("stats");
assert_eq!(stats.sst_size_bytes, 10);
assert_eq!(stats.sst_count, 1);
assert_eq!(stats.blob_size_bytes, 20);
assert_eq!(stats.blob_count, 1);
assert_eq!(stats.log_size_bytes, 70);
assert_eq!(stats.log_count, 2);
assert_eq!(stats.manifest_size_bytes, 50);
assert_eq!(stats.manifest_count, 1);
assert_eq!(stats.other_size_bytes, 60);
assert_eq!(stats.other_count, 1);
assert_eq!(stats.total_size_bytes, 210);
assert_eq!(stats.total_count, 6);
}
#[test]
fn usage_mentions_grouped_output_and_exact_mode() {
let text = usage();
assert!(text.contains("--exact"), "{text}");
assert!(text.contains("current_validation_state"), "{text}");
assert!(text.contains("current_rrdp_state"), "{text}");
assert!(text.contains("db.files.*"), "{text}");
}
}