dotfiles/agent/src/health.rs

158 lines
4.2 KiB
Rust
Raw Normal View History

2024-01-25 15:16:27 +00:00
//! System health information and checking
2024-01-29 10:44:00 +00:00
use std::{sync::Arc, time::Duration};
2024-01-25 15:16:27 +00:00
2024-01-29 10:44:00 +00:00
use serde::{Deserialize, Serialize};
2024-01-25 15:16:27 +00:00
use tokio::sync::watch;
2024-01-29 10:44:00 +00:00
use crate::messaging::{Client, Message};
2024-01-25 15:16:27 +00:00
const MEMORY_USAGE_CRITICAL_THRESHOLD: f64 = 90.0;
const CPU_USAGE_CRITICAL_THRESHOLD: f32 = 90.0;
const DISK_USAGE_CRITICAL_THRESHOLD: f32 = 90.0;
pub struct System {
sys: sysinfo::System,
disks: sysinfo::Disks,
}
impl System {
pub fn new() -> Self {
Self {
sys: sysinfo::System::new(),
disks: sysinfo::Disks::new(),
}
}
pub fn refresh_resources(&mut self) {
2024-01-26 16:22:09 +00:00
use sysinfo::{CpuRefreshKind, MemoryRefreshKind, RefreshKind};
2024-01-25 15:16:27 +00:00
self.sys.refresh_specifics(
2024-01-26 16:22:09 +00:00
RefreshKind::new()
2024-01-27 21:01:59 +00:00
.with_memory(MemoryRefreshKind::everything())
2024-01-26 16:22:09 +00:00
.with_cpu(CpuRefreshKind::everything()),
2024-01-25 15:16:27 +00:00
);
2024-01-26 16:22:09 +00:00
// self.disks.refresh_list();
2024-01-25 15:16:27 +00:00
}
pub fn system(&self) -> &sysinfo::System {
&self.sys
}
pub fn disks(&self) -> &sysinfo::Disks {
&self.disks
}
}
2024-01-29 10:44:00 +00:00
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
2024-01-25 15:16:27 +00:00
pub enum Status {
#[default]
Normal,
2024-01-26 16:22:09 +00:00
Critical,
2024-01-25 15:16:27 +00:00
}
2024-01-29 10:44:00 +00:00
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
2024-01-25 15:16:27 +00:00
pub struct Health {
2024-01-26 16:22:09 +00:00
cpu_status: Status,
memory_status: Status,
disk_status: Status,
2024-01-25 15:16:27 +00:00
}
#[derive(Clone)]
pub struct HealthMonitor(Arc<watch::Sender<Health>>);
impl HealthMonitor {
pub fn new() -> Self {
2024-01-26 16:22:09 +00:00
let (sender, _) = watch::channel(Health::default());
2024-01-25 15:16:27 +00:00
Self(Arc::new(sender))
}
pub fn check_system(&self, system: &System) {
2024-01-26 16:22:09 +00:00
let sys = system.system();
2024-01-25 15:16:27 +00:00
2024-01-26 16:22:09 +00:00
let memory_usage = if sys.total_memory() > 0 {
sys.used_memory() as f64 * 100.0 / sys.total_memory() as f64
2024-01-25 15:16:27 +00:00
} else {
0.0
};
2024-01-26 16:22:09 +00:00
let cpu_usage = sys.global_cpu_info().cpu_usage();
2024-01-25 15:16:27 +00:00
// for d in system.disks().list() {
2024-01-26 16:22:09 +00:00
// let _avail = if d.total_space() > 0 {
2024-01-25 15:16:27 +00:00
// (d.available_space() * 100 / d.total_space()) as u8
// } else {
// 0 as u8
// };
// }
self.0.send_if_modified(|health| {
2024-01-26 16:22:09 +00:00
let cpu_changed = match health.cpu_status {
Status::Normal if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD => {
health.cpu_status = Status::Critical;
true
}
Status::Critical if cpu_usage <= CPU_USAGE_CRITICAL_THRESHOLD => {
health.cpu_status = Status::Normal;
true
}
_ => false,
2024-01-25 15:16:27 +00:00
};
2024-01-26 16:22:09 +00:00
let memory_changed = match health.memory_status {
Status::Normal if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD => {
health.memory_status = Status::Critical;
true
}
Status::Critical if memory_usage <= MEMORY_USAGE_CRITICAL_THRESHOLD => {
health.memory_status = Status::Normal;
true
}
_ => false,
2024-01-25 15:16:27 +00:00
};
2024-01-26 16:22:09 +00:00
cpu_changed || memory_changed
2024-01-25 15:16:27 +00:00
});
}
pub fn monitor(&self) -> watch::Receiver<Health> {
self.0.subscribe()
}
}
2024-01-26 16:22:09 +00:00
impl Default for HealthMonitor {
#[inline]
fn default() -> Self {
Self::new()
}
}
2024-01-29 10:44:00 +00:00
pub async fn init_health_subsystem(client: Client) -> HealthMonitor {
let health_monitor = HealthMonitor::new();
let health_monitor_clone = health_monitor.clone();
let health_monitor_ret = health_monitor.clone();
let mut system = System::new();
// Forever refresh system resources and monitor changes
std::thread::spawn(move || loop {
const REFRESH_INTERVAL: Duration = Duration::from_secs(1);
system.refresh_resources();
health_monitor.check_system(&system);
std::thread::sleep(REFRESH_INTERVAL);
});
tokio::spawn(async move {
let mut recv = health_monitor_clone.monitor();
while let Ok(()) = recv.changed().await {
tracing::info!(health = ?&*recv.borrow(), "health watermark");
let health = recv.borrow().clone();
client.publish(Message::health(health).unwrap()).await;
}
});
health_monitor_ret
}