//! System health module use std::{collections::HashMap, sync::Arc}; use tokio::sync::watch; use super::{info::Info, task::TaskStatus}; const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90; const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90; const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90; #[derive(Clone, PartialEq)] pub enum CriticalReason { HighMemoryUsage, HighCpuUsage, HighDiskUsage, } #[derive(Clone, Default, PartialEq)] pub enum SystemStatus { #[default] Normal, OutOfDate, Updating, Critical(Vec), } #[derive(Clone, Default)] pub struct SystemHealth { pub status: SystemStatus, } #[derive(Default, Clone)] pub struct Health { system: SystemHealth, tasks: HashMap, } impl Health { pub fn system(&self) -> &SystemHealth { &self.system } pub fn tasks(&self) -> &HashMap { &self.tasks } } /// [HealthMonitor] gives access to shared system health state, allowing to watch health and update /// task health status. /// /// # Usage /// Internally it uses [Arc] so it can be cheaply cloned and shared. /// ``` /// use prymn_agent::system::health::HealthMonitor; /// use prymn_agent::system::info::Info; /// /// let mut info = Info::new(); /// let health_monitor = HealthMonitor::new(); /// /// // Monitor health changes /// let _receiver = health_monitor.monitor(); /// /// // Refresh system resources /// info.refresh_resources(); /// /// // Update the health monitor with the refreshed info /// health_monitor.check_system_info(&info); /// ``` #[derive(Clone)] pub struct HealthMonitor { sender: Arc>, } impl HealthMonitor { pub fn new() -> Self { let (sender, _) = watch::channel(Health::default()); Self { sender: Arc::new(sender), } } pub fn check_system_info(&self, info: &Info) { use sysinfo::{CpuExt, DiskExt, SystemExt}; let sys = info.system(); let mut status = SystemStatus::Normal; let mut statuses = vec![]; // Check for critical memory usage let memory_usage = if sys.total_memory() > 0 { sys.used_memory() * 100 / sys.total_memory() } else { 0 }; if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD { statuses.push(CriticalReason::HighMemoryUsage); } // Check for critical CPU usage let cpu_usage = sys.global_cpu_info().cpu_usage(); if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 { statuses.push(CriticalReason::HighCpuUsage); } // Check for any disk usage that is critical for disk in sys.disks() { let available_disk = if disk.total_space() > 0 { disk.available_space() * 100 / disk.total_space() } else { 0 }; if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD { statuses.push(CriticalReason::HighDiskUsage); } } if !statuses.is_empty() { status = SystemStatus::Critical(statuses); } self.sender.send_if_modified(|Health { system, .. }| { if system.status == status { return false; } system.status = status; true }); } /// Spawns a new tokio task that tracks from the [watch::Receiver] the status of a Prymn task /// via [TaskStatus] pub fn track_task(&self, name: String, mut task_recv: watch::Receiver) { let sender = self.sender.clone(); tokio::task::spawn(async move { while task_recv.changed().await.is_ok() { sender.send_modify(|health| { health .tasks .insert(String::from(&name), task_recv.borrow().clone()); }); } // At this point the Sender part of the watch dropped, meaning we can clear the task // because it is complete. sender.send_if_modified(|health| health.tasks.remove(&name).is_some()); }); } pub fn clear_task(&self, task_name: &str) { self.sender .send_if_modified(|Health { tasks, .. }| tasks.remove(task_name).is_some()); } pub fn monitor(&self) -> watch::Receiver { self.sender.subscribe() } } impl Default for HealthMonitor { fn default() -> Self { Self::new() } } impl std::fmt::Display for SystemStatus { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { SystemStatus::Normal => write!(f, "normal"), SystemStatus::OutOfDate => write!(f, "out of date"), SystemStatus::Updating => write!(f, "updating"), SystemStatus::Critical(_) => write!(f, "critical"), } } } impl std::fmt::Display for CriticalReason { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { CriticalReason::HighMemoryUsage => write!(f, "high memory usage"), CriticalReason::HighCpuUsage => write!(f, "high cpu usage"), CriticalReason::HighDiskUsage => write!(f, "high disk usage"), } } }