//! System health module use std::{collections::HashMap, sync::Arc}; use chrono::{DateTime, Utc}; use tokio::sync::watch; use super::SYSTEM; const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90; const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90; const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90; #[derive(Clone, PartialEq)] pub enum CriticalReason { HighMemoryUsage, HighCpuUsage, HighDiskUsage, } #[derive(Clone, Default, PartialEq)] pub enum SystemStatus { #[default] Normal, OutOfDate, Updating, Critical(Vec), } #[derive(Clone, Default)] pub struct SystemHealth { pub status: SystemStatus, } #[derive(Clone, PartialEq, Debug)] pub enum TaskStatus { Normal, Warning, Error, Completed, } #[derive(Clone)] pub struct TaskHealth { status: TaskStatus, started_on: DateTime, message: String, progress: u8, } impl TaskHealth { pub fn new(message: String) -> Self { let started_on = chrono::Utc::now(); Self { status: TaskStatus::Normal, started_on, message, progress: 0, } } pub fn set_normal(&mut self, message: String) { self.status = TaskStatus::Normal; self.message = message; } pub fn set_warning(&mut self, message: String) { self.status = TaskStatus::Warning; self.message = message; } pub fn set_error(&mut self, message: String) { self.status = TaskStatus::Error; self.message = message; } pub fn set_completed(mut self, message: String) { self.status = TaskStatus::Completed; self.progress = 100; self.message = message; } pub fn set_progress(&mut self, message: String, progress: u8) { self.progress = progress; self.message = message; } pub fn status(&self) -> &TaskStatus { &self.status } pub fn started_on(&self) -> &DateTime { &self.started_on } pub fn message(&self) -> &str { &self.message } pub fn progress(&self) -> u8 { self.progress } } #[derive(Default, Clone)] pub struct Health { system: SystemHealth, tasks: HashMap, } impl Health { pub fn system(&self) -> SystemHealth { self.system.clone() } pub fn tasks(self) -> HashMap { self.tasks } } /// `HealthMonitor` gives access to shared system health state, allowing to watch health and update /// task health status. /// /// # Usage /// Internally `HealthMonitor` uses [Arc] so it can be cheaply cloned and shared. /// /// ```no_run /// use prymn_agent::system::health::{HealthMonitor, TaskHealth}; /// /// let health_monitor = HealthMonitor::new(); /// let health_monitor_clone = health_monitor.clone(); /// tokio::spawn(async move { /// loop { /// health_monitor_clone.check_system().await; /// } /// }); /// tokio::spawn(async move { /// health_monitor.set_task_health( /// "some_task".to_string(), /// TaskHealth::new("example".to_string()) /// ); /// }); /// ``` #[derive(Clone)] pub struct HealthMonitor { sender: Arc>, receiver: watch::Receiver, } impl HealthMonitor { pub fn new() -> Self { let (sender, receiver) = watch::channel(Health::default()); Self { sender: Arc::new(sender), receiver, } } // TODO: Remove async from here (so it can be consistent) // Move system checking task into it's own thing pub async fn check_system(&self) { use sysinfo::{CpuExt, DiskExt, SystemExt}; let status = tokio::task::spawn_blocking(|| { let mut status = SystemStatus::Normal; // TODO: For testability, dependency inject this System struct in this function. let mut sys = SYSTEM.lock().unwrap(); // Refresh system resources usage sys.refresh_specifics( sysinfo::RefreshKind::new() .with_memory() .with_disks() .with_cpu(sysinfo::CpuRefreshKind::new().with_cpu_usage()), ); let mut statuses = vec![]; // Check for critical memory usage let memory_usage = sys.used_memory() * 100 / sys.total_memory(); if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD { statuses.push(CriticalReason::HighMemoryUsage); } // Check for critical CPU usage let cpu_usage = sys.global_cpu_info().cpu_usage(); if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 { statuses.push(CriticalReason::HighCpuUsage); } // Check for any disk usage that is critical for disk in sys.disks() { let available_disk = disk.available_space() * 100 / disk.total_space(); if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD { statuses.push(CriticalReason::HighDiskUsage); } } if !statuses.is_empty() { status = SystemStatus::Critical(statuses); } status }) .await .expect("system checking task panicked - possibly due to panicked mutex lock"); self.sender.send_if_modified(|Health { system, .. }| { if system.status == status { return false; } system.status = status; true }); } pub fn set_task_health(&self, task_name: String, health: TaskHealth) { // Always send a notification in this case since it is an explicit action. self.sender.send_modify(|Health { tasks, .. }| { tasks.insert(task_name, health); }); } pub fn clear_task(&self, task_name: &str) { self.sender .send_if_modified(|Health { tasks, .. }| tasks.remove(task_name).is_some()); } pub fn monitor(&self) -> watch::Receiver { self.receiver.clone() } } impl Default for HealthMonitor { fn default() -> Self { HealthMonitor::new() } } impl std::fmt::Display for SystemStatus { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { SystemStatus::Normal => write!(f, "normal"), SystemStatus::OutOfDate => write!(f, "out of date"), SystemStatus::Updating => write!(f, "updating"), SystemStatus::Critical(_) => write!(f, "critical"), } } } impl std::fmt::Display for CriticalReason { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { CriticalReason::HighMemoryUsage => write!(f, "high memory usage"), CriticalReason::HighCpuUsage => write!(f, "high cpu usage"), CriticalReason::HighDiskUsage => write!(f, "high disk usage"), } } } impl std::fmt::Display for TaskStatus { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { TaskStatus::Normal => write!(f, "normal"), TaskStatus::Warning => write!(f, "warning"), TaskStatus::Error => write!(f, "error"), TaskStatus::Completed => write!(f, "completed"), } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_task_monitor() { let health_monitor = HealthMonitor::new(); let receiver = health_monitor.monitor(); assert!(receiver.has_changed().is_ok_and(|changed| !changed)); let health = TaskHealth::new("this is normal".to_owned()); health_monitor.set_task_health("some_task".to_string(), health); assert!(receiver.has_changed().is_ok_and(|changed| changed)); { let health = receiver.borrow(); let task_health = health.tasks.get("some_task").expect("a task should exist"); assert_eq!(task_health.status, TaskStatus::Normal); assert_eq!(task_health.progress, 0); assert_eq!(task_health.message, "this is normal"); } health_monitor.clear_task("some_task"); assert!(!receiver.borrow().tasks.contains_key("some_task")); } }