2023-08-12 09:37:01 +00:00
|
|
|
//! System health module
|
|
|
|
use std::{collections::HashMap, sync::Arc};
|
|
|
|
|
|
|
|
use tokio::sync::watch;
|
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
use super::{info::Info, task::TaskStatus};
|
2023-08-12 09:37:01 +00:00
|
|
|
|
|
|
|
const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90;
|
|
|
|
const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90;
|
|
|
|
const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90;
|
|
|
|
|
|
|
|
#[derive(Clone, PartialEq)]
|
|
|
|
pub enum CriticalReason {
|
|
|
|
HighMemoryUsage,
|
|
|
|
HighCpuUsage,
|
|
|
|
HighDiskUsage,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone, Default, PartialEq)]
|
|
|
|
pub enum SystemStatus {
|
|
|
|
#[default]
|
|
|
|
Normal,
|
|
|
|
OutOfDate,
|
|
|
|
Updating,
|
|
|
|
Critical(Vec<CriticalReason>),
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone, Default)]
|
|
|
|
pub struct SystemHealth {
|
|
|
|
pub status: SystemStatus,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Default, Clone)]
|
|
|
|
pub struct Health {
|
|
|
|
system: SystemHealth,
|
2023-11-14 15:23:50 +00:00
|
|
|
tasks: HashMap<String, TaskStatus>,
|
2023-08-12 09:37:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Health {
|
2023-11-14 15:23:50 +00:00
|
|
|
pub fn system(&self) -> &SystemHealth {
|
|
|
|
&self.system
|
2023-08-12 09:37:01 +00:00
|
|
|
}
|
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
pub fn tasks(&self) -> &HashMap<String, TaskStatus> {
|
|
|
|
&self.tasks
|
2023-08-12 09:37:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
/// [HealthMonitor] gives access to shared system health state, allowing to watch health and update
|
2023-08-12 09:37:01 +00:00
|
|
|
/// task health status.
|
|
|
|
///
|
|
|
|
/// # Usage
|
2023-11-14 15:23:50 +00:00
|
|
|
/// Internally it uses [Arc] so it can be cheaply cloned and shared.
|
|
|
|
/// ```
|
|
|
|
/// use prymn_agent::system::health::HealthMonitor;
|
|
|
|
/// use prymn_agent::system::info::Info;
|
2023-08-12 09:37:01 +00:00
|
|
|
///
|
2023-11-14 15:23:50 +00:00
|
|
|
/// let mut info = Info::new();
|
2023-08-12 09:37:01 +00:00
|
|
|
/// let health_monitor = HealthMonitor::new();
|
2023-11-14 15:23:50 +00:00
|
|
|
///
|
|
|
|
/// // Monitor health changes
|
|
|
|
/// let _receiver = health_monitor.monitor();
|
|
|
|
///
|
|
|
|
/// // Refresh system resources
|
|
|
|
/// info.refresh_resources();
|
|
|
|
///
|
|
|
|
/// // Update the health monitor with the refreshed info
|
|
|
|
/// health_monitor.check_system_info(&info);
|
2023-08-12 09:37:01 +00:00
|
|
|
/// ```
|
|
|
|
#[derive(Clone)]
|
|
|
|
pub struct HealthMonitor {
|
|
|
|
sender: Arc<watch::Sender<Health>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl HealthMonitor {
|
|
|
|
pub fn new() -> Self {
|
2023-11-14 15:23:50 +00:00
|
|
|
let (sender, _) = watch::channel(Health::default());
|
2023-08-12 09:37:01 +00:00
|
|
|
Self {
|
|
|
|
sender: Arc::new(sender),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
pub fn check_system_info(&self, info: &Info) {
|
2023-08-12 09:37:01 +00:00
|
|
|
use sysinfo::{CpuExt, DiskExt, SystemExt};
|
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
let sys = info.system();
|
|
|
|
let mut status = SystemStatus::Normal;
|
|
|
|
let mut statuses = vec![];
|
2023-08-12 09:37:01 +00:00
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
// Check for critical memory usage
|
|
|
|
let memory_usage = if sys.total_memory() > 0 {
|
|
|
|
sys.used_memory() * 100 / sys.total_memory()
|
|
|
|
} else {
|
|
|
|
0
|
|
|
|
};
|
2023-08-12 09:37:01 +00:00
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD {
|
|
|
|
statuses.push(CriticalReason::HighMemoryUsage);
|
|
|
|
}
|
2023-08-12 09:37:01 +00:00
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
// Check for critical CPU usage
|
|
|
|
let cpu_usage = sys.global_cpu_info().cpu_usage();
|
2023-08-12 09:37:01 +00:00
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 {
|
|
|
|
statuses.push(CriticalReason::HighCpuUsage);
|
|
|
|
}
|
2023-08-12 09:37:01 +00:00
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
// Check for any disk usage that is critical
|
|
|
|
for disk in sys.disks() {
|
|
|
|
let available_disk = if disk.total_space() > 0 {
|
|
|
|
disk.available_space() * 100 / disk.total_space()
|
|
|
|
} else {
|
|
|
|
0
|
|
|
|
};
|
2023-08-12 09:37:01 +00:00
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD {
|
|
|
|
statuses.push(CriticalReason::HighDiskUsage);
|
2023-08-12 09:37:01 +00:00
|
|
|
}
|
2023-11-14 15:23:50 +00:00
|
|
|
}
|
2023-08-12 09:37:01 +00:00
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
if !statuses.is_empty() {
|
|
|
|
status = SystemStatus::Critical(statuses);
|
|
|
|
}
|
2023-08-12 09:37:01 +00:00
|
|
|
|
|
|
|
self.sender.send_if_modified(|Health { system, .. }| {
|
|
|
|
if system.status == status {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
system.status = status;
|
|
|
|
true
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2023-11-14 15:23:50 +00:00
|
|
|
/// Spawns a new tokio task that tracks from the [watch::Receiver] the status of a Prymn task
|
|
|
|
/// via [TaskStatus]
|
|
|
|
pub fn track_task(&self, name: String, mut task_recv: watch::Receiver<TaskStatus>) {
|
|
|
|
let sender = self.sender.clone();
|
|
|
|
|
|
|
|
tokio::task::spawn(async move {
|
|
|
|
while task_recv.changed().await.is_ok() {
|
|
|
|
sender.send_modify(|health| {
|
|
|
|
health
|
|
|
|
.tasks
|
|
|
|
.insert(String::from(&name), task_recv.borrow().clone());
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// At this point the Sender part of the watch dropped, meaning we can clear the task
|
|
|
|
// because it is complete.
|
|
|
|
sender.send_if_modified(|health| health.tasks.remove(&name).is_some());
|
2023-08-12 09:37:01 +00:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn clear_task(&self, task_name: &str) {
|
|
|
|
self.sender
|
|
|
|
.send_if_modified(|Health { tasks, .. }| tasks.remove(task_name).is_some());
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn monitor(&self) -> watch::Receiver<Health> {
|
2023-11-14 15:23:50 +00:00
|
|
|
self.sender.subscribe()
|
2023-08-12 09:37:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Default for HealthMonitor {
|
|
|
|
fn default() -> Self {
|
2023-11-14 15:23:50 +00:00
|
|
|
Self::new()
|
2023-08-12 09:37:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl std::fmt::Display for SystemStatus {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
match self {
|
|
|
|
SystemStatus::Normal => write!(f, "normal"),
|
|
|
|
SystemStatus::OutOfDate => write!(f, "out of date"),
|
|
|
|
SystemStatus::Updating => write!(f, "updating"),
|
|
|
|
SystemStatus::Critical(_) => write!(f, "critical"),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl std::fmt::Display for CriticalReason {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
match self {
|
|
|
|
CriticalReason::HighMemoryUsage => write!(f, "high memory usage"),
|
|
|
|
CriticalReason::HighCpuUsage => write!(f, "high cpu usage"),
|
|
|
|
CriticalReason::HighDiskUsage => write!(f, "high disk usage"),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|