dotfiles/agent/src/system/health.rs
Nikos Papadakis 5c64f02579 Feature: Agent Tasks (#8)
Reviewed-on: https://git.nikos.gg/prymn/prymn/pulls/8
Co-authored-by: Nikos Papadakis <nikos@papadakis.xyz>
Co-committed-by: Nikos Papadakis <nikos@papadakis.xyz>
2023-11-14 15:23:50 +00:00

190 lines
5.2 KiB
Rust

//! System health module
use std::{collections::HashMap, sync::Arc};
use tokio::sync::watch;
use super::{info::Info, task::TaskStatus};
const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90;
const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90;
const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90;
#[derive(Clone, PartialEq)]
pub enum CriticalReason {
HighMemoryUsage,
HighCpuUsage,
HighDiskUsage,
}
#[derive(Clone, Default, PartialEq)]
pub enum SystemStatus {
#[default]
Normal,
OutOfDate,
Updating,
Critical(Vec<CriticalReason>),
}
#[derive(Clone, Default)]
pub struct SystemHealth {
pub status: SystemStatus,
}
#[derive(Default, Clone)]
pub struct Health {
system: SystemHealth,
tasks: HashMap<String, TaskStatus>,
}
impl Health {
pub fn system(&self) -> &SystemHealth {
&self.system
}
pub fn tasks(&self) -> &HashMap<String, TaskStatus> {
&self.tasks
}
}
/// [HealthMonitor] gives access to shared system health state, allowing to watch health and update
/// task health status.
///
/// # Usage
/// Internally it uses [Arc] so it can be cheaply cloned and shared.
/// ```
/// use prymn_agent::system::health::HealthMonitor;
/// use prymn_agent::system::info::Info;
///
/// let mut info = Info::new();
/// let health_monitor = HealthMonitor::new();
///
/// // Monitor health changes
/// let _receiver = health_monitor.monitor();
///
/// // Refresh system resources
/// info.refresh_resources();
///
/// // Update the health monitor with the refreshed info
/// health_monitor.check_system_info(&info);
/// ```
#[derive(Clone)]
pub struct HealthMonitor {
sender: Arc<watch::Sender<Health>>,
}
impl HealthMonitor {
pub fn new() -> Self {
let (sender, _) = watch::channel(Health::default());
Self {
sender: Arc::new(sender),
}
}
pub fn check_system_info(&self, info: &Info) {
use sysinfo::{CpuExt, DiskExt, SystemExt};
let sys = info.system();
let mut status = SystemStatus::Normal;
let mut statuses = vec![];
// Check for critical memory usage
let memory_usage = if sys.total_memory() > 0 {
sys.used_memory() * 100 / sys.total_memory()
} else {
0
};
if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD {
statuses.push(CriticalReason::HighMemoryUsage);
}
// Check for critical CPU usage
let cpu_usage = sys.global_cpu_info().cpu_usage();
if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 {
statuses.push(CriticalReason::HighCpuUsage);
}
// Check for any disk usage that is critical
for disk in sys.disks() {
let available_disk = if disk.total_space() > 0 {
disk.available_space() * 100 / disk.total_space()
} else {
0
};
if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD {
statuses.push(CriticalReason::HighDiskUsage);
}
}
if !statuses.is_empty() {
status = SystemStatus::Critical(statuses);
}
self.sender.send_if_modified(|Health { system, .. }| {
if system.status == status {
return false;
}
system.status = status;
true
});
}
/// Spawns a new tokio task that tracks from the [watch::Receiver] the status of a Prymn task
/// via [TaskStatus]
pub fn track_task(&self, name: String, mut task_recv: watch::Receiver<TaskStatus>) {
let sender = self.sender.clone();
tokio::task::spawn(async move {
while task_recv.changed().await.is_ok() {
sender.send_modify(|health| {
health
.tasks
.insert(String::from(&name), task_recv.borrow().clone());
});
}
// At this point the Sender part of the watch dropped, meaning we can clear the task
// because it is complete.
sender.send_if_modified(|health| health.tasks.remove(&name).is_some());
});
}
pub fn clear_task(&self, task_name: &str) {
self.sender
.send_if_modified(|Health { tasks, .. }| tasks.remove(task_name).is_some());
}
pub fn monitor(&self) -> watch::Receiver<Health> {
self.sender.subscribe()
}
}
impl Default for HealthMonitor {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for SystemStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SystemStatus::Normal => write!(f, "normal"),
SystemStatus::OutOfDate => write!(f, "out of date"),
SystemStatus::Updating => write!(f, "updating"),
SystemStatus::Critical(_) => write!(f, "critical"),
}
}
}
impl std::fmt::Display for CriticalReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
CriticalReason::HighMemoryUsage => write!(f, "high memory usage"),
CriticalReason::HighCpuUsage => write!(f, "high cpu usage"),
CriticalReason::HighDiskUsage => write!(f, "high disk usage"),
}
}
}