dotfiles/agent/src/system/health.rs

303 lines
8.1 KiB
Rust
Raw Normal View History

//! System health module
use std::{collections::HashMap, sync::Arc};
use chrono::{DateTime, Utc};
use tokio::sync::watch;
use super::SYSTEM;
const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90;
const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90;
const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90;
#[derive(Clone, PartialEq)]
pub enum CriticalReason {
HighMemoryUsage,
HighCpuUsage,
HighDiskUsage,
}
#[derive(Clone, Default, PartialEq)]
pub enum SystemStatus {
#[default]
Normal,
OutOfDate,
Updating,
Critical(Vec<CriticalReason>),
}
#[derive(Clone, Default)]
pub struct SystemHealth {
pub status: SystemStatus,
}
#[derive(Clone, PartialEq, Debug)]
pub enum TaskStatus {
Normal,
Warning,
Error,
Completed,
}
#[derive(Clone)]
pub struct TaskHealth {
status: TaskStatus,
started_on: DateTime<Utc>,
message: String,
progress: u8,
}
impl TaskHealth {
pub fn new(message: String) -> Self {
let started_on = chrono::Utc::now();
Self {
status: TaskStatus::Normal,
started_on,
message,
progress: 0,
}
}
pub fn set_normal(&mut self, message: String) {
self.status = TaskStatus::Normal;
self.message = message;
}
pub fn set_warning(&mut self, message: String) {
self.status = TaskStatus::Warning;
self.message = message;
}
pub fn set_error(&mut self, message: String) {
self.status = TaskStatus::Error;
self.message = message;
}
pub fn set_completed(mut self, message: String) {
self.status = TaskStatus::Completed;
self.progress = 100;
self.message = message;
}
pub fn set_progress(&mut self, message: String, progress: u8) {
self.progress = progress;
self.message = message;
}
pub fn status(&self) -> &TaskStatus {
&self.status
}
pub fn started_on(&self) -> &DateTime<Utc> {
&self.started_on
}
pub fn message(&self) -> &str {
&self.message
}
pub fn progress(&self) -> u8 {
self.progress
}
}
#[derive(Default, Clone)]
pub struct Health {
system: SystemHealth,
tasks: HashMap<String, TaskHealth>,
}
impl Health {
pub fn system(&self) -> SystemHealth {
self.system.clone()
}
pub fn tasks(self) -> HashMap<String, TaskHealth> {
self.tasks
}
}
/// `HealthMonitor` gives access to shared system health state, allowing to watch health and update
/// task health status.
///
/// # Usage
/// Internally `HealthMonitor` uses [Arc] so it can be cheaply cloned and shared.
///
/// ```no_run
/// use prymn_agent::system::health::{HealthMonitor, TaskHealth};
///
/// let health_monitor = HealthMonitor::new();
/// let health_monitor_clone = health_monitor.clone();
/// tokio::spawn(async move {
/// loop {
/// health_monitor_clone.check_system().await;
/// }
/// });
/// tokio::spawn(async move {
/// health_monitor.set_task_health(
/// "some_task".to_string(),
/// TaskHealth::new("example".to_string())
/// );
/// });
/// ```
#[derive(Clone)]
pub struct HealthMonitor {
sender: Arc<watch::Sender<Health>>,
receiver: watch::Receiver<Health>,
}
impl HealthMonitor {
pub fn new() -> Self {
let (sender, receiver) = watch::channel(Health::default());
Self {
sender: Arc::new(sender),
receiver,
}
}
// TODO: Remove async from here (so it can be consistent)
// Move system checking task into it's own thing
pub async fn check_system(&self) {
use sysinfo::{CpuExt, DiskExt, SystemExt};
let status = tokio::task::spawn_blocking(|| {
let mut status = SystemStatus::Normal;
// TODO: For testability, dependency inject this System struct in this function.
let mut sys = SYSTEM.lock().unwrap();
// Refresh system resources usage
sys.refresh_specifics(
sysinfo::RefreshKind::new()
.with_memory()
.with_disks()
.with_cpu(sysinfo::CpuRefreshKind::new().with_cpu_usage()),
);
let mut statuses = vec![];
// Check for critical memory usage
let memory_usage = sys.used_memory() * 100 / sys.total_memory();
if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD {
statuses.push(CriticalReason::HighMemoryUsage);
}
// Check for critical CPU usage
let cpu_usage = sys.global_cpu_info().cpu_usage();
if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 {
statuses.push(CriticalReason::HighCpuUsage);
}
// Check for any disk usage that is critical
for disk in sys.disks() {
let available_disk = disk.available_space() * 100 / disk.total_space();
if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD {
statuses.push(CriticalReason::HighDiskUsage);
}
}
if !statuses.is_empty() {
status = SystemStatus::Critical(statuses);
}
status
})
.await
.expect("system checking task panicked - possibly due to panicked mutex lock");
self.sender.send_if_modified(|Health { system, .. }| {
if system.status == status {
return false;
}
system.status = status;
true
});
}
pub fn set_task_health(&self, task_name: String, health: TaskHealth) {
// Always send a notification in this case since it is an explicit action.
self.sender.send_modify(|Health { tasks, .. }| {
tasks.insert(task_name, health);
});
}
pub fn clear_task(&self, task_name: &str) {
self.sender
.send_if_modified(|Health { tasks, .. }| tasks.remove(task_name).is_some());
}
pub fn monitor(&self) -> watch::Receiver<Health> {
self.receiver.clone()
}
}
impl Default for HealthMonitor {
fn default() -> Self {
HealthMonitor::new()
}
}
impl std::fmt::Display for SystemStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SystemStatus::Normal => write!(f, "normal"),
SystemStatus::OutOfDate => write!(f, "out of date"),
SystemStatus::Updating => write!(f, "updating"),
SystemStatus::Critical(_) => write!(f, "critical"),
}
}
}
impl std::fmt::Display for CriticalReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
CriticalReason::HighMemoryUsage => write!(f, "high memory usage"),
CriticalReason::HighCpuUsage => write!(f, "high cpu usage"),
CriticalReason::HighDiskUsage => write!(f, "high disk usage"),
}
}
}
impl std::fmt::Display for TaskStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TaskStatus::Normal => write!(f, "normal"),
TaskStatus::Warning => write!(f, "warning"),
TaskStatus::Error => write!(f, "error"),
TaskStatus::Completed => write!(f, "completed"),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_task_monitor() {
let health_monitor = HealthMonitor::new();
let receiver = health_monitor.monitor();
assert!(receiver.has_changed().is_ok_and(|changed| !changed));
let health = TaskHealth::new("this is normal".to_owned());
health_monitor.set_task_health("some_task".to_string(), health);
assert!(receiver.has_changed().is_ok_and(|changed| changed));
{
let health = receiver.borrow();
let task_health = health.tasks.get("some_task").expect("a task should exist");
assert_eq!(task_health.status, TaskStatus::Normal);
assert_eq!(task_health.progress, 0);
assert_eq!(task_health.message, "this is normal");
}
health_monitor.clear_task("some_task");
assert!(!receiver.borrow().tasks.contains_key("some_task"));
}
}