dotfiles/agent/src/system/health.rs

//! System health module
use std::{collections::HashMap, sync::Arc};

use chrono::{DateTime, Utc};
use tokio::sync::watch;

use super::SYSTEM;

const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90;
const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90;
const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90;

#[derive(Clone, PartialEq)]
pub enum CriticalReason {
    HighMemoryUsage,
    HighCpuUsage,
    HighDiskUsage,
}

#[derive(Clone, Default, PartialEq)]
pub enum SystemStatus {
    #[default]
    Normal,
    OutOfDate,
    Updating,
    Critical(Vec<CriticalReason>),
}

#[derive(Clone, Default)]
pub struct SystemHealth {
    pub status: SystemStatus,
}

#[derive(Clone, PartialEq, Debug)]
pub enum TaskStatus {
    Normal,
    Warning,
    Error,
    Completed,
}

#[derive(Clone)]
pub struct TaskHealth {
    status: TaskStatus,
    started_on: DateTime<Utc>,
    message: String,
    progress: u8,
}

impl TaskHealth {
    pub fn new(message: String) -> Self {
        let started_on = chrono::Utc::now();

        Self {
            status: TaskStatus::Normal,
            started_on,
            message,
            progress: 0,
        }
    }

    pub fn set_normal(&mut self, message: String) {
        self.status = TaskStatus::Normal;
        self.message = message;
    }

    pub fn set_warning(&mut self, message: String) {
        self.status = TaskStatus::Warning;
        self.message = message;
    }

    pub fn set_error(&mut self, message: String) {
        self.status = TaskStatus::Error;
        self.message = message;
    }

    pub fn set_completed(mut self, message: String) {
        self.status = TaskStatus::Completed;
        self.progress = 100;
        self.message = message;
    }

    pub fn set_progress(&mut self, message: String, progress: u8) {
        self.progress = progress;
        self.message = message;
    }

    pub fn status(&self) -> &TaskStatus {
        &self.status
    }

    pub fn started_on(&self) -> &DateTime<Utc> {
        &self.started_on
    }

    pub fn message(&self) -> &str {
        &self.message
    }

    pub fn progress(&self) -> u8 {
        self.progress
    }
}

#[derive(Default, Clone)]
pub struct Health {
    system: SystemHealth,
    tasks: HashMap<String, TaskHealth>,
}

impl Health {
    pub fn system(&self) -> SystemHealth {
        self.system.clone()
    }

    pub fn tasks(self) -> HashMap<String, TaskHealth> {
        self.tasks
    }
}

/// `HealthMonitor` gives access to shared system health state, allowing to watch health and update
/// task health status.
///
/// # Usage
/// Internally `HealthMonitor` uses [Arc] so it can be cheaply cloned and shared.
///
/// ```no_run
/// use prymn_agent::system::health::{HealthMonitor, TaskHealth};
///
/// let health_monitor = HealthMonitor::new();
/// let health_monitor_clone = health_monitor.clone();
/// tokio::spawn(async move {
///     loop {
///         health_monitor_clone.check_system().await;
///     }
/// });
/// tokio::spawn(async move {
///     health_monitor.set_task_health("some_task".to_string(), TaskHealth::new(None)).await;
/// });
/// ```
#[derive(Clone)]
pub struct HealthMonitor {
    sender: Arc<watch::Sender<Health>>,
    receiver: watch::Receiver<Health>,
}

impl HealthMonitor {
    pub fn new() -> Self {
        let (sender, receiver) = watch::channel(Health::default());
        Self {
            sender: Arc::new(sender),
            receiver,
        }
    }

    // TODO: Remove async from here (so it can be consistent)
    //       Move system checking task into it's own thing
    pub async fn check_system(&self) {
        use sysinfo::{CpuExt, DiskExt, SystemExt};

        let status = tokio::task::spawn_blocking(|| {
            let mut status = SystemStatus::Normal;

            // TODO: For testability, dependency inject this System struct in this function.
            let mut sys = SYSTEM.lock().unwrap();

            // Refresh system resources usage
            sys.refresh_specifics(
                sysinfo::RefreshKind::new()
                    .with_memory()
                    .with_disks()
                    .with_cpu(sysinfo::CpuRefreshKind::new().with_cpu_usage()),
            );

            let mut statuses = vec![];

            // Check for critical memory usage
            let memory_usage = sys.used_memory() * 100 / sys.total_memory();
            if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD {
                statuses.push(CriticalReason::HighMemoryUsage);
            }

            // Check for critical CPU usage
            let cpu_usage = sys.global_cpu_info().cpu_usage();
            if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 {
                statuses.push(CriticalReason::HighCpuUsage);
            }

            // Check for any disk usage that is critical
            for disk in sys.disks() {
                let available_disk = disk.available_space() * 100 / disk.total_space();
                if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD {
                    statuses.push(CriticalReason::HighDiskUsage);
                }
            }

            if !statuses.is_empty() {
                status = SystemStatus::Critical(statuses);
            }

            status
        })
        .await
        .expect("system checking task panicked - possibly due to panicked mutex lock");

        self.sender.send_if_modified(|Health { system, .. }| {
            if system.status == status {
                return false;
            }

            system.status = status;
            true
        });
    }

    pub fn set_task_health(&self, task_name: String, health: TaskHealth) {
        // Always send a notification in this case since it is an explicit action.
        self.sender.send_modify(|Health { tasks, .. }| {
            tasks.insert(task_name, health);
        });
    }

    pub fn clear_task(&self, task_name: &str) {
        self.sender
            .send_if_modified(|Health { tasks, .. }| tasks.remove(task_name).is_some());
    }

    pub fn monitor(&self) -> watch::Receiver<Health> {
        self.receiver.clone()
    }
}

impl Default for HealthMonitor {
    fn default() -> Self {
        HealthMonitor::new()
    }
}

impl std::fmt::Display for SystemStatus {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            SystemStatus::Normal => write!(f, "normal"),
            SystemStatus::OutOfDate => write!(f, "out of date"),
            SystemStatus::Updating => write!(f, "updating"),
            SystemStatus::Critical(_) => write!(f, "critical"),
        }
    }
}

impl std::fmt::Display for CriticalReason {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            CriticalReason::HighMemoryUsage => write!(f, "high memory usage"),
            CriticalReason::HighCpuUsage => write!(f, "high cpu usage"),
            CriticalReason::HighDiskUsage => write!(f, "high disk usage"),
        }
    }
}

impl std::fmt::Display for TaskStatus {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TaskStatus::Normal => write!(f, "normal"),
            TaskStatus::Warning => write!(f, "warning"),
            TaskStatus::Error => write!(f, "error"),
            TaskStatus::Completed => write!(f, "completed"),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_task_monitor() {
        let health_monitor = HealthMonitor::new();
        let receiver = health_monitor.monitor();

        assert!(receiver.has_changed().is_ok_and(|changed| !changed));

        let health = TaskHealth::new("this is normal".to_owned());
        health_monitor.set_task_health("some_task".to_string(), health);

        assert!(receiver.has_changed().is_ok_and(|changed| changed));

        {
            let health = receiver.borrow();
            let task_health = health.tasks.get("some_task").expect("a task should exist");

            assert_eq!(task_health.status, TaskStatus::Normal);
            assert_eq!(task_health.progress, 0);
            assert_eq!(task_health.message, "this is normal");
        }

        health_monitor.clear_task("some_task");
        assert!(!receiver.borrow().tasks.contains_key("some_task"));
    }
}
Add health checking system to agent Adds a health checking endpoint on the GRPC server. This is a stream that changes whenever a health status update occurs. Reviewed-on: https://git.nikos.gg/prymn/prymn/pulls/5 Co-authored-by: Nikos Papadakis <nikos@papadakis.xyz> Co-committed-by: Nikos Papadakis <nikos@papadakis.xyz> 2023-08-12 09:37:01 +00:00			`//! System health module`
			`use std::{collections::HashMap, sync::Arc};`

			`use chrono::{DateTime, Utc};`
			`use tokio::sync::watch;`

			`use super::SYSTEM;`

			`const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90;`
			`const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90;`
			`const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90;`

			`#[derive(Clone, PartialEq)]`
			`pub enum CriticalReason {`
			`HighMemoryUsage,`
			`HighCpuUsage,`
			`HighDiskUsage,`
			`}`

			`#[derive(Clone, Default, PartialEq)]`
			`pub enum SystemStatus {`
			`#[default]`
			`Normal,`
			`OutOfDate,`
			`Updating,`
			`Critical(Vec<CriticalReason>),`
			`}`

			`#[derive(Clone, Default)]`
			`pub struct SystemHealth {`
			`pub status: SystemStatus,`
			`}`

			`#[derive(Clone, PartialEq, Debug)]`
			`pub enum TaskStatus {`
			`Normal,`
			`Warning,`
			`Error,`
			`Completed,`
			`}`

			`#[derive(Clone)]`
			`pub struct TaskHealth {`
			`status: TaskStatus,`
			`started_on: DateTime<Utc>,`
			`message: String,`
			`progress: u8,`
			`}`

			`impl TaskHealth {`
			`pub fn new(message: String) -> Self {`
			`let started_on = chrono::Utc::now();`

			`Self {`
			`status: TaskStatus::Normal,`
			`started_on,`
			`message,`
			`progress: 0,`
			`}`
			`}`

			`pub fn set_normal(&mut self, message: String) {`
			`self.status = TaskStatus::Normal;`
			`self.message = message;`
			`}`

			`pub fn set_warning(&mut self, message: String) {`
			`self.status = TaskStatus::Warning;`
			`self.message = message;`
			`}`

			`pub fn set_error(&mut self, message: String) {`
			`self.status = TaskStatus::Error;`
			`self.message = message;`
			`}`

			`pub fn set_completed(mut self, message: String) {`
			`self.status = TaskStatus::Completed;`
			`self.progress = 100;`
			`self.message = message;`
			`}`

			`pub fn set_progress(&mut self, message: String, progress: u8) {`
			`self.progress = progress;`
			`self.message = message;`
			`}`

			`pub fn status(&self) -> &TaskStatus {`
			`&self.status`
			`}`

			`pub fn started_on(&self) -> &DateTime<Utc> {`
			`&self.started_on`
			`}`

			`pub fn message(&self) -> &str {`
			`&self.message`
			`}`

			`pub fn progress(&self) -> u8 {`
			`self.progress`
			`}`
			`}`

			`#[derive(Default, Clone)]`
			`pub struct Health {`
			`system: SystemHealth,`
			`tasks: HashMap<String, TaskHealth>,`
			`}`

			`impl Health {`
			`pub fn system(&self) -> SystemHealth {`
			`self.system.clone()`
			`}`

			`pub fn tasks(self) -> HashMap<String, TaskHealth> {`
			`self.tasks`
			`}`
			`}`

			/// `HealthMonitor` gives access to shared system health state, allowing to watch health and update
			`/// task health status.`
			`///`
			`/// # Usage`
			/// Internally `HealthMonitor` uses [Arc] so it can be cheaply cloned and shared.
			`///`
			/// ```no_run
			`/// use prymn_agent::system::health::{HealthMonitor, TaskHealth};`
			`///`
			`/// let health_monitor = HealthMonitor::new();`
			`/// let health_monitor_clone = health_monitor.clone();`
			`/// tokio::spawn(async move {`
			`/// loop {`
			`/// health_monitor_clone.check_system().await;`
			`/// }`
			`/// });`
			`/// tokio::spawn(async move {`
			`/// health_monitor.set_task_health("some_task".to_string(), TaskHealth::new(None)).await;`
			`/// });`
			/// ```
			`#[derive(Clone)]`
			`pub struct HealthMonitor {`
			`sender: Arc<watch::Sender<Health>>,`
			`receiver: watch::Receiver<Health>,`
			`}`

			`impl HealthMonitor {`
			`pub fn new() -> Self {`
			`let (sender, receiver) = watch::channel(Health::default());`
			`Self {`
			`sender: Arc::new(sender),`
			`receiver,`
			`}`
			`}`

			`// TODO: Remove async from here (so it can be consistent)`
			`// Move system checking task into it's own thing`
			`pub async fn check_system(&self) {`
			`use sysinfo::{CpuExt, DiskExt, SystemExt};`

			`let status = tokio::task::spawn_blocking(\|\| {`
			`let mut status = SystemStatus::Normal;`

			`// TODO: For testability, dependency inject this System struct in this function.`
			`let mut sys = SYSTEM.lock().unwrap();`

			`// Refresh system resources usage`
			`sys.refresh_specifics(`
			`sysinfo::RefreshKind::new()`
			`.with_memory()`
			`.with_disks()`
			`.with_cpu(sysinfo::CpuRefreshKind::new().with_cpu_usage()),`
			`);`

			`let mut statuses = vec![];`

			`// Check for critical memory usage`
			`let memory_usage = sys.used_memory() * 100 / sys.total_memory();`
			`if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD {`
			`statuses.push(CriticalReason::HighMemoryUsage);`
			`}`

			`// Check for critical CPU usage`
			`let cpu_usage = sys.global_cpu_info().cpu_usage();`
			`if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 {`
			`statuses.push(CriticalReason::HighCpuUsage);`
			`}`

			`// Check for any disk usage that is critical`
			`for disk in sys.disks() {`
			`let available_disk = disk.available_space() * 100 / disk.total_space();`
			`if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD {`
			`statuses.push(CriticalReason::HighDiskUsage);`
			`}`
			`}`

			`if !statuses.is_empty() {`
			`status = SystemStatus::Critical(statuses);`
			`}`

			`status`
			`})`
			`.await`
			`.expect("system checking task panicked - possibly due to panicked mutex lock");`

			`self.sender.send_if_modified(\|Health { system, .. }\| {`
			`if system.status == status {`
			`return false;`
			`}`

			`system.status = status;`
			`true`
			`});`
			`}`

			`pub fn set_task_health(&self, task_name: String, health: TaskHealth) {`
			`// Always send a notification in this case since it is an explicit action.`
			`self.sender.send_modify(\|Health { tasks, .. }\| {`
			`tasks.insert(task_name, health);`
			`});`
			`}`

			`pub fn clear_task(&self, task_name: &str) {`
			`self.sender`
			`.send_if_modified(\|Health { tasks, .. }\| tasks.remove(task_name).is_some());`
			`}`

			`pub fn monitor(&self) -> watch::Receiver<Health> {`
			`self.receiver.clone()`
			`}`
			`}`

			`impl Default for HealthMonitor {`
			`fn default() -> Self {`
			`HealthMonitor::new()`
			`}`
			`}`

			`impl std::fmt::Display for SystemStatus {`
			`fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {`
			`match self {`
			`SystemStatus::Normal => write!(f, "normal"),`
			`SystemStatus::OutOfDate => write!(f, "out of date"),`
			`SystemStatus::Updating => write!(f, "updating"),`
			`SystemStatus::Critical(_) => write!(f, "critical"),`
			`}`
			`}`
			`}`

			`impl std::fmt::Display for CriticalReason {`
			`fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {`
			`match self {`
			`CriticalReason::HighMemoryUsage => write!(f, "high memory usage"),`
			`CriticalReason::HighCpuUsage => write!(f, "high cpu usage"),`
			`CriticalReason::HighDiskUsage => write!(f, "high disk usage"),`
			`}`
			`}`
			`}`

			`impl std::fmt::Display for TaskStatus {`
			`fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {`
			`match self {`
			`TaskStatus::Normal => write!(f, "normal"),`
			`TaskStatus::Warning => write!(f, "warning"),`
			`TaskStatus::Error => write!(f, "error"),`
			`TaskStatus::Completed => write!(f, "completed"),`
			`}`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn test_task_monitor() {`
			`let health_monitor = HealthMonitor::new();`
			`let receiver = health_monitor.monitor();`

			`assert!(receiver.has_changed().is_ok_and(\|changed\| !changed));`

			`let health = TaskHealth::new("this is normal".to_owned());`
			`health_monitor.set_task_health("some_task".to_string(), health);`

			`assert!(receiver.has_changed().is_ok_and(\|changed\| changed));`

			`{`
			`let health = receiver.borrow();`
			`let task_health = health.tasks.get("some_task").expect("a task should exist");`

			`assert_eq!(task_health.status, TaskStatus::Normal);`
			`assert_eq!(task_health.progress, 0);`
			`assert_eq!(task_health.message, "this is normal");`
			`}`

			`health_monitor.clear_task("some_task");`
			`assert!(!receiver.borrow().tasks.contains_key("some_task"));`
			`}`
			`}`