300 lines
8.1 KiB
Rust
300 lines
8.1 KiB
Rust
|
//! System health module
|
||
|
use std::{collections::HashMap, sync::Arc};
|
||
|
|
||
|
use chrono::{DateTime, Utc};
|
||
|
use tokio::sync::watch;
|
||
|
|
||
|
use super::SYSTEM;
|
||
|
|
||
|
const MEMORY_USAGE_CRITICAL_THRESHOLD: u64 = 90;
|
||
|
const CPU_USAGE_CRITICAL_THRESHOLD: u64 = 90;
|
||
|
const DISK_USAGE_CRITICAL_THRESHOLD: u64 = 90;
|
||
|
|
||
|
#[derive(Clone, PartialEq)]
|
||
|
pub enum CriticalReason {
|
||
|
HighMemoryUsage,
|
||
|
HighCpuUsage,
|
||
|
HighDiskUsage,
|
||
|
}
|
||
|
|
||
|
#[derive(Clone, Default, PartialEq)]
|
||
|
pub enum SystemStatus {
|
||
|
#[default]
|
||
|
Normal,
|
||
|
OutOfDate,
|
||
|
Updating,
|
||
|
Critical(Vec<CriticalReason>),
|
||
|
}
|
||
|
|
||
|
#[derive(Clone, Default)]
|
||
|
pub struct SystemHealth {
|
||
|
pub status: SystemStatus,
|
||
|
}
|
||
|
|
||
|
#[derive(Clone, PartialEq, Debug)]
|
||
|
pub enum TaskStatus {
|
||
|
Normal,
|
||
|
Warning,
|
||
|
Error,
|
||
|
Completed,
|
||
|
}
|
||
|
|
||
|
#[derive(Clone)]
|
||
|
pub struct TaskHealth {
|
||
|
status: TaskStatus,
|
||
|
started_on: DateTime<Utc>,
|
||
|
message: String,
|
||
|
progress: u8,
|
||
|
}
|
||
|
|
||
|
impl TaskHealth {
|
||
|
pub fn new(message: String) -> Self {
|
||
|
let started_on = chrono::Utc::now();
|
||
|
|
||
|
Self {
|
||
|
status: TaskStatus::Normal,
|
||
|
started_on,
|
||
|
message,
|
||
|
progress: 0,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
pub fn set_normal(&mut self, message: String) {
|
||
|
self.status = TaskStatus::Normal;
|
||
|
self.message = message;
|
||
|
}
|
||
|
|
||
|
pub fn set_warning(&mut self, message: String) {
|
||
|
self.status = TaskStatus::Warning;
|
||
|
self.message = message;
|
||
|
}
|
||
|
|
||
|
pub fn set_error(&mut self, message: String) {
|
||
|
self.status = TaskStatus::Error;
|
||
|
self.message = message;
|
||
|
}
|
||
|
|
||
|
pub fn set_completed(mut self, message: String) {
|
||
|
self.status = TaskStatus::Completed;
|
||
|
self.progress = 100;
|
||
|
self.message = message;
|
||
|
}
|
||
|
|
||
|
pub fn set_progress(&mut self, message: String, progress: u8) {
|
||
|
self.progress = progress;
|
||
|
self.message = message;
|
||
|
}
|
||
|
|
||
|
pub fn status(&self) -> &TaskStatus {
|
||
|
&self.status
|
||
|
}
|
||
|
|
||
|
pub fn started_on(&self) -> &DateTime<Utc> {
|
||
|
&self.started_on
|
||
|
}
|
||
|
|
||
|
pub fn message(&self) -> &str {
|
||
|
&self.message
|
||
|
}
|
||
|
|
||
|
pub fn progress(&self) -> u8 {
|
||
|
self.progress
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#[derive(Default, Clone)]
|
||
|
pub struct Health {
|
||
|
system: SystemHealth,
|
||
|
tasks: HashMap<String, TaskHealth>,
|
||
|
}
|
||
|
|
||
|
impl Health {
|
||
|
pub fn system(&self) -> SystemHealth {
|
||
|
self.system.clone()
|
||
|
}
|
||
|
|
||
|
pub fn tasks(self) -> HashMap<String, TaskHealth> {
|
||
|
self.tasks
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// `HealthMonitor` gives access to shared system health state, allowing to watch health and update
|
||
|
/// task health status.
|
||
|
///
|
||
|
/// # Usage
|
||
|
/// Internally `HealthMonitor` uses [Arc] so it can be cheaply cloned and shared.
|
||
|
///
|
||
|
/// ```no_run
|
||
|
/// use prymn_agent::system::health::{HealthMonitor, TaskHealth};
|
||
|
///
|
||
|
/// let health_monitor = HealthMonitor::new();
|
||
|
/// let health_monitor_clone = health_monitor.clone();
|
||
|
/// tokio::spawn(async move {
|
||
|
/// loop {
|
||
|
/// health_monitor_clone.check_system().await;
|
||
|
/// }
|
||
|
/// });
|
||
|
/// tokio::spawn(async move {
|
||
|
/// health_monitor.set_task_health("some_task".to_string(), TaskHealth::new(None)).await;
|
||
|
/// });
|
||
|
/// ```
|
||
|
#[derive(Clone)]
|
||
|
pub struct HealthMonitor {
|
||
|
sender: Arc<watch::Sender<Health>>,
|
||
|
receiver: watch::Receiver<Health>,
|
||
|
}
|
||
|
|
||
|
impl HealthMonitor {
|
||
|
pub fn new() -> Self {
|
||
|
let (sender, receiver) = watch::channel(Health::default());
|
||
|
Self {
|
||
|
sender: Arc::new(sender),
|
||
|
receiver,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// TODO: Remove async from here (so it can be consistent)
|
||
|
// Move system checking task into it's own thing
|
||
|
pub async fn check_system(&self) {
|
||
|
use sysinfo::{CpuExt, DiskExt, SystemExt};
|
||
|
|
||
|
let status = tokio::task::spawn_blocking(|| {
|
||
|
let mut status = SystemStatus::Normal;
|
||
|
|
||
|
// TODO: For testability, dependency inject this System struct in this function.
|
||
|
let mut sys = SYSTEM.lock().unwrap();
|
||
|
|
||
|
// Refresh system resources usage
|
||
|
sys.refresh_specifics(
|
||
|
sysinfo::RefreshKind::new()
|
||
|
.with_memory()
|
||
|
.with_disks()
|
||
|
.with_cpu(sysinfo::CpuRefreshKind::new().with_cpu_usage()),
|
||
|
);
|
||
|
|
||
|
let mut statuses = vec![];
|
||
|
|
||
|
// Check for critical memory usage
|
||
|
let memory_usage = sys.used_memory() * 100 / sys.total_memory();
|
||
|
if memory_usage > MEMORY_USAGE_CRITICAL_THRESHOLD {
|
||
|
statuses.push(CriticalReason::HighMemoryUsage);
|
||
|
}
|
||
|
|
||
|
// Check for critical CPU usage
|
||
|
let cpu_usage = sys.global_cpu_info().cpu_usage();
|
||
|
if cpu_usage > CPU_USAGE_CRITICAL_THRESHOLD as f32 {
|
||
|
statuses.push(CriticalReason::HighCpuUsage);
|
||
|
}
|
||
|
|
||
|
// Check for any disk usage that is critical
|
||
|
for disk in sys.disks() {
|
||
|
let available_disk = disk.available_space() * 100 / disk.total_space();
|
||
|
if available_disk < 100 - DISK_USAGE_CRITICAL_THRESHOLD {
|
||
|
statuses.push(CriticalReason::HighDiskUsage);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if !statuses.is_empty() {
|
||
|
status = SystemStatus::Critical(statuses);
|
||
|
}
|
||
|
|
||
|
status
|
||
|
})
|
||
|
.await
|
||
|
.expect("system checking task panicked - possibly due to panicked mutex lock");
|
||
|
|
||
|
self.sender.send_if_modified(|Health { system, .. }| {
|
||
|
if system.status == status {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
system.status = status;
|
||
|
true
|
||
|
});
|
||
|
}
|
||
|
|
||
|
pub fn set_task_health(&self, task_name: String, health: TaskHealth) {
|
||
|
// Always send a notification in this case since it is an explicit action.
|
||
|
self.sender.send_modify(|Health { tasks, .. }| {
|
||
|
tasks.insert(task_name, health);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
pub fn clear_task(&self, task_name: &str) {
|
||
|
self.sender
|
||
|
.send_if_modified(|Health { tasks, .. }| tasks.remove(task_name).is_some());
|
||
|
}
|
||
|
|
||
|
pub fn monitor(&self) -> watch::Receiver<Health> {
|
||
|
self.receiver.clone()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
impl Default for HealthMonitor {
|
||
|
fn default() -> Self {
|
||
|
HealthMonitor::new()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
impl std::fmt::Display for SystemStatus {
|
||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||
|
match self {
|
||
|
SystemStatus::Normal => write!(f, "normal"),
|
||
|
SystemStatus::OutOfDate => write!(f, "out of date"),
|
||
|
SystemStatus::Updating => write!(f, "updating"),
|
||
|
SystemStatus::Critical(_) => write!(f, "critical"),
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
impl std::fmt::Display for CriticalReason {
|
||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||
|
match self {
|
||
|
CriticalReason::HighMemoryUsage => write!(f, "high memory usage"),
|
||
|
CriticalReason::HighCpuUsage => write!(f, "high cpu usage"),
|
||
|
CriticalReason::HighDiskUsage => write!(f, "high disk usage"),
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
impl std::fmt::Display for TaskStatus {
|
||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||
|
match self {
|
||
|
TaskStatus::Normal => write!(f, "normal"),
|
||
|
TaskStatus::Warning => write!(f, "warning"),
|
||
|
TaskStatus::Error => write!(f, "error"),
|
||
|
TaskStatus::Completed => write!(f, "completed"),
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#[cfg(test)]
|
||
|
mod tests {
|
||
|
use super::*;
|
||
|
|
||
|
#[test]
|
||
|
fn test_task_monitor() {
|
||
|
let health_monitor = HealthMonitor::new();
|
||
|
let receiver = health_monitor.monitor();
|
||
|
|
||
|
assert!(receiver.has_changed().is_ok_and(|changed| !changed));
|
||
|
|
||
|
let health = TaskHealth::new("this is normal".to_owned());
|
||
|
health_monitor.set_task_health("some_task".to_string(), health);
|
||
|
|
||
|
assert!(receiver.has_changed().is_ok_and(|changed| changed));
|
||
|
|
||
|
{
|
||
|
let health = receiver.borrow();
|
||
|
let task_health = health.tasks.get("some_task").expect("a task should exist");
|
||
|
|
||
|
assert_eq!(task_health.status, TaskStatus::Normal);
|
||
|
assert_eq!(task_health.progress, 0);
|
||
|
assert_eq!(task_health.message, "this is normal");
|
||
|
}
|
||
|
|
||
|
health_monitor.clear_task("some_task");
|
||
|
assert!(!receiver.borrow().tasks.contains_key("some_task"));
|
||
|
}
|
||
|
}
|