Name: Ops Infra Health Check
Author: codeferreira

Ops Infra Health Check | Skills Pool

# Initialize session state
echo '{
  "sessionId": "'$SESSION_ID'",
  "targetSystem": "'{{SC_ARGS}}'",
  "timestamp": "'$(gdate -Iseconds 2>/dev/null || date -Iseconds)'",
  "healthChecks": [],
  "systemComponents": [],
  "monitoringTools": []
}' > /tmp/health-session-$SESSION_ID.json

Agent 1: Infrastructure Health Analysis: Analyze Kubernetes cluster, nodes, networking, and orchestration
- Focus: Pod status, node health, resource allocation, network connectivity, cluster-level services
- Tools: kubectl with JSON output, cluster-level health indicators, network testing, service mesh analysis
- Output: Infrastructure readiness and capacity analysis with resource optimization recommendations
Agent 2: Container Ecosystem Health: Assess Docker containers, images, volumes, and container orchestration
- Focus: Container status, resource usage, health check configurations, image vulnerabilities, volume health
- Tools: docker commands, container inspection, log analysis, image scanning, volume monitoring
- Output: Container ecosystem health and resource utilization with security and performance insights
Agent 3: Application Service Health: Evaluate application-level health endpoints, APIs, and service dependencies
- Focus: HTTP health checks, API availability, service dependencies, response times, error rates
- Tools: curl, API testing, response validation, load testing, dependency mapping
- Output: Application service availability and response metrics with performance optimization recommendations
Agent 4: System Resource Monitoring: Monitor system-level resources, performance, and capacity planning
- Focus: CPU, memory, disk, network utilization and capacity, system performance metrics
- Tools: system monitoring commands, resource analysis, performance profiling, capacity planning
- Output: Resource availability and performance bottlenecks with scaling recommendations
Agent 5: Configuration & Alerting Analysis: Review monitoring and alerting setup, configuration drift
- Focus: Existing monitoring tools, alert configurations, dashboards, configuration management
- Tools: file analysis, configuration validation, alert testing, dashboard analysis
- Output: Current monitoring coverage and improvement opportunities with alerting optimization
Agent 6: Security & Compliance Health: Assess security posture, compliance status, and vulnerability management
- Focus: Security configurations, compliance checks, vulnerability scanning, access controls
- Tools: security scanning, compliance validation, vulnerability assessment, access auditing
- Output: Security health assessment with compliance gaps and vulnerability remediation
Agent 7: Performance & Scalability Analysis: Analyze performance metrics, scalability patterns, and optimization opportunities
- Focus: Performance trends, scalability bottlenecks, resource optimization, load patterns
- Tools: performance monitoring, scalability testing, resource optimization, trend analysis
- Output: Performance insights with scalability recommendations and optimization strategies
Agent 8: Operational Readiness Assessment: Evaluate operational procedures, backup systems, and disaster recovery
- Focus: Backup systems, disaster recovery, operational procedures, monitoring coverage
- Tools: backup validation, DR testing, operational procedure analysis, monitoring gap analysis
- Output: Operational readiness assessment with DR improvements and operational optimization

# Discover system configuration files
echo "📁 Discovering system configuration..."
fd "(docker-compose|k8s|deployment|health)" --type f -d 3 | head -10

# Find existing health check implementations
echo "🔍 Locating health check implementations..."
rg "health|status|ping|ready|live" --type-add 'config:*.{yaml,yml,json,toml}' --type config -l | head -5
rg "(/health|/ping|/status|healthcheck)" --type-add 'code:*.{ts,js,go,rs,java,py}' --type code -l | head -5

# Check for monitoring tools
echo "📊 Checking monitoring infrastructure..."
fd "(prometheus|grafana|datadog|newrelic|monitoring)" --type f -d 2 | head -5

# Comprehensive cluster health assessment
echo "🎯 Kubernetes cluster health analysis:"

# Node health and resource availability
kubectl get nodes -o json | jq -r '.items[] | "Node: \(.metadata.name) | Status: \(.status.conditions[-1].type) | Resources: CPU=\(.status.allocatable.cpu) Memory=\(.status.allocatable.memory)"'

# Pod health across all namespaces
kubectl get pods --all-namespaces -o json | jq -r '.items[] | select(.status.phase != "Running") | "⚠️ \(.metadata.namespace)/\(.metadata.name): \(.status.phase)"' | head -10

# Service endpoints and connectivity
kubectl get services --all-namespaces -o json | jq -r '.items[] | select(.spec.type == "LoadBalancer" or .spec.type == "NodePort") | "Service: \(.metadata.namespace)/\(.metadata.name) | Type: \(.spec.type)"'

# Resource usage and limits
kubectl top nodes 2>/dev/null || echo "Metrics server not available"
kubectl top pods --all-namespaces 2>/dev/null | head -10 || echo "Pod metrics not available"

echo "🐳 Docker container health analysis:"

# Container status and health
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | head -15

# Container resource usage
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}" | head -10

# Health check configurations
docker inspect $(docker ps -q) | jq -r '.[] | select(.Config.Healthcheck) | "✓ \(.Name): \(.Config.Healthcheck.Test[1:] | join(" "))"' 2>/dev/null | head -5

# Volume and network status
docker volume ls | wc -l | awk '{print "📂 Volumes: " $1}'
docker network ls | wc -l | awk '{print "🌐 Networks: " $1}'

echo "🚀 Application health endpoint analysis:"

# Test primary health endpoints
for port in 8080 3000 8000 8090 9000; do
  echo "Testing port $port..."
  curl -s --connect-timeout 2 "http://localhost:$port/health" | head -1 2>/dev/null || echo "Port $port: No response"
done

# Discover application configurations
echo "📋 Application configuration discovery:"
fd "(package\.json|deno\.json|Cargo\.toml|pom\.xml|go\.mod)" --type f | head -5

# Find health check implementations in code
echo "💻 Code-level health check implementations:"
rg "(healthcheck|health.*endpoint|/health|/ping)" --type-add 'src:*.{ts,js,go,rs,java,py}' --type src -A 2 -B 1 | head -10

# Create monitoring session directory
mkdir -p /tmp/health-monitor-$SESSION_ID

# Generate HTML health dashboard
cat > /tmp/health-monitor-$SESSION_ID/dashboard.html << 'EOF'
<!DOCTYPE html>
<html>
<head>
    <title>System Health Dashboard</title>
    <meta http-equiv="refresh" content="30">
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
        .health-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; }
        .health-card { background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
        .status-healthy { border-left: 4px solid #28a745; }
        .status-warning { border-left: 4px solid #ffc107; }
        .status-critical { border-left: 4px solid #dc3545; }
        .metric { display: flex; justify-content: space-between; margin: 8px 0; }
        .timestamp { color: #666; font-size: 12px; }
    </style>
</head>
<body>
    <h1>🏥 System Health Dashboard</h1>
    <div class="timestamp">Last updated: $(date)</div>
    <div class="health-grid">
        <!-- Health cards will be populated by monitoring script -->
    </div>
    <script>
        // Auto-refresh functionality
        setTimeout(() => location.reload(), 30000);
    </script>
</body>
</html>
EOF

echo "📊 Health dashboard created: /tmp/health-monitor-$SESSION_ID/dashboard.html"

// Enhanced health-check service with monitoring integration
interface HealthStatus {
  status: "healthy" | "degraded" | "unhealthy";
  timestamp: string;
  version: string;
  uptime: number;
  sessionId: string;
  environment: string;
  checks: Record<string, CheckResult>;
  metrics: SystemMetrics;
}

interface CheckResult {
  status: "pass" | "fail" | "warn";
  message?: string;
  responseTime?: number;
  details?: Record<string, any>;
  lastCheck?: string;
  checkCount?: number;
}

interface SystemMetrics {
  cpuUsage?: number;
  memoryUsage?: number;
  diskUsage?: number;
  activeConnections?: number;
  requestRate?: number;
}

class AdvancedHealthCheckService {
  private startTime = Date.now();
  private checks = new Map<string, () => Promise<CheckResult>>();
  private checkHistory = new Map<string, CheckResult[]>();
  private sessionId: string;

  constructor(sessionId: string) {
    this.sessionId = sessionId;
    this.initializeDefaultChecks();
  }

  private initializeDefaultChecks() {
    // System resource checks
    this.register("system_memory", async () => {
      const memInfo = await this.getMemoryInfo();
      const usage = memInfo.used / memInfo.total;
      return {
        status: usage > 0.9 ? "fail" : usage > 0.75 ? "warn" : "pass",
        details: { usage: Math.round(usage * 100), used: memInfo.used, total: memInfo.total },
        message: usage > 0.9 ? "Critical memory usage" : undefined,
      };
    });

    this.register("system_disk", async () => {
      const diskInfo = await this.getDiskInfo();
      const usage = diskInfo.used / diskInfo.total;
      return {
        status: usage > 0.95 ? "fail" : usage > 0.85 ? "warn" : "pass",
        details: { usage: Math.round(usage * 100), available: diskInfo.available },
        message: usage > 0.95 ? "Critical disk space" : undefined,
      };
    });
  }

  register(name: string, check: () => Promise<CheckResult>) {
    this.checks.set(name, check);
    this.checkHistory.set(name, []);
  }

  async getHealth(): Promise<HealthStatus> {
    const results: Record<string, CheckResult> = {};
    let overallStatus: "healthy" | "degraded" | "unhealthy" = "healthy";

    // Execute all checks in parallel with timeout
    const checkPromises = Array.from(this.checks.entries()).map(async ([name, check]) => {
      const start = Date.now();
      try {
        const timeoutPromise = new Promise<CheckResult>((_, reject) =>
          setTimeout(() => reject(new Error("Check timeout")), 5000)
        );

        const result = await Promise.race([check(), timeoutPromise]);
        result.responseTime = Date.now() - start;
        result.lastCheck = new Date().toISOString();

        // Update check history
        const history = this.checkHistory.get(name) || [];
        history.push(result);
        if (history.length > 10) history.shift(); // Keep last 10 results
        this.checkHistory.set(name, history);
        result.checkCount = history.length;

        return [name, result] as [string, CheckResult];
      } catch (error) {
        const failResult: CheckResult = {
          status: "fail",
          message: error.message,
          responseTime: Date.now() - start,
          lastCheck: new Date().toISOString(),
        };
        return [name, failResult] as [string, CheckResult];
      }
    });

    const checkResults = await Promise.allSettled(checkPromises);

    // Process results and determine overall status
    for (const promiseResult of checkResults) {
      if (promiseResult.status === "fulfilled") {
        const [name, result] = promiseResult.value;
        results[name] = result;

        if (result.status === "fail") {
          overallStatus = "unhealthy";
        } else if (result.status === "warn" && overallStatus === "healthy") {
          overallStatus = "degraded";
        }
      }
    }

    return {
      status: overallStatus,
      timestamp: new Date().toISOString(),
      version: Deno.env.get("APP_VERSION") || "unknown",
      uptime: Date.now() - this.startTime,
      sessionId: this.sessionId,
      environment: Deno.env.get("NODE_ENV") || "development",
      checks: results,
      metrics: await this.getSystemMetrics(),
    };
  }

  private async getMemoryInfo() {
    // Platform-specific memory info implementation
    try {
      const proc = new Deno.Command("free", { args: ["-b"] });
      const output = await proc.output();
      const text = new TextDecoder().decode(output.stdout);
      const memLine = text.split("\n")[1];
      const [, total, used] = memLine.split(/\s+/).map(Number);
      return { total, used };
    } catch {
      return { total: 0, used: 0 };
    }
  }

  private async getDiskInfo() {
    try {
      const proc = new Deno.Command("df", { args: ["-b", "/"] });
      const output = await proc.output();
      const text = new TextDecoder().decode(output.stdout);
      const diskLine = text.split("\n")[1];
      const [, total, used, available] = diskLine.split(/\s+/).map(Number);
      return { total, used, available };
    } catch {
      return { total: 0, used: 0, available: 0 };
    }
  }

  private async getSystemMetrics(): Promise<SystemMetrics> {
    return {
      cpuUsage: await this.getCpuUsage(),
      memoryUsage: (await this.getMemoryInfo()).used,
      diskUsage: (await this.getDiskInfo()).used,
      activeConnections: await this.getActiveConnections(),
    };
  }

  private async getCpuUsage(): Promise<number> {
    try {
      const proc = new Deno.Command("uptime");
      const output = await proc.output();
      const text = new TextDecoder().decode(output.stdout);
      const loadMatch = text.match(/load average: ([\d.]+)/);
      return loadMatch ? parseFloat(loadMatch[1]) : 0;
    } catch {
      return 0;
    }
  }

  private async getActiveConnections(): Promise<number> {
    try {
      const proc = new Deno.Command("netstat", { args: ["-an"] });
      const output = await proc.output();
      const text = new TextDecoder().decode(output.stdout);
      return text.split("\n").filter((line) => line.includes("ESTABLISHED")).length;
    } catch {
      return 0;
    }
  }
}

// Usage example with session management
const sessionId = Deno.env.get("HEALTH_SESSION_ID") || crypto.randomUUID();
const healthService = new AdvancedHealthCheckService(sessionId);

// Enhanced database check with connection pooling
healthService.register("database", async () => {
  try {
    const start = Date.now();
    // Replace with your actual database connection
    // await db.query("SELECT 1");
    const latency = Date.now() - start;

    // Simulate database health check
    const isHealthy = latency < 1000; // Example threshold

    return {
      status: latency < 100 ? "pass" : latency < 500 ? "warn" : "fail",
      details: {
        latency,
        connectionPool: { active: 5, idle: 10, max: 20 },
        lastQuery: new Date().toISOString(),
      },
      message: latency > 500 ? "High database latency detected" : undefined,
    };
  } catch (error) {
    return {
      status: "fail",
      message: `Database connection failed: ${error.message}`,
      details: { error: error.name },
    };
  }
});

// Enhanced cache check with metrics
healthService.register("cache", async () => {
  try {
    const start = Date.now();
    // Replace with your actual cache connection
    // await redis.ping();
    const latency = Date.now() - start;

    // Simulate cache metrics
    const memoryUsage = Math.random() * 2_000_000_000; // Example memory usage

    return {
      status: memoryUsage < 1_000_000_000 ? "pass" : memoryUsage < 1_500_000_000 ? "warn" : "fail",
      details: {
        latency,
        memory: Math.round(memoryUsage / 1024 / 1024), // MB
        hitRate: 0.95,
        connections: 42,
      },
      message: memoryUsage > 1_500_000_000 ? "Cache memory usage critical" : undefined,
    };
  } catch (error) {
    return {
      status: "fail",
      message: `Cache connection failed: ${error.message}`,
      details: { error: error.name },
    };
  }
});

// External dependency check with circuit breaker pattern
healthService.register("external_api", async () => {
  try {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), 5000);

    // Replace with your actual external API
    const response = await fetch("https://httpbin.org/status/200", {
      signal: controller.signal,
      headers: { "User-Agent": "Health-Check/1.0" },
    });

    clearTimeout(timeoutId);

    return {
      status: response.ok ? "pass" : response.status < 500 ? "warn" : "fail",
      details: {
        statusCode: response.status,
        responseTime: response.headers.get("x-response-time") || "unknown",
        endpoint: "external-api",
      },
      message: !response.ok ? `API returned ${response.status}` : undefined,
    };
  } catch (error) {
    return {
      status: "fail",
      message: error.name === "AbortError" ? "API timeout" : "API unreachable",
      details: { error: error.name },
    };
  }
});

// Export health check endpoint
export async function healthCheckHandler(): Promise<Response> {
  const health = await healthService.getHealth();
  const status = health.status === "healthy" ? 200 : health.status === "degraded" ? 200 : 503;

  return new Response(JSON.stringify(health, null, 2), {
    status,
    headers: {
      "Content-Type": "application/json",
      "Cache-Control": "no-cache",
      "X-Health-Check-Version": "2.0",
    },
  });
}

// Advanced Rust health check implementation
use std::time::{Duration, Instant};
use serde::{Serialize, Deserialize};
use tokio::time::timeout;
use std::collections::HashMap;

#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct HealthCheck {
    pub status: HealthStatus,
    pub timestamp: String,
    pub session_id: String,
    pub uptime: u64,
    pub version: String,
    pub checks: HashMap<String, ComponentCheck>,
    pub system: SystemInfo,
}

#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub enum HealthStatus {
    Healthy,
    Degraded,
    Unhealthy,
}

#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct ComponentCheck {
    pub name: String,
    pub status: CheckStatus,
    pub response_time_ms: u64,
    pub message: Option<String>,
    pub details: Option<serde_json::Value>,
    pub last_check: String,
}

#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub enum CheckStatus {
    Pass,
    Warn,
    Fail,
}

#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct SystemInfo {
    pub cpu_usage: f32,
    pub memory_usage_mb: u64,
    pub disk_usage_percent: f32,
    pub open_connections: u32,
    pub load_average: f32,
}

pub struct HealthService {
    start_time: Instant,
    session_id: String,
    checks: HashMap<String, Box<dyn HealthChecker + Send + Sync>>,
}

#[async_trait::async_trait]
pub trait HealthChecker {
    async fn check(&self) -> ComponentCheck;
}

impl HealthService {
    pub fn new(session_id: String) -> Self {
        let mut service = Self {
            start_time: Instant::now(),
            session_id,
            checks: HashMap::new(),
        };
        
        // Register default system checks
        service.register("system_resources", Box::new(SystemResourceChecker));
        service
    }
    
    pub fn register(&mut self, name: &str, checker: Box<dyn HealthChecker + Send + Sync>) {
        self.checks.insert(name.to_string(), checker);
    }
    
    pub async fn perform_health_check(&self) -> HealthCheck {
        let mut checks = HashMap::new();
        let mut overall_status = HealthStatus::Healthy;
        
        // Execute all checks in parallel with timeout
        let check_futures: Vec<_> = self.checks.iter().map(|(name, checker)| {
            let name = name.clone();
            async move {
                match timeout(Duration::from_secs(5), checker.check()).await {
                    Ok(result) => (name, result),
                    Err(_) => (name, ComponentCheck {
                        name: name.clone(),
                        status: CheckStatus::Fail,
                        response_time_ms: 5000,
                        message: Some("Check timeout".to_string()),
                        details: None,
                        last_check: chrono::Utc::now().to_rfc3339(),
                    })
                }
            }
        }).collect();
        
        let results = futures::future::join_all(check_futures).await;
        
        // Process results and determine overall status
        for (name, check) in results {
            match check.status {
                CheckStatus::Fail => overall_status = HealthStatus::Unhealthy,
                CheckStatus::Warn if overall_status == HealthStatus::Healthy => {
                    overall_status = HealthStatus::Degraded
                }
                _ => {}
            }
            checks.insert(name, check);
        }
        
        HealthCheck {
            status: overall_status,
            timestamp: chrono::Utc::now().to_rfc3339(),
            session_id: self.session_id.clone(),
            uptime: self.start_time.elapsed().as_secs(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            checks,
            system: get_system_info().await,
        }
    }
}

// System resource checker implementation
struct SystemResourceChecker;

#[async_trait::async_trait]
impl HealthChecker for SystemResourceChecker {
    async fn check(&self) -> ComponentCheck {
        let start = Instant::now();
        
        let memory_info = get_memory_info().await;
        let memory_usage_percent = (memory_info.used as f32 / memory_info.total as f32) * 100.0;
        
        let status = if memory_usage_percent > 90.0 {
            CheckStatus::Fail
        } else if memory_usage_percent > 75.0 {
            CheckStatus::Warn  
        } else {
            CheckStatus::Pass
        };
        
        let message = if memory_usage_percent > 90.0 {
            Some(format!("Critical memory usage: {:.1}%", memory_usage_percent))
        } else if memory_usage_percent > 75.0 {
            Some(format!("High memory usage: {:.1}%", memory_usage_percent))
        } else {
            None
        };
        
        ComponentCheck {
            name: "system_resources".to_string(),
            status,
            response_time_ms: start.elapsed().as_millis() as u64,
            message,
            details: Some(serde_json::json!({
                "memory_usage_percent": memory_usage_percent,
                "memory_used_mb": memory_info.used / 1024 / 1024,
                "memory_total_mb": memory_info.total / 1024 / 1024
            })),
            last_check: chrono::Utc::now().to_rfc3339(),
        }
    }
}

#[derive(Debug)]
struct MemoryInfo {
    total: u64,
    used: u64,
}

async fn get_memory_info() -> MemoryInfo {
    // Platform-specific memory info implementation
    #[cfg(target_os = "linux")]
    {
        if let Ok(meminfo) = tokio::fs::read_to_string("/proc/meminfo").await {
            let mut total = 0;
            let mut available = 0;
            
            for line in meminfo.lines() {
                if line.starts_with("MemTotal:") {
                    total = line.split_whitespace().nth(1)
                        .and_then(|s| s.parse::<u64>().ok())
                        .unwrap_or(0) * 1024; // Convert from KB to bytes
                } else if line.starts_with("MemAvailable:") {
                    available = line.split_whitespace().nth(1)
                        .and_then(|s| s.parse::<u64>().ok())
                        .unwrap_or(0) * 1024; // Convert from KB to bytes
                }
            }
            
            return MemoryInfo {
                total,
                used: total.saturating_sub(available),
            };
        }
    }
    
    // Fallback for other platforms or if /proc/meminfo is not available
    MemoryInfo { total: 0, used: 0 }
}

async fn get_system_info() -> SystemInfo {
    let memory_info = get_memory_info().await;
    
    SystemInfo {
        cpu_usage: get_cpu_usage().await,
        memory_usage_mb: memory_info.used / 1024 / 1024,
        disk_usage_percent: get_disk_usage().await,
        open_connections: get_connection_count().await,
        load_average: get_load_average().await,
    }
}

async fn get_cpu_usage() -> f32 {
    // Simplified CPU usage - in production, implement proper CPU monitoring
    0.0
}

async fn get_disk_usage() -> f32 {
    // Simplified disk usage - in production, implement proper disk monitoring
    0.0
}

async fn get_connection_count() -> u32 {
    // Simplified connection count - in production, implement proper network monitoring
    0
}

async fn get_load_average() -> f32 {
    // Platform-specific load average implementation
    #[cfg(target_os = "linux")]
    {
        if let Ok(loadavg) = tokio::fs::read_to_string("/proc/loadavg").await {
            return loadavg.split_whitespace().next()
                .and_then(|s| s.parse::<f32>().ok())
                .unwrap_or(0.0);
        }
    }
    0.0
}

// Usage example
pub async fn create_health_service() -> HealthService {
    let session_id = uuid::Uuid::new_v4().to_string();
    let mut service = HealthService::new(session_id);
    
    // Register additional custom checks here
    // service.register("database", Box::new(DatabaseChecker::new()));
    // service.register("cache", Box::new(CacheChecker::new()));
    
    service
}

# Enhanced Kubernetes health probe configuration with monitoring
apiVersion: apps/v1

Ops Infra Health Check

A. Skill Invocation

B. Claude Command Translation

C. Compatibility Notes

Ops Infra Health Check

A. Skill Invocation

B. Claude Command Translation

C. Compatibility Notes

Context

Your spawn_agent

Github

Openclaw Parallels Smoke

Update Screenshots

Azure Pipelines

Deployment Patterns

Deployment Patterns