Name: Ops Infra Infra Status
Author: codeferreira

Search skills.../

Ops Infra Infra Status | Skills Pool

# Initialize infrastructure monitoring session
echo '{
  "sessionId": "'$SESSION_ID'",
  "timestamp": "'$(gdate -Iseconds 2>/dev/null || date -Iseconds)'",
  "checkScope": "'${ARGUMENTS:-all}'",
  "clusterStatus": "unknown",
  "infrastructureServices": {},
  "criticalIssues": [],
  "healthScore": 0
}' > /tmp/infra-status-$SESSION_ID.json

# Quick node health check
echo "🔍 Node Health Analysis:"
kubectl get nodes -o json | jq -r '.items[] | "\(.metadata.name): \(.status.conditions[] | select(.type=="Ready") | .status)"'

# Resource utilization with error handling
echo "📊 Resource Utilization:"
kubectl top nodes --use-protocol-buffers=false 2>/dev/null || echo "⚠️ Metrics server unavailable"
kubectl top pods --all-namespaces --use-protocol-buffers=false 2>/dev/null | head -10 || echo "⚠️ Pod metrics unavailable"

# Critical system pods
echo "🏗️ Critical System Pods:"
kubectl get pods -n kube-system -o json | jq -r '.items[] | select(.metadata.name | test("(etcd|api-server|controller|scheduler)")) | "\(.metadata.name): \(.status.phase)"'

# Each agent reports findings to session state
echo "Launching parallel infrastructure health analysis..."
echo "Agents will analyze: Database, Cache, Streaming, NoSQL, and Foundation layers"
echo "Results will be aggregated into comprehensive health dashboard"

# Calculate comprehensive health score (0-100)
node_health=$(kubectl get nodes --no-headers | grep -c Ready || echo 0)
total_nodes=$(kubectl get nodes --no-headers | wc -l || echo 1)
failing_pods=$(kubectl get pods -A --no-headers | grep -c -E '(Error|CrashLoopBackOff|ImagePullBackOff)' || echo 0)
total_pods=$(kubectl get pods -A --no-headers | wc -l || echo 1)

# Update session state with health metrics
jq --argjson node_health "$node_health" \
   --argjson total_nodes "$total_nodes" \
   --argjson failing_pods "$failing_pods" \
   --argjson total_pods "$total_pods" \
   '.healthMetrics = {
     "nodeHealthRatio": ($node_health / $total_nodes),
     "podHealthRatio": (($total_pods - $failing_pods) / $total_pods),
     "timestamp": now
   }' /tmp/infra-status-$SESSION_ID.json > /tmp/infra-status-$SESSION_ID.tmp && \
mv /tmp/infra-status-$SESSION_ID.tmp /tmp/infra-status-$SESSION_ID.json

# Identify critical issues requiring immediate attention
echo "🚨 Critical Issue Analysis:"

# Control plane health
kubectl get componentstatuses 2>/dev/null || echo "⚠️ Component status unavailable"

# Resource exhaustion detection
echo "💾 Resource Exhaustion Check:"
kubectl describe nodes | grep -E '(OutOfmemory|OutOfcpu|OutOfStorage)' || echo "✅ No resource exhaustion detected"

# Failed/stuck pods
echo "🔄 Failed Pod Analysis:"
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null | head -10 || echo "✅ No failed pods detected"

echo "⚠️ Infrastructure monitoring failed. Checking cluster accessibility:"
echo "Kubectl context: $(kubectl config current-context 2>/dev/null || echo 'Not configured')"
echo "Cluster reachability: $(kubectl cluster-info --request-timeout=3s 2>/dev/null && echo '✅ Reachable' || echo '❌ Unreachable')"
echo "Suggested actions:"
echo "  1. Verify kubectl configuration: kubectl config view"
echo "  2. Check cluster status: kubectl get nodes"
echo "  3. Validate network connectivity to cluster"

echo "📊 TALOS KUBERNETES INFRASTRUCTURE HEALTH DASHBOARD"
echo "═══════════════════════════════════════════════════════"
echo "Session: $SESSION_ID"
echo "Timestamp: $(gdate -Iseconds 2>/dev/null || date -Iseconds)"
echo "Check Scope: ${ARGUMENTS:-comprehensive}"
echo ""

# Cluster overview
echo "🏗️ CLUSTER OVERVIEW"
echo "Nodes: $(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') total"
echo "Pods: $(kubectl get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ') running"
echo "Namespaces: $(kubectl get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') active"
echo ""

# Modern infrastructure services status
echo "🔧 MODERN INFRASTRUCTURE SERVICES"
echo "Postgres: $(kubectl get pods -n postgres-system --no-headers 2>/dev/null | grep -c Running || echo 0)/$(kubectl get pods -n postgres-system --no-headers 2>/dev/null | wc -l || echo 0) pods ready"
echo "DragonflyDB: $(kubectl get pods -n dragonfly-system --no-headers 2>/dev/null | grep -c Running || echo 0)/$(kubectl get pods -n dragonfly-system --no-headers 2>/dev/null | wc -l || echo 0) pods ready"
echo "RedPanda: $(kubectl get pods -n redpanda-system --no-headers 2>/dev/null | grep -c Running || echo 0)/$(kubectl get pods -n redpanda-system --no-headers 2>/dev/null | wc -l || echo 0) pods ready"
echo "ScyllaDB: $(kubectl get pods -n scylla-system --no-headers 2>/dev/null | grep -c Running || echo 0)/$(kubectl get pods -n scylla-system --no-headers 2>/dev/null | wc -l || echo 0) pods ready"
echo ""

echo "✅ Infrastructure health check completed"
echo "🎯 Target: ${ARGUMENTS:-comprehensive analysis}"
echo "⏱️ Session: $SESSION_ID"
echo "💾 Report cached in: /tmp/infra-status-$SESSION_ID.json"
echo "📈 Overall Status: $(jq -r '.healthScore // "Calculating..."' /tmp/infra-status-$SESSION_ID.json 2>/dev/null)"

# Comprehensive node analysis
kubectl get nodes -o json | jq -r '.items[] | {
  name: .metadata.name,
  ready: (.status.conditions[] | select(.type=="Ready") | .status),
  cpu_capacity: .status.capacity.cpu,
  memory_capacity: .status.capacity.memory,
  kernel: .status.nodeInfo.kernelVersion,
  talos_version: .status.nodeInfo.osImage
}'

# CNI and networking validation
kubectl get pods -n kube-system -l k8s-app=cilium -o json | jq -r '.items[] | "\(.metadata.name): \(.status.phase)"'
kubectl get ciliumnodes 2>/dev/null | head -5 || echo "Cilium CRDs not available"

# Resource consumption patterns
kubectl top nodes --sort-by cpu --no-headers 2>/dev/null | head -3
kubectl top pods -A --sort-by cpu --no-headers 2>/dev/null | head -5

# Comprehensive infrastructure audit
/infra-status

# Focus on specific infrastructure layer
/infra-status nodes
/infra-status services 
/infra-status storage
/infra-status network

# Modern infrastructure services health
/infra-status postgres
/infra-status dragonfly
/infra-status redpanda
/infra-status scylla

# Emergency response mode
/infra-status all

Ops Infra Infra Status

A. Skill Invocation

B. Claude Command Translation

C. Compatibility Notes

Ops Infra Infra Status

A. Skill Invocation

B. Claude Command Translation

C. Compatibility Notes

Context

Your spawn_agent

Advanced Infrastructure Monitoring Patterns

Component-Specific Health Checks

Troubleshooting Automation

Example Usage Scenarios

Helm Chart Scaffolding

Python Observability

K8s Manifest Generator

Istio Traffic Management

Secrets Management

Gitops Workflow