Name: Chaos Gardener
Author: smouj

Chaos Gardener | Skills Pool

chaos-gardener inject pod-kill --namespace <ns> --label-selector <selector> --percentage <0-100> --duration <time>
chaos-gardener inject network-latency --target <service> --latency-ms <ms> --jitter <ms> --duration <time>
chaos-gardener inject network-loss --target <pod> --loss-percent <0-100> --duration <time>
chaos-gardener inject io-throttle --device <path> --read-bps <bytes> --write-bps <bytes> --duration <time>
chaos-gardener inject cpu-stress --target <pod> --load <0-100> --duration <time>
chaos-gardener inject memory-fill --target <pod> --fill-percent <0-100> --duration <time>
chaos-gardener inject time-shift --target <pod> --offset <seconds> --duration <time>
chaos-gardener inject dns-chaos --domain <domain> --failure-rate <0-100> --duration <time>
chaos-gardener experiment create <name> --scenario <scenario-file.yaml> --target-env <environment>
chaos-gardener experiment run <experiment-id> --dry-run --verify
chaos-gardener experiment status <experiment-id>
chaos-gardener experiment abort <experiment-id>
chaos-gardener verify scaling --deployment <name> --expected-replicas <count> --timeout <duration>
chaos-gardener verify recovery --service <name> --success-rate <percent> --window <duration>
chaos-gardener verify metrics --prometheus-url <url> --alert <alert-name> --threshold <value>
chaos-gardener report generate <experiment-id> --output <format> --include-metrics
chaos-gardener schedule create <cron> --experiment <id> --approval-required
chaos-gardener schedule list --namespace <ns>
chaos-gardener safety check --experiment <scenario> --blocklist <services>

# Clone experiment repository
git clone https://github.com/org/chaos-scenarios.git /tmp/chaos-scenarios
cd /tmp/chaos-scenarios

# Select environment
export CHAOS_GARDENER_NAMESPACE=staging
export CHAOS_GARDENER_PROMETHEUS_URL=http://prometheus.monitoring.svc:9090

# Verify cluster access and safety
chaos-gardener safety check --experiment ecommerce-checkout-failure.yaml \
  --blocklist "payment-gateway,auth-service"

# Create experiment from scenario file
chaos-gardener experiment create checkout-resilience \
  --scenario scenarios/ecommerce/checkout-failure.yaml \
  --target-env staging

# Preview injected faults (dry-run)
chaos-gardener experiment run checkout-resilience-001 \
  --dry-run --verify

# Expected output includes:
# - Target pods: 12
# - Faults to inject: Pod kill (30%), Network latency (500ms), DB connection drop (5s)
# - Verification checks: 5
# - Estimated blast radius: Low (non-payment services)

# Run experiment with real-time monitoring
chaos-gardener experiment run checkout-resilience-001

# In separate terminal, monitor metrics:
watch -n 5 'curl -s http://prometheus.monitoring.svc:9090/api/v1/query?query=rate(http_requests_total{service="checkout"}[5m]) | jq'

# Monitor experiment status
chaos-gardener experiment status checkout-resilience-001

# Wait for experiment completion, verify metrics automatically:
chaos-gardener verify recovery --service checkout-service \
  --success-rate 99.5 --window 2m

# Expected output: `✓ Recovery verified: 99.8% success rate over 120s window`

# Verify auto-scaling behavior
chaos-gardener verify scaling --deployment checkout-api \
  --expected-replicas 5 --timeout 90s

chaos-gardener report generate checkout-resilience-001 \
  --output html --include-metrics \
  --slack-notify "#chaos-engineering"

# Report includes:
# - Timeline of injected faults
# - Metric graphs (latency, error rate, throughput)
# - Service impact matrix
# - Resilience score (calculated from recovery time and success rate)

# Experiment auto-cleans injected faults. Verify cleanup:
kubectl get chaosengines -n staging
# Should show: No resources found

# Archive experiment data
chaos-gardener report archive checkout-resilience-001 \
  --s3-bucket chaos-reports --retention 90d

chaos-gardener inject pod-kill \
  --namespace checkout \
  --label-selector "app=checkout-api" \
  --percentage 30 \
  --duration 10m \
  --interval 60s

✓ Injection prepared: Target 6/20 pods (30%)
✓ Chaos experiment ID: podkill-20240315-001
→ Injecting pod-kill every 60s for 600s total
Monitoring: `chaos-gardener experiment status podkill-20240315-001`

chaos-gardener verify recovery \
  --service checkout-api \
  --success-rate 99.0 \
  --window 5m

chaos-gardener inject network-latency \
  --target inventory-service \
  --latency-ms 500 \
  --jitter 100 \
  --duration 5m

✓ Iptables rules created on 3 inventory pods
✓ Experiment ID: netlat-20240315-002
→ 500ms ±100ms latency injected for 300s
Metrics dashboard: http://grafana.team/d/chaos/network-latency

# Check that checkout service circuit breaker triggered
kubectl logs deployment/checkout -c checkout --tail=50 | grep circuit-breaker
# Expected: "Circuit breaker OPENED for inventory-service, fallback active"

chaos-gardener verify metrics \
  --prometheus-url http://prometheus:9090 \
  --alert "HighCheckoutLatency" \
  --threshold "95th_percentile_latency_seconds > 1.0"

apiVersion: chaos-mesh.org/v1alpha1

Chaos Gardener

Purpose

Real Use Cases

Scope

Chaos Gardener

Purpose

Real Use Cases

Scope

Commands

Environment Variables

Work Process

1. Pre-Experiment Setup

2. Create and Prepare Experiment

3. Execute Experiment with Monitoring

4. Automated Verification

5. Generate Report

6. Cleanup

Golden Rules

Examples

Example 1: Pod Failure Chaos

Example 2: Network Latency Between Services

Example 3: Full Experiment Scenario

Sessions

Docker Patterns

Autonomous Loops

Kotlin Patterns

Eval Harness

Golang Patterns