Systematic resilience testing to discover weaknesses before they cause outages.

Steady State Hypothesis

# Define BEFORE injecting chaos - what "normal" looks like
steady_state_hypothesis:
  title: "API serves traffic within SLO"
  probes:
    - name: "API response time p95 < 500ms"
      type: http
      url: "https://api.example.com/health"
      threshold: 500

    - name: "Error rate < 1%"
      type: prometheus
      query: "rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m])"
      threshold: 0.01

    - name: "Order processing queue depth < 100"
      type: cloudwatch
      metric: "ApproximateNumberOfMessagesVisible"
      threshold: 100

    - name: "Database connections < 80% capacity"
      type: prometheus
      query: "pg_stat_activity_count / pg_settings_max_connections"
      threshold: 0.8

Failure Injection Patterns

# Using Chaos Toolkit (chaostoolkit.org)
# experiment.json

{
  "title": "Database failover resilience",
  "description": "Verify app handles primary DB failover gracefully",

  "steady-state-hypothesis": {
    "title": "API responds normally",
    "probes": [
      {
        "name": "api-health",
        "type": "probe",
        "provider": {
          "type": "http",
          "url": "https://api.example.com/health",
          "timeout": 5
        },
        "tolerance": {"status": 200}
      }
    ]
  },

  "method": [
    {
      "name": "failover-primary-db",
      "type": "action",
      "provider": {
        "type": "python",
        "module": "chaosaws.rds.actions",
        "func": "failover_db_cluster",
        "arguments": {
          "db_cluster_identifier": "prod-cluster"
        }
      },
      "pauses": {"after": 60}
    }
  ],

  "rollbacks": [
    {
      "name": "verify-db-recovered",
      "type": "probe",
      "provider": {
        "type": "python",
        "module": "chaosaws.rds.probes",
        "func": "cluster_status",
        "arguments": {
          "db_cluster_identifier": "prod-cluster"
        }
      },
      "tolerance": "available"
    }
  ]
}

Steady State Hypothesis

# Define BEFORE injecting chaos - what "normal" looks like steady_state_hypothesis: title: "API serves traffic within SLO" probes: - name: "API response time p95 < 500ms" type: http url: "https://api.example.com/health" threshold: 500 - name: "Error rate < 1%" type: prometheus query: "rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m])" threshold: 0.01 - name: "Order processing queue depth < 100" type: cloudwatch metric: "ApproximateNumberOfMessagesVisible" threshold: 100 - name: "Database connections < 80% capacity" type: prometheus query: "pg_stat_activity_count / pg_settings_max_connections" threshold: 0.8

Failure Injection Patterns

# Using Chaos Toolkit (chaostoolkit.org) # experiment.json { "title": "Database failover resilience", "description": "Verify app handles primary DB failover gracefully", "steady-state-hypothesis": { "title": "API responds normally", "probes": [ { "name": "api-health", "type": "probe", "provider": { "type": "http", "url": "https://api.example.com/health", "timeout": 5 }, "tolerance": {"status": 200} } ] }, "method": [ { "name": "failover-primary-db", "type": "action", "provider": { "type": "python", "module": "chaosaws.rds.actions", "func": "failover_db_cluster", "arguments": { "db_cluster_identifier": "prod-cluster" } }, "pauses": {"after": 60} } ], "rollbacks": [ { "name": "verify-db-recovered", "type": "probe", "provider": { "type": "python", "module": "chaosaws.rds.probes", "func": "cluster_status", "arguments": { "db_cluster_identifier": "prod-cluster" } }, "tolerance": "available" } ] }

Chaos Engineering

Steady State Hypothesis

Failure Injection Patterns

Chaos Engineering

Steady State Hypothesis

Failure Injection Patterns

Blast Radius Control

Common Chaos Experiments

Gameday Checklist

Checklist

Anti-Patterns

Healthcare Cdss Patterns

Drug Discovery

Qmd

Attack Tree Construction

Azure Ai Anomalydetector Java

Viboscope