Collect Databricks debug evidence for support tickets and troubleshooting. Use when encountering persistent issues, preparing support tickets, or collecting diagnostic information for Databricks problems. Trigger with phrases like "databricks debug", "databricks support bundle", "collect databricks logs", "databricks diagnostic".
Collect all necessary diagnostic information for Databricks support tickets.
#!/bin/bash
# databricks-debug-bundle.sh
set -e
BUNDLE_DIR="databricks-debug-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BUNDLE_DIR"
echo "=== Databricks Debug Bundle ===" > "$BUNDLE_DIR/summary.txt"
echo "Generated: $(date)" >> "$BUNDLE_DIR/summary.txt"
echo "Workspace: ${DATABRICKS_HOST}" >> "$BUNDLE_DIR/summary.txt"
echo "" >> "$BUNDLE_DIR/summary.txt"
set -euo pipefail
# Environment info
echo "--- Environment ---" >> "$BUNDLE_DIR/summary.txt"
echo "CLI Version: $(databricks --version)" >> "$BUNDLE_DIR/summary.txt"
echo "Python: $(python --version 2>&1)" >> "$BUNDLE_DIR/summary.txt"
echo "Databricks SDK: $(pip show databricks-sdk 2>/dev/null | grep Version)" >> "$BUNDLE_DIR/summary.txt"
echo "DATABRICKS_HOST: ${DATABRICKS_HOST}" >> "$BUNDLE_DIR/summary.txt"
echo "DATABRICKS_TOKEN: ${DATABRICKS_TOKEN:+[SET]}" >> "$BUNDLE_DIR/summary.txt"
echo "" >> "$BUNDLE_DIR/summary.txt"
# Workspace info
echo "--- Workspace Info ---" >> "$BUNDLE_DIR/summary.txt"
databricks current-user me 2>&1 >> "$BUNDLE_DIR/summary.txt" || echo "Failed to get user info"
echo "" >> "$BUNDLE_DIR/summary.txt"
# Cluster details (if cluster_id provided)
CLUSTER_ID="${1:-}"
if [ -n "$CLUSTER_ID" ]; then
echo "--- Cluster Info: $CLUSTER_ID ---" >> "$BUNDLE_DIR/summary.txt"
databricks clusters get --cluster-id "$CLUSTER_ID" > "$BUNDLE_DIR/cluster_info.json" 2>&1
# Extract key info
jq -r '{
state: .state,
spark_version: .spark_version,
node_type_id: .node_type_id,
num_workers: .num_workers,
autotermination_minutes: .autotermination_minutes
}' "$BUNDLE_DIR/cluster_info.json" >> "$BUNDLE_DIR/summary.txt"
# Get cluster events
echo "--- Recent Cluster Events ---" >> "$BUNDLE_DIR/summary.txt"
databricks clusters events --cluster-id "$CLUSTER_ID" --limit 20 > "$BUNDLE_DIR/cluster_events.json" 2>&1
jq -r '.events[] | "\(.timestamp): \(.type) - \(.details)"' "$BUNDLE_DIR/cluster_events.json" >> "$BUNDLE_DIR/summary.txt" 2>/dev/null
fi
# Job run details (if run_id provided)
RUN_ID="${2:-}"
if [ -n "$RUN_ID" ]; then
echo "--- Job Run Info: $RUN_ID ---" >> "$BUNDLE_DIR/summary.txt"
databricks runs get --run-id "$RUN_ID" > "$BUNDLE_DIR/run_info.json" 2>&1
# Extract run state
jq -r '{
state: .state.life_cycle_state,
result: .state.result_state,
message: .state.state_message,
start_time: .start_time,
end_time: .end_time
}' "$BUNDLE_DIR/run_info.json" >> "$BUNDLE_DIR/summary.txt"
# Get run output
echo "--- Run Output ---" >> "$BUNDLE_DIR/summary.txt"
databricks runs get-output --run-id "$RUN_ID" > "$BUNDLE_DIR/run_output.json" 2>&1
jq -r '.error // "No error"' "$BUNDLE_DIR/run_output.json" >> "$BUNDLE_DIR/summary.txt"
# Task-level details
jq -r '.tasks[] | "Task \(.task_key): \(.state.result_state)"' "$BUNDLE_DIR/run_info.json" >> "$BUNDLE_DIR/summary.txt" 2>/dev/null
fi
# Spark driver logs (requires cluster_id)
if [ -n "$CLUSTER_ID" ]; then
echo "--- Spark Driver Logs (last 500 lines) ---" > "$BUNDLE_DIR/driver_logs.txt" # HTTP 500 Internal Server Error
# Get logs via API
python3 << EOF >> "$BUNDLE_DIR/driver_logs.txt" 2>&1
from databricks.sdk import WorkspaceClient
w = WorkspaceClient()