Name: Debug Cuda Crash
Author: sgl-project

搵技能.../

Debug Cuda Crash | Skills Pool

export SGLANG_KERNEL_API_LOGLEVEL=1
export SGLANG_KERNEL_API_LOGDEST=stdout

python my_script.py

================================================================================
[2026-03-19 00:47:06] SGLang Kernel API Call: RMSNorm.forward
================================================================================
[2026-03-19 00:47:06] SGLang Kernel API Call: sglang.quant_method.UnquantizedLinearMethod.apply
================================================================================
[2026-03-19 00:47:06] SGLang Kernel API Call: sglang.custom_op.fused_inplace_qknorm

export SGLANG_KERNEL_API_LOGLEVEL=3
export SGLANG_KERNEL_API_LOGDEST=debug.log

python my_script.py

================================================================================
[2026-03-19 00:47:30] SGLang Kernel API Call: sglang.quant_method.UnquantizedLinearMethod.apply
Positional input arguments:
  arg[0]=QKVParallelLinear(
      repr=QKVParallelLinear(in_features=1024, output_features=4096, bias=False, tp_size=1, gather_output=False)
    )
  arg[1]=Tensor(
      shape=(1, 1024)
      dtype=torch.bfloat16
      device=cuda:0
      requires_grad=False
      is_contiguous=True
    )
  arg[2]=None
Output:
  return=Tensor(
      shape=(1, 4096)
      dtype=torch.bfloat16
      device=cuda:0
      requires_grad=False
      is_contiguous=True
    )

export SGLANG_KERNEL_API_LOGLEVEL=5
export SGLANG_KERNEL_API_LOGDEST=debug.log

python my_script.py

================================================================================
[2026-03-19 01:00:42] SGLang Kernel API Call: diffusion.quant_method.UnquantizedLinearMethod.apply
Positional input arguments:
  arg[1]=Tensor(
      shape=(1, 77, 768)
      dtype=torch.bfloat16
      device=cuda:0
      requires_grad=False
      is_contiguous=True
      min=-27.250000
      max=28.500000
      mean=0.011723
      nan_count=0
      inf_count=0
    )
Output:
  return=Tensor(
      shape=(1, 77, 2304)
      dtype=torch.bfloat16
      device=cuda:0
      requires_grad=False
      is_contiguous=True
      min=-8.937500
      max=9.375000
      mean=0.009460
      nan_count=0
      inf_count=0
    )

export SGLANG_KERNEL_API_LOGLEVEL=10
export SGLANG_KERNEL_API_LOGDEST=debug.log
export SGLANG_KERNEL_API_DUMP_DIR=/tmp/sglang_kernel_api_dumps

python my_script.py

/tmp/sglang_kernel_api_validation/qwen_qwen3_0_6b_level10_dumps
/tmp/sglang_kernel_api_validation/qwen_qwen3_0_6b_level10_dumps/20260319_004821_182_pid919286_RotaryEmbedding.forward_call0001
/tmp/sglang_kernel_api_validation/qwen_qwen3_0_6b_level10_dumps/20260319_004821_182_pid919286_RotaryEmbedding.forward_call0001/inputs.pt
/tmp/sglang_kernel_api_validation/qwen_qwen3_0_6b_level10_dumps/20260319_004821_182_pid919286_RotaryEmbedding.forward_call0001/metadata.json
/tmp/sglang_kernel_api_validation/qwen_qwen3_0_6b_level10_dumps/20260319_004821_182_pid919286_RotaryEmbedding.forward_call0001/outputs.pt

{
  "function_name": "RotaryEmbedding.forward",
  "timestamp": "20260319_004821_182",
  "process_id": 919286,
  "execution_status": "completed",
  "input_tensor_keys": ["arg_0", "arg_1", "arg_2"],
  "output_tensor_keys": ["result_0", "result_1"]
}

python3 - <<'PY'
from pathlib import Path
Path("/tmp/sglang_llm_crash.py").write_text(
    "import torch\\n"
    "import torch.nn.functional as F\\n"
    "from sglang.srt.utils.custom_op import register_custom_op\\n\\n"
    "def _fake_embedding(indices, table):\\n"
    "    return torch.empty((*indices.shape, table.shape[-1]), device=table.device, dtype=table.dtype)\\n\\n"
    "@register_custom_op(op_name='mock_llm_cuda_crash', fake_impl=_fake_embedding)\\n"
    "def mock_llm_cuda_crash(indices, table):\\n"
    "    out = F.embedding(indices, table)\\n"
    "    torch.cuda.synchronize()\\n"
    "    return out\\n\\n"
    "table = torch.randn(4, 8, device='cuda', dtype=torch.float16)\\n"
    "indices = torch.tensor([0, 7], device='cuda', dtype=torch.long)\\n"
    "mock_llm_cuda_crash(indices, table)\\n"
)
PY

SGLANG_KERNEL_API_LOGLEVEL=1 \
SGLANG_KERNEL_API_LOGDEST=/tmp/sglang_llm_level1.log \
python3 /tmp/sglang_llm_crash.py

SGLANG_KERNEL_API_LOGLEVEL=3 \
SGLANG_KERNEL_API_LOGDEST=/tmp/sglang_llm_level3.log \
python3 /tmp/sglang_llm_crash.py

SGLANG_KERNEL_API_LOGLEVEL=10 \
SGLANG_KERNEL_API_LOGDEST=/tmp/sglang_llm_level10.log \
SGLANG_KERNEL_API_DUMP_DIR=/tmp/sglang_llm_level10_dumps \
python3 /tmp/sglang_llm_crash.py

python3 - <<'PY'
from pathlib import Path
Path("/tmp/sglang_diffusion_crash.py").write_text(
    "import torch\\n"
    "import torch.nn.functional as F\\n"
    "from sglang.multimodal_gen.runtime.layers.utils import register_custom_op\\n\\n"
    "def _fake_embedding(positions, cache):\\n"
    "    return torch.empty((*positions.shape, cache.shape[-1]), device=cache.device, dtype=cache.dtype)\\n\\n"
    "@register_custom_op(op_name='mock_diffusion_cuda_crash', fake_impl=_fake_embedding)\\n"
    "def mock_diffusion_cuda_crash(positions, cache):\\n"
    "    out = F.embedding(positions, cache)\\n"
    "    torch.cuda.synchronize()\\n"
    "    return out\\n\\n"
    "cache = torch.randn(4, 64, device='cuda', dtype=torch.float16)\\n"
    "positions = torch.tensor([0, 9], device='cuda', dtype=torch.long)\\n"
    "mock_diffusion_cuda_crash(positions, cache)\\n"
)
PY

SGLANG_KERNEL_API_LOGLEVEL=1 \
SGLANG_KERNEL_API_LOGDEST=/tmp/sglang_diffusion_level1.log \
python3 /tmp/sglang_diffusion_crash.py

SGLANG_KERNEL_API_LOGLEVEL=3 \
SGLANG_KERNEL_API_LOGDEST=/tmp/sglang_diffusion_level3.log \
python3 /tmp/sglang_diffusion_crash.py

SGLANG_KERNEL_API_LOGLEVEL=10 \
SGLANG_KERNEL_API_LOGDEST=/tmp/sglang_diffusion_level10.log \
SGLANG_KERNEL_API_DUMP_DIR=/tmp/sglang_diffusion_level10_dumps \
python3 /tmp/sglang_diffusion_crash.py

export SGLANG_KERNEL_API_LOGLEVEL=3
export SGLANG_KERNEL_API_LOGDEST=debug_rank_%i.log

torchrun --nproc_per_node=4 my_script.py

/tmp/sglang_kernel_api_validation_multi/qwen_qwen2_5_0_5b_instruct_level3_950201.log
/tmp/sglang_kernel_api_validation_multi/qwen_qwen2_5_0_5b_instruct_level3_950349.log
/tmp/sglang_kernel_api_validation_multi/qwen_qwen2_5_0_5b_instruct_level3_950350.log
/tmp/sglang_kernel_api_validation_multi/qwen_qwen2_5_0_5b_instruct_level3_950351.log

export SGLANG_KERNEL_API_LOGLEVEL=10
export SGLANG_KERNEL_API_LOGDEST=debug_rank_%i.log
export SGLANG_KERNEL_API_DUMP_DIR=/tmp/sglang_kernel_api_dumps_%i

export SGLANG_KERNEL_API_LOGLEVEL=10
export SGLANG_KERNEL_API_LOGDEST=debug.log
export SGLANG_KERNEL_API_DUMP_DIR=/tmp/sglang_kernel_api_dumps
export SGLANG_KERNEL_API_DUMP_INCLUDE='sglang.custom_op.*'
export SGLANG_KERNEL_API_DUMP_EXCLUDE='*.fake_impl'

RuntimeError: CUDA error: an illegal memory access was encountered
torch.AcceleratorError: CUDA error: device-side assert triggered

export SGLANG_KERNEL_API_LOGLEVEL=3

SGLang Kernel API Call: ...
arg[0]=Tensor(shape=(..., 128), ...)   # ✅ expected dimension
arg[1]=Tensor(shape=(..., 64), ...)    # ❌ mismatch

export SGLANG_KERNEL_API_LOGLEVEL=5

Tensor(
  ...
  min=-1234567.000000   # ❌ suspiciously large
  max=9876543.000000    # ❌ suspiciously large
  mean=nan              # ❌ bad
  nan_count=128         # ❌ found NaNs
  inf_count=0           # ✅ no Infs here
)

export SGLANG_KERNEL_API_LOGLEVEL=3

Tensor(
  shape=(1024, 8192, 128, 128)   # ❌ way too large
  ...
)

[2026-03-19 00:47:30] SGLang Kernel API Call: RotaryEmbedding.forward
Positional input arguments:
  arg[0]=Tensor(shape=(1, 8), dtype=torch.int64, ...)
  arg[1]=Tensor(shape=(1, 8, 8, 256), dtype=torch.bfloat16, ...)    # ✅ query
  arg[2]=Tensor(shape=(1, 8, 4, 64), dtype=torch.bfloat16, ...)     # ❌ key head_dim mismatch

export SGLANG_KERNEL_API_LOGLEVEL=3
export SGLANG_KERNEL_API_LOGDEST=debug.log

compute-sanitizer --tool memcheck python3 /tmp/sglang_llm_crash.py

========= COMPUTE-SANITIZER
========= Invalid __global__ write of size 4 bytes
=========     at 0x1234 in SomeKernel
=========     by thread (256,0,0) in block (10,0,0)
=========     Address 0x... is out of bounds

export SGLANG_KERNEL_API_LOGLEVEL=3
export SGLANG_KERNEL_API_LOGDEST=debug.log

cuda-gdb --args python3 /tmp/sglang_llm_crash.py

(cuda-gdb) run
(cuda-gdb) where

__global__ void MyKernel(const float* input, float* output, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (threadIdx.x == 0 && blockIdx.x == 0) {
    printf("n=%d input0=%f\n", n, input[0]);
  }

  if (idx < n) {
    output[idx] = input[idx] * 2.0f;
  }
}

my_kernel(...)
torch.cuda.synchronize()

__global__ void WarpSpecializedKernel(...) {
  // Example: first lane of each warp
  if ((threadIdx.x % 32) == 0) {
    printf("warp=%d\n", threadIdx.x / 32);
  }
}

// Only warp 0 prints
if (threadIdx.x == 0) {
  printf("warp=%d\n", threadIdx.x / 32);
}

Kernel Type	Print Condition	Notes
Simple kernel	`threadIdx.x == 0`	One thread per block is usually enough
Warp-specialized kernel	one representative lane per warp	e.g. `threadIdx.x % 32 == 0`
Group-specialized kernel	one representative lane per group	choose based on the kernel's scheduling layout

assert(value >= 0.0f && "value must be non-negative");
static_assert(BLOCK_SIZE % 32 == 0, "BLOCK_SIZE must be warp aligned");

Variable	Values	Description
`SGLANG_KERNEL_API_LOGLEVEL`	`0`	No logging (default)
	`1`	Function names only
	`3`	Inputs and outputs with metadata
	`5`	Level 3 plus tensor statistics
	`10`	Level 5 plus crash-safe tensor dumps
`SGLANG_KERNEL_API_LOGDEST`	`stdout`	Log to stdout
	`stderr`	Log to stderr
	`<path>`	Log to file
	`log_%i.txt`	`%i` expands to process ID
`SGLANG_KERNEL_API_DUMP_DIR`	`<path>`	Directory for level-10 dumps
`SGLANG_KERNEL_API_DUMP_INCLUDE`	wildcard list	Only dump matching API names
`SGLANG_KERNEL_API_DUMP_EXCLUDE`	wildcard list	Skip matching API names

export SGLANG_KERNEL_API_LOGLEVEL=3

export SGLANG_KERNEL_API_LOGLEVEL=5

export SGLANG_KERNEL_API_LOGLEVEL=10

export SGLANG_KERNEL_API_LOGDEST=crash.log

unset SGLANG_KERNEL_API_LOGLEVEL

export SGLANG_KERNEL_API_LOGLEVEL=3

statistics=[skipped: CUDA graph capture in progress]

Tensor dump skipped: CUDA graph capture in progress

Debug Cuda Crash

Tutorial: Debugging CUDA Crashes with Kernel API Logging

Goal

Why Use Kernel API Logging?

What Is Covered?

Debug Cuda Crash

Tutorial: Debugging CUDA Crashes with Kernel API Logging

Goal

Why Use Kernel API Logging?

What Is Covered?

Step 1: Enable Kernel API Logging

Basic Logging (Function Names Only)

Detailed Logging (Inputs with Metadata)

Full Logging (With Tensor Statistics)

Crash-Safe Dumps (Inputs Saved Before Execution)

Step 2: Reproduce an LLM CUDA Crash

Step 3: Reproduce a Diffusion CUDA Crash

Step 4: Multi-Process Debugging

Step 5: Filter Level-10 Dumps

Step 6: Common CUDA Errors and What to Check

Illegal Memory Access or Device-Side Assert

NaN or Inf

Out of Memory

Example: Spot a Shape Bug from the Log

Step 7: Combine with compute-sanitizer

Step 8: Combine with cuda-gdb

Step 9: Kernel-Level Debugging with printf()

Warp-Specialized Kernels: Choosing the Right Print Thread

Quick Reference

Other Kernel Debugging Tools

Environment Variables Reference

Best Practices

1. Start with Level 3

2. Use Level 5 for Numerical Issues

3. Use Level 10 for Crash Reproduction

4. Log to File for Crashes

5. Disable Logging in Production

Troubleshooting

No Logs Appear

Too Much Output

Statistics Are Skipped During CUDA Graph Capture

Tensor Dumps Are Skipped During CUDA Graph Capture

Session Logs

OpenClaw Test Heap Leaks

Node Connect

Openclaw Qa Testing

Openclaw Secret Scanning Maintainer

Flags