Name: Opus Kernel Best Practice
Author: ROCm

Opus Kernel Best Practice | Skills Pool

hipcc my_kernel.cu -I<aiter_root>/csrc/include -D__HIPCC_RTC__ -std=c++20 -O3 --offload-arch=gfx950

HIP runtime	opus:: wrapper	LLVM builtin
`threadIdx.x`	`opus::thread_id_x()`	`__builtin_amdgcn_workitem_id_x()`
`blockIdx.x`	`opus::block_id_x()`	`__builtin_amdgcn_workgroup_id_x()`
`blockDim.x`	`opus::block_size_x()`	`__builtin_amdgcn_workgroup_size_x()`
`gridDim.x * blockDim.x`	`opus::grid_size_x()`	`__builtin_amdgcn_grid_size_x()`
`__syncthreads()`	`opus::sync_threads()`	`__builtin_amdgcn_s_barrier()`
`__all(pred)`	`opus::warp_all(pred)`	—

// my_kernel.cu
#ifdef __HIP_DEVICE_COMPILE__
// ── Device pass: include opus.hpp and define kernels ──
#include "opus/opus.hpp"

__global__ __launch_bounds__(256, 2)
void my_kernel(const float* src, float* dst, int n) {
    // ... opus layout, load, store, MMA, etc.
}

#else
// ── Host pass: minimal declarations + launcher only ──
#include "opus/hip_minimal.hpp"

__global__ void my_kernel(const float* src, float* dst, int n);  // declaration only

extern "C" void run_my_kernel(const void* d_src, void* d_dst, int n) {
    dim3 grid((n + 255) / 256), block(256);
    hipLaunchKernelGGL(my_kernel, grid, block, 0, 0,
                       (const float*)d_src, (float*)d_dst, n);
    hipDeviceSynchronize();
}
#endif

hipcc my_kernel.cu \
  -I<aiter_root>/csrc/include \
  -D__HIPCC_RTC__ \
  -std=c++20 -O3 -ffast-math \
  --offload-arch=gfx950 \
  -fPIC -shared -o my_kernel.so

int tid = __builtin_amdgcn_workitem_id_x();     // threadIdx.x
int bid = __builtin_amdgcn_workgroup_id_x();     // blockIdx.x
int bsz = __builtin_amdgcn_workgroup_size_x();   // blockDim.x
__builtin_amdgcn_s_barrier();                     // __syncthreads()

// SLOW: N unique lambda instantiations
static_for<N>([&](auto I) {
    r[I.value] = load<vec>(offsets[I.value]);
});

// FAST: 1 instantiation, compiler unrolls identically
for (index_t i = 0; i < N; i++) {
    r[i] = load<vec>(offsets[i]);
}

// SLOW: N unique coord_to_linear instantiations (one per multi-index combination)
static_ford(issue_space_vec, [&](auto... ids) {
    offsets[u_linear(ids...)] = u(ids...);
});

// FAST: 1 coord_to_linear instantiation (all iterations use tuple<index_t, ...>)
for (index_t i = 0; i < num_issues; i++) {
    offsets[i] = u(flat_to_coords(i, make_index_seq<ndim>{}, issue_space_vec));
}

// SLOW: y_shape_a() + reduce_tuple_mul evaluated in every operator()/step_k() overload
constexpr auto a_len = get<0>(reduce_tuple_mul(MMA::y_shape_a()));

// FAST: cached once as class member
static constexpr index_t mma_a_len = get<0>(reduce_tuple_mul(MMA::y_shape_a())).value;

// SLOW: 64-element pack expansion
return vector_return_type<D, decltype(cast<D>(get<Is>(s)))...>{cast<D>(get<Is>(s))...};

// FAST: single builtin call
return __builtin_convertvector(s, vector_t<D, size<S>()>);

// SLOW: N-element braced init
return make_vector(get<Is>(c)...);

// FAST: single shuffle (returns GCC-style vector, bit_cast to ext_vector_type)
using R = vector_t<scalar_type, sizeof...(Is)>;
return __builtin_bit_cast(R, __builtin_shufflevector(c, c, Is...));

// unfold_x_stride: instead of concat_tuple(per_group_results...)
// compute each element's stride directly via unfold_x_stride_at<J>()

// pickup_shape: instead of concat_tuple(conditional<match, tuple<T>, tuple<>>{}...)
// build a filtered index sequence, then make_tuple(get<filtered_indices>(Shape{})...)

// flatten_tuple: instead of concat_tuple(explode_tuple(get<Is>(t))...)
// directly index as get<local>(get<group>(t)) via flatten_at<T, J, GS>()

// SLOW: triggers recursive std::common_type<D, D, D, ..., D> with 64 types
return vector_return_type<void, decltype(cast<D>(get<Is>(s)))...>{...};

// FAST: D is already known, skip common_type entirely
return vector_return_type<D, decltype(cast<D>(get<Is>(s)))...>{...};

// reduce_tuple_mul for tuple<number<>...>: fold expression instead of recursive reduction
template<typename... Ns, std::enable_if_t<(is_constant_v<Ns> && ...), bool> = true>
constexpr auto reduce_tuple_mul(const tuple<Ns...>&) { return tuple<number<(Ns::value * ...)>>{}; }

test_mfma.cu (3.9s) -> test_mfma_f16.cu (0.9s) + test_mfma_f32.cu (0.5s) + test_mfma_f8.cu (0.9s)

hipcc kernel.cc --cuda-device-only -c -o /dev/null \
  -Xclang -ftime-trace=trace.json

import json
with open('trace.json') as f: data = json.load(f)
events = data.get('traceEvents', data)
inst = [(e['dur'], e['args']['detail']) for e in events
        if e.get('name') == 'InstantiateFunction' and 'dur' in e]
inst.sort(key=lambda x: -x[0])
for dur, name in inst[:20]:
    print(f"{dur/1000:8.1f}ms  {name[:100]}")

Technique	Typical savings	Where applied
Separate device/host code (`__HIP_DEVICE_COMPILE__` guard)	~50% total	All `.cu`/`.hip` files — always do this first
Runtime `for` loops in load/store/MMA	30-60% frontend	`buffer_view::load/store`, `tiled_mma_adaptor::operator()`
Runtime `flat_to_coords`	40-50% frontend	`layout_to_offsets`
`__builtin_convertvector`	5-10% frontend	`cast` for vectors >16 elements
`__builtin_shufflevector`	3-5% frontend	`slice_impl` for vectors
Cache constexpr members	10-15% frontend	`layout_load_traits`, `mma_a/b/c_len`
Direct indexing (bypass concat_tuple)	5-10% frontend	`unfold_x_stride`, `pickup_shape`, `flatten_tuple`
`-D__HIPCC_RTC__`	~25% per-file	Compiler flags
`hipcc --genco`	~15% per-file	Python-launched kernels
Split large TU files	Better parallelism	Test suites, multi-kernel builds

Binding	Compile time
torch `CUDAExtension`	~21s
pybind11 + Ninja	~4.2s
ctypes (`extern "C"`, see Section 0)	~0.4s

Opus Kernel Best Practice

OPUS Kernel Compile-Time Best Practices

Required headers and include paths

Opus Kernel Best Practice

OPUS Kernel Compile-Time Best Practices

Required headers and include paths

0. Always Separate Device and Host Code (Most Important)

1. Minimize Header Overhead

Replace `<hip/hip_runtime.h>` with `opus/hip_minimal.hpp`

Use `-D__HIPCC_RTC__` to suppress implicit includes

Use ctypes instead of pybind11/torch extension for Python bindings

2. Reduce Template Instantiation Count

Use runtime loops instead of `static_for` where compile-time indices aren't needed

Use runtime `flat_to_coords` instead of compile-time multi-index decomposition

Cache constexpr computations in struct members

3. Use LLVM Builtins for Vector Operations

`__builtin_convertvector` for type conversion

`__builtin_shufflevector` for vector slice/concat

4. Avoid Intermediate Type Creation

Bypass `concat_tuple` with direct indexing

Specify return type explicitly to avoid `std::common_type`

Add fold-expression fast paths for common patterns

5. Parallel Compilation

Split device test files by template-instantiation cost

Use `hipcc --genco` for device-only compilation when launching from Python

Compile-Time Measurement

Use `-ftime-trace` for profiling

Key metrics to track

Summary Table

Pytorch Patterns

Regex Vs Llm Structured Text

Effect

Flags

WPF to WinUI 3 Migration Skill

At Dispatch V2

Opus Kernel Best Practice

OPUS Kernel Compile-Time Best Practices

Required headers and include paths

Opus Kernel Best Practice

OPUS Kernel Compile-Time Best Practices

Required headers and include paths

0. Always Separate Device and Host Code (Most Important)

1. Minimize Header Overhead

Replace <hip/hip_runtime.h> with opus/hip_minimal.hpp

Use -D__HIPCC_RTC__ to suppress implicit includes

Use ctypes instead of pybind11/torch extension for Python bindings

2. Reduce Template Instantiation Count

Use runtime loops instead of static_for where compile-time indices aren't needed

Use runtime flat_to_coords instead of compile-time multi-index decomposition

Cache constexpr computations in struct members

3. Use LLVM Builtins for Vector Operations

__builtin_convertvector for type conversion

__builtin_shufflevector for vector slice/concat

4. Avoid Intermediate Type Creation

Bypass concat_tuple with direct indexing

Specify return type explicitly to avoid std::common_type

Add fold-expression fast paths for common patterns

5. Parallel Compilation

Split device test files by template-instantiation cost

Use hipcc --genco for device-only compilation when launching from Python

Compile-Time Measurement

Use -ftime-trace for profiling

Key metrics to track

Summary Table

Pytorch Patterns

Regex Vs Llm Structured Text

Effect

Flags

WPF to WinUI 3 Migration Skill

At Dispatch V2

Replace `<hip/hip_runtime.h>` with `opus/hip_minimal.hpp`

Use `-D__HIPCC_RTC__` to suppress implicit includes

Use runtime loops instead of `static_for` where compile-time indices aren't needed

Use runtime `flat_to_coords` instead of compile-time multi-index decomposition

`__builtin_convertvector` for type conversion

`__builtin_shufflevector` for vector slice/concat

Bypass `concat_tuple` with direct indexing

Specify return type explicitly to avoid `std::common_type`

Use `hipcc --genco` for device-only compilation when launching from Python

Use `-ftime-trace` for profiling