Name: Intel ESIMD Base Skill
Author: ModelTC

Intel ESIMD Base Skill | Skills Pool

#include <sycl/sycl.hpp>
#include <sycl/ext/intel/esimd.hpp>

// Kernel lambda must be annotated:
[=](sycl::id<1> idx) SYCL_ESIMD_KERNEL { ... }
// or equivalently:
[=](sycl::nd_item<1> item) [[intel::sycl_explicit_simd]] { ... }

using namespace sycl::ext::intel::esimd;

Resource	Spec
EUs (Execution Units)	128–512 (model dependent)
Threads per EU	8
Total threads	EUs × 8
GRF registers per thread (Xe1)	128 × 32B = 4 KB
GRF registers per thread (Xe2+)	256 × 32B = 8 KB
SIMD width per thread (Xe1)	256-bit → 16 FP16 elements per instruction
SIMD width per thread (Xe2+)	512-bit → 32 FP16 elements per instruction
SLM per sub-slice	128 KB
Recommended SLM	< 64 KB

Architecture	GRF per thread	FP16 capacity	FP32 capacity
Xe1 (Arc Alchemist, iGPU Gen12)	128 × 32B = 4 KB	up to ~2048 elements	up to ~1024 elements
Xe2+ (Arc Battlemage, Lunar Lake+)	256 × 32B = 8 KB	up to ~4096 elements	up to ~2048 elements

const int ts = (int)q.get_device()
    .get_info<sycl::info::device::max_compute_units>() * 8;

Priority	Technique	Typical Impact
0	Kernel design (range dim, work/thread, template params)	10–35×
1	Memory access (block_load, alignment, coalescing)	2–5×
2	Data types (FP16 > FP32, uint4/int8 for weights)	2–8×
3	Vectorization (hmax/hmin, SIMD ops, avoid scalar loops)	1.5–2×
4	Parallelization (workgroup size, SLM reduction)	10–30%
5	Register file vs SLM (~1 cycle vs ~30 cycles)	up to 2×
6	Algorithm complexity (O(N) vs O(K×N), unroll, hoist)	varies

// BAD: 1D range + manual 4D decode → 1% bandwidth
q.parallel_for(range<1>(bsz * heads * seq * blocks),
    [=](id<1> idx) SYCL_ESIMD_KERNEL {
        int d4 =  idx[0] % blocks;
        int d3 = (idx[0] / blocks) % seq;
        int d2 = (idx[0] / (blocks * seq)) % heads;
        int d1 =  idx[0] / (blocks * seq * heads);
        output[idx[0]] = process(input[idx[0]]);  // 1 element/thread
    });

// GOOD: 3D range → ~73% bandwidth (21.7× faster before any other change)
q.parallel_for(range<3>(bsz, heads, seq),
    [=](id<3> idx) SYCL_ESIMD_KERNEL {
        int b = idx[0], h = idx[1], s = idx[2];
        // each thread processes many output elements for this (b, h, s)
    });

// GOOD: thread processes 128 elements
constexpr int ELEMS = 128;
const int base = idx[0] * ELEMS;
simd<sycl::half, ELEMS> data = block_load<sycl::half, ELEMS>(input + base);
// ... process data ...
block_store<sycl::half, ELEMS>(output + base, data);

// BAD: runtime N prevents unrolling
void kernel(int N) { for (int i = 0; i < N; i++) ... }

// GOOD: compile-time N enables full unroll and SIMD vectorization
template<int N>
void kernel() {
    #pragma unroll
    for (int i = 0; i < N; i++) ...
}

// BAD: each load stalls its own compute
for (int i = 0; i < K; i++) {
    auto v = block_load<half, D>(ptr + i * D);
    results[i] = detail::sum<half, half, D>(v * w);
}

// GOOD: all loads in-flight before compute begins (+29% measured)
simd<half, D> vecs[K];
for (int i = 0; i < K; i++) vecs[i] = block_load<half, D>(ptr + i * D);
for (int i = 0; i < K; i++) results[i] = detail::sum<half, half, D>(vecs[i] * w);

// Load N elements starting at ptr (ptr must be N*sizeof(T) aligned by default)
simd<T, N> data = block_load<T, N>(ptr);
simd<T, N> data = block_load<T, N>(ptr + offset);  // offset in elements

// Store
block_store<T, N>(ptr + offset, data);

// ── half / bfloat16 (2 bytes each) ───────────────────────────────────────
// Element index 3 → byte address +6 → NOT a multiple of 4 → wrong results!
simd<half, 8> data = block_load<half, 8>(ptr + 3);                          // WRONG
simd<half, 8> data = block_load<half, 8>(ptr + 3, properties{alignment<2>}); // OK

// Safe rule: always use alignment<2> for half* / bfloat16* block_load/store
// unless you can statically prove the element offset is even (byte offset % 4 == 0).
block_store<half, 8>(ptr + 3, data, properties{alignment<2>});               // OK

// ── float / int (4 bytes each) ────────────────────────────────────────────
// 4-byte elements: any element index gives a 4-byte-aligned byte address → safe.
simd<float, 8> data = block_load<float, 8>(ptr + 3);   // OK (byte addr = +12)

// ── uint8_t (1 byte each) ────────────────────────────────────────────────
// 1-byte elements: any byte address → always need alignment<1>.
simd<uint8_t, 8> data = block_load<uint8_t, 8>(ptr + 3, properties{alignment<1>});

Type	Size	Default required alignment	When to annotate
`float`, `int`, `uint32_t`	4 B	4 B	never needed (4B × any index = multiple of 4)
`half`, `bfloat16`	2 B	4 B	whenever element offset may be odd
`uint8_t`, `int8_t`	1 B	4 B	always (`alignment<1>`)

// Gather: load N elements from per-lane byte offsets
simd<T, N> data = gather<T, N>(base_ptr, simd<uint32_t, N>(byte_offsets));

// Scatter: store N elements to per-lane byte offsets
scatter<T, N>(base_ptr, simd<uint32_t, N>(byte_offsets), data);

constexpr int BS = 32;
simd<int, BS> rel = simd<int, BS>(0, 1) * STRIDE;  // [0, S, 2S, ..., 31S]

for (int out = 0; out < num_outputs; out += BS) {
    // Clamp offsets to valid range (branchless — safe for idempotent ops like max)
    simd<int, BS> offs = max(out * STRIDE + rel + OFFSET, 0);
    offs = min(offs, max_elem);

    simd<T, BS> vals = gather<T, BS>(
        input + base, simd<uint32_t, BS>(offs) * sizeof(T));

    // ... process vals ...

    int n = std::min(BS, num_outputs - out);
    if (n == BS) block_store<T, BS>(output + out_base + out, result);
    else         for (int i = 0; i < n; i++) output[out_base + out + i] = result[i];
}

// Initialize SLM (compile-time size, inside kernel)
constexpr int SLM_BYTES = GROUP_SIZE * sizeof(float);
slm_init<SLM_BYTES>();

// Per-thread write (byte offset)
slm_block_store<float, 1>(local_id * sizeof(float), simd<float, 1>(partial));

// Synchronize all threads in workgroup
barrier();

// Read all GROUP_SIZE values (thread 0 only, or all threads for broadcast)
simd<float, GROUP_SIZE> parts = slm_block_load<float, GROUP_SIZE>(0);

// Pattern: FP16 inputs → FP32 accumulation → FP16 output
simd<sycl::half, 128> a = block_load<sycl::half, 128>(ptr_a);
simd<sycl::half, 128> b = block_load<sycl::half, 128>(ptr_b);
simd<float, 128> acc(0.f);
acc += convert<float>(a) * convert<float>(b);
float result = sycl::ext::intel::esimd::detail::sum<float, float, 128>(acc);
block_store<sycl::half, 1>(out, simd<sycl::half, 1>(sycl::half(result)));

// Element-wise ops (all broadcast scalars automatically)
simd<T, N> c = a + b;
simd<T, N> c = a * b + scalar;   // FMA when T=float
simd_mask<N> m = a > b;
simd<T, N> r = merge(a, b, m);   // r[i] = m[i] ? a[i] : b[i]

// GOOD: extract scalar from simd, multiply directly — one instruction
half k_val = k_tile[tt * SUB_HD + ii];   // simd_view → half (implicit conversion)
acc[ii] += k_val * v_row;                // scalar * simd<half, N> → simd<half, N>

// BAD: explicit broadcast via replicate_w — wastes a GRF shuffle instruction
simd<half, N> k_bcast = k_tile.template replicate_w<N, 1>(tt * SUB_HD + ii);
acc[ii] += k_bcast * v_row;

// Horizontal sum — USE detail::sum, NOT reduce<T>(vec, std::plus<T>())
// WHY: with `using namespace sycl::ext::intel::esimd`, the name `reduce` is
// ambiguous between esimd::reduce and C++17 std::reduce (iterator overload).
// The compiler silently picks std::reduce which returns 0. This is a silent bug
// with no compile error that produces wrong results.
float s = sycl::ext::intel::esimd::detail::sum<float, float, N>(vec);

Intel ESIMD Base Skill

Table of Contents

Intel ESIMD Base Skill

Table of Contents

What Is ESIMD

Hardware Characteristics

Intel Arc GPU / iGPU (Xe architecture)

Optimization Priority Checklist

Kernel Design

Rule 1: Match Kernel Range Dimensionality to Data Shape

Rule 2: Each Thread Must Do Meaningful Work

Rule 3: Use Template Parameters for All Compile-Time Constants

Rule 4: Separate Load and Compute Loops

Memory Access Patterns

Block Load / Store (Contiguous Access)

Gather / Scatter (Non-Contiguous Access)

SLM — Shared Local Memory

Data Types

FP16 vs FP32

Vectorization Techniques

Arithmetic

Scalar × Vector (No Explicit Broadcast Needed)

Reductions

Pytorch Patterns

Regex Vs Llm Structured Text

Effect

Flags

WPF to WinUI 3 Migration Skill

At Dispatch V2

Architecture	SIMD width	FP16 elements/instruction
Xe1 (Arc Alchemist, iGPU Gen12)	256-bit	16 FP16 per instruction
Xe2+ (Arc Battlemage, Lunar Lake+)	512-bit	32 FP16 per instruction