Name: Exp Simd Vectorization
Author: dotnet

Operation	API
Sum	`TensorPrimitives.Sum(span)`
Sum of squares	`TensorPrimitives.SumOfSquares(span)`
Sum of magnitudes (L1 norm)	`TensorPrimitives.SumOfMagnitudes(span)`
L2 norm	`TensorPrimitives.Norm(span)`
Product of all elements	`TensorPrimitives.Product(span)`
Min value	`TensorPrimitives.Min(span)`
Max value	`TensorPrimitives.Max(span)`
Index of max	`TensorPrimitives.IndexOfMax(span)`
Index of min	`TensorPrimitives.IndexOfMin(span)`
Dot product	`TensorPrimitives.Dot(a, b)`
Cosine similarity	`TensorPrimitives.CosineSimilarity(a, b)`
Euclidean distance	`TensorPrimitives.Distance(a, b)`

Operation	API
Negate	`TensorPrimitives.Negate(src, dst)`
Abs	`TensorPrimitives.Abs(src, dst)`
Sqrt	`TensorPrimitives.Sqrt(src, dst)`
Exp	`TensorPrimitives.Exp(src, dst)`
Log	`TensorPrimitives.Log(src, dst)`
Log2	`TensorPrimitives.Log2(src, dst)`
Tanh	`TensorPrimitives.Tanh(src, dst)`
Sigmoid	`TensorPrimitives.Sigmoid(src, dst)`
SoftMax	`TensorPrimitives.SoftMax(src, dst)`
Sinh	`TensorPrimitives.Sinh(src, dst)`
Cosh	`TensorPrimitives.Cosh(src, dst)`
Round	`TensorPrimitives.Round(src, dst)`
Floor	`TensorPrimitives.Floor(src, dst)`
Ceiling	`TensorPrimitives.Ceiling(src, dst)`
CopySign	`TensorPrimitives.CopySign(src, sign, dst)`
Pow	`TensorPrimitives.Pow(bases, exponents, dst)`

Operation	API
Add	`TensorPrimitives.Add(a, b, dst)`
Subtract	`TensorPrimitives.Subtract(a, b, dst)`
Multiply	`TensorPrimitives.Multiply(a, b, dst)`
Divide	`TensorPrimitives.Divide(a, b, dst)`
Element-wise Min	`TensorPrimitives.Min(a, b, dst)`
Element-wise Max	`TensorPrimitives.Max(a, b, dst)`

using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;

ref var src = ref MemoryMarshal.GetReference(span);
uint i = 0;
uint length = (uint)span.Length;

if (Vector512.IsHardwareAccelerated && Vector512<T>.IsSupported)
{
    uint vec512Count = (uint)Vector512<T>.Count;
    while (i + vec512Count <= length)
    {
        var vec = Vector512.LoadUnsafe(ref src, i);
        // ... process vec ...
        i += vec512Count;
    }
}
else if (Vector256.IsHardwareAccelerated && Vector256<T>.IsSupported)
{
    uint vec256Count = (uint)Vector256<T>.Count;
    while (i + vec256Count <= length)
    {
        var vec = Vector256.LoadUnsafe(ref src, i);
        // ... process vec ...
        i += vec256Count;
    }
}
else if (Vector128.IsHardwareAccelerated && Vector128<T>.IsSupported)
{
    uint vec128Count = (uint)Vector128<T>.Count;
    while (i + vec128Count <= length)
    {
        var vec = Vector128.LoadUnsafe(ref src, i);
        // ... process vec ...
        i += vec128Count;
    }
}
// Scalar fallback for remaining elements (and the only loop hit for small inputs)
for (; i < length; i++)
{
    // ... scalar processing ...
}

var vLo = Vector128.Create((byte)lo);
var vRange = Vector128.Create((byte)(hi - lo));
// (b - lo) > range means out-of-range (unsigned wraparound catches b < lo)
var shifted = Vector128.Subtract(vec, vLo);
var inRange = Vector128.LessThanOrEqual(shifted, vRange);
if (!Vector128.All(inRange.AsByte())) return false; // for validation
// or: count += Vector128.CountWhereAllBitsSet(inRange); // for counting

var lo_lut = Vector128.Create(/* 16 bytes: bit pattern for low nibble match */);
var hi_lut = Vector128.Create(/* 16 bytes: bit pattern for high nibble match */);
var nibbleMask = Vector128.Create((byte)0x0F);

var lo_nibble = vec & nibbleMask;
var hi_nibble = Vector128.ShiftRightLogical(vec.AsUInt16(), 4).AsByte() & nibbleMask;
var lo_match = Vector128.Shuffle(lo_lut, lo_nibble);
var hi_match = Vector128.Shuffle(hi_lut, hi_nibble);
var match = lo_match & hi_match;
count += Vector128.CountWhereAllBitsSet(~Vector128.Equals(match, Vector128<byte>.Zero));

// Widen: byte → short → int → float
var bytes = Vector128.LoadUnsafe(ref src, offset);
var (lo16, hi16) = Vector128.Widen(bytes);
var (lo32a, lo32b) = Vector128.Widen(lo16);
var f0 = Vector128.ConvertToSingle(lo32a.AsInt32());

// Narrow: int → short → byte (with saturation via Min/Max clamping)
var clamped = Vector128.Min(Vector128.Max(vec, Vector128<short>.Zero), Vector128.Create((short)255));
var narrowed = Vector128.Narrow(clamped.AsUInt16(), nextVec.AsUInt16());

Exp Simd Vectorization | Skills Pool

Operation	API
(x+y)*z	`TensorPrimitives.AddMultiply(x, y, z, dst)`
x*y+z	`TensorPrimitives.MultiplyAdd(x, y, z, dst)`
fma(x,y,z)	`TensorPrimitives.FusedMultiplyAdd(x, y, z, dst)`

Exp Simd Vectorization

Exp Simd Vectorization

SIMD Vectorization

Decision Gate

TensorPrimitives API Reference

Reductions (span → scalar)

Element-wise transforms (span → span)

Two-span operations (a, b → dst)

Three-span fused operations

Manual SIMD with Vector128/Vector256/Vector512

Required imports

Three-tier dispatch pattern

Core SIMD operations

Pattern: Unsigned range check (byte-range validation)

Pattern: Nibble-lookup counting (character classes, popcount, etc.)

Pattern: Cross-type conversion (widening chains)

Trailing elements

Key Rules

Pytorch Patterns

Regex Vs Llm Structured Text

Effect

Flags

WPF to WinUI 3 Migration Skill

At Dispatch V2