DoxigAlpha

suggestVectorLengthForCpu

Function parameters

Parameters

#
T:type
cpu:std.Target.Cpu

Functions in this namespace

Functions

#
suggestVectorLength
Suggests a target-dependant vector length for a given type, or null if scalars are recommended.
VectorIndex
Returns the smallest type of unsigned ints capable of indexing any element within the given vector type.
VectorCount
Returns the smallest type of unsigned ints capable of holding the length of the given vector type.
iota
Returns a vector containing the first `len` integers in order from 0 to `len`-1.
repeat
Returns a vector containing the same elements as the input, but repeated until the desired length is reached.
join
Returns a vector containing all elements of the first vector at the lower indices followed by all elements of the second vector
interlace
Returns a vector whose elements alternates between those of each input vector.
deinterlace
The contents of `interlaced` is evenly split between vec_count vectors that are returned as an array.
mergeShift
Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the length of a and b.
shiftElementsRight
Elements are shifted rightwards (towards higher indices).
shiftElementsLeft
Elements are shifted leftwards (towards lower indices).
rotateElementsLeft
Elements are shifted leftwards (towards lower indices).
rotateElementsRight
Elements are shifted rightwards (towards higher indices).
prefixScanWithFunc
Same as prefixScan, but with a user-provided, mathematically associative function.
prefixScan
Returns a vector whose elements are the result of performing the specified operation on the corresponding

Source

Implementation

#
pub fn suggestVectorLengthForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int {
    @setEvalBranchQuota(2_000);

    // This is guesswork, if you have better suggestions can add it or edit the current here
    const element_bit_size = @max(8, std.math.ceilPowerOfTwo(u16, @bitSizeOf(T)) catch unreachable);
    const vector_bit_size: u16 = blk: {
        if (cpu.arch.isX86()) {
            if (T == bool and cpu.has(.x86, .prefer_mask_registers)) return 64;
            if (builtin.zig_backend != .stage2_x86_64 and cpu.has(.x86, .avx512f) and !cpu.hasAny(.x86, &.{ .prefer_256_bit, .prefer_128_bit })) break :blk 512;
            if (cpu.hasAny(.x86, &.{ .prefer_256_bit, .avx2 }) and !cpu.has(.x86, .prefer_128_bit)) break :blk 256;
            if (cpu.has(.x86, .sse)) break :blk 128;
            if (cpu.hasAny(.x86, &.{ .mmx, .@"3dnow" })) break :blk 64;
        } else if (cpu.arch.isArm()) {
            if (cpu.has(.arm, .neon)) break :blk 128;
        } else if (cpu.arch.isAARCH64()) {
            // SVE allows up to 2048 bits in the specification, as of 2022 the most powerful machine has implemented 512-bit
            // I think is safer to just be on 128 until is more common
            // TODO: Check on this return when bigger values are more common
            if (cpu.has(.aarch64, .sve)) break :blk 128;
            if (cpu.has(.aarch64, .neon)) break :blk 128;
        } else if (cpu.arch.isPowerPC()) {
            if (cpu.has(.powerpc, .altivec)) break :blk 128;
        } else if (cpu.arch.isMIPS()) {
            if (cpu.has(.mips, .msa)) break :blk 128;
            // TODO: Test MIPS capability to handle bigger vectors
            //       In theory MDMX and by extension mips3d have 32 registers of 64 bits which can use in parallel
            //       for multiple processing, but I don't know what's optimal here, if using
            //       the 2048 bits or using just 64 per vector or something in between
            if (cpu.has(.mips, .mips3d)) break :blk 64;
        } else if (cpu.arch.isRISCV()) {
            // In RISC-V Vector Registers are length agnostic so there's no good way to determine the best size.
            // The usual vector length in most RISC-V cpus is 256 bits, however it can get to multiple kB.
            if (cpu.has(.riscv, .v)) {
                inline for (.{
                    .{ .zvl65536b, 65536 },
                    .{ .zvl32768b, 32768 },
                    .{ .zvl16384b, 16384 },
                    .{ .zvl8192b, 8192 },
                    .{ .zvl4096b, 4096 },
                    .{ .zvl2048b, 2048 },
                    .{ .zvl1024b, 1024 },
                    .{ .zvl512b, 512 },
                    .{ .zvl256b, 256 },
                    .{ .zvl128b, 128 },
                    .{ .zvl64b, 64 },
                    .{ .zvl32b, 32 },
                }) |mapping| {
                    if (cpu.has(.riscv, mapping[0])) break :blk mapping[1];
                }

                break :blk 256;
            }
        } else if (cpu.arch.isSPARC()) {
            // TODO: Test Sparc capability to handle bigger vectors
            //       In theory Sparc have 32 registers of 64 bits which can use in parallel
            //       for multiple processing, but I don't know what's optimal here, if using
            //       the 2048 bits or using just 64 per vector or something in between
            if (cpu.hasAny(.sparc, &.{ .vis, .vis2, .vis3 })) break :blk 64;
        } else if (cpu.arch.isWasm()) {
            if (cpu.has(.wasm, .simd128)) break :blk 128;
        }
        return null;
    };
    if (vector_bit_size <= element_bit_size) return null;

    return @divExact(vector_bit_size, element_bit_size);
}