All Intrinsics

Complete reference for every built-in function in Ea.

Memory

load

Load a vector from a pointer at a byte offset. The return type is inferred from context.

let v: f32x8 = load(ptr, i);

Typed Scalar Loads

Load a single scalar value from a pointer at a byte offset.

IntrinsicReturn Type
load_f32(ptr, i)f32
load_f64(ptr, i)f64
load_i32(ptr, i)i32
load_i16(ptr, i)i16
load_i8(ptr, i)i8
load_u8(ptr, i)u8
load_u16(ptr, i)u16
load_u32(ptr, i)u32
load_u64(ptr, i)u64
let x: f32 = load_f32(ptr, i);

Typed Vector Loads

Load a full vector from a pointer at a byte offset.

IntrinsicReturn Type
load_f32x4(ptr, i)f32x4
load_f32x8(ptr, i)f32x8
load_f32x16(ptr, i)f32x16
load_i32x4(ptr, i)i32x4
load_i32x8(ptr, i)i32x8
load_i16x8(ptr, i)i16x8
load_i8x16(ptr, i)i8x16
load_u8x16(ptr, i)u8x16
load_u8x32(ptr, i)u8x32
let v: f32x8 = load_f32x8(data, i * 32);

store

Write a vector to a pointer at a byte offset.

store(out, i, result);

stream_store

Non-temporal store that bypasses the CPU cache. Use for write-only output that will not be read back soon.

stream_store(out, i, result);

load_masked

Masked vector load. Lanes where the mask is false are not loaded.

let v: f32x8 = load_masked(ptr, i, mask);

store_masked

Masked vector store. Only lanes where the mask is true are written.

store_masked(out, i, value, mask);

gather

Load elements from scattered memory addresses using an index vector. x86 only -- not available on ARM.

let v: f32x8 = gather(ptr, indices);

scatter

Store elements to scattered memory addresses using an index vector. AVX-512 only (--avx512 flag required).

scatter(ptr, indices, values);

prefetch

Issue a prefetch hint to bring data into cache.

prefetch(ptr, i);

Math

sqrt

Square root. Works on scalar f32/f64 and all float vector types.

let y: f32 = sqrt(x);
let v: f32x8 = sqrt(vec);

rsqrt

Reciprocal square root (approximate). Scalar f32 and f32 vector types.

let y: f32 = rsqrt(x);
let v: f32x8 = rsqrt(vec);

exp

Exponential function. Float types.

let y: f32 = exp(x);

fma

Fused multiply-add: computes a * b + c in a single operation with one rounding step. Works on scalar f32 and all float vector types.

let y: f32 = fma(a, b, c);
let v: f32x8 = fma(va, vb, vc);

min

Element-wise minimum. Works on scalar (i32, f32, f64) and vector types.

let m: i32 = min(a, b);
let v: f32x8 = min(va, vb);

max

Element-wise maximum. Works on scalar (i32, f32, f64) and vector types.

let m: i32 = max(a, b);
let v: f32x8 = max(va, vb);

Reduction

Reduce a vector to a single scalar value.

reduce_add

Sum all lanes.

let sum: f32 = reduce_add(v);   // f32x8 -> f32
let sum: i32 = reduce_add(iv);  // i32x8 -> i32

reduce_max

Maximum across all lanes.

let m: f32 = reduce_max(v);

reduce_min

Minimum across all lanes.

let m: f32 = reduce_min(v);

reduce_add_fast

Unordered float reduction. Faster than reduce_add but does not guarantee summation order, so results may differ slightly due to floating-point rounding. Float vectors only.

let sum: f32 = reduce_add_fast(v);

Vector

splat

Broadcast a scalar value to all lanes of a vector. The vector type is inferred from context.

let v: f32x8 = splat(1.0);

shuffle

Reorder lanes of a vector according to an index tuple. The indices are compile-time constants.

let reversed: f32x4 = shuffle(v, (3, 2, 1, 0));

select

Per-lane conditional select. Where the mask is true, take from a; where false, take from b.

let result: f32x8 = select(mask, a, b);

movemask

Extract a comparison result bitmask from a boolean vector to a scalar i32. Each bit corresponds to the sign bit of one lane. x86 only -- not available on ARM.

let bits: i32 = movemask(cmp_result);

Conversion

Scalar Casts

IntrinsicDescription
to_f32(x)Convert to f32
to_f64(x)Convert to f64
to_i32(x)Convert to i32
to_i64(x)Convert to i64
let f: f32 = to_f32(i);
let n: i32 = to_i32(x);

Widening Conversions

Widen narrow integer lanes to wider float or integer lanes. Only the first N lanes of the input are consumed.

IntrinsicInputOutput
widen_i8_f32x4(v)i8x16f32x4
widen_u8_f32x4(v)u8x16f32x4
widen_i8_f32x8(v)i8x16f32x8
widen_u8_f32x8(v)u8x16f32x8
widen_u8_i32x4(v)u8x16i32x4
widen_u8_i32x8(v)u8x16i32x8
let pixels: f32x8 = widen_u8_f32x8(raw_bytes);

Narrowing Conversions

Convert wider lanes to narrower lanes, with clamping and rounding.

IntrinsicInputOutput
narrow_f32x4_i8(v)f32x4i8 (4 bytes)
let packed = narrow_f32x4_i8(float_pixels);

Multiply-Add Byte Pairs

Multiply unsigned bytes by signed bytes and add adjacent pairs. x86 only.

IntrinsicSignatureDescription
maddubs_i16(a, b)(u8x16, i8x16) -> i16x8Multiply and add adjacent pairs to 16-bit
maddubs_i32(a, b)(u8x16, i8x16) -> i32x4Multiply and add adjacent quads to 32-bit
let products: i16x8 = maddubs_i16(unsigned_bytes, signed_weights);

Debug

println

Print a value to stdout. Accepts scalars (i32, i64, u8, u16, u32, u64, f32, f64, bool), string literals, and vector types. Lowers to C printf. No format strings.

println(42);
println(3.14);
println("hello");
println(my_vector);