Skip to content

Instantly share code, notes, and snippets.

@jakubtomsu
Created April 10, 2026 15:14
Show Gist options
  • Select an option

  • Save jakubtomsu/08212f9216935db0354b1e08f87b3814 to your computer and use it in GitHub Desktop.

Select an option

Save jakubtomsu/08212f9216935db0354b1e08f87b3814 to your computer and use it in GitHub Desktop.
Making core:math/linalg a bit faster
/*
Odin core:math/linalg benchmarking code
Primarily optimizes 3xf32 math in -o:speed mode!
However I spent a lot of effort ensuring the code doesn't get significantly slower
in non-optimized builds.
The results can looks something like this (-o:speed, amd64 skylake)
NAME | Cycles/Elem | Total Cycles | Ms | Speedup
cross_linalg | 24 (22 ..149) | 204240 (186948 ..1223189) | 6 | 1.000x
cross_new | 6 ( 6 .. 11) | 52435 ( 49435 .. 93333) | 1 | 3.895x
dot_linalg | 22 (20 ..114) | 183991 (170855 .. 937097) | 6 | 1.000x
dot_new | 4 ( 4 .. 5) | 33884 ( 33674 .. 41996) | 1 | 5.430x
length_linalg | 28 (28 .. 52) | 237166 (234925 .. 434006) | 7 | 1.000x
length_new | 4 ( 4 .. 5) | 39665 ( 39470 .. 47656) | 1 | 5.979x
length2_linalg | 20 (20 .. 26) | 171496 (170871 .. 215835) | 5 | 1.000x
length2_new | 4 ( 4 .. 5) | 37617 ( 37534 .. 42996) | 1 | 4.559x
floor_linalg | 59 (55 ..105) | 489949 (456486 .. 866643) | 16 | 1.000x
floor_new | 22 (22 .. 23) | 188308 (188119 .. 191002) | 6 | 2.602x
ceil_linalg | 57 (53 ..108) | 469098 (437186 .. 888921) | 15 | 1.000x
ceil_new | 23 (22 .. 46) | 190237 (188129 .. 383439) | 6 | 2.466x
trunc_new | 23 (22 .. 28) | 189011 (188127 .. 234359) | 6 | 1.000x
*/
#+vet shadowing
package linalg_bench
import "core:math/rand"
import "core:math/linalg"
import "core:fmt"
import "core:time"
import "core:sys/windows"
import "base:intrinsics"
Float :: f32
W :: 3
RUN_REPEAT :: 1
DATA_LEN :: 1024 * 8
REPEAT :: 100
// Warm up data and code cache
WARMUP_ITERS :: 50
// Makes sure consecutive benchmarks don't interfere with each other.
// Especially important on laptops!
// To ensure this works correctly, try swapping order of bench_proc() calls and check the timers stay the same.
COOLDOWN_MS :: 500
vec_a: [W]Float
data: [][W]Float
sum: Float
bench_dur_prev: i64
@(enable_target_feature = "sse")
main :: proc() {
// Windows only.
// More stable cache and scheduling.
pin_thread_to_core(0)
fmt.println("ODIN_OPTIMIZATION_MODE =", ODIN_OPTIMIZATION_MODE)
fmt.println("ODIN_ARCH =", ODIN_ARCH)
fmt.println("ODIN_MICROARCH_STRING =", ODIN_MICROARCH_STRING)
fmt.println("ODIN_NO_BOUNDS_CHECK =", ODIN_NO_BOUNDS_CHECK)
fmt.println("avx512f =", intrinsics.has_target_feature("avx512f"))
fmt.println("avx2 =", intrinsics.has_target_feature("avx2"))
fmt.println("sse2 =", intrinsics.has_target_feature("sse2"))
fmt.println("neon =", intrinsics.has_target_feature("neon"))
data = make([][W]Float, DATA_LEN)
for i in 0..<RUN_REPEAT {
fmt.printfln("\nRUN %i", i)
for &d in data {
d = rand_vec()
}
bench_section()
bench_proc(cross_linalg)
bench_proc(cross_new)
bench_section()
bench_proc(dot_linalg)
bench_proc(dot_new)
bench_section()
bench_proc(length_linalg)
bench_proc(length_new)
bench_section()
bench_proc(length2_linalg)
bench_proc(length2_new)
bench_section()
bench_proc(floor_linalg)
bench_proc(floor_new)
bench_section()
bench_proc(ceil_linalg)
bench_proc(ceil_new)
bench_section()
bench_proc(trunc_new)
}
// Just makes really sure the compiler doesn't optimize anything away.
fmt.println("\ntotal result:", sum)
}
bench_section :: proc() {
bench_dur_prev = -1
fmt.println()
}
bench_proc :: proc(procedure: $P, name := #caller_expression(procedure)) where intrinsics.type_is_proc(P) {
res: intrinsics.type_proc_return_type(P, 0)
for _ in 0..<WARMUP_ITERS {
for d in data {
when intrinsics.type_proc_parameter_count(P) == 1 {
res += procedure(d)
} else {
res += procedure(d, vec_a)
}
}
}
start_time := time.tick_now()
dur_sum: i64
dur_min := max(i64)
dur_max := min(i64)
for _ in 0..<REPEAT {
start := intrinsics.read_cycle_counter()
for d in data {
when intrinsics.type_proc_parameter_count(P) == 1 {
res += procedure(d)
} else {
res += procedure(d, vec_a)
}
}
dur := intrinsics.read_cycle_counter() - start
dur_sum += dur
dur_min = min(dur_min, dur)
dur_max = max(dur_max, dur)
intrinsics.cpu_relax()
}
tick_dur := time.tick_since(start_time)
if bench_dur_prev == -1 {
bench_dur_prev = dur_sum
}
fmt.printfln(
"% 24s | cycles/elem: % 5i (% 5i .. % 5i) | total cycles: % 12i (% 12i .. % 12i) | total ms: % 8i | speedup: %.3fx",
name,
dur_sum / (DATA_LEN * REPEAT),
dur_min / DATA_LEN,
dur_max / DATA_LEN,
dur_sum / REPEAT,
dur_min,
dur_max,
i64(time.duration_milliseconds(tick_dur)),
f64(bench_dur_prev) / f64(dur_sum)
)
when intrinsics.type_proc_return_type(P, 0) == Float {
sum += res
} else {
sum += dot_linalg(res, res)
}
time.sleep(time.Millisecond * COOLDOWN_MS)
}
//
// MARK: Util
//
rand_vec :: proc() -> (result: [W]Float) {
if rand.float32() < 0.05 {
return result
}
for &v in result {
v = Float(rand.float32() * 2 - 1)
}
return result
}
pin_thread_to_core :: proc(core_index: u32) -> bool {
when ODIN_OS == .Windows {
mask: windows.DWORD_PTR = 1 << core_index
thread_handle := windows.GetCurrentThread()
old_mask := windows.SetThreadAffinityMask(thread_handle, mask)
if old_mask == 0 {
fmt.eprintln("Failed to set thread affinity.")
return false
}
windows.SetThreadPriority(thread_handle, windows.THREAD_PRIORITY_HIGHEST)
}
return true
}
// from linalg
@private IS_NUMERIC :: intrinsics.type_is_numeric
@private IS_QUATERNION :: intrinsics.type_is_quaternion
@private IS_ARRAY :: intrinsics.type_is_array
@private IS_FLOAT :: intrinsics.type_is_float
@private BASE_TYPE :: intrinsics.type_base_type
@private ELEM_TYPE :: intrinsics.type_elem_type
//
// MARK: Benchmarked procedures
//
dot_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check {
return #force_inline linalg.dot(a, b)
}
dot_new :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check {
return #force_inline new_vector_dot(a, b)
}
length_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
return #force_inline linalg.length(a)
}
length_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
return new_vector_length(a)
}
length2_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
return #force_inline linalg.length2(a)
}
length2_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
return new_vector_length2(a)
}
cross_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check {
when W == 3 {
return linalg.vector_cross3(a, b)
} else {
return 0
}
}
cross_new :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check {
when W == 3 {
return new_vector_cross3(a, b)
} else {
return 0
}
}
floor_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
return #force_inline linalg.floor(a)
}
round_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
return #force_inline linalg.round(a)
}
ceil_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
return #force_inline linalg.ceil(a)
}
floor_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
return new_floor(a)
}
ceil_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
return new_ceil(a)
}
trunc_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
return new_trunc(a)
}
//
// MARK: NEW
//
@(require_results)
new_vector_cross3 :: proc "contextless" (a, b: $T/[3]$E) -> (c: T) where IS_NUMERIC(E) #no_bounds_check {
return a.yzx*b.zxy - b.yzx*a.zxy
}
@(require_results)
new_vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
ab := a * b
when N == 1 {
return ab.x
} else when N == 2 {
return ab.x + ab.y
} else when N == 3 {
return ab.x + ab.y + ab.z
} else {
return ab.x + ab.y + ab.z + ab.w
}
}
@(require_results)
new_vector_length2 :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
return #force_inline new_vector_dot(a, a)
}
@(require_results)
new_vector_length :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
return intrinsics.sqrt(#force_inline new_vector_dot(a, a))
}
@(require_results)
new_floor :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
return _from_simd4(T, intrinsics.simd_floor(_to_simd4(a)))
}
@(require_results)
new_ceil :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
return _from_simd4(T, intrinsics.simd_ceil(_to_simd4(a)))
}
@(require_results)
new_trunc :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
return _from_simd4(T, intrinsics.simd_trunc(_to_simd4(a)))
}
_to_simd4 :: #force_inline proc "contextless" (a: $T) -> (out: #simd[4]ELEM_TYPE(T)) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
when IS_ARRAY(T) {
when len(T) == 1 {
_a: [4]ELEM_TYPE(T)
_a.x = a.x
return transmute(#simd[4]ELEM_TYPE(T))_a
} else when len(T) == 2 {
_a: [4]ELEM_TYPE(T)
_a.xy = a
return transmute(#simd[4]ELEM_TYPE(T))_a
} else when len(T) == 3 {
_a: [4]ELEM_TYPE(T)
_a.xyz = a
return transmute(#simd[4]ELEM_TYPE(T))_a
} else {
return transmute(#simd[4]ELEM_TYPE(T))a
}
} else {
_a: [4]ELEM_TYPE(T)
_a.x = a
return transmute(#simd[4]ELEM_TYPE(T))_a
}
}
_from_simd4 :: #force_inline proc "contextless" ($T: typeid, a: $V/#simd[4]$E) -> T where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
when IS_ARRAY(T) {
when len(T) == 1 {
return (transmute([4]ELEM_TYPE(T))a).x
} else when len(T) == 2 {
return (transmute([4]ELEM_TYPE(T))a).xy
} else when len(T) == 3 {
return (transmute([4]ELEM_TYPE(T))a).xyz
} else {
return transmute([4]ELEM_TYPE(T))a
}
} else {
return (transmute([4]ELEM_TYPE(T))a).x
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment