Created
April 10, 2026 15:14
-
-
Save jakubtomsu/08212f9216935db0354b1e08f87b3814 to your computer and use it in GitHub Desktop.
Making core:math/linalg a bit faster
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| Odin core:math/linalg benchmarking code | |
| Primarily optimizes 3xf32 math in -o:speed mode! | |
| However I spent a lot of effort ensuring the code doesn't get significantly slower | |
| in non-optimized builds. | |
| The results can looks something like this (-o:speed, amd64 skylake) | |
| NAME | Cycles/Elem | Total Cycles | Ms | Speedup | |
| cross_linalg | 24 (22 ..149) | 204240 (186948 ..1223189) | 6 | 1.000x | |
| cross_new | 6 ( 6 .. 11) | 52435 ( 49435 .. 93333) | 1 | 3.895x | |
| dot_linalg | 22 (20 ..114) | 183991 (170855 .. 937097) | 6 | 1.000x | |
| dot_new | 4 ( 4 .. 5) | 33884 ( 33674 .. 41996) | 1 | 5.430x | |
| length_linalg | 28 (28 .. 52) | 237166 (234925 .. 434006) | 7 | 1.000x | |
| length_new | 4 ( 4 .. 5) | 39665 ( 39470 .. 47656) | 1 | 5.979x | |
| length2_linalg | 20 (20 .. 26) | 171496 (170871 .. 215835) | 5 | 1.000x | |
| length2_new | 4 ( 4 .. 5) | 37617 ( 37534 .. 42996) | 1 | 4.559x | |
| floor_linalg | 59 (55 ..105) | 489949 (456486 .. 866643) | 16 | 1.000x | |
| floor_new | 22 (22 .. 23) | 188308 (188119 .. 191002) | 6 | 2.602x | |
| ceil_linalg | 57 (53 ..108) | 469098 (437186 .. 888921) | 15 | 1.000x | |
| ceil_new | 23 (22 .. 46) | 190237 (188129 .. 383439) | 6 | 2.466x | |
| trunc_new | 23 (22 .. 28) | 189011 (188127 .. 234359) | 6 | 1.000x | |
| */ | |
| #+vet shadowing | |
| package linalg_bench | |
| import "core:math/rand" | |
| import "core:math/linalg" | |
| import "core:fmt" | |
| import "core:time" | |
| import "core:sys/windows" | |
| import "base:intrinsics" | |
| Float :: f32 | |
| W :: 3 | |
| RUN_REPEAT :: 1 | |
| DATA_LEN :: 1024 * 8 | |
| REPEAT :: 100 | |
| // Warm up data and code cache | |
| WARMUP_ITERS :: 50 | |
| // Makes sure consecutive benchmarks don't interfere with each other. | |
| // Especially important on laptops! | |
| // To ensure this works correctly, try swapping order of bench_proc() calls and check the timers stay the same. | |
| COOLDOWN_MS :: 500 | |
| vec_a: [W]Float | |
| data: [][W]Float | |
| sum: Float | |
| bench_dur_prev: i64 | |
| @(enable_target_feature = "sse") | |
| main :: proc() { | |
| // Windows only. | |
| // More stable cache and scheduling. | |
| pin_thread_to_core(0) | |
| fmt.println("ODIN_OPTIMIZATION_MODE =", ODIN_OPTIMIZATION_MODE) | |
| fmt.println("ODIN_ARCH =", ODIN_ARCH) | |
| fmt.println("ODIN_MICROARCH_STRING =", ODIN_MICROARCH_STRING) | |
| fmt.println("ODIN_NO_BOUNDS_CHECK =", ODIN_NO_BOUNDS_CHECK) | |
| fmt.println("avx512f =", intrinsics.has_target_feature("avx512f")) | |
| fmt.println("avx2 =", intrinsics.has_target_feature("avx2")) | |
| fmt.println("sse2 =", intrinsics.has_target_feature("sse2")) | |
| fmt.println("neon =", intrinsics.has_target_feature("neon")) | |
| data = make([][W]Float, DATA_LEN) | |
| for i in 0..<RUN_REPEAT { | |
| fmt.printfln("\nRUN %i", i) | |
| for &d in data { | |
| d = rand_vec() | |
| } | |
| bench_section() | |
| bench_proc(cross_linalg) | |
| bench_proc(cross_new) | |
| bench_section() | |
| bench_proc(dot_linalg) | |
| bench_proc(dot_new) | |
| bench_section() | |
| bench_proc(length_linalg) | |
| bench_proc(length_new) | |
| bench_section() | |
| bench_proc(length2_linalg) | |
| bench_proc(length2_new) | |
| bench_section() | |
| bench_proc(floor_linalg) | |
| bench_proc(floor_new) | |
| bench_section() | |
| bench_proc(ceil_linalg) | |
| bench_proc(ceil_new) | |
| bench_section() | |
| bench_proc(trunc_new) | |
| } | |
| // Just makes really sure the compiler doesn't optimize anything away. | |
| fmt.println("\ntotal result:", sum) | |
| } | |
| bench_section :: proc() { | |
| bench_dur_prev = -1 | |
| fmt.println() | |
| } | |
| bench_proc :: proc(procedure: $P, name := #caller_expression(procedure)) where intrinsics.type_is_proc(P) { | |
| res: intrinsics.type_proc_return_type(P, 0) | |
| for _ in 0..<WARMUP_ITERS { | |
| for d in data { | |
| when intrinsics.type_proc_parameter_count(P) == 1 { | |
| res += procedure(d) | |
| } else { | |
| res += procedure(d, vec_a) | |
| } | |
| } | |
| } | |
| start_time := time.tick_now() | |
| dur_sum: i64 | |
| dur_min := max(i64) | |
| dur_max := min(i64) | |
| for _ in 0..<REPEAT { | |
| start := intrinsics.read_cycle_counter() | |
| for d in data { | |
| when intrinsics.type_proc_parameter_count(P) == 1 { | |
| res += procedure(d) | |
| } else { | |
| res += procedure(d, vec_a) | |
| } | |
| } | |
| dur := intrinsics.read_cycle_counter() - start | |
| dur_sum += dur | |
| dur_min = min(dur_min, dur) | |
| dur_max = max(dur_max, dur) | |
| intrinsics.cpu_relax() | |
| } | |
| tick_dur := time.tick_since(start_time) | |
| if bench_dur_prev == -1 { | |
| bench_dur_prev = dur_sum | |
| } | |
| fmt.printfln( | |
| "% 24s | cycles/elem: % 5i (% 5i .. % 5i) | total cycles: % 12i (% 12i .. % 12i) | total ms: % 8i | speedup: %.3fx", | |
| name, | |
| dur_sum / (DATA_LEN * REPEAT), | |
| dur_min / DATA_LEN, | |
| dur_max / DATA_LEN, | |
| dur_sum / REPEAT, | |
| dur_min, | |
| dur_max, | |
| i64(time.duration_milliseconds(tick_dur)), | |
| f64(bench_dur_prev) / f64(dur_sum) | |
| ) | |
| when intrinsics.type_proc_return_type(P, 0) == Float { | |
| sum += res | |
| } else { | |
| sum += dot_linalg(res, res) | |
| } | |
| time.sleep(time.Millisecond * COOLDOWN_MS) | |
| } | |
| // | |
| // MARK: Util | |
| // | |
| rand_vec :: proc() -> (result: [W]Float) { | |
| if rand.float32() < 0.05 { | |
| return result | |
| } | |
| for &v in result { | |
| v = Float(rand.float32() * 2 - 1) | |
| } | |
| return result | |
| } | |
| pin_thread_to_core :: proc(core_index: u32) -> bool { | |
| when ODIN_OS == .Windows { | |
| mask: windows.DWORD_PTR = 1 << core_index | |
| thread_handle := windows.GetCurrentThread() | |
| old_mask := windows.SetThreadAffinityMask(thread_handle, mask) | |
| if old_mask == 0 { | |
| fmt.eprintln("Failed to set thread affinity.") | |
| return false | |
| } | |
| windows.SetThreadPriority(thread_handle, windows.THREAD_PRIORITY_HIGHEST) | |
| } | |
| return true | |
| } | |
| // from linalg | |
| @private IS_NUMERIC :: intrinsics.type_is_numeric | |
| @private IS_QUATERNION :: intrinsics.type_is_quaternion | |
| @private IS_ARRAY :: intrinsics.type_is_array | |
| @private IS_FLOAT :: intrinsics.type_is_float | |
| @private BASE_TYPE :: intrinsics.type_base_type | |
| @private ELEM_TYPE :: intrinsics.type_elem_type | |
| // | |
| // MARK: Benchmarked procedures | |
| // | |
| dot_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check { | |
| return #force_inline linalg.dot(a, b) | |
| } | |
| dot_new :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check { | |
| return #force_inline new_vector_dot(a, b) | |
| } | |
| length_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check { | |
| return #force_inline linalg.length(a) | |
| } | |
| length_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check { | |
| return new_vector_length(a) | |
| } | |
| length2_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check { | |
| return #force_inline linalg.length2(a) | |
| } | |
| length2_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check { | |
| return new_vector_length2(a) | |
| } | |
| cross_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check { | |
| when W == 3 { | |
| return linalg.vector_cross3(a, b) | |
| } else { | |
| return 0 | |
| } | |
| } | |
| cross_new :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check { | |
| when W == 3 { | |
| return new_vector_cross3(a, b) | |
| } else { | |
| return 0 | |
| } | |
| } | |
| floor_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check { | |
| return #force_inline linalg.floor(a) | |
| } | |
| round_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check { | |
| return #force_inline linalg.round(a) | |
| } | |
| ceil_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check { | |
| return #force_inline linalg.ceil(a) | |
| } | |
| floor_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check { | |
| return new_floor(a) | |
| } | |
| ceil_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check { | |
| return new_ceil(a) | |
| } | |
| trunc_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check { | |
| return new_trunc(a) | |
| } | |
| // | |
| // MARK: NEW | |
| // | |
| @(require_results) | |
| new_vector_cross3 :: proc "contextless" (a, b: $T/[3]$E) -> (c: T) where IS_NUMERIC(E) #no_bounds_check { | |
| return a.yzx*b.zxy - b.yzx*a.zxy | |
| } | |
| @(require_results) | |
| new_vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check { | |
| ab := a * b | |
| when N == 1 { | |
| return ab.x | |
| } else when N == 2 { | |
| return ab.x + ab.y | |
| } else when N == 3 { | |
| return ab.x + ab.y + ab.z | |
| } else { | |
| return ab.x + ab.y + ab.z + ab.w | |
| } | |
| } | |
| @(require_results) | |
| new_vector_length2 :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check { | |
| return #force_inline new_vector_dot(a, a) | |
| } | |
| @(require_results) | |
| new_vector_length :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check { | |
| return intrinsics.sqrt(#force_inline new_vector_dot(a, a)) | |
| } | |
| @(require_results) | |
| new_floor :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check { | |
| return _from_simd4(T, intrinsics.simd_floor(_to_simd4(a))) | |
| } | |
| @(require_results) | |
| new_ceil :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check { | |
| return _from_simd4(T, intrinsics.simd_ceil(_to_simd4(a))) | |
| } | |
| @(require_results) | |
| new_trunc :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check { | |
| return _from_simd4(T, intrinsics.simd_trunc(_to_simd4(a))) | |
| } | |
| _to_simd4 :: #force_inline proc "contextless" (a: $T) -> (out: #simd[4]ELEM_TYPE(T)) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check { | |
| when IS_ARRAY(T) { | |
| when len(T) == 1 { | |
| _a: [4]ELEM_TYPE(T) | |
| _a.x = a.x | |
| return transmute(#simd[4]ELEM_TYPE(T))_a | |
| } else when len(T) == 2 { | |
| _a: [4]ELEM_TYPE(T) | |
| _a.xy = a | |
| return transmute(#simd[4]ELEM_TYPE(T))_a | |
| } else when len(T) == 3 { | |
| _a: [4]ELEM_TYPE(T) | |
| _a.xyz = a | |
| return transmute(#simd[4]ELEM_TYPE(T))_a | |
| } else { | |
| return transmute(#simd[4]ELEM_TYPE(T))a | |
| } | |
| } else { | |
| _a: [4]ELEM_TYPE(T) | |
| _a.x = a | |
| return transmute(#simd[4]ELEM_TYPE(T))_a | |
| } | |
| } | |
| _from_simd4 :: #force_inline proc "contextless" ($T: typeid, a: $V/#simd[4]$E) -> T where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check { | |
| when IS_ARRAY(T) { | |
| when len(T) == 1 { | |
| return (transmute([4]ELEM_TYPE(T))a).x | |
| } else when len(T) == 2 { | |
| return (transmute([4]ELEM_TYPE(T))a).xy | |
| } else when len(T) == 3 { | |
| return (transmute([4]ELEM_TYPE(T))a).xyz | |
| } else { | |
| return transmute([4]ELEM_TYPE(T))a | |
| } | |
| } else { | |
| return (transmute([4]ELEM_TYPE(T))a).x | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment