jakubtomsu · April 10, 2026 15:14
diff --git a/linalg_bench.odin b/linalg_bench.odin
 /*
 Odin core:math/linalg benchmarking code

 Primarily optimizes 3xf32 math in -o:speed mode!
 However I spent a lot of effort ensuring the code doesn't get significantly slower
 in non-optimized builds.

 The results can looks something like this (-o:speed, amd64 skylake)

 			  NAME |  Cycles/Elem  |       Total Cycles        | Ms | Speedup
 	  cross_linalg | 24 (22 ..149) | 204240 (186948 ..1223189) |  6 | 1.000x
 		 cross_new |  6 ( 6 .. 11) |  52435 ( 49435 ..  93333) |  1 | 3.895x
 		dot_linalg | 22 (20 ..114) | 183991 (170855 .. 937097) |  6 | 1.000x
 		   dot_new |  4 ( 4 ..  5) |  33884 ( 33674 ..  41996) |  1 | 5.430x
 	 length_linalg | 28 (28 .. 52) | 237166 (234925 .. 434006) |  7 | 1.000x
 		length_new |  4 ( 4 ..  5) |  39665 ( 39470 ..  47656) |  1 | 5.979x
 	length2_linalg | 20 (20 .. 26) | 171496 (170871 .. 215835) |  5 | 1.000x
 	   length2_new |  4 ( 4 ..  5) |  37617 ( 37534 ..  42996) |  1 | 4.559x
 	  floor_linalg | 59 (55 ..105) | 489949 (456486 .. 866643) | 16 | 1.000x
 		 floor_new | 22 (22 .. 23) | 188308 (188119 .. 191002) |  6 | 2.602x
 	   ceil_linalg | 57 (53 ..108) | 469098 (437186 .. 888921) | 15 | 1.000x
 		  ceil_new | 23 (22 .. 46) | 190237 (188129 .. 383439) |  6 | 2.466x
 		 trunc_new | 23 (22 .. 28) | 189011 (188127 .. 234359) |  6 | 1.000x

 */
 #+vet shadowing
 package linalg_bench

 import "core:math/rand"
 import "core:math/linalg"
 import "core:fmt"
 import "core:time"
 import "core:sys/windows"
 import "base:intrinsics"

 Float :: f32
 W :: 3

 RUN_REPEAT :: 1
 DATA_LEN :: 1024 * 8
 REPEAT :: 100

 // Warm up data and code cache
 WARMUP_ITERS :: 50

 // Makes sure consecutive benchmarks don't interfere with each other.
 // Especially important on laptops!
 // To ensure this works correctly, try swapping order of bench_proc() calls and check the timers stay the same.
 COOLDOWN_MS :: 500

 vec_a: [W]Float
 data: [][W]Float
 sum: Float
 bench_dur_prev: i64

 @(enable_target_feature = "sse")
 main :: proc() {
 	// Windows only.
 	// More stable cache and scheduling.
 	pin_thread_to_core(0)

 	fmt.println("ODIN_OPTIMIZATION_MODE =", ODIN_OPTIMIZATION_MODE)
 	fmt.println("ODIN_ARCH =", ODIN_ARCH)
 	fmt.println("ODIN_MICROARCH_STRING =", ODIN_MICROARCH_STRING)
 	fmt.println("ODIN_NO_BOUNDS_CHECK =", ODIN_NO_BOUNDS_CHECK)
 	fmt.println("avx512f =", intrinsics.has_target_feature("avx512f"))
 	fmt.println("avx2 =", intrinsics.has_target_feature("avx2"))
 	fmt.println("sse2 =", intrinsics.has_target_feature("sse2"))
 	fmt.println("neon =", intrinsics.has_target_feature("neon"))

 	data = make([][W]Float, DATA_LEN)
 	
 	for i in 0..<RUN_REPEAT {
 		fmt.printfln("\nRUN %i", i)

 		for &d in data {
 			d = rand_vec()
 		}

 		bench_section()
 		bench_proc(cross_linalg)
 		bench_proc(cross_new)

 		bench_section()
 		bench_proc(dot_linalg)
 		bench_proc(dot_new)

 		bench_section()
 		bench_proc(length_linalg)
 		bench_proc(length_new)

 		bench_section()
 		bench_proc(length2_linalg)
 		bench_proc(length2_new)

 		bench_section()
 		bench_proc(floor_linalg)
 		bench_proc(floor_new)

 		bench_section()
 		bench_proc(ceil_linalg)
 		bench_proc(ceil_new)

 		bench_section()
 		bench_proc(trunc_new)
 	}

 	// Just makes really sure the compiler doesn't optimize anything away.
 	fmt.println("\ntotal result:", sum)
 }

 bench_section :: proc() {
 	bench_dur_prev = -1
 	fmt.println()
 }

 bench_proc :: proc(procedure: $P, name := #caller_expression(procedure)) where intrinsics.type_is_proc(P) {
 	res: intrinsics.type_proc_return_type(P, 0)

 	for _ in 0..<WARMUP_ITERS {
 		for d in data {
 			when intrinsics.type_proc_parameter_count(P) == 1 {
 				res += procedure(d)
 			} else {
 				res += procedure(d, vec_a)
 			}
 		}
 	}

 	start_time := time.tick_now()
 	dur_sum: i64
 	dur_min := max(i64)
 	dur_max := min(i64)

 	for _ in 0..<REPEAT {
 		start := intrinsics.read_cycle_counter()
 		for d in data {
 			when intrinsics.type_proc_parameter_count(P) == 1 {
 				res += procedure(d)
 			} else {
 				res += procedure(d, vec_a)
 			}
 		}

 		dur := intrinsics.read_cycle_counter() - start
 		dur_sum += dur
 		dur_min = min(dur_min, dur)
 		dur_max = max(dur_max, dur)

 		intrinsics.cpu_relax()
 	}

 	tick_dur := time.tick_since(start_time)

 	if bench_dur_prev == -1 {
 		bench_dur_prev = dur_sum
 	}

 	fmt.printfln(
 		"% 24s | cycles/elem: % 5i (% 5i .. % 5i) | total cycles: % 12i (% 12i .. % 12i) | total ms: % 8i | speedup: %.3fx",
 		name,
 		dur_sum / (DATA_LEN * REPEAT),
 		dur_min / DATA_LEN,
 		dur_max / DATA_LEN,
 		dur_sum / REPEAT,
 		dur_min,
 		dur_max,
 		i64(time.duration_milliseconds(tick_dur)),
 		f64(bench_dur_prev) / f64(dur_sum)
 	)

 	when intrinsics.type_proc_return_type(P, 0) == Float {
 		sum += res
 	} else {
 		sum += dot_linalg(res, res)
 	}

 	time.sleep(time.Millisecond * COOLDOWN_MS)
 }



 //
 // MARK: Util
 //

 rand_vec :: proc() -> (result: [W]Float) {
 	if rand.float32() < 0.05 {
 		return result
 	}
 	for &v in result {
 		v = Float(rand.float32() * 2 - 1)
 	}
 	return result
 }

 pin_thread_to_core :: proc(core_index: u32) -> bool {
 	when ODIN_OS == .Windows {
 		mask: windows.DWORD_PTR = 1 << core_index

 		thread_handle := windows.GetCurrentThread()

 		old_mask := windows.SetThreadAffinityMask(thread_handle, mask)

 		if old_mask == 0 {
 			fmt.eprintln("Failed to set thread affinity.")
 			return false
 		}

 		windows.SetThreadPriority(thread_handle, windows.THREAD_PRIORITY_HIGHEST)
 	}

 	return true
 }

 // from linalg

 @private IS_NUMERIC :: intrinsics.type_is_numeric
 @private IS_QUATERNION :: intrinsics.type_is_quaternion
 @private IS_ARRAY :: intrinsics.type_is_array
 @private IS_FLOAT :: intrinsics.type_is_float
 @private BASE_TYPE :: intrinsics.type_base_type
 @private ELEM_TYPE :: intrinsics.type_elem_type



 //
 // MARK: Benchmarked procedures
 //

 dot_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check {
 	return #force_inline linalg.dot(a, b)
 }

 dot_new :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check {
 	return #force_inline new_vector_dot(a, b)
 }

 length_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
 	return #force_inline linalg.length(a)
 }

 length_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
 	return new_vector_length(a)
 }

 length2_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
 	return #force_inline linalg.length2(a)
 }

 length2_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
 	return new_vector_length2(a)
 }

 cross_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check {
 	when W == 3 {
 		return linalg.vector_cross3(a, b)
 	} else {
 		return 0
 	}
 }

 cross_new :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check {
 	when W == 3 {
 		return new_vector_cross3(a, b)
 	} else {
 		return 0
 	}
 }


 floor_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
 	return #force_inline linalg.floor(a)
 }

 round_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
 	return #force_inline linalg.round(a)
 }

 ceil_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
 	return #force_inline linalg.ceil(a)
 }


 floor_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
 	return new_floor(a)
 }

 ceil_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
 	return new_ceil(a)
 }

 trunc_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
 	return new_trunc(a)
 }



 //
 // MARK: NEW
 //

 @(require_results)
 new_vector_cross3 :: proc "contextless" (a, b: $T/[3]$E) -> (c: T) where IS_NUMERIC(E) #no_bounds_check {
 	return a.yzx*b.zxy - b.yzx*a.zxy
 }

 @(require_results)
 new_vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
 	ab := a * b
 	when N == 1 {
 		return ab.x
 	} else when N == 2 {
 		return ab.x + ab.y
 	} else when N == 3 {
 		return ab.x + ab.y + ab.z
 	} else {
 		return ab.x + ab.y + ab.z + ab.w
 	}
 }

 @(require_results)
 new_vector_length2 :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
 	return #force_inline new_vector_dot(a, a)
 }

 @(require_results)
 new_vector_length :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
 	return intrinsics.sqrt(#force_inline new_vector_dot(a, a))
 }




 @(require_results)
 new_floor :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
 	return _from_simd4(T, intrinsics.simd_floor(_to_simd4(a)))
 }

 @(require_results)
 new_ceil :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
 	return _from_simd4(T, intrinsics.simd_ceil(_to_simd4(a)))
 }

 @(require_results)
 new_trunc :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
 	return _from_simd4(T, intrinsics.simd_trunc(_to_simd4(a)))
 }



 _to_simd4 :: #force_inline proc "contextless" (a: $T) -> (out: #simd[4]ELEM_TYPE(T)) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
 	when IS_ARRAY(T) {
 		when len(T) == 1 {
 			_a: [4]ELEM_TYPE(T)
 			_a.x = a.x
 			return transmute(#simd[4]ELEM_TYPE(T))_a
 		} else when len(T) == 2 {
 			_a: [4]ELEM_TYPE(T)
 			_a.xy = a
 			return transmute(#simd[4]ELEM_TYPE(T))_a
 		} else when len(T) == 3 {
 			_a: [4]ELEM_TYPE(T)
 			_a.xyz = a
 			return transmute(#simd[4]ELEM_TYPE(T))_a
 		} else {
 			return transmute(#simd[4]ELEM_TYPE(T))a
 		}
 	} else {
 		_a: [4]ELEM_TYPE(T)
 		_a.x = a
 		return transmute(#simd[4]ELEM_TYPE(T))_a
 	}
 }

 _from_simd4 :: #force_inline proc "contextless" ($T: typeid, a: $V/#simd[4]$E) -> T where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
 	when IS_ARRAY(T) {
 		when len(T) == 1 {
 			return (transmute([4]ELEM_TYPE(T))a).x
 		} else when len(T) == 2 {
 			return (transmute([4]ELEM_TYPE(T))a).xy
 		} else when len(T) == 3 {
 			return (transmute([4]ELEM_TYPE(T))a).xyz
 		} else {
 			return transmute([4]ELEM_TYPE(T))a
 		}       
 	} else {
 		return (transmute([4]ELEM_TYPE(T))a).x
 	}
 }
	/*
	Odin core:math/linalg benchmarking code

	Primarily optimizes 3xf32 math in -o:speed mode!
	However I spent a lot of effort ensuring the code doesn't get significantly slower
	in non-optimized builds.

	The results can looks something like this (-o:speed, amd64 skylake)

	NAME \| Cycles/Elem \| Total Cycles \| Ms \| Speedup
	cross_linalg \| 24 (22 ..149) \| 204240 (186948 ..1223189) \| 6 \| 1.000x
	cross_new \| 6 ( 6 .. 11) \| 52435 ( 49435 .. 93333) \| 1 \| 3.895x
	dot_linalg \| 22 (20 ..114) \| 183991 (170855 .. 937097) \| 6 \| 1.000x
	dot_new \| 4 ( 4 .. 5) \| 33884 ( 33674 .. 41996) \| 1 \| 5.430x
	length_linalg \| 28 (28 .. 52) \| 237166 (234925 .. 434006) \| 7 \| 1.000x
	length_new \| 4 ( 4 .. 5) \| 39665 ( 39470 .. 47656) \| 1 \| 5.979x
	length2_linalg \| 20 (20 .. 26) \| 171496 (170871 .. 215835) \| 5 \| 1.000x
	length2_new \| 4 ( 4 .. 5) \| 37617 ( 37534 .. 42996) \| 1 \| 4.559x
	floor_linalg \| 59 (55 ..105) \| 489949 (456486 .. 866643) \| 16 \| 1.000x
	floor_new \| 22 (22 .. 23) \| 188308 (188119 .. 191002) \| 6 \| 2.602x
	ceil_linalg \| 57 (53 ..108) \| 469098 (437186 .. 888921) \| 15 \| 1.000x
	ceil_new \| 23 (22 .. 46) \| 190237 (188129 .. 383439) \| 6 \| 2.466x
	trunc_new \| 23 (22 .. 28) \| 189011 (188127 .. 234359) \| 6 \| 1.000x

	*/
	#+vet shadowing
	package linalg_bench

	import "core:math/rand"
	import "core:math/linalg"
	import "core:fmt"
	import "core:time"
	import "core:sys/windows"
	import "base:intrinsics"

	Float :: f32
	W :: 3

	RUN_REPEAT :: 1
	DATA_LEN :: 1024 * 8
	REPEAT :: 100

	// Warm up data and code cache
	WARMUP_ITERS :: 50

	// Makes sure consecutive benchmarks don't interfere with each other.
	// Especially important on laptops!
	// To ensure this works correctly, try swapping order of bench_proc() calls and check the timers stay the same.
	COOLDOWN_MS :: 500

	vec_a: [W]Float
	data: [][W]Float
	sum: Float
	bench_dur_prev: i64

	@(enable_target_feature = "sse")
	main :: proc() {
	// Windows only.
	// More stable cache and scheduling.
	pin_thread_to_core(0)

	fmt.println("ODIN_OPTIMIZATION_MODE =", ODIN_OPTIMIZATION_MODE)
	fmt.println("ODIN_ARCH =", ODIN_ARCH)
	fmt.println("ODIN_MICROARCH_STRING =", ODIN_MICROARCH_STRING)
	fmt.println("ODIN_NO_BOUNDS_CHECK =", ODIN_NO_BOUNDS_CHECK)
	fmt.println("avx512f =", intrinsics.has_target_feature("avx512f"))
	fmt.println("avx2 =", intrinsics.has_target_feature("avx2"))
	fmt.println("sse2 =", intrinsics.has_target_feature("sse2"))
	fmt.println("neon =", intrinsics.has_target_feature("neon"))

	data = make([][W]Float, DATA_LEN)

	for i in 0..<RUN_REPEAT {
	fmt.printfln("\nRUN %i", i)

	for &d in data {
	d = rand_vec()
	}

	bench_section()
	bench_proc(cross_linalg)
	bench_proc(cross_new)

	bench_section()
	bench_proc(dot_linalg)
	bench_proc(dot_new)

	bench_section()
	bench_proc(length_linalg)
	bench_proc(length_new)

	bench_section()
	bench_proc(length2_linalg)
	bench_proc(length2_new)

	bench_section()
	bench_proc(floor_linalg)
	bench_proc(floor_new)

	bench_section()
	bench_proc(ceil_linalg)
	bench_proc(ceil_new)

	bench_section()
	bench_proc(trunc_new)
	}

	// Just makes really sure the compiler doesn't optimize anything away.
	fmt.println("\ntotal result:", sum)
	}

	bench_section :: proc() {
	bench_dur_prev = -1
	fmt.println()
	}

	bench_proc :: proc(procedure: $P, name := #caller_expression(procedure)) where intrinsics.type_is_proc(P) {
	res: intrinsics.type_proc_return_type(P, 0)

	for _ in 0..<WARMUP_ITERS {
	for d in data {
	when intrinsics.type_proc_parameter_count(P) == 1 {
	res += procedure(d)
	} else {
	res += procedure(d, vec_a)
	}
	}
	}

	start_time := time.tick_now()
	dur_sum: i64
	dur_min := max(i64)
	dur_max := min(i64)

	for _ in 0..<REPEAT {
	start := intrinsics.read_cycle_counter()
	for d in data {
	when intrinsics.type_proc_parameter_count(P) == 1 {
	res += procedure(d)
	} else {
	res += procedure(d, vec_a)
	}
	}

	dur := intrinsics.read_cycle_counter() - start
	dur_sum += dur
	dur_min = min(dur_min, dur)
	dur_max = max(dur_max, dur)

	intrinsics.cpu_relax()
	}

	tick_dur := time.tick_since(start_time)

	if bench_dur_prev == -1 {
	bench_dur_prev = dur_sum
	}

	fmt.printfln(
	"% 24s \| cycles/elem: % 5i (% 5i .. % 5i) \| total cycles: % 12i (% 12i .. % 12i) \| total ms: % 8i \| speedup: %.3fx",
	name,
	dur_sum / (DATA_LEN * REPEAT),
	dur_min / DATA_LEN,
	dur_max / DATA_LEN,
	dur_sum / REPEAT,
	dur_min,
	dur_max,
	i64(time.duration_milliseconds(tick_dur)),
	f64(bench_dur_prev) / f64(dur_sum)
	)

	when intrinsics.type_proc_return_type(P, 0) == Float {
	sum += res
	} else {
	sum += dot_linalg(res, res)
	}

	time.sleep(time.Millisecond * COOLDOWN_MS)
	}



	//
	// MARK: Util
	//

	rand_vec :: proc() -> (result: [W]Float) {
	if rand.float32() < 0.05 {
	return result
	}
	for &v in result {
	v = Float(rand.float32() * 2 - 1)
	}
	return result
	}

	pin_thread_to_core :: proc(core_index: u32) -> bool {
	when ODIN_OS == .Windows {
	mask: windows.DWORD_PTR = 1 << core_index

	thread_handle := windows.GetCurrentThread()

	old_mask := windows.SetThreadAffinityMask(thread_handle, mask)

	if old_mask == 0 {
	fmt.eprintln("Failed to set thread affinity.")
	return false
	}

	windows.SetThreadPriority(thread_handle, windows.THREAD_PRIORITY_HIGHEST)
	}

	return true
	}

	// from linalg

	@private IS_NUMERIC :: intrinsics.type_is_numeric
	@private IS_QUATERNION :: intrinsics.type_is_quaternion
	@private IS_ARRAY :: intrinsics.type_is_array
	@private IS_FLOAT :: intrinsics.type_is_float
	@private BASE_TYPE :: intrinsics.type_base_type
	@private ELEM_TYPE :: intrinsics.type_elem_type



	//
	// MARK: Benchmarked procedures
	//

	dot_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check {
	return #force_inline linalg.dot(a, b)
	}

	dot_new :: #force_inline proc "contextless" (a, b: [W]Float) -> Float #no_bounds_check {
	return #force_inline new_vector_dot(a, b)
	}

	length_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
	return #force_inline linalg.length(a)
	}

	length_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
	return new_vector_length(a)
	}

	length2_linalg :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
	return #force_inline linalg.length2(a)
	}

	length2_new :: #force_inline proc "contextless" (a: [W]Float) -> Float #no_bounds_check {
	return new_vector_length2(a)
	}

	cross_linalg :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check {
	when W == 3 {
	return linalg.vector_cross3(a, b)
	} else {
	return 0
	}
	}

	cross_new :: #force_inline proc "contextless" (a, b: [W]Float) -> [W]Float #no_bounds_check {
	when W == 3 {
	return new_vector_cross3(a, b)
	} else {
	return 0
	}
	}


	floor_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
	return #force_inline linalg.floor(a)
	}

	round_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
	return #force_inline linalg.round(a)
	}

	ceil_linalg :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
	return #force_inline linalg.ceil(a)
	}


	floor_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
	return new_floor(a)
	}

	ceil_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
	return new_ceil(a)
	}

	trunc_new :: #force_inline proc "contextless" (a: [W]Float) -> [W]Float #no_bounds_check {
	return new_trunc(a)
	}



	//
	// MARK: NEW
	//

	@(require_results)
	new_vector_cross3 :: proc "contextless" (a, b: $T/[3]$E) -> (c: T) where IS_NUMERIC(E) #no_bounds_check {
	return a.yzxb.zxy - b.yzxa.zxy
	}

	@(require_results)
	new_vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
	ab := a * b
	when N == 1 {
	return ab.x
	} else when N == 2 {
	return ab.x + ab.y
	} else when N == 3 {
	return ab.x + ab.y + ab.z
	} else {
	return ab.x + ab.y + ab.z + ab.w
	}
	}

	@(require_results)
	new_vector_length2 :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
	return #force_inline new_vector_dot(a, a)
	}

	@(require_results)
	new_vector_length :: proc "contextless" (a: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
	return intrinsics.sqrt(#force_inline new_vector_dot(a, a))
	}




	@(require_results)
	new_floor :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
	return _from_simd4(T, intrinsics.simd_floor(_to_simd4(a)))
	}

	@(require_results)
	new_ceil :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
	return _from_simd4(T, intrinsics.simd_ceil(_to_simd4(a)))
	}

	@(require_results)
	new_trunc :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
	return _from_simd4(T, intrinsics.simd_trunc(_to_simd4(a)))
	}



	_to_simd4 :: #force_inline proc "contextless" (a: $T) -> (out: #simd[4]ELEM_TYPE(T)) where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
	when IS_ARRAY(T) {
	when len(T) == 1 {
	_a: [4]ELEM_TYPE(T)
	_a.x = a.x
	return transmute(#simd[4]ELEM_TYPE(T))_a
	} else when len(T) == 2 {
	_a: [4]ELEM_TYPE(T)
	_a.xy = a
	return transmute(#simd[4]ELEM_TYPE(T))_a
	} else when len(T) == 3 {
	_a: [4]ELEM_TYPE(T)
	_a.xyz = a
	return transmute(#simd[4]ELEM_TYPE(T))_a
	} else {
	return transmute(#simd[4]ELEM_TYPE(T))a
	}
	} else {
	_a: [4]ELEM_TYPE(T)
	_a.x = a
	return transmute(#simd[4]ELEM_TYPE(T))_a
	}
	}

	_from_simd4 :: #force_inline proc "contextless" ($T: typeid, a: $V/#simd[4]$E) -> T where IS_NUMERIC(ELEM_TYPE(T)) #no_bounds_check {
	when IS_ARRAY(T) {
	when len(T) == 1 {
	return (transmute([4]ELEM_TYPE(T))a).x
	} else when len(T) == 2 {
	return (transmute([4]ELEM_TYPE(T))a).xy
	} else when len(T) == 3 {
	return (transmute([4]ELEM_TYPE(T))a).xyz
	} else {
	return transmute([4]ELEM_TYPE(T))a
	}
	} else {
	return (transmute([4]ELEM_TYPE(T))a).x
	}
	}
No results found