Created
October 29, 2025 06:21
-
-
Save Pikachuxxxx/7942e6e385b8a794b2febcafa434530e to your computer and use it in GitHub Desktop.
just some basic ARM neon intrinsics practice
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // neon_practice.c | |
| // Practice file for learning ARM NEON intrinsics | |
| // Compile with: clang -O3 neon_practice.c -o neon_practice | |
| #include <arm_neon.h> | |
| #include <stdio.h> | |
| // --------------------------------------------------------- | |
| // Exercise 1: Vector addition | |
| // Task: Add two float32x4_t vectors element-wise | |
| // Intrinsics to use: vaddq_f32() | |
| // --------------------------------------------------------- | |
| void exercise1_vector_addition() | |
| { | |
| float a[4] = {1, 2, 3, 4}; | |
| float b[4] = {5, 6, 7, 8}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t B = vld1q_f32(b); | |
| float32x4_t mul = vmulq_f32(A, B); | |
| float32x4_t add = vaddq_f32(A, B); | |
| vst1q_f32(result, add); | |
| printf("vector add: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]); | |
| vst1q_f32(result, mul); | |
| printf("vector mul: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);} | |
| // --------------------------------------------------------- | |
| // Exercise 2: Multiply and accumulate | |
| // Task: result = a * b + c | |
| // Intrinsics to use: vmulq_f32(), vaddq_f32(), or vmlaq_f32() | |
| // --------------------------------------------------------- | |
| void exercise2_mac() | |
| { | |
| float a[4] = {1, 2, 3, 4}; | |
| float b[4] = {5, 6, 7, 8}; | |
| float c[4] = {2, 4, 6, 8}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t B = vld1q_f32(b); | |
| float32x4_t C = vld1q_f32(c); | |
| float32x4_t res = vmlaq_f32(C, A, B); | |
| vst1q_f32(result, res); | |
| printf("vector fma: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]); | |
| float32x4_t vc = vaddq_f32(vmulq_f32(A, B), C); | |
| vst1q_f32(result, vc); | |
| printf("manual fma: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 3: Reciprocal approximation | |
| // Task: Compute 1/x for a vector using NEON reciprocal estimate | |
| // Intrinsics to use: vrecpeq_f32(), vrecpsq_f32() for refinement | |
| // --------------------------------------------------------- | |
| void exercise3_reciprocal() | |
| { | |
| float a[4] = {2, 4, 6, 8}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t b0 = vrecpeq_f32(A); | |
| float32x4_t b1 = vmulq_f32(b0, vrecpsq_f32(A, b0)); | |
| float32x4_t b2 = vmulq_f32(b1, vrecpsq_f32(A, b1)); | |
| vst1q_f32(result, b0); | |
| printf("recp b0: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]); | |
| vst1q_f32(result, b1); | |
| printf("recp b1: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]); | |
| vst1q_f32(result, b1); | |
| printf("recp b2: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 4: Square root approximation | |
| // Task: Compute sqrt(x) using reciprocal sqrt estimate and Newton–Raphson | |
| // Intrinsics to use: vrsqrteq_f32(), vrsqrtsq_f32() | |
| // --------------------------------------------------------- | |
| void exercise4_sqrt() | |
| { | |
| float a[4] = {2, 4, 11, 25}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t b0 = vrsqrteq_f32(A); | |
| float32x4_t b1 = vmulq_f32(b0, vrsqrtsq_f32(A, b0)); | |
| float32x4_t b2 = vmulq_f32(b1, vrsqrtsq_f32(A, b1)); | |
| vst1q_f32(result, b0); | |
| printf("sqrt b0: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]); | |
| vst1q_f32(result, b1); | |
| printf("sqrt b1: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]); | |
| vst1q_f32(result, b1); | |
| printf("sqrt b2: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 5: Dot product | |
| // Task: Compute dot product of two float32x4_t vectors | |
| // Intrinsics to use: vmulq_f32(), vaddvq_f32() | |
| // --------------------------------------------------------- | |
| void exercise5_dot_product() | |
| { | |
| float a[4] = {1, 2, 3, 4}; | |
| float b[4] = {5, 6, 7, 8}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t B = vld1q_f32(b); | |
| float32x4_t mul = vmulq_f32(A, B); | |
| float dot = vaddvq_f32(mul); | |
| printf("dot: %f \n", dot); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 6: Horizontal add (sum of all elements) | |
| // Intrinsics to use: vpaddq_f32(), vaddvq_f32() | |
| // --------------------------------------------------------- | |
| void exercise6_horizontal_add() | |
| { | |
| float a[4] = {1, 2, 3, 4}; | |
| float b[4] = {5, 6, 7, 8}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t B = vld1q_f32(b); | |
| float32x4_t mul = vmulq_f32(A, B); | |
| float dot = vaddvq_f32(mul); | |
| printf("addv lane wise: %f \n", dot); | |
| float32x4_t pair = vpaddq_f32(A, B); | |
| vst1q_f32(result, pair); | |
| printf("pairwise add: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 7: Compare and mask | |
| // Task: Compare two vectors (a > b), store mask as result | |
| // Intrinsics to use: vcgtq_f32(), vandq_u32(), vbslq_f32() | |
| // --------------------------------------------------------- | |
| void exercise7_compare_mask() | |
| { | |
| float a[4] = {11, 2, 33, 4}; | |
| float b[4] = {5, 6, 7, 8}; | |
| uint32_t result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t B = vld1q_f32(b); | |
| uint32x4_t mask = vcgtq_f32(A, B); | |
| vst1q_u32(result, mask); | |
| printf("mask: 0x%x, 0x%x, 0x%x, 0x%x \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 8: Integer vector arithmetic | |
| // Task: Add, subtract, multiply int32x4_t vectors | |
| // Intrinsics to use: vaddq_s32(), vsubq_s32(), vmulq_s32() | |
| // --------------------------------------------------------- | |
| void exercise8_int_math() | |
| { | |
| uint32_t a[4] = {11, 2, 33, 4}; | |
| uint32_t b[4] = {5, 6, 7, 8}; | |
| uint32_t result[4]; | |
| uint32x4_t A = vld1q_u32(a); | |
| uint32x4_t B = vld1q_u32(b); | |
| uint32x4_t add = vaddq_u32(A, B); | |
| uint32x4_t sub = vsubq_u32(A, B); | |
| uint32x4_t mul = vmulq_u32(A, B); | |
| vst1q_u32(result, add); | |
| printf("add u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]); | |
| vst1q_u32(result, sub); | |
| printf("sub u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]); | |
| vst1q_u32(result, mul); | |
| printf("mul u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 9: Bitwise operations | |
| // Task: AND, OR, XOR of integer vectors | |
| // Intrinsics to use: vandq_u32(), vorrq_u32(), veorq_u32() | |
| // --------------------------------------------------------- | |
| void exercise9_bitwise() | |
| { | |
| uint32_t a[4] = {11, 2, 33, 4}; | |
| uint32_t b[4] = {5, 6, 7, 8}; | |
| uint32_t result[4]; | |
| uint32x4_t A = vld1q_u32(a); | |
| uint32x4_t B = vld1q_u32(b); | |
| uint32x4_t and = vandq_u32(A, B); | |
| uint32x4_t or = vorrq_u32(A, B); | |
| uint32x4_t xor = veorq_u32(A, B); | |
| vst1q_u32(result, and); | |
| printf("AND u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]); | |
| vst1q_u32(result, or); | |
| printf("OR u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]); | |
| vst1q_u32(result, xor); | |
| printf("XOR u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 10: Power (x^n) using repeated squaring | |
| // Intrinsics to use: vmulq_f32() | |
| // --------------------------------------------------------- | |
| void exercise10_pow() | |
| { | |
| float a[4] = {1, 2, 3, 4}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t mul = vmulq_f32(A, A); | |
| mul = vmulq_f32(mul, A); | |
| vst1q_f32(result, mul); | |
| printf("pow(n, 3): %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]); | |
| } | |
| // --------------------------------------------------------- | |
| // Exercise 11: Normalize vector (divide by its length) | |
| // Intrinsics to use: vmulq_f32(), vrsqrteq_f32(), vrsqrtsq_f32(), vdupq_n_f32() etc. | |
| // --------------------------------------------------------- | |
| void exercise11_normalize() | |
| { | |
| float a[4] = {1, 2, 3, 4}; | |
| float result[4]; | |
| float32x4_t A = vld1q_f32(a); | |
| float32x4_t AA = vmulq_f32(A, A); | |
| float32x4_t b0 = vrsqrteq_f32(vdupq_n_f32(vaddvq_f32(AA))); | |
| float32x4_t mul = vmulq_f32(A, b0); | |
| vst1q_f32(result, mul); | |
| printf("normalized vector: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]); | |
| } | |
| int main() | |
| { | |
| exercise1_vector_addition(); | |
| exercise2_mac(); | |
| exercise3_reciprocal(); | |
| exercise4_sqrt(); | |
| exercise5_dot_product(); | |
| exercise6_horizontal_add(); | |
| exercise7_compare_mask(); | |
| exercise8_int_math(); | |
| exercise9_bitwise(); | |
| exercise10_pow(); | |
| exercise11_normalize(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment