Skip to content

Instantly share code, notes, and snippets.

@Pikachuxxxx
Created October 29, 2025 06:21
Show Gist options
  • Save Pikachuxxxx/7942e6e385b8a794b2febcafa434530e to your computer and use it in GitHub Desktop.
Save Pikachuxxxx/7942e6e385b8a794b2febcafa434530e to your computer and use it in GitHub Desktop.
just some basic ARM neon intrinsics practice
// neon_practice.c
// Practice file for learning ARM NEON intrinsics
// Compile with: clang -O3 neon_practice.c -o neon_practice
#include <arm_neon.h>
#include <stdio.h>
// ---------------------------------------------------------
// Exercise 1: Vector addition
// Task: Add two float32x4_t vectors element-wise
// Intrinsics to use: vaddq_f32()
// ---------------------------------------------------------
void exercise1_vector_addition()
{
float a[4] = {1, 2, 3, 4};
float b[4] = {5, 6, 7, 8};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t B = vld1q_f32(b);
float32x4_t mul = vmulq_f32(A, B);
float32x4_t add = vaddq_f32(A, B);
vst1q_f32(result, add);
printf("vector add: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);
vst1q_f32(result, mul);
printf("vector mul: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);}
// ---------------------------------------------------------
// Exercise 2: Multiply and accumulate
// Task: result = a * b + c
// Intrinsics to use: vmulq_f32(), vaddq_f32(), or vmlaq_f32()
// ---------------------------------------------------------
void exercise2_mac()
{
float a[4] = {1, 2, 3, 4};
float b[4] = {5, 6, 7, 8};
float c[4] = {2, 4, 6, 8};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t B = vld1q_f32(b);
float32x4_t C = vld1q_f32(c);
float32x4_t res = vmlaq_f32(C, A, B);
vst1q_f32(result, res);
printf("vector fma: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);
float32x4_t vc = vaddq_f32(vmulq_f32(A, B), C);
vst1q_f32(result, vc);
printf("manual fma: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 3: Reciprocal approximation
// Task: Compute 1/x for a vector using NEON reciprocal estimate
// Intrinsics to use: vrecpeq_f32(), vrecpsq_f32() for refinement
// ---------------------------------------------------------
void exercise3_reciprocal()
{
float a[4] = {2, 4, 6, 8};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t b0 = vrecpeq_f32(A);
float32x4_t b1 = vmulq_f32(b0, vrecpsq_f32(A, b0));
float32x4_t b2 = vmulq_f32(b1, vrecpsq_f32(A, b1));
vst1q_f32(result, b0);
printf("recp b0: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]);
vst1q_f32(result, b1);
printf("recp b1: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]);
vst1q_f32(result, b1);
printf("recp b2: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 4: Square root approximation
// Task: Compute sqrt(x) using reciprocal sqrt estimate and Newton–Raphson
// Intrinsics to use: vrsqrteq_f32(), vrsqrtsq_f32()
// ---------------------------------------------------------
void exercise4_sqrt()
{
float a[4] = {2, 4, 11, 25};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t b0 = vrsqrteq_f32(A);
float32x4_t b1 = vmulq_f32(b0, vrsqrtsq_f32(A, b0));
float32x4_t b2 = vmulq_f32(b1, vrsqrtsq_f32(A, b1));
vst1q_f32(result, b0);
printf("sqrt b0: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]);
vst1q_f32(result, b1);
printf("sqrt b1: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]);
vst1q_f32(result, b1);
printf("sqrt b2: %.12f, %.12f, %.12f, %.12f \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 5: Dot product
// Task: Compute dot product of two float32x4_t vectors
// Intrinsics to use: vmulq_f32(), vaddvq_f32()
// ---------------------------------------------------------
void exercise5_dot_product()
{
float a[4] = {1, 2, 3, 4};
float b[4] = {5, 6, 7, 8};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t B = vld1q_f32(b);
float32x4_t mul = vmulq_f32(A, B);
float dot = vaddvq_f32(mul);
printf("dot: %f \n", dot);
}
// ---------------------------------------------------------
// Exercise 6: Horizontal add (sum of all elements)
// Intrinsics to use: vpaddq_f32(), vaddvq_f32()
// ---------------------------------------------------------
void exercise6_horizontal_add()
{
float a[4] = {1, 2, 3, 4};
float b[4] = {5, 6, 7, 8};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t B = vld1q_f32(b);
float32x4_t mul = vmulq_f32(A, B);
float dot = vaddvq_f32(mul);
printf("addv lane wise: %f \n", dot);
float32x4_t pair = vpaddq_f32(A, B);
vst1q_f32(result, pair);
printf("pairwise add: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 7: Compare and mask
// Task: Compare two vectors (a > b), store mask as result
// Intrinsics to use: vcgtq_f32(), vandq_u32(), vbslq_f32()
// ---------------------------------------------------------
void exercise7_compare_mask()
{
float a[4] = {11, 2, 33, 4};
float b[4] = {5, 6, 7, 8};
uint32_t result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t B = vld1q_f32(b);
uint32x4_t mask = vcgtq_f32(A, B);
vst1q_u32(result, mask);
printf("mask: 0x%x, 0x%x, 0x%x, 0x%x \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 8: Integer vector arithmetic
// Task: Add, subtract, multiply int32x4_t vectors
// Intrinsics to use: vaddq_s32(), vsubq_s32(), vmulq_s32()
// ---------------------------------------------------------
void exercise8_int_math()
{
uint32_t a[4] = {11, 2, 33, 4};
uint32_t b[4] = {5, 6, 7, 8};
uint32_t result[4];
uint32x4_t A = vld1q_u32(a);
uint32x4_t B = vld1q_u32(b);
uint32x4_t add = vaddq_u32(A, B);
uint32x4_t sub = vsubq_u32(A, B);
uint32x4_t mul = vmulq_u32(A, B);
vst1q_u32(result, add);
printf("add u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]);
vst1q_u32(result, sub);
printf("sub u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]);
vst1q_u32(result, mul);
printf("mul u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 9: Bitwise operations
// Task: AND, OR, XOR of integer vectors
// Intrinsics to use: vandq_u32(), vorrq_u32(), veorq_u32()
// ---------------------------------------------------------
void exercise9_bitwise()
{
uint32_t a[4] = {11, 2, 33, 4};
uint32_t b[4] = {5, 6, 7, 8};
uint32_t result[4];
uint32x4_t A = vld1q_u32(a);
uint32x4_t B = vld1q_u32(b);
uint32x4_t and = vandq_u32(A, B);
uint32x4_t or = vorrq_u32(A, B);
uint32x4_t xor = veorq_u32(A, B);
vst1q_u32(result, and);
printf("AND u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]);
vst1q_u32(result, or);
printf("OR u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]);
vst1q_u32(result, xor);
printf("XOR u32s: %u %u %u %u \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 10: Power (x^n) using repeated squaring
// Intrinsics to use: vmulq_f32()
// ---------------------------------------------------------
void exercise10_pow()
{
float a[4] = {1, 2, 3, 4};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t mul = vmulq_f32(A, A);
mul = vmulq_f32(mul, A);
vst1q_f32(result, mul);
printf("pow(n, 3): %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);
}
// ---------------------------------------------------------
// Exercise 11: Normalize vector (divide by its length)
// Intrinsics to use: vmulq_f32(), vrsqrteq_f32(), vrsqrtsq_f32(), vdupq_n_f32() etc.
// ---------------------------------------------------------
void exercise11_normalize()
{
float a[4] = {1, 2, 3, 4};
float result[4];
float32x4_t A = vld1q_f32(a);
float32x4_t AA = vmulq_f32(A, A);
float32x4_t b0 = vrsqrteq_f32(vdupq_n_f32(vaddvq_f32(AA)));
float32x4_t mul = vmulq_f32(A, b0);
vst1q_f32(result, mul);
printf("normalized vector: %.2f, %.2f, %.2f, %.2f \n", result[0], result[1], result[2], result[3]);
}
int main()
{
exercise1_vector_addition();
exercise2_mac();
exercise3_reciprocal();
exercise4_sqrt();
exercise5_dot_product();
exercise6_horizontal_add();
exercise7_compare_mask();
exercise8_int_math();
exercise9_bitwise();
exercise10_pow();
exercise11_normalize();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment