Skip to content

Instantly share code, notes, and snippets.

@vurtun
Created July 28, 2025 05:16
Show Gist options
  • Save vurtun/150c1071de3ca7773aad3319333b1cd4 to your computer and use it in GitHub Desktop.
Save vurtun/150c1071de3ca7773aad3319333b1cd4 to your computer and use it in GitHub Desktop.
#include <immintrin.h> // For AVX2 intrinsics
#include <stdio.h>
#include <float.h> // For FLT_MAX and FLT_MIN
#include <string.h> // For memset
#include <stdlib.h> // For exit
#include <math.h> // For fmaxf, fminf
#include <pthread.h> // For pthreads (example, assuming your pool uses similar mechanisms)
#include <assert.h>
#define EPSILON 1e-6f // Define a small epsilon to handle floating point inaccuracies
#define NUM_AABBS (128 * 1024) // Number of AABBs in our global array
struct vec3 {
float x, y, z;
};
struct ray {
struct vec3 origin;
struct vec3 direction;
};
struct sce_pck_bvh {
unsigned free_idx_cnt;
unsigned free_idx[NUM_AABBS];
float aabb_min_x[NUM_AABBS];
float aabb_min_y[NUM_AABBS];
float aabb_min_z[NUM_AABBS];
float aabb_max_x[NUM_AABBS];
float aabb_max_y[NUM_AABBS];
float aabb_max_z[NUM_AABBS];
unsigned long long hit_bitset[(NUM_AABBS + 63) / 64];
} sce_pck_bvh;
static void
sce_pck_bvh_init(struct sce_pck_bvh *bvh) {
assert(bvh);
bvh->free_idx_cnt = NUM_AABBS;
for (int i = 0; i < NUM_AABBS; ++i) {
bvh->free_idx[i] = NUM_AABBS - i - 1;
}
}
static int
sce_pck_bvh_add(struct sce_pck_bvh *bvh,
float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z) {
assert(bvh);
int idx = bvh->free_idx[--bvh->free_idx_cnt];
assert(idx < NUM_AABBS);
bvh->aabb_min_x[idx] = min_x;
bvh->aabb_min_y[idx] = min_y;
bvh->aabb_min_z[idx] = min_z;
bvh->aabb_max_x[idx] = max_x;
bvh->aabb_max_y[idx] = max_y;
bvh->aabb_max_z[idx] = max_z;
return idx;
}
static void
sce_pck_bvh_rm(struct sce_pck_bvh *bvh, int idx) {
assert(bvh);
assert(idx < NUM_AABBS);
bvh->free_idx[bvh->free_idx_cnt++] = idx;
bvh->aabb_min_x[idx] = 0.0f;
bvh->aabb_min_y[idx] = 0.0f;
bvh->aabb_min_z[idx] = 0.0f;
bvh->aabb_max_x[idx] = 0.0f;
bvh->aabb_max_y[idx] = 0.0f;
bvh->aabb_max_z[idx] = 0.0f;
}
struct sce_pck_bvh_thrd_tsk {
int start_idx;
int end_idx;
struct ray current_ray;
};
static void*
sce_pck_bvh_ray_hit_thrd(void *arg) {
struct sce_pck_bvh_thrd_tsk *tsk = (struct sce_pck_bvh_thrd_tsk*)arg;
int start_idx = tsk->start_idx;
int end_idx = tsk->end_idx; // Exclusive end index
struct ray current_ray = tsk->current_ray;
__m256 ray_ox_v_g = _mm256_set1_ps(current_ray.origin.x);
__m256 ray_oy_v_g = _mm256_set1_ps(current_ray.origin.y);
__m256 ray_oz_v_g = _mm256_set1_ps(current_ray.origin.z);
float inv_dir_x_val = 1.0f / (current_ray.direction.x + EPSILON * (current_ray.direction.x == 0.0f ? 1.0f : 0.0f));
float inv_dir_y_val = 1.0f / (current_ray.direction.y + EPSILON * (current_ray.direction.y == 0.0f ? 1.0f : 0.0f));
float inv_dir_z_val = 1.0f / (current_ray.direction.z + EPSILON * (current_ray.direction.z == 0.0f ? 1.0f : 0.0f));
__m256 inv_dir_x_v_g = _mm256_set1_ps(inv_dir_x_val);
__m256 inv_dir_y_v_g = _mm256_set1_ps(inv_dir_y_val);
__m256 inv_dir_z_v_g = _mm256_set1_ps(inv_dir_z_val);
__m256 t_min_initial_v_g = _mm256_set1_ps(-FLT_MAX);
__m256 t_max_initial_v_g = _mm256_set1_ps(FLT_MAX);
__m256 epsilon_v_g = _mm256_set1_ps(EPSILON);
// Loop through AABBs 8 at a time
for (int i = start_idx; i < end_idx; i += 8) {
// Load 8 min/max components for the current axis
__m256 aabb_min_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_x[i]);
__m256 aabb_min_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_y[i]);
__m256 aabb_min_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_z[i]);
__m256 aabb_max_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_x[i]);
__m256 aabb_max_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_y[i]);
__m256 aabb_max_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_z[i]);
// Initialize t_min and t_max for the current batch of 8 AABBs
__m256 t_min_v = t_min_initial_v_g;
__m256 t_max_v = t_max_initial_v_g;
// X-axis slab
__m256 t1_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x_v, ray_ox_v_g), inv_dir_x_v_g);
__m256 t2_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x_v, ray_ox_v_g), inv_dir_x_v_g);
t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_x_v, t2_x_v));
t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_x_v, t2_x_v));
// Y-axis slab
__m256 t1_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y_v, ray_oy_v_g), inv_dir_y_v_g);
__m256 t2_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y_v, ray_oy_v_g), inv_dir_y_v_g);
t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_y_v, t2_y_v));
t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_y_v, t2_y_v));
// Z-axis slab
__m256 t1_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z_v, ray_oz_v_g), inv_dir_z_v_g);
__m256 t2_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z_v, ray_oz_v_g), inv_dir_z_v_g);
t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_z_v, t2_z_v));
t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_z_v, t2_z_v));
// Final intersection check for each of the 8 lanes:
__m256 intersect_mask1 = _mm256_cmp_ps(t_min_v, t_max_v, _CMP_LE_OQ);
__m256 intersect_mask2 = _mm256_cmp_ps(t_max_v, epsilon_v_g, _CMP_GT_OQ);
__m256 final_intersect_mask = _mm256_and_ps(intersect_mask1, intersect_mask2);
unsigned mask = _mm256_movemask_ps(final_intersect_mask);
// Store results in the global hit_bitset
unsigned bitset_idx = i >> 6;
unsigned bit_offset = (i & 63);
// Update the hit_bitset for this 8-AABB batch.
// This implicitly assumes that each thread operates on a range of AABBs
// that ensures they don't step on each other's toes within the same unsigned long long.
// For correct division to avoid false sharing, ranges should be multiples of 64 AABBs.
// However, a simpler correct approach to avoid data races when writing to the *same* unsigned long long
// (if not aligning tasks to 64-AABB boundaries) would be to use an atomic OR,
// but that would add overhead.
// Assuming tasks are aligned to at least 64-AABB boundaries to ensure distinct ull writes.
sce_pck_bvh.hit_bitset[bitset_idx] |= ((unsigned long long)mask << bit_offset);
}
return 0;
}
static void
sce_pck_bvh_ray_hit(struct sce_pck_bvh *bvh, struct vec3 ray_origin, struct vec3 ray_direction, int num_threads) {
// Clear the hit bitset before starting a new check
memset(bvh->hit_bitset, 0, sizeof(bvh->hit_bitset));
// Calculate work distribution
int aabbs_per_thread = NUM_AABBS / num_threads;
int remainder = NUM_AABBS % num_threads;
pthread_t thrd[num_threads];
struct sce_pck_bvh_thrd_tsk tsk[num_threads];
int current_start_idx = 0;
for (int i = 0; i < num_threads; ++i) {
tsk[i].start_idx = current_start_idx;
int num_for_this_thread = aabbs_per_thread;
if (i < remainder) {
num_for_this_thread++; // Distribute remainder evenly
}
tsk[i].end_idx = tsk[i].start_idx + num_for_this_thread;
// Ensure alignment to 8 (AVX2 batch size) for simplicity,
// and ideally 64 (cache line for ull) to avoid false sharing on bitset.
// For production code, adjust end_idx to be a multiple of 8 or 64.
// For this example, simple division is used, but a "remainder" thread handles leftover.
// We ensure start_idx is always multiple of 8.
tsk[i].start_idx = (tsk[i].start_idx / 8) * 8;
tsk[i].end_idx = (tsk[i].end_idx / 8) * 8; // round down to nearest 8
if (i == num_threads - 1) { // last thread takes all remaining
tsk[i].end_idx = NUM_AABBS;
}
tsk[i].current_ray.origin = ray_origin;
tsk[i].current_ray.direction = ray_direction;
pthread_create(&thrd[i], 0, sce_pck_bvh_ray_hit_thrd, &tsk[i]);
current_start_idx = tsk[i].end_idx;
}
// Wait for all threads to complete
for (int i = 0; i < num_threads; ++i) {
pthread_join(thrd[i], 0);
}
}
//-----------------------------------------------------------------------------
//
// Test
//
//-----------------------------------------------------------------------------
#include <time.h> // For clock_gettime
#define NSEC_PER_SEC 1000000000.0
static void
print_hit_bitset(unsigned int num_to_print) {
printf("Hit Bitset Results (first %u AABBs):\n", num_to_print);
for (unsigned int i = 0; i < num_to_print; ++i) {
unsigned int bitset_idx = i / 64;
unsigned int bit_pos = i % 64;
if ((sce_pck_bvh.hit_bitset[bitset_idx] >> bit_pos) & 1ULL) {
printf(" AABB %u: HIT\n", i);
} else {
// printf(" AABB %u: MISS\n", i); // Uncomment to see misses
}
}
}
extern int
main(void) {
sce_pck_bvh_init(&sce_pck_bvh);
// AABB 0: Intersects with ray (0,0,0) -> (1,0,0)
sce_pck_bvh_add(&sce_pck_bvh, 2.0f, -1.0f, -1.0f, 4.0f, 1.0f, 1.0f);
sce_pck_bvh_add(&sce_pck_bvh, 2.0f, 5.0f, -1.0f, 4.0f, 6.0f, 1.0f); // Miss
sce_pck_bvh_add(&sce_pck_bvh, 1.0f, -0.5f, -0.5f, 2.0f, 0.5f, 0.5f); // Hit
sce_pck_bvh_add(&sce_pck_bvh, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f); // Ray originates inside
sce_pck_bvh_add(&sce_pck_bvh, 0.0f, 1.0f, 1.0f, 0.01f, 2.0f, 2.0f); // Parallel miss
sce_pck_bvh_add(&sce_pck_bvh, 10.0f, -1.0f, -1.0f, 12.0f, 1.0f, 1.0f); // Hit (end of first ull)
sce_pck_bvh_add(&sce_pck_bvh, 15.0f, -1.0f, -1.0f, 17.0f, 1.0f, 1.0f); // Hit (start of second ull)
struct ray test_ray = {{0.0f, 0.0f, 0.0f}, {1.0f, 0.0f, 0.0f}}; // Ray along positive X-axis
printf("Checking ray ((%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)) against all AABBs...\n",
test_ray.origin.x, test_ray.origin.y, test_ray.origin.z,
test_ray.direction.x, test_ray.direction.y, test_ray.direction.z);
struct timespec start, end;
double elapsed_time;
// --- Single-core measurement ---
printf("\n--- Single-core AVX2 performance ---\n");
clock_gettime(CLOCK_MONOTONIC, &start);
// Directly call the single-threaded function from the previous version for comparison
// We'll simulate it by calling the multi-threaded function with 1 thread
sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, 1);
clock_gettime(CLOCK_MONOTONIC, &end);
elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC;
printf("Single-core execution time: %.6f microseconds\n", elapsed_time * 1e6);
print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100);
// --- Multithreaded measurement ---
int num_physical_cores = 8; // Example: Adjust based on your CPU's actual physical cores
printf("\n--- Multithreaded AVX2 performance (%d threads) ---\n", num_physical_cores);
clock_gettime(CLOCK_MONOTONIC, &start);
sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, num_physical_cores);
clock_gettime(CLOCK_MONOTONIC, &end);
elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC;
printf("Multithreaded execution time: %.6f microseconds\n", elapsed_time * 1e6);
print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100);
printf("\nFinished. Check the global hit_bitset for all %u AABB results.\n", NUM_AABBS);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment