Created
July 28, 2025 05:16
-
-
Save vurtun/150c1071de3ca7773aad3319333b1cd4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> // For AVX2 intrinsics | |
#include <stdio.h> | |
#include <float.h> // For FLT_MAX and FLT_MIN | |
#include <string.h> // For memset | |
#include <stdlib.h> // For exit | |
#include <math.h> // For fmaxf, fminf | |
#include <pthread.h> // For pthreads (example, assuming your pool uses similar mechanisms) | |
#include <assert.h> | |
#define EPSILON 1e-6f // Define a small epsilon to handle floating point inaccuracies | |
#define NUM_AABBS (128 * 1024) // Number of AABBs in our global array | |
struct vec3 { | |
float x, y, z; | |
}; | |
struct ray { | |
struct vec3 origin; | |
struct vec3 direction; | |
}; | |
struct sce_pck_bvh { | |
unsigned free_idx_cnt; | |
unsigned free_idx[NUM_AABBS]; | |
float aabb_min_x[NUM_AABBS]; | |
float aabb_min_y[NUM_AABBS]; | |
float aabb_min_z[NUM_AABBS]; | |
float aabb_max_x[NUM_AABBS]; | |
float aabb_max_y[NUM_AABBS]; | |
float aabb_max_z[NUM_AABBS]; | |
unsigned long long hit_bitset[(NUM_AABBS + 63) / 64]; | |
} sce_pck_bvh; | |
static void | |
sce_pck_bvh_init(struct sce_pck_bvh *bvh) { | |
assert(bvh); | |
bvh->free_idx_cnt = NUM_AABBS; | |
for (int i = 0; i < NUM_AABBS; ++i) { | |
bvh->free_idx[i] = NUM_AABBS - i - 1; | |
} | |
} | |
static int | |
sce_pck_bvh_add(struct sce_pck_bvh *bvh, | |
float min_x, float min_y, float min_z, | |
float max_x, float max_y, float max_z) { | |
assert(bvh); | |
int idx = bvh->free_idx[--bvh->free_idx_cnt]; | |
assert(idx < NUM_AABBS); | |
bvh->aabb_min_x[idx] = min_x; | |
bvh->aabb_min_y[idx] = min_y; | |
bvh->aabb_min_z[idx] = min_z; | |
bvh->aabb_max_x[idx] = max_x; | |
bvh->aabb_max_y[idx] = max_y; | |
bvh->aabb_max_z[idx] = max_z; | |
return idx; | |
} | |
static void | |
sce_pck_bvh_rm(struct sce_pck_bvh *bvh, int idx) { | |
assert(bvh); | |
assert(idx < NUM_AABBS); | |
bvh->free_idx[bvh->free_idx_cnt++] = idx; | |
bvh->aabb_min_x[idx] = 0.0f; | |
bvh->aabb_min_y[idx] = 0.0f; | |
bvh->aabb_min_z[idx] = 0.0f; | |
bvh->aabb_max_x[idx] = 0.0f; | |
bvh->aabb_max_y[idx] = 0.0f; | |
bvh->aabb_max_z[idx] = 0.0f; | |
} | |
struct sce_pck_bvh_thrd_tsk { | |
int start_idx; | |
int end_idx; | |
struct ray current_ray; | |
}; | |
static void* | |
sce_pck_bvh_ray_hit_thrd(void *arg) { | |
struct sce_pck_bvh_thrd_tsk *tsk = (struct sce_pck_bvh_thrd_tsk*)arg; | |
int start_idx = tsk->start_idx; | |
int end_idx = tsk->end_idx; // Exclusive end index | |
struct ray current_ray = tsk->current_ray; | |
__m256 ray_ox_v_g = _mm256_set1_ps(current_ray.origin.x); | |
__m256 ray_oy_v_g = _mm256_set1_ps(current_ray.origin.y); | |
__m256 ray_oz_v_g = _mm256_set1_ps(current_ray.origin.z); | |
float inv_dir_x_val = 1.0f / (current_ray.direction.x + EPSILON * (current_ray.direction.x == 0.0f ? 1.0f : 0.0f)); | |
float inv_dir_y_val = 1.0f / (current_ray.direction.y + EPSILON * (current_ray.direction.y == 0.0f ? 1.0f : 0.0f)); | |
float inv_dir_z_val = 1.0f / (current_ray.direction.z + EPSILON * (current_ray.direction.z == 0.0f ? 1.0f : 0.0f)); | |
__m256 inv_dir_x_v_g = _mm256_set1_ps(inv_dir_x_val); | |
__m256 inv_dir_y_v_g = _mm256_set1_ps(inv_dir_y_val); | |
__m256 inv_dir_z_v_g = _mm256_set1_ps(inv_dir_z_val); | |
__m256 t_min_initial_v_g = _mm256_set1_ps(-FLT_MAX); | |
__m256 t_max_initial_v_g = _mm256_set1_ps(FLT_MAX); | |
__m256 epsilon_v_g = _mm256_set1_ps(EPSILON); | |
// Loop through AABBs 8 at a time | |
for (int i = start_idx; i < end_idx; i += 8) { | |
// Load 8 min/max components for the current axis | |
__m256 aabb_min_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_x[i]); | |
__m256 aabb_min_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_y[i]); | |
__m256 aabb_min_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_z[i]); | |
__m256 aabb_max_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_x[i]); | |
__m256 aabb_max_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_y[i]); | |
__m256 aabb_max_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_z[i]); | |
// Initialize t_min and t_max for the current batch of 8 AABBs | |
__m256 t_min_v = t_min_initial_v_g; | |
__m256 t_max_v = t_max_initial_v_g; | |
// X-axis slab | |
__m256 t1_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x_v, ray_ox_v_g), inv_dir_x_v_g); | |
__m256 t2_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x_v, ray_ox_v_g), inv_dir_x_v_g); | |
t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_x_v, t2_x_v)); | |
t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_x_v, t2_x_v)); | |
// Y-axis slab | |
__m256 t1_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y_v, ray_oy_v_g), inv_dir_y_v_g); | |
__m256 t2_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y_v, ray_oy_v_g), inv_dir_y_v_g); | |
t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_y_v, t2_y_v)); | |
t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_y_v, t2_y_v)); | |
// Z-axis slab | |
__m256 t1_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z_v, ray_oz_v_g), inv_dir_z_v_g); | |
__m256 t2_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z_v, ray_oz_v_g), inv_dir_z_v_g); | |
t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_z_v, t2_z_v)); | |
t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_z_v, t2_z_v)); | |
// Final intersection check for each of the 8 lanes: | |
__m256 intersect_mask1 = _mm256_cmp_ps(t_min_v, t_max_v, _CMP_LE_OQ); | |
__m256 intersect_mask2 = _mm256_cmp_ps(t_max_v, epsilon_v_g, _CMP_GT_OQ); | |
__m256 final_intersect_mask = _mm256_and_ps(intersect_mask1, intersect_mask2); | |
unsigned mask = _mm256_movemask_ps(final_intersect_mask); | |
// Store results in the global hit_bitset | |
unsigned bitset_idx = i >> 6; | |
unsigned bit_offset = (i & 63); | |
// Update the hit_bitset for this 8-AABB batch. | |
// This implicitly assumes that each thread operates on a range of AABBs | |
// that ensures they don't step on each other's toes within the same unsigned long long. | |
// For correct division to avoid false sharing, ranges should be multiples of 64 AABBs. | |
// However, a simpler correct approach to avoid data races when writing to the *same* unsigned long long | |
// (if not aligning tasks to 64-AABB boundaries) would be to use an atomic OR, | |
// but that would add overhead. | |
// Assuming tasks are aligned to at least 64-AABB boundaries to ensure distinct ull writes. | |
sce_pck_bvh.hit_bitset[bitset_idx] |= ((unsigned long long)mask << bit_offset); | |
} | |
return 0; | |
} | |
static void | |
sce_pck_bvh_ray_hit(struct sce_pck_bvh *bvh, struct vec3 ray_origin, struct vec3 ray_direction, int num_threads) { | |
// Clear the hit bitset before starting a new check | |
memset(bvh->hit_bitset, 0, sizeof(bvh->hit_bitset)); | |
// Calculate work distribution | |
int aabbs_per_thread = NUM_AABBS / num_threads; | |
int remainder = NUM_AABBS % num_threads; | |
pthread_t thrd[num_threads]; | |
struct sce_pck_bvh_thrd_tsk tsk[num_threads]; | |
int current_start_idx = 0; | |
for (int i = 0; i < num_threads; ++i) { | |
tsk[i].start_idx = current_start_idx; | |
int num_for_this_thread = aabbs_per_thread; | |
if (i < remainder) { | |
num_for_this_thread++; // Distribute remainder evenly | |
} | |
tsk[i].end_idx = tsk[i].start_idx + num_for_this_thread; | |
// Ensure alignment to 8 (AVX2 batch size) for simplicity, | |
// and ideally 64 (cache line for ull) to avoid false sharing on bitset. | |
// For production code, adjust end_idx to be a multiple of 8 or 64. | |
// For this example, simple division is used, but a "remainder" thread handles leftover. | |
// We ensure start_idx is always multiple of 8. | |
tsk[i].start_idx = (tsk[i].start_idx / 8) * 8; | |
tsk[i].end_idx = (tsk[i].end_idx / 8) * 8; // round down to nearest 8 | |
if (i == num_threads - 1) { // last thread takes all remaining | |
tsk[i].end_idx = NUM_AABBS; | |
} | |
tsk[i].current_ray.origin = ray_origin; | |
tsk[i].current_ray.direction = ray_direction; | |
pthread_create(&thrd[i], 0, sce_pck_bvh_ray_hit_thrd, &tsk[i]); | |
current_start_idx = tsk[i].end_idx; | |
} | |
// Wait for all threads to complete | |
for (int i = 0; i < num_threads; ++i) { | |
pthread_join(thrd[i], 0); | |
} | |
} | |
//----------------------------------------------------------------------------- | |
// | |
// Test | |
// | |
//----------------------------------------------------------------------------- | |
#include <time.h> // For clock_gettime | |
#define NSEC_PER_SEC 1000000000.0 | |
static void | |
print_hit_bitset(unsigned int num_to_print) { | |
printf("Hit Bitset Results (first %u AABBs):\n", num_to_print); | |
for (unsigned int i = 0; i < num_to_print; ++i) { | |
unsigned int bitset_idx = i / 64; | |
unsigned int bit_pos = i % 64; | |
if ((sce_pck_bvh.hit_bitset[bitset_idx] >> bit_pos) & 1ULL) { | |
printf(" AABB %u: HIT\n", i); | |
} else { | |
// printf(" AABB %u: MISS\n", i); // Uncomment to see misses | |
} | |
} | |
} | |
extern int | |
main(void) { | |
sce_pck_bvh_init(&sce_pck_bvh); | |
// AABB 0: Intersects with ray (0,0,0) -> (1,0,0) | |
sce_pck_bvh_add(&sce_pck_bvh, 2.0f, -1.0f, -1.0f, 4.0f, 1.0f, 1.0f); | |
sce_pck_bvh_add(&sce_pck_bvh, 2.0f, 5.0f, -1.0f, 4.0f, 6.0f, 1.0f); // Miss | |
sce_pck_bvh_add(&sce_pck_bvh, 1.0f, -0.5f, -0.5f, 2.0f, 0.5f, 0.5f); // Hit | |
sce_pck_bvh_add(&sce_pck_bvh, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f); // Ray originates inside | |
sce_pck_bvh_add(&sce_pck_bvh, 0.0f, 1.0f, 1.0f, 0.01f, 2.0f, 2.0f); // Parallel miss | |
sce_pck_bvh_add(&sce_pck_bvh, 10.0f, -1.0f, -1.0f, 12.0f, 1.0f, 1.0f); // Hit (end of first ull) | |
sce_pck_bvh_add(&sce_pck_bvh, 15.0f, -1.0f, -1.0f, 17.0f, 1.0f, 1.0f); // Hit (start of second ull) | |
struct ray test_ray = {{0.0f, 0.0f, 0.0f}, {1.0f, 0.0f, 0.0f}}; // Ray along positive X-axis | |
printf("Checking ray ((%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)) against all AABBs...\n", | |
test_ray.origin.x, test_ray.origin.y, test_ray.origin.z, | |
test_ray.direction.x, test_ray.direction.y, test_ray.direction.z); | |
struct timespec start, end; | |
double elapsed_time; | |
// --- Single-core measurement --- | |
printf("\n--- Single-core AVX2 performance ---\n"); | |
clock_gettime(CLOCK_MONOTONIC, &start); | |
// Directly call the single-threaded function from the previous version for comparison | |
// We'll simulate it by calling the multi-threaded function with 1 thread | |
sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, 1); | |
clock_gettime(CLOCK_MONOTONIC, &end); | |
elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC; | |
printf("Single-core execution time: %.6f microseconds\n", elapsed_time * 1e6); | |
print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100); | |
// --- Multithreaded measurement --- | |
int num_physical_cores = 8; // Example: Adjust based on your CPU's actual physical cores | |
printf("\n--- Multithreaded AVX2 performance (%d threads) ---\n", num_physical_cores); | |
clock_gettime(CLOCK_MONOTONIC, &start); | |
sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, num_physical_cores); | |
clock_gettime(CLOCK_MONOTONIC, &end); | |
elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC; | |
printf("Multithreaded execution time: %.6f microseconds\n", elapsed_time * 1e6); | |
print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100); | |
printf("\nFinished. Check the global hit_bitset for all %u AABB results.\n", NUM_AABBS); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment