vurtun · July 28, 2025 05:16
diff --git a/scene_obj_pick.c b/scene_obj_pick.c
 #include <immintrin.h> // For AVX2 intrinsics
 #include <stdio.h>
 #include <float.h>     // For FLT_MAX and FLT_MIN
 #include <string.h>    // For memset
 #include <stdlib.h>    // For exit
 #include <math.h>      // For fmaxf, fminf
 #include <pthread.h>   // For pthreads (example, assuming your pool uses similar mechanisms)
 #include <assert.h>

 #define EPSILON 1e-6f           // Define a small epsilon to handle floating point inaccuracies
 #define NUM_AABBS (128 * 1024)  // Number of AABBs in our global array

 struct vec3 {
  float x, y, z;
 };
 struct ray {
  struct vec3 origin;
  struct vec3 direction;
 };
 struct sce_pck_bvh {
  unsigned free_idx_cnt;
  unsigned free_idx[NUM_AABBS];
  float aabb_min_x[NUM_AABBS];
  float aabb_min_y[NUM_AABBS];
  float aabb_min_z[NUM_AABBS];
  float aabb_max_x[NUM_AABBS];
  float aabb_max_y[NUM_AABBS];
  float aabb_max_z[NUM_AABBS];
  unsigned long long hit_bitset[(NUM_AABBS + 63) / 64];
 } sce_pck_bvh;

 static void
 sce_pck_bvh_init(struct sce_pck_bvh *bvh) {
  assert(bvh);
  bvh->free_idx_cnt = NUM_AABBS;
  for (int i = 0; i < NUM_AABBS; ++i) {
    bvh->free_idx[i] = NUM_AABBS - i - 1;
  }
 }
 static int
 sce_pck_bvh_add(struct sce_pck_bvh *bvh,
  float min_x, float min_y, float min_z,
  float max_x, float max_y, float max_z) {

  assert(bvh);
  int idx = bvh->free_idx[--bvh->free_idx_cnt];
  assert(idx < NUM_AABBS);

  bvh->aabb_min_x[idx] = min_x;
  bvh->aabb_min_y[idx] = min_y;
  bvh->aabb_min_z[idx] = min_z;

  bvh->aabb_max_x[idx] = max_x;
  bvh->aabb_max_y[idx] = max_y;
  bvh->aabb_max_z[idx] = max_z;
  return idx;
 }
 static void
 sce_pck_bvh_rm(struct sce_pck_bvh *bvh, int idx) {
  assert(bvh);
  assert(idx < NUM_AABBS);
  bvh->free_idx[bvh->free_idx_cnt++] = idx;

  bvh->aabb_min_x[idx] = 0.0f;
  bvh->aabb_min_y[idx] = 0.0f;
  bvh->aabb_min_z[idx] = 0.0f;

  bvh->aabb_max_x[idx] = 0.0f;
  bvh->aabb_max_y[idx] = 0.0f;
  bvh->aabb_max_z[idx] = 0.0f;
 }
 struct sce_pck_bvh_thrd_tsk {
  int start_idx;
  int end_idx;
  struct ray current_ray;
 };
 static void*
 sce_pck_bvh_ray_hit_thrd(void *arg) {
  struct sce_pck_bvh_thrd_tsk *tsk = (struct sce_pck_bvh_thrd_tsk*)arg;
  int start_idx = tsk->start_idx;
  int end_idx = tsk->end_idx; // Exclusive end index
  struct ray current_ray = tsk->current_ray;

  __m256 ray_ox_v_g = _mm256_set1_ps(current_ray.origin.x);
  __m256 ray_oy_v_g = _mm256_set1_ps(current_ray.origin.y);
  __m256 ray_oz_v_g = _mm256_set1_ps(current_ray.origin.z);

  float inv_dir_x_val = 1.0f / (current_ray.direction.x + EPSILON * (current_ray.direction.x == 0.0f ? 1.0f : 0.0f));
  float inv_dir_y_val = 1.0f / (current_ray.direction.y + EPSILON * (current_ray.direction.y == 0.0f ? 1.0f : 0.0f));
  float inv_dir_z_val = 1.0f / (current_ray.direction.z + EPSILON * (current_ray.direction.z == 0.0f ? 1.0f : 0.0f));

  __m256 inv_dir_x_v_g = _mm256_set1_ps(inv_dir_x_val);
  __m256 inv_dir_y_v_g = _mm256_set1_ps(inv_dir_y_val);
  __m256 inv_dir_z_v_g = _mm256_set1_ps(inv_dir_z_val);

  __m256 t_min_initial_v_g = _mm256_set1_ps(-FLT_MAX);
  __m256 t_max_initial_v_g = _mm256_set1_ps(FLT_MAX);
  __m256 epsilon_v_g = _mm256_set1_ps(EPSILON);

  // Loop through AABBs 8 at a time
  for (int i = start_idx; i < end_idx; i += 8) {
    // Load 8 min/max components for the current axis
    __m256 aabb_min_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_x[i]);
    __m256 aabb_min_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_y[i]);
    __m256 aabb_min_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_z[i]);

    __m256 aabb_max_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_x[i]);
    __m256 aabb_max_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_y[i]);
    __m256 aabb_max_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_z[i]);

    // Initialize t_min and t_max for the current batch of 8 AABBs
    __m256 t_min_v = t_min_initial_v_g;
    __m256 t_max_v = t_max_initial_v_g;

    // X-axis slab
    __m256 t1_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x_v, ray_ox_v_g), inv_dir_x_v_g);
    __m256 t2_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x_v, ray_ox_v_g), inv_dir_x_v_g);
    t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_x_v, t2_x_v));
    t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_x_v, t2_x_v));

    // Y-axis slab
    __m256 t1_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y_v, ray_oy_v_g), inv_dir_y_v_g);
    __m256 t2_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y_v, ray_oy_v_g), inv_dir_y_v_g);
    t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_y_v, t2_y_v));
    t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_y_v, t2_y_v));

    // Z-axis slab
    __m256 t1_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z_v, ray_oz_v_g), inv_dir_z_v_g);
    __m256 t2_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z_v, ray_oz_v_g), inv_dir_z_v_g);
    t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_z_v, t2_z_v));
    t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_z_v, t2_z_v));

    // Final intersection check for each of the 8 lanes:
    __m256 intersect_mask1 = _mm256_cmp_ps(t_min_v, t_max_v, _CMP_LE_OQ);
    __m256 intersect_mask2 = _mm256_cmp_ps(t_max_v, epsilon_v_g, _CMP_GT_OQ);
    __m256 final_intersect_mask = _mm256_and_ps(intersect_mask1, intersect_mask2);
    unsigned mask = _mm256_movemask_ps(final_intersect_mask);

    // Store results in the global hit_bitset
    unsigned bitset_idx = i >> 6;
    unsigned bit_offset = (i & 63);
    // Update the hit_bitset for this 8-AABB batch.
    // This implicitly assumes that each thread operates on a range of AABBs
    // that ensures they don't step on each other's toes within the same unsigned long long.
    // For correct division to avoid false sharing, ranges should be multiples of 64 AABBs.
    // However, a simpler correct approach to avoid data races when writing to the *same* unsigned long long
    // (if not aligning tasks to 64-AABB boundaries) would be to use an atomic OR,
    // but that would add overhead.
    // Assuming tasks are aligned to at least 64-AABB boundaries to ensure distinct ull writes.
    sce_pck_bvh.hit_bitset[bitset_idx] |= ((unsigned long long)mask << bit_offset);
  }
  return 0;
 }
 static void
 sce_pck_bvh_ray_hit(struct sce_pck_bvh *bvh, struct vec3 ray_origin, struct vec3 ray_direction, int num_threads) {
  // Clear the hit bitset before starting a new check
  memset(bvh->hit_bitset, 0, sizeof(bvh->hit_bitset));

  // Calculate work distribution
  int aabbs_per_thread = NUM_AABBS / num_threads;
  int remainder = NUM_AABBS % num_threads;

  pthread_t thrd[num_threads];
  struct sce_pck_bvh_thrd_tsk tsk[num_threads];

  int current_start_idx = 0;
  for (int i = 0; i < num_threads; ++i) {
    tsk[i].start_idx = current_start_idx;
    int num_for_this_thread = aabbs_per_thread;
    if (i < remainder) {
      num_for_this_thread++; // Distribute remainder evenly
    }
    tsk[i].end_idx = tsk[i].start_idx + num_for_this_thread;
    // Ensure alignment to 8 (AVX2 batch size) for simplicity,
    // and ideally 64 (cache line for ull) to avoid false sharing on bitset.
    // For production code, adjust end_idx to be a multiple of 8 or 64.
    // For this example, simple division is used, but a "remainder" thread handles leftover.
    // We ensure start_idx is always multiple of 8.
    tsk[i].start_idx = (tsk[i].start_idx / 8) * 8;
    tsk[i].end_idx = (tsk[i].end_idx / 8) * 8; // round down to nearest 8
    if (i == num_threads - 1) { // last thread takes all remaining
      tsk[i].end_idx = NUM_AABBS;
    }
    tsk[i].current_ray.origin = ray_origin;
    tsk[i].current_ray.direction = ray_direction;

    pthread_create(&thrd[i], 0, sce_pck_bvh_ray_hit_thrd, &tsk[i]);
    current_start_idx = tsk[i].end_idx;
  }
  // Wait for all threads to complete
  for (int i = 0; i < num_threads; ++i) {
    pthread_join(thrd[i], 0);
  }
 }

 //-----------------------------------------------------------------------------
 //
 //                                Test
 //
 //-----------------------------------------------------------------------------

 #include <time.h> // For clock_gettime
 #define NSEC_PER_SEC 1000000000.0

 static void
 print_hit_bitset(unsigned int num_to_print) {
  printf("Hit Bitset Results (first %u AABBs):\n", num_to_print);
  for (unsigned int i = 0; i < num_to_print; ++i) {
    unsigned int bitset_idx = i / 64;
    unsigned int bit_pos = i % 64;
    if ((sce_pck_bvh.hit_bitset[bitset_idx] >> bit_pos) & 1ULL) {
      printf("  AABB %u: HIT\n", i);
    } else {
      // printf("  AABB %u: MISS\n", i); // Uncomment to see misses
    }
  }
 }
 extern int
 main(void) {
  sce_pck_bvh_init(&sce_pck_bvh);

  // AABB 0: Intersects with ray (0,0,0) -> (1,0,0)
  sce_pck_bvh_add(&sce_pck_bvh, 2.0f, -1.0f, -1.0f, 4.0f, 1.0f, 1.0f);
  sce_pck_bvh_add(&sce_pck_bvh, 2.0f, 5.0f, -1.0f, 4.0f, 6.0f, 1.0f); // Miss
  sce_pck_bvh_add(&sce_pck_bvh, 1.0f, -0.5f, -0.5f, 2.0f, 0.5f, 0.5f); // Hit
  sce_pck_bvh_add(&sce_pck_bvh, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f); // Ray originates inside
  sce_pck_bvh_add(&sce_pck_bvh, 0.0f, 1.0f, 1.0f, 0.01f, 2.0f, 2.0f); // Parallel miss
  sce_pck_bvh_add(&sce_pck_bvh, 10.0f, -1.0f, -1.0f, 12.0f, 1.0f, 1.0f); // Hit (end of first ull)
  sce_pck_bvh_add(&sce_pck_bvh, 15.0f, -1.0f, -1.0f, 17.0f, 1.0f, 1.0f); // Hit (start of second ull)

  struct ray test_ray = {{0.0f, 0.0f, 0.0f}, {1.0f, 0.0f, 0.0f}}; // Ray along positive X-axis
  printf("Checking ray ((%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)) against all AABBs...\n",
       test_ray.origin.x, test_ray.origin.y, test_ray.origin.z,
       test_ray.direction.x, test_ray.direction.y, test_ray.direction.z);

  struct timespec start, end;
  double elapsed_time;

  // --- Single-core measurement ---
  printf("\n--- Single-core AVX2 performance ---\n");
  clock_gettime(CLOCK_MONOTONIC, &start);
  // Directly call the single-threaded function from the previous version for comparison
  // We'll simulate it by calling the multi-threaded function with 1 thread
  sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, 1);
  clock_gettime(CLOCK_MONOTONIC, &end);
  elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC;
  printf("Single-core execution time: %.6f microseconds\n", elapsed_time * 1e6);
  print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100);

  // --- Multithreaded measurement ---
  int num_physical_cores = 8; // Example: Adjust based on your CPU's actual physical cores
  printf("\n--- Multithreaded AVX2 performance (%d threads) ---\n", num_physical_cores);
  clock_gettime(CLOCK_MONOTONIC, &start);
  sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, num_physical_cores);
  clock_gettime(CLOCK_MONOTONIC, &end);
  elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC;
  printf("Multithreaded execution time: %.6f microseconds\n", elapsed_time * 1e6);
  print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100);

  printf("\nFinished. Check the global hit_bitset for all %u AABB results.\n", NUM_AABBS);

  return 0;
 }
	#include <immintrin.h> // For AVX2 intrinsics
	#include <stdio.h>
	#include <float.h> // For FLT_MAX and FLT_MIN
	#include <string.h> // For memset
	#include <stdlib.h> // For exit
	#include <math.h> // For fmaxf, fminf
	#include <pthread.h> // For pthreads (example, assuming your pool uses similar mechanisms)
	#include <assert.h>

	#define EPSILON 1e-6f // Define a small epsilon to handle floating point inaccuracies
	#define NUM_AABBS (128 * 1024) // Number of AABBs in our global array

	struct vec3 {
	float x, y, z;
	};
	struct ray {
	struct vec3 origin;
	struct vec3 direction;
	};
	struct sce_pck_bvh {
	unsigned free_idx_cnt;
	unsigned free_idx[NUM_AABBS];
	float aabb_min_x[NUM_AABBS];
	float aabb_min_y[NUM_AABBS];
	float aabb_min_z[NUM_AABBS];
	float aabb_max_x[NUM_AABBS];
	float aabb_max_y[NUM_AABBS];
	float aabb_max_z[NUM_AABBS];
	unsigned long long hit_bitset[(NUM_AABBS + 63) / 64];
	} sce_pck_bvh;

	static void
	sce_pck_bvh_init(struct sce_pck_bvh *bvh) {
	assert(bvh);
	bvh->free_idx_cnt = NUM_AABBS;
	for (int i = 0; i < NUM_AABBS; ++i) {
	bvh->free_idx[i] = NUM_AABBS - i - 1;
	}
	}
	static int
	sce_pck_bvh_add(struct sce_pck_bvh *bvh,
	float min_x, float min_y, float min_z,
	float max_x, float max_y, float max_z) {

	assert(bvh);
	int idx = bvh->free_idx[--bvh->free_idx_cnt];
	assert(idx < NUM_AABBS);

	bvh->aabb_min_x[idx] = min_x;
	bvh->aabb_min_y[idx] = min_y;
	bvh->aabb_min_z[idx] = min_z;

	bvh->aabb_max_x[idx] = max_x;
	bvh->aabb_max_y[idx] = max_y;
	bvh->aabb_max_z[idx] = max_z;
	return idx;
	}
	static void
	sce_pck_bvh_rm(struct sce_pck_bvh *bvh, int idx) {
	assert(bvh);
	assert(idx < NUM_AABBS);
	bvh->free_idx[bvh->free_idx_cnt++] = idx;

	bvh->aabb_min_x[idx] = 0.0f;
	bvh->aabb_min_y[idx] = 0.0f;
	bvh->aabb_min_z[idx] = 0.0f;

	bvh->aabb_max_x[idx] = 0.0f;
	bvh->aabb_max_y[idx] = 0.0f;
	bvh->aabb_max_z[idx] = 0.0f;
	}
	struct sce_pck_bvh_thrd_tsk {
	int start_idx;
	int end_idx;
	struct ray current_ray;
	};
	static void*
	sce_pck_bvh_ray_hit_thrd(void *arg) {
	struct sce_pck_bvh_thrd_tsk tsk = (struct sce_pck_bvh_thrd_tsk)arg;
	int start_idx = tsk->start_idx;
	int end_idx = tsk->end_idx; // Exclusive end index
	struct ray current_ray = tsk->current_ray;

	__m256 ray_ox_v_g = _mm256_set1_ps(current_ray.origin.x);
	__m256 ray_oy_v_g = _mm256_set1_ps(current_ray.origin.y);
	__m256 ray_oz_v_g = _mm256_set1_ps(current_ray.origin.z);

	float inv_dir_x_val = 1.0f / (current_ray.direction.x + EPSILON * (current_ray.direction.x == 0.0f ? 1.0f : 0.0f));
	float inv_dir_y_val = 1.0f / (current_ray.direction.y + EPSILON * (current_ray.direction.y == 0.0f ? 1.0f : 0.0f));
	float inv_dir_z_val = 1.0f / (current_ray.direction.z + EPSILON * (current_ray.direction.z == 0.0f ? 1.0f : 0.0f));

	__m256 inv_dir_x_v_g = _mm256_set1_ps(inv_dir_x_val);
	__m256 inv_dir_y_v_g = _mm256_set1_ps(inv_dir_y_val);
	__m256 inv_dir_z_v_g = _mm256_set1_ps(inv_dir_z_val);

	__m256 t_min_initial_v_g = _mm256_set1_ps(-FLT_MAX);
	__m256 t_max_initial_v_g = _mm256_set1_ps(FLT_MAX);
	__m256 epsilon_v_g = _mm256_set1_ps(EPSILON);

	// Loop through AABBs 8 at a time
	for (int i = start_idx; i < end_idx; i += 8) {
	// Load 8 min/max components for the current axis
	__m256 aabb_min_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_x[i]);
	__m256 aabb_min_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_y[i]);
	__m256 aabb_min_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_min_z[i]);

	__m256 aabb_max_x_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_x[i]);
	__m256 aabb_max_y_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_y[i]);
	__m256 aabb_max_z_v = _mm256_load_ps(&sce_pck_bvh.aabb_max_z[i]);

	// Initialize t_min and t_max for the current batch of 8 AABBs
	__m256 t_min_v = t_min_initial_v_g;
	__m256 t_max_v = t_max_initial_v_g;

	// X-axis slab
	__m256 t1_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x_v, ray_ox_v_g), inv_dir_x_v_g);
	__m256 t2_x_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x_v, ray_ox_v_g), inv_dir_x_v_g);
	t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_x_v, t2_x_v));
	t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_x_v, t2_x_v));

	// Y-axis slab
	__m256 t1_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y_v, ray_oy_v_g), inv_dir_y_v_g);
	__m256 t2_y_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y_v, ray_oy_v_g), inv_dir_y_v_g);
	t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_y_v, t2_y_v));
	t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_y_v, t2_y_v));

	// Z-axis slab
	__m256 t1_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z_v, ray_oz_v_g), inv_dir_z_v_g);
	__m256 t2_z_v = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z_v, ray_oz_v_g), inv_dir_z_v_g);
	t_min_v = _mm256_max_ps(t_min_v, _mm256_min_ps(t1_z_v, t2_z_v));
	t_max_v = _mm256_min_ps(t_max_v, _mm256_max_ps(t1_z_v, t2_z_v));

	// Final intersection check for each of the 8 lanes:
	__m256 intersect_mask1 = _mm256_cmp_ps(t_min_v, t_max_v, _CMP_LE_OQ);
	__m256 intersect_mask2 = _mm256_cmp_ps(t_max_v, epsilon_v_g, _CMP_GT_OQ);
	__m256 final_intersect_mask = _mm256_and_ps(intersect_mask1, intersect_mask2);
	unsigned mask = _mm256_movemask_ps(final_intersect_mask);

	// Store results in the global hit_bitset
	unsigned bitset_idx = i >> 6;
	unsigned bit_offset = (i & 63);
	// Update the hit_bitset for this 8-AABB batch.
	// This implicitly assumes that each thread operates on a range of AABBs
	// that ensures they don't step on each other's toes within the same unsigned long long.
	// For correct division to avoid false sharing, ranges should be multiples of 64 AABBs.
	// However, a simpler correct approach to avoid data races when writing to the same unsigned long long
	// (if not aligning tasks to 64-AABB boundaries) would be to use an atomic OR,
	// but that would add overhead.
	// Assuming tasks are aligned to at least 64-AABB boundaries to ensure distinct ull writes.
	sce_pck_bvh.hit_bitset[bitset_idx] \|= ((unsigned long long)mask << bit_offset);
	}
	return 0;
	}
	static void
	sce_pck_bvh_ray_hit(struct sce_pck_bvh *bvh, struct vec3 ray_origin, struct vec3 ray_direction, int num_threads) {
	// Clear the hit bitset before starting a new check
	memset(bvh->hit_bitset, 0, sizeof(bvh->hit_bitset));

	// Calculate work distribution
	int aabbs_per_thread = NUM_AABBS / num_threads;
	int remainder = NUM_AABBS % num_threads;

	pthread_t thrd[num_threads];
	struct sce_pck_bvh_thrd_tsk tsk[num_threads];

	int current_start_idx = 0;
	for (int i = 0; i < num_threads; ++i) {
	tsk[i].start_idx = current_start_idx;
	int num_for_this_thread = aabbs_per_thread;
	if (i < remainder) {
	num_for_this_thread++; // Distribute remainder evenly
	}
	tsk[i].end_idx = tsk[i].start_idx + num_for_this_thread;
	// Ensure alignment to 8 (AVX2 batch size) for simplicity,
	// and ideally 64 (cache line for ull) to avoid false sharing on bitset.
	// For production code, adjust end_idx to be a multiple of 8 or 64.
	// For this example, simple division is used, but a "remainder" thread handles leftover.
	// We ensure start_idx is always multiple of 8.
	tsk[i].start_idx = (tsk[i].start_idx / 8) * 8;
	tsk[i].end_idx = (tsk[i].end_idx / 8) * 8; // round down to nearest 8
	if (i == num_threads - 1) { // last thread takes all remaining
	tsk[i].end_idx = NUM_AABBS;
	}
	tsk[i].current_ray.origin = ray_origin;
	tsk[i].current_ray.direction = ray_direction;

	pthread_create(&thrd[i], 0, sce_pck_bvh_ray_hit_thrd, &tsk[i]);
	current_start_idx = tsk[i].end_idx;
	}
	// Wait for all threads to complete
	for (int i = 0; i < num_threads; ++i) {
	pthread_join(thrd[i], 0);
	}
	}

	//-----------------------------------------------------------------------------
	//
	// Test
	//
	//-----------------------------------------------------------------------------

	#include <time.h> // For clock_gettime
	#define NSEC_PER_SEC 1000000000.0

	static void
	print_hit_bitset(unsigned int num_to_print) {
	printf("Hit Bitset Results (first %u AABBs):\n", num_to_print);
	for (unsigned int i = 0; i < num_to_print; ++i) {
	unsigned int bitset_idx = i / 64;
	unsigned int bit_pos = i % 64;
	if ((sce_pck_bvh.hit_bitset[bitset_idx] >> bit_pos) & 1ULL) {
	printf(" AABB %u: HIT\n", i);
	} else {
	// printf(" AABB %u: MISS\n", i); // Uncomment to see misses
	}
	}
	}
	extern int
	main(void) {
	sce_pck_bvh_init(&sce_pck_bvh);

	// AABB 0: Intersects with ray (0,0,0) -> (1,0,0)
	sce_pck_bvh_add(&sce_pck_bvh, 2.0f, -1.0f, -1.0f, 4.0f, 1.0f, 1.0f);
	sce_pck_bvh_add(&sce_pck_bvh, 2.0f, 5.0f, -1.0f, 4.0f, 6.0f, 1.0f); // Miss
	sce_pck_bvh_add(&sce_pck_bvh, 1.0f, -0.5f, -0.5f, 2.0f, 0.5f, 0.5f); // Hit
	sce_pck_bvh_add(&sce_pck_bvh, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f); // Ray originates inside
	sce_pck_bvh_add(&sce_pck_bvh, 0.0f, 1.0f, 1.0f, 0.01f, 2.0f, 2.0f); // Parallel miss
	sce_pck_bvh_add(&sce_pck_bvh, 10.0f, -1.0f, -1.0f, 12.0f, 1.0f, 1.0f); // Hit (end of first ull)
	sce_pck_bvh_add(&sce_pck_bvh, 15.0f, -1.0f, -1.0f, 17.0f, 1.0f, 1.0f); // Hit (start of second ull)

	struct ray test_ray = {{0.0f, 0.0f, 0.0f}, {1.0f, 0.0f, 0.0f}}; // Ray along positive X-axis
	printf("Checking ray ((%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)) against all AABBs...\n",
	test_ray.origin.x, test_ray.origin.y, test_ray.origin.z,
	test_ray.direction.x, test_ray.direction.y, test_ray.direction.z);

	struct timespec start, end;
	double elapsed_time;

	// --- Single-core measurement ---
	printf("\n--- Single-core AVX2 performance ---\n");
	clock_gettime(CLOCK_MONOTONIC, &start);
	// Directly call the single-threaded function from the previous version for comparison
	// We'll simulate it by calling the multi-threaded function with 1 thread
	sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, 1);
	clock_gettime(CLOCK_MONOTONIC, &end);
	elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC;
	printf("Single-core execution time: %.6f microseconds\n", elapsed_time * 1e6);
	print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100);

	// --- Multithreaded measurement ---
	int num_physical_cores = 8; // Example: Adjust based on your CPU's actual physical cores
	printf("\n--- Multithreaded AVX2 performance (%d threads) ---\n", num_physical_cores);
	clock_gettime(CLOCK_MONOTONIC, &start);
	sce_pck_bvh_ray_hit(&sce_pck_bvh, test_ray.origin, test_ray.direction, num_physical_cores);
	clock_gettime(CLOCK_MONOTONIC, &end);
	elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / NSEC_PER_SEC;
	printf("Multithreaded execution time: %.6f microseconds\n", elapsed_time * 1e6);
	print_hit_bitset(NUM_AABBS < 100 ? NUM_AABBS : 100);

	printf("\nFinished. Check the global hit_bitset for all %u AABB results.\n", NUM_AABBS);

	return 0;
	}