corporatepiyush · September 19, 2025 15:29
diff --git a/bufferpool.c b/bufferpool.c
 #define _GNU_SOURCE
 #include "bufferpool.h"
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
 #include <stdatomic.h>
 #include <dlfcn.h>
 #include <errno.h>
 #include <sys/mman.h>
 #include <inttypes.h>
 #include <stdalign.h>
 #include <unistd.h>
 #include "uthash.h"
 #include <stdio.h>
 #ifdef __APPLE__
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <malloc/malloc.h>
 #include <mach/mach_init.h>
 #endif
 #if defined(__GNUC__) || defined(__clang__)
 #define HAS_128BIT_ATOMICS 1
 #else
 #error "This lock-free implementation requires a compiler with 128-bit atomic support (GCC or Clang)"
 #endif
 // ===============================================================================
 // 1. Core Data Structures & Globals (Pointer Tagging)
 // ===============================================================================
 #define LIKELY(x)   __builtin_expect(!!(x), 1)
 #define UNLIKELY(x) __builtin_expect(!!(x), 0)

 // --- Pointer Tagging Constants ---
 #define TAG_SHIFT           48
 #define POINTER_MASK        ((1ULL << TAG_SHIFT) - 1)
 #define TAG_MASK            (~POINTER_MASK)

 // Within the 16-bit tag, we pack a magic number and the class index
 #define TAG_MAGIC           0x6A5U // An 11-bit magic number
 #define TAG_MAGIC_SHIFT     5
 #define TAG_CLASS_MASK      0x1FU  // Mask for the lower 5 bits (for class index)

 typedef struct slab_segment slab_segment_t;

 // The header is now only 8 bytes when allocated, matching the freelist pointer size.
 typedef union obj_header {
    // Used when the object is FREE
    struct {
        union obj_header *next;
    } free_list_node;

    // Used when the object is ALLOCATED
    struct {
        // This single value holds the slab pointer AND the metadata (magic, class_idx)
        _Atomic uintptr_t tagged_slab_ptr;
    } data;
 } obj_header_t;

 #define HEADER_SIZE (sizeof(obj_header_t)) // Now just 8 bytes on 64-bit
 #define OVERHEAD    (HEADER_SIZE)

 typedef struct { obj_header_t* ptr; uint64_t tag; } tagged_ptr_t;

 struct slab_segment {
    _Atomic __int128 freelist_head;
    void* memory_base;
    uint32_t obj_size;
    uint32_t obj_count;
    uint16_t size_class_idx;
    _Atomic(slab_segment_t*) next_slab;
 };
 typedef struct slab_page_map_entry {
    uintptr_t page_addr;
    slab_segment_t *slab;
    UT_hash_handle hh;
 } slab_page_map_entry_t;
 static const size_t base_sizes[] = { 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, 40960, 49152, 57344, 65536, 98304, 131072, 163840, 196608, 229376, 262144 };
 #define NUM_SIZE_CLASSES (sizeof(base_sizes) / sizeof(base_sizes[0]))
 #define LOOKUP_TABLE_SIZE 262200
 static int8_t size_to_class[LOOKUP_TABLE_SIZE];
 static size_t size_classes[NUM_SIZE_CLASSES];
 static _Atomic(slab_segment_t*) slab_lists[NUM_SIZE_CLASSES];
 static _Atomic int global_state = 0;
 static size_t page_size = 4096;
 static size_t slab_size_small, slab_size_medium, slab_size_large;
 static size_t num_map_shards = 256;
 static slab_page_map_entry_t **slab_map_shards = NULL;
 static pthread_mutex_t *slab_map_locks = NULL;
 static slab_segment_t* add_new_slab(uint16_t class_idx);
 static _Atomic uint64_t stats_malloc_calls = 0;
 static _Atomic uint64_t stats_free_calls = 0;
 static _Atomic uint64_t stats_realloc_calls = 0;
 static _Atomic uint64_t stats_realloc_same_ptr = 0;
 static _Atomic uint64_t stats_realloc_fallback = 0;
 static _Atomic uint64_t stats_slab_creates = 0;
 static _Atomic uint64_t stats_freelist_hits = 0;
 static _Atomic uint64_t stats_freelist_misses = 0;
 #ifdef __APPLE__
 static malloc_zone_t *default_zone = NULL;
 static size_t (*real_size)(malloc_zone_t*, const void*);
 static void* (*real_malloc)(malloc_zone_t*, size_t);
 static void* (*real_calloc)(malloc_zone_t*, size_t, size_t);
 static void* (*real_realloc)(malloc_zone_t*, void*, size_t);
 static void (*real_free)(malloc_zone_t*, void*);
 #else
 static void *(*real_malloc_linux)(size_t) = NULL;
 static void (*real_free_linux)(void*) = NULL;
 static void *(*real_realloc_linux)(void*, size_t) = NULL;
 static void *(*real_calloc_linux)(size_t, size_t) = NULL;
 #endif
 // ===============================================================================
 // 2. Backend Allocator Logic (Updated for Pointer Tagging)
 // ===============================================================================
 void *bufferpool_malloc(size_t size) {
    atomic_fetch_add_explicit(&stats_malloc_calls, 1, memory_order_relaxed);
    if (UNLIKELY(global_state != 1 || size == 0 || size >= LOOKUP_TABLE_SIZE)) {
        #ifdef __APPLE__
            return real_malloc(default_zone, size);
        #else
            return real_malloc_linux(size);
        #endif
    }
    int class_idx = size_to_class[size];
    if (UNLIKELY(class_idx < 0)) {
        #ifdef __APPLE__
            return real_malloc(default_zone, size);
        #else
            return real_malloc_linux(size);
        #endif
    }
    slab_segment_t* slab = atomic_load_explicit(&slab_lists[class_idx], memory_order_acquire);
    while (1) {
        if (!slab) {
            atomic_fetch_add_explicit(&stats_slab_creates, 1, memory_order_relaxed);
            slab = add_new_slab(class_idx);
            if (!slab) {
                #ifdef __APPLE__
                    return real_malloc(default_zone, size);
                #else
                    return real_malloc_linux(size);
                #endif
            }
        }
        tagged_ptr_t old_head, new_head;
        __int128 old_head_packed = atomic_load_explicit(&slab->freelist_head, memory_order_acquire);
        while (1) {
            memcpy(&old_head, &old_head_packed, sizeof(old_head_packed));
            if (old_head.ptr == NULL) {
                atomic_fetch_add_explicit(&stats_freelist_misses, 1, memory_order_relaxed);
                break;
            }
            new_head.ptr = old_head.ptr->free_list_node.next;
            new_head.tag = old_head.tag + 1;
            __int128 new_head_packed;
            memcpy(&new_head_packed, &new_head, sizeof(new_head_packed));
            if (atomic_compare_exchange_weak_explicit(&slab->freelist_head, &old_head_packed, new_head_packed, memory_order_acq_rel, memory_order_acquire)) {
                atomic_fetch_add_explicit(&stats_freelist_hits, 1, memory_order_relaxed);
                obj_header_t* header = old_head.ptr;

                // Pack the metadata into the tag
                uint64_t tag = ((uint64_t)TAG_MAGIC << TAG_MAGIC_SHIFT) | slab->size_class_idx;
                uintptr_t tagged_ptr = ((uintptr_t)slab & POINTER_MASK) | (tag << TAG_SHIFT);

                atomic_store_explicit(&header->data.tagged_slab_ptr, tagged_ptr, memory_order_relaxed);
                return (void*)((char*)header + HEADER_SIZE);
            }
        }
        slab = atomic_load_explicit(&slab->next_slab, memory_order_acquire);
    }
 }
 void bufferpool_free(void *ptr) {
    atomic_fetch_add_explicit(&stats_free_calls, 1, memory_order_relaxed);
    if (UNLIKELY(!ptr)) return;

    obj_header_t* header = (obj_header_t*)((char*)ptr - HEADER_SIZE);
    uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);

    // Unpack the metadata
    uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
    uint16_t magic = tag >> TAG_MAGIC_SHIFT;

    if (UNLIKELY(magic != TAG_MAGIC)) {
        #ifdef __APPLE__
            if (real_free) real_free(default_zone, ptr);
        #else
            if (real_free_linux) real_free_linux(ptr);
        #endif
        return;
    }

    // Atomically clear the magic number part of the tag to prevent double-free
    uintptr_t new_tagged_ptr = tagged_ptr & POINTER_MASK; // Clears the whole tag
    if (!atomic_compare_exchange_strong_explicit(&header->data.tagged_slab_ptr, &tagged_ptr, new_tagged_ptr, memory_order_acq_rel, memory_order_relaxed)) {
        return; // Race with another free
    }

    slab_segment_t* slab = (slab_segment_t*)(tagged_ptr & POINTER_MASK);
    tagged_ptr_t old_head, new_head;
    __int128 old_head_packed = atomic_load_explicit(&slab->freelist_head, memory_order_acquire);

    new_head.ptr = header;

    do {
        memcpy(&old_head, &old_head_packed, sizeof(old_head_packed));
        header->free_list_node.next = old_head.ptr;
        new_head.tag = old_head.tag + 1;
        __int128 new_head_packed;
        memcpy(&new_head_packed, &new_head, sizeof(new_head_packed));
        if (atomic_compare_exchange_weak_explicit(&slab->freelist_head, &old_head_packed, new_head_packed, memory_order_acq_rel, memory_order_acquire)) return;
    } while(1);
 }
 size_t bufferpool_size(const void *ptr) {
    if (!ptr) return 0;
    const obj_header_t *header = (const obj_header_t*)((const char*)ptr - HEADER_SIZE);
    uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);

    uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
    uint16_t magic = tag >> TAG_MAGIC_SHIFT;

    if (magic == TAG_MAGIC) {
        uint16_t class_idx = tag & TAG_CLASS_MASK;
        return base_sizes[class_idx];
    }

    #ifdef __APPLE__
        if (real_size) return real_size(default_zone, ptr);
    #endif
    return 0;
 }
 void *bufferpool_realloc(void *ptr, size_t size) {
    atomic_fetch_add_explicit(&stats_realloc_calls, 1, memory_order_relaxed);
    if (!ptr) return bufferpool_malloc(size);
    if (size == 0) { bufferpool_free(ptr); return NULL; }

    obj_header_t* header = (obj_header_t*)((char*)ptr - HEADER_SIZE);
    uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);

    uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
    uint16_t magic = tag >> TAG_MAGIC_SHIFT;

    if (magic != TAG_MAGIC) {
        #ifdef __APPLE__
            return real_realloc(default_zone, ptr, size);
        #else
            return real_realloc_linux(ptr, size);
        #endif
    }

    uint16_t class_idx = tag & TAG_CLASS_MASK;
    size_t old_size = base_sizes[class_idx];
    if (size < LOOKUP_TABLE_SIZE && class_idx == size_to_class[size]) {
        atomic_fetch_add_explicit(&stats_realloc_same_ptr, 1, memory_order_relaxed);
        return ptr;
    }
    atomic_fetch_add_explicit(&stats_realloc_fallback, 1, memory_order_relaxed);
    void *new_ptr = bufferpool_malloc(size);
    if (new_ptr) {
        memcpy(new_ptr, ptr, old_size < size ? old_size : size);
        bufferpool_free(ptr);
    }
    return new_ptr;
 }
 void *bufferpool_calloc(size_t num_items, size_t size) {
    size_t total_size = num_items * size;
    if (size != 0 && total_size / size != num_items) return NULL;
    void *ptr = bufferpool_malloc(total_size);
    if (ptr) {
        memset(ptr, 0, total_size);
    }
    return ptr;
 }
 // ===============================================================================
 // 3. Helper and Initialization Functions (No changes needed here)
 // ===============================================================================
 static inline uint32_t hash_to_shard_index(uintptr_t key) {
    uint64_t k = (uint64_t)key;
    k = (k ^ (k >> 30)) * 0xbf58476d1ce4e5b9;
    k = (k ^ (k >> 27)) * 0x94d049bb133111eb;
    k = k ^ (k >> 31);
    return (uint32_t)(k & (num_map_shards - 1));
 }
 static size_t get_slab_size_for_class(uint32_t idx) {
    if (idx >= NUM_SIZE_CLASSES) return 0;
    size_t bs = base_sizes[idx];
    if (bs <= 1024) return slab_size_small;
    if (bs <= 32768) return slab_size_medium;
    return slab_size_large;
 }
 static slab_segment_t* add_new_slab(uint16_t class_idx) {
    size_t slab_size = get_slab_size_for_class(class_idx);
    void* mem = mmap(NULL, slab_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (mem == MAP_FAILED) return NULL;
    slab_segment_t* slab;
    #ifdef __APPLE__
        slab = real_malloc(default_zone, sizeof(slab_segment_t));
    #else
        slab = real_malloc_linux(sizeof(slab_segment_t));
    #endif
    if (!slab) { munmap(mem, slab_size); return NULL; }
    *slab = (slab_segment_t){ .memory_base=mem, .size_class_idx=class_idx, .obj_size=size_classes[class_idx], .obj_count=slab_size / size_classes[class_idx] };
    atomic_init(&slab->next_slab, NULL);
    obj_header_t* head = NULL;
    for (uint32_t i = 0; i < slab->obj_count; ++i) {
        char* obj_mem = (char*)mem + (i * slab->obj_size);
        obj_header_t* header = (obj_header_t*)obj_mem;
        header->free_list_node.next = head;
        head = header;
    }
    tagged_ptr_t initial_head = { .ptr = head, .tag = 0 };
    __int128 initial_head_packed;
    memcpy(&initial_head_packed, &initial_head, sizeof(initial_head_packed));
    atomic_init(&slab->freelist_head, initial_head_packed);
    for (size_t i = 0; i < slab_size; i += page_size) {
        uintptr_t page_addr = (uintptr_t)mem + i;
        uint32_t shard_idx = hash_to_shard_index(page_addr / page_size);
        slab_page_map_entry_t *entry;
        #ifdef __APPLE__
            entry = real_malloc(default_zone, sizeof(slab_page_map_entry_t));
        #else
            entry = real_malloc_linux(sizeof(slab_page_map_entry_t));
        #endif
        if (!entry) abort();
        entry->page_addr = page_addr;
        entry->slab = slab;
        pthread_mutex_lock(&slab_map_locks[shard_idx]);
        HASH_ADD_PTR(slab_map_shards[shard_idx], page_addr, entry);
        pthread_mutex_unlock(&slab_map_locks[shard_idx]);
    }
    slab_segment_t* old_head = atomic_load_explicit(&slab_lists[class_idx], memory_order_acquire);
    do { atomic_store_explicit(&slab->next_slab, old_head, memory_order_relaxed);
    } while (!atomic_compare_exchange_weak_explicit(&slab_lists[class_idx], &old_head, slab, memory_order_release, memory_order_acquire));
    return slab;
 }
 static size_t get_total_physical_memory() {
    #ifdef __APPLE__
        int mib[2] = {CTL_HW, HW_MEMSIZE};
        uint64_t memsize;
        size_t len = sizeof(memsize);
        if (sysctl(mib, 2, &memsize, &len, NULL, 0) == 0) return (size_t)memsize;
    #else
        long pages = sysconf(_SC_PHYS_PAGES);
        long page_sz = sysconf(_SC_PAGE_SIZE);
        if (pages > 0 && page_sz > 0) return (size_t)pages * (size_t)page_sz;
    #endif
    return 0;
 }
 static size_t next_power_of_two(size_t n) {
    if (n == 0) return 1;
    n--; n |= n >> 1; n |= n >> 2; n |= n >> 4; n |= n >> 8; n |= n >> 16;
    #if __x86_64__ || __aarch64__
    n |= n >> 32;
    #endif
    n++;
    return n;
 }
 int bufferpool_init_internal() {
    long ret_page_size = sysconf(_SC_PAGESIZE);
    if (ret_page_size > 0) page_size = (size_t)ret_page_size;
    size_t ram_gb = get_total_physical_memory() / (1024 * 1024 * 1024);
    int scale_factor;
    if (ram_gb < 16) scale_factor = 1; else if (ram_gb < 32) scale_factor = 2;
    else if (ram_gb < 64) scale_factor = 3; else if (ram_gb < 96) scale_factor = 4;
    else if (ram_gb < 128) scale_factor = 5; else if (ram_gb < 160) scale_factor = 6;
    else if (ram_gb < 192) scale_factor = 7; else scale_factor = 8;
    slab_size_small = (128 * 1024) * scale_factor;
    slab_size_medium = (512 * 1024) * scale_factor;
    slab_size_large = (2 * 1024 * 1024) * scale_factor;
    long nprocs = sysconf(_SC_NPROCESSORS_ONLN);
    if (nprocs <= 0) nprocs = 1;
    size_t shards = next_power_of_two((size_t)nprocs * 2);
    num_map_shards = shards < 128 ? 128 : shards;
    #ifdef __APPLE__
        slab_map_shards = real_calloc(default_zone, num_map_shards, sizeof(slab_page_map_entry_t*));
        slab_map_locks = real_malloc(default_zone, num_map_shards * sizeof(pthread_mutex_t));
    #else
        slab_map_shards = real_calloc_linux(num_map_shards, sizeof(slab_page_map_entry_t*));
        slab_map_locks = real_malloc_linux(num_map_shards * sizeof(pthread_mutex_t));
    #endif
    if (!slab_map_shards || !slab_map_locks) return -1;
    for(size_t i=0; i<num_map_shards; ++i) pthread_mutex_init(&slab_map_locks[i], NULL);
    memset(size_to_class, -1, sizeof(size_to_class));
    for (size_t i = 0; i < NUM_SIZE_CLASSES; ++i) {
        size_classes[i] = base_sizes[i] + OVERHEAD;
        size_classes[i] = (size_classes[i] + (alignof(max_align_t) - 1)) & ~(alignof(max_align_t) - 1);
        for (size_t sz = (i > 0 ? base_sizes[i-1] + 1 : 1); sz <= base_sizes[i] && sz < LOOKUP_TABLE_SIZE; ++sz) size_to_class[sz] = i;
        atomic_init(&slab_lists[i], NULL);
    }
    atomic_store(&global_state, 1);
    return 0;
 }
 #ifdef __APPLE__
 // ===============================================================================
 // 4. macOS Malloc Zone Hijacking
 // ===============================================================================
 static size_t bp_zone_size_hijack(malloc_zone_t *zone, const void *ptr) {
    return bufferpool_size(ptr);
 }
 static void *bp_zone_malloc_hijack(malloc_zone_t *zone, size_t size) {
    return bufferpool_malloc(size);
 }
 static void *bp_zone_calloc_hijack(malloc_zone_t *zone, size_t num_items, size_t size) {
    return bufferpool_calloc(num_items, size);
 }
 static void *bp_zone_realloc_hijack(malloc_zone_t *zone, void *ptr, size_t size) {
    return bufferpool_realloc(ptr, size);
 }
 static void bp_zone_free_hijack(malloc_zone_t *zone, void *ptr) {
    bufferpool_free(ptr);
 }
 __attribute__((constructor))
 void hijack_default_zone(void) {
    default_zone = malloc_default_zone();
    if (!default_zone) return;
    real_size = default_zone->size;
    real_malloc = default_zone->malloc;
    real_calloc = default_zone->calloc;
    real_realloc = default_zone->realloc;
    real_free = default_zone->free;
    if (bufferpool_init_internal() != 0) {
        return;
    }
    default_zone->size = bp_zone_size_hijack;
    default_zone->malloc = bp_zone_malloc_hijack;
    default_zone->calloc = bp_zone_calloc_hijack;
    default_zone->realloc = bp_zone_realloc_hijack;
    default_zone->free = bp_zone_free_hijack;
 }
 #else
 // ===============================================================================
 // 4. Linux LD_PRELOAD Interception
 // ===============================================================================
 void *malloc(size_t size) {
    return bufferpool_malloc(size);
 }
 void free(void *ptr) {
    bufferpool_free(ptr);
 }
 void *calloc(size_t n, size_t s) {
    return bufferpool_calloc(n, s);
 }
 void *realloc(void *ptr, size_t size) {
    return bufferpool_realloc(ptr, size);
 }
 __attribute__((constructor))
 void ctor(void) {
    real_malloc_linux = dlsym(RTLD_NEXT, "malloc");
    real_free_linux = dlsym(RTLD_NEXT, "free");
    real_realloc_linux = dlsym(RTLD_NEXT, "realloc");
    real_calloc_linux = dlsym(RTLD_NEXT, "calloc");
    if (!real_malloc_linux || !real_free_linux || !real_realloc_linux || !real_calloc_linux) {
        abort();
    }
    bufferpool_init_internal();
 }
 #endif
 __attribute__((destructor))
 void print_bufferpool_stats(void) {
    printf("\n=== BufferPool Statistics ===\n");
    printf("Malloc calls:     %llu\n", (unsigned long long)atomic_load(&stats_malloc_calls));
    printf("Free calls:       %llu\n", (unsigned long long)atomic_load(&stats_free_calls));
    printf("Realloc calls:    %llu\n", (unsigned long long)atomic_load(&stats_realloc_calls));
    printf("Realloc same ptr: %llu\n", (unsigned long long)atomic_load(&stats_realloc_same_ptr));
    printf("Realloc fallback: %llu\n", (unsigned long long)atomic_load(&stats_realloc_fallback));
    printf("Slab creates:     %llu\n", (unsigned long long)atomic_load(&stats_slab_creates));
    printf("Freelist hits:    %llu\n", (unsigned long long)atomic_load(&stats_freelist_hits));
    printf("Freelist misses:  %llu\n", (unsigned long long)atomic_load(&stats_freelist_misses));
    printf("=============================\n");
 }
diff --git a/bufferpool.h b/bufferpool.h
 #ifndef BUFFERPOOL_H
 #define BUFFERPOOL_H


 #endif // BUFFERPOOL_H
	#define _GNU_SOURCE
	#include "bufferpool.h"
	#include <stdlib.h>
	#include <string.h>
	#include <pthread.h>
	#include <stdatomic.h>
	#include <dlfcn.h>
	#include <errno.h>
	#include <sys/mman.h>
	#include <inttypes.h>
	#include <stdalign.h>
	#include <unistd.h>
	#include "uthash.h"
	#include <stdio.h>
	#ifdef __APPLE__
	#include <sys/types.h>
	#include <sys/sysctl.h>
	#include <malloc/malloc.h>
	#include <mach/mach_init.h>
	#endif
	#if defined(__GNUC__) \|\| defined(__clang__)
	#define HAS_128BIT_ATOMICS 1
	#else
	#error "This lock-free implementation requires a compiler with 128-bit atomic support (GCC or Clang)"
	#endif
	// ===============================================================================
	// 1. Core Data Structures & Globals (Pointer Tagging)
	// ===============================================================================
	#define LIKELY(x) __builtin_expect(!!(x), 1)
	#define UNLIKELY(x) __builtin_expect(!!(x), 0)

	// --- Pointer Tagging Constants ---
	#define TAG_SHIFT 48
	#define POINTER_MASK ((1ULL << TAG_SHIFT) - 1)
	#define TAG_MASK (~POINTER_MASK)

	// Within the 16-bit tag, we pack a magic number and the class index
	#define TAG_MAGIC 0x6A5U // An 11-bit magic number
	#define TAG_MAGIC_SHIFT 5
	#define TAG_CLASS_MASK 0x1FU // Mask for the lower 5 bits (for class index)

	typedef struct slab_segment slab_segment_t;

	// The header is now only 8 bytes when allocated, matching the freelist pointer size.
	typedef union obj_header {
	// Used when the object is FREE
	struct {
	union obj_header *next;
	} free_list_node;

	// Used when the object is ALLOCATED
	struct {
	// This single value holds the slab pointer AND the metadata (magic, class_idx)
	_Atomic uintptr_t tagged_slab_ptr;
	} data;
	} obj_header_t;

	#define HEADER_SIZE (sizeof(obj_header_t)) // Now just 8 bytes on 64-bit
	#define OVERHEAD (HEADER_SIZE)

	typedef struct { obj_header_t* ptr; uint64_t tag; } tagged_ptr_t;

	struct slab_segment {
	_Atomic __int128 freelist_head;
	void* memory_base;
	uint32_t obj_size;
	uint32_t obj_count;
	uint16_t size_class_idx;
	_Atomic(slab_segment_t*) next_slab;
	};
	typedef struct slab_page_map_entry {
	uintptr_t page_addr;
	slab_segment_t *slab;
	UT_hash_handle hh;
	} slab_page_map_entry_t;
	static const size_t base_sizes[] = { 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, 40960, 49152, 57344, 65536, 98304, 131072, 163840, 196608, 229376, 262144 };
	#define NUM_SIZE_CLASSES (sizeof(base_sizes) / sizeof(base_sizes[0]))
	#define LOOKUP_TABLE_SIZE 262200
	static int8_t size_to_class[LOOKUP_TABLE_SIZE];
	static size_t size_classes[NUM_SIZE_CLASSES];
	static _Atomic(slab_segment_t*) slab_lists[NUM_SIZE_CLASSES];
	static _Atomic int global_state = 0;
	static size_t page_size = 4096;
	static size_t slab_size_small, slab_size_medium, slab_size_large;
	static size_t num_map_shards = 256;
	static slab_page_map_entry_t **slab_map_shards = NULL;
	static pthread_mutex_t *slab_map_locks = NULL;
	static slab_segment_t* add_new_slab(uint16_t class_idx);
	static _Atomic uint64_t stats_malloc_calls = 0;
	static _Atomic uint64_t stats_free_calls = 0;
	static _Atomic uint64_t stats_realloc_calls = 0;
	static _Atomic uint64_t stats_realloc_same_ptr = 0;
	static _Atomic uint64_t stats_realloc_fallback = 0;
	static _Atomic uint64_t stats_slab_creates = 0;
	static _Atomic uint64_t stats_freelist_hits = 0;
	static _Atomic uint64_t stats_freelist_misses = 0;
	#ifdef __APPLE__
	static malloc_zone_t *default_zone = NULL;
	static size_t (real_size)(malloc_zone_t, const void*);
	static void* (real_malloc)(malloc_zone_t, size_t);
	static void* (real_calloc)(malloc_zone_t, size_t, size_t);
	static void* (real_realloc)(malloc_zone_t, void*, size_t);
	static void (real_free)(malloc_zone_t, void*);
	#else
	static void (real_malloc_linux)(size_t) = NULL;
	static void (real_free_linux)(void) = NULL;
	static void (real_realloc_linux)(void*, size_t) = NULL;
	static void (real_calloc_linux)(size_t, size_t) = NULL;
	#endif
	// ===============================================================================
	// 2. Backend Allocator Logic (Updated for Pointer Tagging)
	// ===============================================================================
	void *bufferpool_malloc(size_t size) {
	atomic_fetch_add_explicit(&stats_malloc_calls, 1, memory_order_relaxed);
	if (UNLIKELY(global_state != 1 \|\| size == 0 \|\| size >= LOOKUP_TABLE_SIZE)) {
	#ifdef __APPLE__
	return real_malloc(default_zone, size);
	#else
	return real_malloc_linux(size);
	#endif
	}
	int class_idx = size_to_class[size];
	if (UNLIKELY(class_idx < 0)) {
	#ifdef __APPLE__
	return real_malloc(default_zone, size);
	#else
	return real_malloc_linux(size);
	#endif
	}
	slab_segment_t* slab = atomic_load_explicit(&slab_lists[class_idx], memory_order_acquire);
	while (1) {
	if (!slab) {
	atomic_fetch_add_explicit(&stats_slab_creates, 1, memory_order_relaxed);
	slab = add_new_slab(class_idx);
	if (!slab) {
	#ifdef __APPLE__
	return real_malloc(default_zone, size);
	#else
	return real_malloc_linux(size);
	#endif
	}
	}
	tagged_ptr_t old_head, new_head;
	__int128 old_head_packed = atomic_load_explicit(&slab->freelist_head, memory_order_acquire);
	while (1) {
	memcpy(&old_head, &old_head_packed, sizeof(old_head_packed));
	if (old_head.ptr == NULL) {
	atomic_fetch_add_explicit(&stats_freelist_misses, 1, memory_order_relaxed);
	break;
	}
	new_head.ptr = old_head.ptr->free_list_node.next;
	new_head.tag = old_head.tag + 1;
	__int128 new_head_packed;
	memcpy(&new_head_packed, &new_head, sizeof(new_head_packed));
	if (atomic_compare_exchange_weak_explicit(&slab->freelist_head, &old_head_packed, new_head_packed, memory_order_acq_rel, memory_order_acquire)) {
	atomic_fetch_add_explicit(&stats_freelist_hits, 1, memory_order_relaxed);
	obj_header_t* header = old_head.ptr;

	// Pack the metadata into the tag
	uint64_t tag = ((uint64_t)TAG_MAGIC << TAG_MAGIC_SHIFT) \| slab->size_class_idx;
	uintptr_t tagged_ptr = ((uintptr_t)slab & POINTER_MASK) \| (tag << TAG_SHIFT);

	atomic_store_explicit(&header->data.tagged_slab_ptr, tagged_ptr, memory_order_relaxed);
	return (void)((char)header + HEADER_SIZE);
	}
	}
	slab = atomic_load_explicit(&slab->next_slab, memory_order_acquire);
	}
	}
	void bufferpool_free(void *ptr) {
	atomic_fetch_add_explicit(&stats_free_calls, 1, memory_order_relaxed);
	if (UNLIKELY(!ptr)) return;

	obj_header_t* header = (obj_header_t)((char)ptr - HEADER_SIZE);
	uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);

	// Unpack the metadata
	uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
	uint16_t magic = tag >> TAG_MAGIC_SHIFT;

	if (UNLIKELY(magic != TAG_MAGIC)) {
	#ifdef __APPLE__
	if (real_free) real_free(default_zone, ptr);
	#else
	if (real_free_linux) real_free_linux(ptr);
	#endif
	return;
	}

	// Atomically clear the magic number part of the tag to prevent double-free
	uintptr_t new_tagged_ptr = tagged_ptr & POINTER_MASK; // Clears the whole tag
	if (!atomic_compare_exchange_strong_explicit(&header->data.tagged_slab_ptr, &tagged_ptr, new_tagged_ptr, memory_order_acq_rel, memory_order_relaxed)) {
	return; // Race with another free
	}

	slab_segment_t* slab = (slab_segment_t*)(tagged_ptr & POINTER_MASK);
	tagged_ptr_t old_head, new_head;
	__int128 old_head_packed = atomic_load_explicit(&slab->freelist_head, memory_order_acquire);

	new_head.ptr = header;

	do {
	memcpy(&old_head, &old_head_packed, sizeof(old_head_packed));
	header->free_list_node.next = old_head.ptr;
	new_head.tag = old_head.tag + 1;
	__int128 new_head_packed;
	memcpy(&new_head_packed, &new_head, sizeof(new_head_packed));
	if (atomic_compare_exchange_weak_explicit(&slab->freelist_head, &old_head_packed, new_head_packed, memory_order_acq_rel, memory_order_acquire)) return;
	} while(1);
	}
	size_t bufferpool_size(const void *ptr) {
	if (!ptr) return 0;
	const obj_header_t header = (const obj_header_t)((const char*)ptr - HEADER_SIZE);
	uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);

	uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
	uint16_t magic = tag >> TAG_MAGIC_SHIFT;

	if (magic == TAG_MAGIC) {
	uint16_t class_idx = tag & TAG_CLASS_MASK;
	return base_sizes[class_idx];
	}

	#ifdef __APPLE__
	if (real_size) return real_size(default_zone, ptr);
	#endif
	return 0;
	}
	void bufferpool_realloc(void ptr, size_t size) {
	atomic_fetch_add_explicit(&stats_realloc_calls, 1, memory_order_relaxed);
	if (!ptr) return bufferpool_malloc(size);
	if (size == 0) { bufferpool_free(ptr); return NULL; }

	obj_header_t* header = (obj_header_t)((char)ptr - HEADER_SIZE);
	uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);

	uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
	uint16_t magic = tag >> TAG_MAGIC_SHIFT;

	if (magic != TAG_MAGIC) {
	#ifdef __APPLE__
	return real_realloc(default_zone, ptr, size);
	#else
	return real_realloc_linux(ptr, size);
	#endif
	}

	uint16_t class_idx = tag & TAG_CLASS_MASK;
	size_t old_size = base_sizes[class_idx];
	if (size < LOOKUP_TABLE_SIZE && class_idx == size_to_class[size]) {
	atomic_fetch_add_explicit(&stats_realloc_same_ptr, 1, memory_order_relaxed);
	return ptr;
	}
	atomic_fetch_add_explicit(&stats_realloc_fallback, 1, memory_order_relaxed);
	void *new_ptr = bufferpool_malloc(size);
	if (new_ptr) {
	memcpy(new_ptr, ptr, old_size < size ? old_size : size);
	bufferpool_free(ptr);
	}
	return new_ptr;
	}
	void *bufferpool_calloc(size_t num_items, size_t size) {
	size_t total_size = num_items * size;
	if (size != 0 && total_size / size != num_items) return NULL;
	void *ptr = bufferpool_malloc(total_size);
	if (ptr) {
	memset(ptr, 0, total_size);
	}
	return ptr;
	}
	// ===============================================================================
	// 3. Helper and Initialization Functions (No changes needed here)
	// ===============================================================================
	static inline uint32_t hash_to_shard_index(uintptr_t key) {
	uint64_t k = (uint64_t)key;
	k = (k ^ (k >> 30)) * 0xbf58476d1ce4e5b9;
	k = (k ^ (k >> 27)) * 0x94d049bb133111eb;
	k = k ^ (k >> 31);
	return (uint32_t)(k & (num_map_shards - 1));
	}
	static size_t get_slab_size_for_class(uint32_t idx) {
	if (idx >= NUM_SIZE_CLASSES) return 0;
	size_t bs = base_sizes[idx];
	if (bs <= 1024) return slab_size_small;
	if (bs <= 32768) return slab_size_medium;
	return slab_size_large;
	}
	static slab_segment_t* add_new_slab(uint16_t class_idx) {
	size_t slab_size = get_slab_size_for_class(class_idx);
	void* mem = mmap(NULL, slab_size, PROT_READ \| PROT_WRITE, MAP_PRIVATE \| MAP_ANONYMOUS, -1, 0);
	if (mem == MAP_FAILED) return NULL;
	slab_segment_t* slab;
	#ifdef __APPLE__
	slab = real_malloc(default_zone, sizeof(slab_segment_t));
	#else
	slab = real_malloc_linux(sizeof(slab_segment_t));
	#endif
	if (!slab) { munmap(mem, slab_size); return NULL; }
	*slab = (slab_segment_t){ .memory_base=mem, .size_class_idx=class_idx, .obj_size=size_classes[class_idx], .obj_count=slab_size / size_classes[class_idx] };
	atomic_init(&slab->next_slab, NULL);
	obj_header_t* head = NULL;
	for (uint32_t i = 0; i < slab->obj_count; ++i) {
	char* obj_mem = (char)mem + (i slab->obj_size);
	obj_header_t* header = (obj_header_t*)obj_mem;
	header->free_list_node.next = head;
	head = header;
	}
	tagged_ptr_t initial_head = { .ptr = head, .tag = 0 };
	__int128 initial_head_packed;
	memcpy(&initial_head_packed, &initial_head, sizeof(initial_head_packed));
	atomic_init(&slab->freelist_head, initial_head_packed);
	for (size_t i = 0; i < slab_size; i += page_size) {
	uintptr_t page_addr = (uintptr_t)mem + i;
	uint32_t shard_idx = hash_to_shard_index(page_addr / page_size);
	slab_page_map_entry_t *entry;
	#ifdef __APPLE__
	entry = real_malloc(default_zone, sizeof(slab_page_map_entry_t));
	#else
	entry = real_malloc_linux(sizeof(slab_page_map_entry_t));
	#endif
	if (!entry) abort();
	entry->page_addr = page_addr;
	entry->slab = slab;
	pthread_mutex_lock(&slab_map_locks[shard_idx]);
	HASH_ADD_PTR(slab_map_shards[shard_idx], page_addr, entry);
	pthread_mutex_unlock(&slab_map_locks[shard_idx]);
	}
	slab_segment_t* old_head = atomic_load_explicit(&slab_lists[class_idx], memory_order_acquire);
	do { atomic_store_explicit(&slab->next_slab, old_head, memory_order_relaxed);
	} while (!atomic_compare_exchange_weak_explicit(&slab_lists[class_idx], &old_head, slab, memory_order_release, memory_order_acquire));
	return slab;
	}
	static size_t get_total_physical_memory() {
	#ifdef __APPLE__
	int mib[2] = {CTL_HW, HW_MEMSIZE};
	uint64_t memsize;
	size_t len = sizeof(memsize);
	if (sysctl(mib, 2, &memsize, &len, NULL, 0) == 0) return (size_t)memsize;
	#else
	long pages = sysconf(_SC_PHYS_PAGES);
	long page_sz = sysconf(_SC_PAGE_SIZE);
	if (pages > 0 && page_sz > 0) return (size_t)pages * (size_t)page_sz;
	#endif
	return 0;
	}
	static size_t next_power_of_two(size_t n) {
	if (n == 0) return 1;
	n--; n \|= n >> 1; n \|= n >> 2; n \|= n >> 4; n \|= n >> 8; n \|= n >> 16;
	#if __x86_64__ \|\| __aarch64__
	n \|= n >> 32;
	#endif
	n++;
	return n;
	}
	int bufferpool_init_internal() {
	long ret_page_size = sysconf(_SC_PAGESIZE);
	if (ret_page_size > 0) page_size = (size_t)ret_page_size;
	size_t ram_gb = get_total_physical_memory() / (1024 * 1024 * 1024);
	int scale_factor;
	if (ram_gb < 16) scale_factor = 1; else if (ram_gb < 32) scale_factor = 2;
	else if (ram_gb < 64) scale_factor = 3; else if (ram_gb < 96) scale_factor = 4;
	else if (ram_gb < 128) scale_factor = 5; else if (ram_gb < 160) scale_factor = 6;
	else if (ram_gb < 192) scale_factor = 7; else scale_factor = 8;
	slab_size_small = (128 * 1024) * scale_factor;
	slab_size_medium = (512 * 1024) * scale_factor;
	slab_size_large = (2 * 1024 * 1024) * scale_factor;
	long nprocs = sysconf(_SC_NPROCESSORS_ONLN);
	if (nprocs <= 0) nprocs = 1;
	size_t shards = next_power_of_two((size_t)nprocs * 2);
	num_map_shards = shards < 128 ? 128 : shards;
	#ifdef __APPLE__
	slab_map_shards = real_calloc(default_zone, num_map_shards, sizeof(slab_page_map_entry_t*));
	slab_map_locks = real_malloc(default_zone, num_map_shards * sizeof(pthread_mutex_t));
	#else
	slab_map_shards = real_calloc_linux(num_map_shards, sizeof(slab_page_map_entry_t*));
	slab_map_locks = real_malloc_linux(num_map_shards * sizeof(pthread_mutex_t));
	#endif
	if (!slab_map_shards \|\| !slab_map_locks) return -1;
	for(size_t i=0; i<num_map_shards; ++i) pthread_mutex_init(&slab_map_locks[i], NULL);
	memset(size_to_class, -1, sizeof(size_to_class));
	for (size_t i = 0; i < NUM_SIZE_CLASSES; ++i) {
	size_classes[i] = base_sizes[i] + OVERHEAD;
	size_classes[i] = (size_classes[i] + (alignof(max_align_t) - 1)) & ~(alignof(max_align_t) - 1);
	for (size_t sz = (i > 0 ? base_sizes[i-1] + 1 : 1); sz <= base_sizes[i] && sz < LOOKUP_TABLE_SIZE; ++sz) size_to_class[sz] = i;
	atomic_init(&slab_lists[i], NULL);
	}
	atomic_store(&global_state, 1);
	return 0;
	}
	#ifdef __APPLE__
	// ===============================================================================
	// 4. macOS Malloc Zone Hijacking
	// ===============================================================================
	static size_t bp_zone_size_hijack(malloc_zone_t zone, const void ptr) {
	return bufferpool_size(ptr);
	}
	static void bp_zone_malloc_hijack(malloc_zone_t zone, size_t size) {
	return bufferpool_malloc(size);
	}
	static void bp_zone_calloc_hijack(malloc_zone_t zone, size_t num_items, size_t size) {
	return bufferpool_calloc(num_items, size);
	}
	static void bp_zone_realloc_hijack(malloc_zone_t zone, void *ptr, size_t size) {
	return bufferpool_realloc(ptr, size);
	}
	static void bp_zone_free_hijack(malloc_zone_t zone, void ptr) {
	bufferpool_free(ptr);
	}
	__attribute__((constructor))
	void hijack_default_zone(void) {
	default_zone = malloc_default_zone();
	if (!default_zone) return;
	real_size = default_zone->size;
	real_malloc = default_zone->malloc;
	real_calloc = default_zone->calloc;
	real_realloc = default_zone->realloc;
	real_free = default_zone->free;
	if (bufferpool_init_internal() != 0) {
	return;
	}
	default_zone->size = bp_zone_size_hijack;
	default_zone->malloc = bp_zone_malloc_hijack;
	default_zone->calloc = bp_zone_calloc_hijack;
	default_zone->realloc = bp_zone_realloc_hijack;
	default_zone->free = bp_zone_free_hijack;
	}
	#else
	// ===============================================================================
	// 4. Linux LD_PRELOAD Interception
	// ===============================================================================
	void *malloc(size_t size) {
	return bufferpool_malloc(size);
	}
	void free(void *ptr) {
	bufferpool_free(ptr);
	}
	void *calloc(size_t n, size_t s) {
	return bufferpool_calloc(n, s);
	}
	void realloc(void ptr, size_t size) {
	return bufferpool_realloc(ptr, size);
	}
	__attribute__((constructor))
	void ctor(void) {
	real_malloc_linux = dlsym(RTLD_NEXT, "malloc");
	real_free_linux = dlsym(RTLD_NEXT, "free");
	real_realloc_linux = dlsym(RTLD_NEXT, "realloc");
	real_calloc_linux = dlsym(RTLD_NEXT, "calloc");
	if (!real_malloc_linux \|\| !real_free_linux \|\| !real_realloc_linux \|\| !real_calloc_linux) {
	abort();
	}
	bufferpool_init_internal();
	}
	#endif
	__attribute__((destructor))
	void print_bufferpool_stats(void) {
	printf("\n=== BufferPool Statistics ===\n");
	printf("Malloc calls: %llu\n", (unsigned long long)atomic_load(&stats_malloc_calls));
	printf("Free calls: %llu\n", (unsigned long long)atomic_load(&stats_free_calls));
	printf("Realloc calls: %llu\n", (unsigned long long)atomic_load(&stats_realloc_calls));
	printf("Realloc same ptr: %llu\n", (unsigned long long)atomic_load(&stats_realloc_same_ptr));
	printf("Realloc fallback: %llu\n", (unsigned long long)atomic_load(&stats_realloc_fallback));
	printf("Slab creates: %llu\n", (unsigned long long)atomic_load(&stats_slab_creates));
	printf("Freelist hits: %llu\n", (unsigned long long)atomic_load(&stats_freelist_hits));
	printf("Freelist misses: %llu\n", (unsigned long long)atomic_load(&stats_freelist_misses));
	printf("=============================\n");
	}
	#ifndef BUFFERPOOL_H
	#define BUFFERPOOL_H


	#endif // BUFFERPOOL_H