Skip to content

Instantly share code, notes, and snippets.

@corporatepiyush
Last active September 19, 2025 15:29
Show Gist options
  • Save corporatepiyush/40d204c021fa1599da7bbfec7b7da7f6 to your computer and use it in GitHub Desktop.
Save corporatepiyush/40d204c021fa1599da7bbfec7b7da7f6 to your computer and use it in GitHub Desktop.
Custom memory allocator with Buffer Pool
#define _GNU_SOURCE
#include "bufferpool.h"
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <stdatomic.h>
#include <dlfcn.h>
#include <errno.h>
#include <sys/mman.h>
#include <inttypes.h>
#include <stdalign.h>
#include <unistd.h>
#include "uthash.h"
#include <stdio.h>
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
#include <malloc/malloc.h>
#include <mach/mach_init.h>
#endif
#if defined(__GNUC__) || defined(__clang__)
#define HAS_128BIT_ATOMICS 1
#else
#error "This lock-free implementation requires a compiler with 128-bit atomic support (GCC or Clang)"
#endif
// ===============================================================================
// 1. Core Data Structures & Globals (Pointer Tagging)
// ===============================================================================
#define LIKELY(x) __builtin_expect(!!(x), 1)
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
// --- Pointer Tagging Constants ---
#define TAG_SHIFT 48
#define POINTER_MASK ((1ULL << TAG_SHIFT) - 1)
#define TAG_MASK (~POINTER_MASK)
// Within the 16-bit tag, we pack a magic number and the class index
#define TAG_MAGIC 0x6A5U // An 11-bit magic number
#define TAG_MAGIC_SHIFT 5
#define TAG_CLASS_MASK 0x1FU // Mask for the lower 5 bits (for class index)
typedef struct slab_segment slab_segment_t;
// The header is now only 8 bytes when allocated, matching the freelist pointer size.
typedef union obj_header {
// Used when the object is FREE
struct {
union obj_header *next;
} free_list_node;
// Used when the object is ALLOCATED
struct {
// This single value holds the slab pointer AND the metadata (magic, class_idx)
_Atomic uintptr_t tagged_slab_ptr;
} data;
} obj_header_t;
#define HEADER_SIZE (sizeof(obj_header_t)) // Now just 8 bytes on 64-bit
#define OVERHEAD (HEADER_SIZE)
typedef struct { obj_header_t* ptr; uint64_t tag; } tagged_ptr_t;
struct slab_segment {
_Atomic __int128 freelist_head;
void* memory_base;
uint32_t obj_size;
uint32_t obj_count;
uint16_t size_class_idx;
_Atomic(slab_segment_t*) next_slab;
};
typedef struct slab_page_map_entry {
uintptr_t page_addr;
slab_segment_t *slab;
UT_hash_handle hh;
} slab_page_map_entry_t;
static const size_t base_sizes[] = { 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, 40960, 49152, 57344, 65536, 98304, 131072, 163840, 196608, 229376, 262144 };
#define NUM_SIZE_CLASSES (sizeof(base_sizes) / sizeof(base_sizes[0]))
#define LOOKUP_TABLE_SIZE 262200
static int8_t size_to_class[LOOKUP_TABLE_SIZE];
static size_t size_classes[NUM_SIZE_CLASSES];
static _Atomic(slab_segment_t*) slab_lists[NUM_SIZE_CLASSES];
static _Atomic int global_state = 0;
static size_t page_size = 4096;
static size_t slab_size_small, slab_size_medium, slab_size_large;
static size_t num_map_shards = 256;
static slab_page_map_entry_t **slab_map_shards = NULL;
static pthread_mutex_t *slab_map_locks = NULL;
static slab_segment_t* add_new_slab(uint16_t class_idx);
static _Atomic uint64_t stats_malloc_calls = 0;
static _Atomic uint64_t stats_free_calls = 0;
static _Atomic uint64_t stats_realloc_calls = 0;
static _Atomic uint64_t stats_realloc_same_ptr = 0;
static _Atomic uint64_t stats_realloc_fallback = 0;
static _Atomic uint64_t stats_slab_creates = 0;
static _Atomic uint64_t stats_freelist_hits = 0;
static _Atomic uint64_t stats_freelist_misses = 0;
#ifdef __APPLE__
static malloc_zone_t *default_zone = NULL;
static size_t (*real_size)(malloc_zone_t*, const void*);
static void* (*real_malloc)(malloc_zone_t*, size_t);
static void* (*real_calloc)(malloc_zone_t*, size_t, size_t);
static void* (*real_realloc)(malloc_zone_t*, void*, size_t);
static void (*real_free)(malloc_zone_t*, void*);
#else
static void *(*real_malloc_linux)(size_t) = NULL;
static void (*real_free_linux)(void*) = NULL;
static void *(*real_realloc_linux)(void*, size_t) = NULL;
static void *(*real_calloc_linux)(size_t, size_t) = NULL;
#endif
// ===============================================================================
// 2. Backend Allocator Logic (Updated for Pointer Tagging)
// ===============================================================================
void *bufferpool_malloc(size_t size) {
atomic_fetch_add_explicit(&stats_malloc_calls, 1, memory_order_relaxed);
if (UNLIKELY(global_state != 1 || size == 0 || size >= LOOKUP_TABLE_SIZE)) {
#ifdef __APPLE__
return real_malloc(default_zone, size);
#else
return real_malloc_linux(size);
#endif
}
int class_idx = size_to_class[size];
if (UNLIKELY(class_idx < 0)) {
#ifdef __APPLE__
return real_malloc(default_zone, size);
#else
return real_malloc_linux(size);
#endif
}
slab_segment_t* slab = atomic_load_explicit(&slab_lists[class_idx], memory_order_acquire);
while (1) {
if (!slab) {
atomic_fetch_add_explicit(&stats_slab_creates, 1, memory_order_relaxed);
slab = add_new_slab(class_idx);
if (!slab) {
#ifdef __APPLE__
return real_malloc(default_zone, size);
#else
return real_malloc_linux(size);
#endif
}
}
tagged_ptr_t old_head, new_head;
__int128 old_head_packed = atomic_load_explicit(&slab->freelist_head, memory_order_acquire);
while (1) {
memcpy(&old_head, &old_head_packed, sizeof(old_head_packed));
if (old_head.ptr == NULL) {
atomic_fetch_add_explicit(&stats_freelist_misses, 1, memory_order_relaxed);
break;
}
new_head.ptr = old_head.ptr->free_list_node.next;
new_head.tag = old_head.tag + 1;
__int128 new_head_packed;
memcpy(&new_head_packed, &new_head, sizeof(new_head_packed));
if (atomic_compare_exchange_weak_explicit(&slab->freelist_head, &old_head_packed, new_head_packed, memory_order_acq_rel, memory_order_acquire)) {
atomic_fetch_add_explicit(&stats_freelist_hits, 1, memory_order_relaxed);
obj_header_t* header = old_head.ptr;
// Pack the metadata into the tag
uint64_t tag = ((uint64_t)TAG_MAGIC << TAG_MAGIC_SHIFT) | slab->size_class_idx;
uintptr_t tagged_ptr = ((uintptr_t)slab & POINTER_MASK) | (tag << TAG_SHIFT);
atomic_store_explicit(&header->data.tagged_slab_ptr, tagged_ptr, memory_order_relaxed);
return (void*)((char*)header + HEADER_SIZE);
}
}
slab = atomic_load_explicit(&slab->next_slab, memory_order_acquire);
}
}
void bufferpool_free(void *ptr) {
atomic_fetch_add_explicit(&stats_free_calls, 1, memory_order_relaxed);
if (UNLIKELY(!ptr)) return;
obj_header_t* header = (obj_header_t*)((char*)ptr - HEADER_SIZE);
uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);
// Unpack the metadata
uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
uint16_t magic = tag >> TAG_MAGIC_SHIFT;
if (UNLIKELY(magic != TAG_MAGIC)) {
#ifdef __APPLE__
if (real_free) real_free(default_zone, ptr);
#else
if (real_free_linux) real_free_linux(ptr);
#endif
return;
}
// Atomically clear the magic number part of the tag to prevent double-free
uintptr_t new_tagged_ptr = tagged_ptr & POINTER_MASK; // Clears the whole tag
if (!atomic_compare_exchange_strong_explicit(&header->data.tagged_slab_ptr, &tagged_ptr, new_tagged_ptr, memory_order_acq_rel, memory_order_relaxed)) {
return; // Race with another free
}
slab_segment_t* slab = (slab_segment_t*)(tagged_ptr & POINTER_MASK);
tagged_ptr_t old_head, new_head;
__int128 old_head_packed = atomic_load_explicit(&slab->freelist_head, memory_order_acquire);
new_head.ptr = header;
do {
memcpy(&old_head, &old_head_packed, sizeof(old_head_packed));
header->free_list_node.next = old_head.ptr;
new_head.tag = old_head.tag + 1;
__int128 new_head_packed;
memcpy(&new_head_packed, &new_head, sizeof(new_head_packed));
if (atomic_compare_exchange_weak_explicit(&slab->freelist_head, &old_head_packed, new_head_packed, memory_order_acq_rel, memory_order_acquire)) return;
} while(1);
}
size_t bufferpool_size(const void *ptr) {
if (!ptr) return 0;
const obj_header_t *header = (const obj_header_t*)((const char*)ptr - HEADER_SIZE);
uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);
uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
uint16_t magic = tag >> TAG_MAGIC_SHIFT;
if (magic == TAG_MAGIC) {
uint16_t class_idx = tag & TAG_CLASS_MASK;
return base_sizes[class_idx];
}
#ifdef __APPLE__
if (real_size) return real_size(default_zone, ptr);
#endif
return 0;
}
void *bufferpool_realloc(void *ptr, size_t size) {
atomic_fetch_add_explicit(&stats_realloc_calls, 1, memory_order_relaxed);
if (!ptr) return bufferpool_malloc(size);
if (size == 0) { bufferpool_free(ptr); return NULL; }
obj_header_t* header = (obj_header_t*)((char*)ptr - HEADER_SIZE);
uintptr_t tagged_ptr = atomic_load_explicit(&header->data.tagged_slab_ptr, memory_order_relaxed);
uint16_t tag = (tagged_ptr & TAG_MASK) >> TAG_SHIFT;
uint16_t magic = tag >> TAG_MAGIC_SHIFT;
if (magic != TAG_MAGIC) {
#ifdef __APPLE__
return real_realloc(default_zone, ptr, size);
#else
return real_realloc_linux(ptr, size);
#endif
}
uint16_t class_idx = tag & TAG_CLASS_MASK;
size_t old_size = base_sizes[class_idx];
if (size < LOOKUP_TABLE_SIZE && class_idx == size_to_class[size]) {
atomic_fetch_add_explicit(&stats_realloc_same_ptr, 1, memory_order_relaxed);
return ptr;
}
atomic_fetch_add_explicit(&stats_realloc_fallback, 1, memory_order_relaxed);
void *new_ptr = bufferpool_malloc(size);
if (new_ptr) {
memcpy(new_ptr, ptr, old_size < size ? old_size : size);
bufferpool_free(ptr);
}
return new_ptr;
}
void *bufferpool_calloc(size_t num_items, size_t size) {
size_t total_size = num_items * size;
if (size != 0 && total_size / size != num_items) return NULL;
void *ptr = bufferpool_malloc(total_size);
if (ptr) {
memset(ptr, 0, total_size);
}
return ptr;
}
// ===============================================================================
// 3. Helper and Initialization Functions (No changes needed here)
// ===============================================================================
static inline uint32_t hash_to_shard_index(uintptr_t key) {
uint64_t k = (uint64_t)key;
k = (k ^ (k >> 30)) * 0xbf58476d1ce4e5b9;
k = (k ^ (k >> 27)) * 0x94d049bb133111eb;
k = k ^ (k >> 31);
return (uint32_t)(k & (num_map_shards - 1));
}
static size_t get_slab_size_for_class(uint32_t idx) {
if (idx >= NUM_SIZE_CLASSES) return 0;
size_t bs = base_sizes[idx];
if (bs <= 1024) return slab_size_small;
if (bs <= 32768) return slab_size_medium;
return slab_size_large;
}
static slab_segment_t* add_new_slab(uint16_t class_idx) {
size_t slab_size = get_slab_size_for_class(class_idx);
void* mem = mmap(NULL, slab_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mem == MAP_FAILED) return NULL;
slab_segment_t* slab;
#ifdef __APPLE__
slab = real_malloc(default_zone, sizeof(slab_segment_t));
#else
slab = real_malloc_linux(sizeof(slab_segment_t));
#endif
if (!slab) { munmap(mem, slab_size); return NULL; }
*slab = (slab_segment_t){ .memory_base=mem, .size_class_idx=class_idx, .obj_size=size_classes[class_idx], .obj_count=slab_size / size_classes[class_idx] };
atomic_init(&slab->next_slab, NULL);
obj_header_t* head = NULL;
for (uint32_t i = 0; i < slab->obj_count; ++i) {
char* obj_mem = (char*)mem + (i * slab->obj_size);
obj_header_t* header = (obj_header_t*)obj_mem;
header->free_list_node.next = head;
head = header;
}
tagged_ptr_t initial_head = { .ptr = head, .tag = 0 };
__int128 initial_head_packed;
memcpy(&initial_head_packed, &initial_head, sizeof(initial_head_packed));
atomic_init(&slab->freelist_head, initial_head_packed);
for (size_t i = 0; i < slab_size; i += page_size) {
uintptr_t page_addr = (uintptr_t)mem + i;
uint32_t shard_idx = hash_to_shard_index(page_addr / page_size);
slab_page_map_entry_t *entry;
#ifdef __APPLE__
entry = real_malloc(default_zone, sizeof(slab_page_map_entry_t));
#else
entry = real_malloc_linux(sizeof(slab_page_map_entry_t));
#endif
if (!entry) abort();
entry->page_addr = page_addr;
entry->slab = slab;
pthread_mutex_lock(&slab_map_locks[shard_idx]);
HASH_ADD_PTR(slab_map_shards[shard_idx], page_addr, entry);
pthread_mutex_unlock(&slab_map_locks[shard_idx]);
}
slab_segment_t* old_head = atomic_load_explicit(&slab_lists[class_idx], memory_order_acquire);
do { atomic_store_explicit(&slab->next_slab, old_head, memory_order_relaxed);
} while (!atomic_compare_exchange_weak_explicit(&slab_lists[class_idx], &old_head, slab, memory_order_release, memory_order_acquire));
return slab;
}
static size_t get_total_physical_memory() {
#ifdef __APPLE__
int mib[2] = {CTL_HW, HW_MEMSIZE};
uint64_t memsize;
size_t len = sizeof(memsize);
if (sysctl(mib, 2, &memsize, &len, NULL, 0) == 0) return (size_t)memsize;
#else
long pages = sysconf(_SC_PHYS_PAGES);
long page_sz = sysconf(_SC_PAGE_SIZE);
if (pages > 0 && page_sz > 0) return (size_t)pages * (size_t)page_sz;
#endif
return 0;
}
static size_t next_power_of_two(size_t n) {
if (n == 0) return 1;
n--; n |= n >> 1; n |= n >> 2; n |= n >> 4; n |= n >> 8; n |= n >> 16;
#if __x86_64__ || __aarch64__
n |= n >> 32;
#endif
n++;
return n;
}
int bufferpool_init_internal() {
long ret_page_size = sysconf(_SC_PAGESIZE);
if (ret_page_size > 0) page_size = (size_t)ret_page_size;
size_t ram_gb = get_total_physical_memory() / (1024 * 1024 * 1024);
int scale_factor;
if (ram_gb < 16) scale_factor = 1; else if (ram_gb < 32) scale_factor = 2;
else if (ram_gb < 64) scale_factor = 3; else if (ram_gb < 96) scale_factor = 4;
else if (ram_gb < 128) scale_factor = 5; else if (ram_gb < 160) scale_factor = 6;
else if (ram_gb < 192) scale_factor = 7; else scale_factor = 8;
slab_size_small = (128 * 1024) * scale_factor;
slab_size_medium = (512 * 1024) * scale_factor;
slab_size_large = (2 * 1024 * 1024) * scale_factor;
long nprocs = sysconf(_SC_NPROCESSORS_ONLN);
if (nprocs <= 0) nprocs = 1;
size_t shards = next_power_of_two((size_t)nprocs * 2);
num_map_shards = shards < 128 ? 128 : shards;
#ifdef __APPLE__
slab_map_shards = real_calloc(default_zone, num_map_shards, sizeof(slab_page_map_entry_t*));
slab_map_locks = real_malloc(default_zone, num_map_shards * sizeof(pthread_mutex_t));
#else
slab_map_shards = real_calloc_linux(num_map_shards, sizeof(slab_page_map_entry_t*));
slab_map_locks = real_malloc_linux(num_map_shards * sizeof(pthread_mutex_t));
#endif
if (!slab_map_shards || !slab_map_locks) return -1;
for(size_t i=0; i<num_map_shards; ++i) pthread_mutex_init(&slab_map_locks[i], NULL);
memset(size_to_class, -1, sizeof(size_to_class));
for (size_t i = 0; i < NUM_SIZE_CLASSES; ++i) {
size_classes[i] = base_sizes[i] + OVERHEAD;
size_classes[i] = (size_classes[i] + (alignof(max_align_t) - 1)) & ~(alignof(max_align_t) - 1);
for (size_t sz = (i > 0 ? base_sizes[i-1] + 1 : 1); sz <= base_sizes[i] && sz < LOOKUP_TABLE_SIZE; ++sz) size_to_class[sz] = i;
atomic_init(&slab_lists[i], NULL);
}
atomic_store(&global_state, 1);
return 0;
}
#ifdef __APPLE__
// ===============================================================================
// 4. macOS Malloc Zone Hijacking
// ===============================================================================
static size_t bp_zone_size_hijack(malloc_zone_t *zone, const void *ptr) {
return bufferpool_size(ptr);
}
static void *bp_zone_malloc_hijack(malloc_zone_t *zone, size_t size) {
return bufferpool_malloc(size);
}
static void *bp_zone_calloc_hijack(malloc_zone_t *zone, size_t num_items, size_t size) {
return bufferpool_calloc(num_items, size);
}
static void *bp_zone_realloc_hijack(malloc_zone_t *zone, void *ptr, size_t size) {
return bufferpool_realloc(ptr, size);
}
static void bp_zone_free_hijack(malloc_zone_t *zone, void *ptr) {
bufferpool_free(ptr);
}
__attribute__((constructor))
void hijack_default_zone(void) {
default_zone = malloc_default_zone();
if (!default_zone) return;
real_size = default_zone->size;
real_malloc = default_zone->malloc;
real_calloc = default_zone->calloc;
real_realloc = default_zone->realloc;
real_free = default_zone->free;
if (bufferpool_init_internal() != 0) {
return;
}
default_zone->size = bp_zone_size_hijack;
default_zone->malloc = bp_zone_malloc_hijack;
default_zone->calloc = bp_zone_calloc_hijack;
default_zone->realloc = bp_zone_realloc_hijack;
default_zone->free = bp_zone_free_hijack;
}
#else
// ===============================================================================
// 4. Linux LD_PRELOAD Interception
// ===============================================================================
void *malloc(size_t size) {
return bufferpool_malloc(size);
}
void free(void *ptr) {
bufferpool_free(ptr);
}
void *calloc(size_t n, size_t s) {
return bufferpool_calloc(n, s);
}
void *realloc(void *ptr, size_t size) {
return bufferpool_realloc(ptr, size);
}
__attribute__((constructor))
void ctor(void) {
real_malloc_linux = dlsym(RTLD_NEXT, "malloc");
real_free_linux = dlsym(RTLD_NEXT, "free");
real_realloc_linux = dlsym(RTLD_NEXT, "realloc");
real_calloc_linux = dlsym(RTLD_NEXT, "calloc");
if (!real_malloc_linux || !real_free_linux || !real_realloc_linux || !real_calloc_linux) {
abort();
}
bufferpool_init_internal();
}
#endif
__attribute__((destructor))
void print_bufferpool_stats(void) {
printf("\n=== BufferPool Statistics ===\n");
printf("Malloc calls: %llu\n", (unsigned long long)atomic_load(&stats_malloc_calls));
printf("Free calls: %llu\n", (unsigned long long)atomic_load(&stats_free_calls));
printf("Realloc calls: %llu\n", (unsigned long long)atomic_load(&stats_realloc_calls));
printf("Realloc same ptr: %llu\n", (unsigned long long)atomic_load(&stats_realloc_same_ptr));
printf("Realloc fallback: %llu\n", (unsigned long long)atomic_load(&stats_realloc_fallback));
printf("Slab creates: %llu\n", (unsigned long long)atomic_load(&stats_slab_creates));
printf("Freelist hits: %llu\n", (unsigned long long)atomic_load(&stats_freelist_hits));
printf("Freelist misses: %llu\n", (unsigned long long)atomic_load(&stats_freelist_misses));
printf("=============================\n");
}
#ifndef BUFFERPOOL_H
#define BUFFERPOOL_H
#endif // BUFFERPOOL_H
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment