Skip to content

Instantly share code, notes, and snippets.

@alonsoir
Created April 16, 2025 07:03
Show Gist options
  • Save alonsoir/3120c3f92638be18174065415b0b6104 to your computer and use it in GitHub Desktop.
Save alonsoir/3120c3f92638be18174065415b0b6104 to your computer and use it in GitHub Desktop.
GCC’s memcpy often leverages advanced techniques like vector instructions (SSE/AVX) and Enhanced REP MOVSB (ERMSB), which can hit higher bandwidths—sometimes over 20 GB/s on similar hardware, as seen in benchmarks from Handmade Network. ERMSB, introduced with Ivy Bridge, uses 256-bit internal operations on Haswell and later CPUs, avoiding cache …
nasm -f macho64 fast_memcpy.asm -o fast_memcpy.o
clang -O2 test_memcpy.c fast_memcpy.o -o test_memcpy
./test_memcpy
section .text
global _fast_memcpy
_fast_memcpy:
push rbx
; If size < 256 bytes, use scalar/SSE copy
cmp rdx, 256
jb .small_copy
mov rax, rdi ; Save destination for return
; Align destination to 32-byte boundary
mov rbx, rdi
and rbx, 31
jz .aligned
mov rcx, 32
sub rcx, rbx
sub rdx, rcx
mov r9, rsi
mov r10, rdi
mov rdi, r10
mov rsi, r9
rep movsb
mov rdi, r10
mov rsi, r9
.aligned:
mov r9, rdx
shr r9, 5
jz .use_erms
.avx_loop:
vmovdqu ymm0, [rsi]
vmovdqa [rdi], ymm0
add rdi, 32
add rsi, 32
dec r9
jnz .avx_loop
mov r9, rdx
and r9, 31
mov rdx, r9
.use_erms:
cmp rdx, 4096
jb .no_prefetch
mov rcx, rdx
shr rcx, 6
mov r9, rsi
.prefetch_loop:
prefetcht0 [r9 + 512]
add r9, 64
dec rcx
jnz .prefetch_loop
.no_prefetch:
mov rcx, rdx
rep movsb
jmp .done
.small_copy:
cmp rdx, 16
jb .tiny_copy
mov rcx, rdx
shr rcx, 4
.sse_loop:
movdqu xmm0, [rsi]
movdqu [rdi], xmm0
add rsi, 16
add rdi, 16
dec rcx
jnz .sse_loop
mov rcx, rdx
and rcx, 15
rep movsb
mov rax, rdi
jmp .done
.tiny_copy:
mov rcx, rdx
rep movsb
mov rax, rdi
jmp .done
.tail:
mov rcx, rdx
rep movsb
.done:
vzeroupper
pop rbx
ret
./compile_and_test_memcpy_test.sh
ld: warning: no platform load command found in '/private/tmp/memcpy_test/fast_memcpy.o', assuming: macOS
Size: 64 bytes
fast_memcpy: 3.19 GB/s (48.08 cycles/iter)
std memcpy: 10.97 GB/s (14.00 cycles/iter)
--------------------
Size: 256 bytes
fast_memcpy: 11.49 GB/s (53.47 cycles/iter)
std memcpy: 45.87 GB/s (13.39 cycles/iter)
--------------------
Size: 1024 bytes
fast_memcpy: 33.02 GB/s (74.43 cycles/iter)
std memcpy: 87.47 GB/s (28.10 cycles/iter)
--------------------
Size: 4096 bytes
fast_memcpy: 44.69 GB/s (219.98 cycles/iter)
std memcpy: 72.05 GB/s (136.45 cycles/iter)
--------------------
Size: 1048576 bytes
fast_memcpy: 27.24 GB/s (92395.32 cycles/iter)
std memcpy: 33.85 GB/s (74350.49 cycles/iter)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
// Declare the assembly function
void *fast_memcpy(void *dst, const void *src, size_t n);
// Function to measure time in nanoseconds
static inline unsigned long long rdtsc(void) {
unsigned int lo, hi;
__asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
return ((unsigned long long)hi << 32) | lo;
}
int main() {
// Buffer sizes to test
size_t sizes[] = {64, 256, 1024, 4096, 1048576}; // 64B, 256B, 1KB, 4KB, 1MB
int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
int iterations = 1000000; // Adjust based on size for reasonable runtime
// Allocate aligned buffers
void *src, *dst;
posix_memalign(&src, 32, sizes[num_sizes - 1]);
posix_memalign(&dst, 32, sizes[num_sizes - 1]);
// Fill source buffer with data
memset(src, 0xAA, sizes[num_sizes - 1]);
for (int s = 0; s < num_sizes; s++) {
size_t size = sizes[s];
iterations = (size < 4096) ? 1000000 : 10000; // Fewer iterations for large sizes
// Test fast_memcpy
unsigned long long start, end, cycles_fast = 0;
for (int i = 0; i < iterations; i++) {
start = rdtsc();
fast_memcpy(dst, src, size);
end = rdtsc();
cycles_fast += (end - start);
}
// Test standard memcpy
unsigned long long cycles_std = 0;
for (int i = 0; i < iterations; i++) {
start = rdtsc();
memcpy(dst, src, size);
end = rdtsc();
cycles_std += (end - start);
}
// Estimate CPU frequency (rough approximation, adjust for your i9)
double cpu_freq_ghz = 2.4; // i9-9980HK base frequency is ~2.4 GHz
double time_fast_ns = (cycles_fast / (double)iterations) / cpu_freq_ghz;
double time_std_ns = (cycles_std / (double)iterations) / cpu_freq_ghz;
// Calculate bandwidth (bytes per second)
double bandwidth_fast = (size / time_fast_ns) * 1e9; // GB/s
double bandwidth_std = (size / time_std_ns) * 1e9; // GB/s
printf("Size: %zu bytes\n", size);
printf("fast_memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_fast / 1e9, (double)cycles_fast / iterations);
printf("std memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_std / 1e9, (double)cycles_std / iterations);
printf("--------------------\n");
}
free(src);
free(dst);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment