Created
April 16, 2025 07:03
-
-
Save alonsoir/3120c3f92638be18174065415b0b6104 to your computer and use it in GitHub Desktop.
GCC’s memcpy often leverages advanced techniques like vector instructions (SSE/AVX) and Enhanced REP MOVSB (ERMSB), which can hit higher bandwidths—sometimes over 20 GB/s on similar hardware, as seen in benchmarks from Handmade Network. ERMSB, introduced with Ivy Bridge, uses 256-bit internal operations on Haswell and later CPUs, avoiding cache …
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nasm -f macho64 fast_memcpy.asm -o fast_memcpy.o | |
clang -O2 test_memcpy.c fast_memcpy.o -o test_memcpy | |
./test_memcpy |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
section .text | |
global _fast_memcpy | |
_fast_memcpy: | |
push rbx | |
; If size < 256 bytes, use scalar/SSE copy | |
cmp rdx, 256 | |
jb .small_copy | |
mov rax, rdi ; Save destination for return | |
; Align destination to 32-byte boundary | |
mov rbx, rdi | |
and rbx, 31 | |
jz .aligned | |
mov rcx, 32 | |
sub rcx, rbx | |
sub rdx, rcx | |
mov r9, rsi | |
mov r10, rdi | |
mov rdi, r10 | |
mov rsi, r9 | |
rep movsb | |
mov rdi, r10 | |
mov rsi, r9 | |
.aligned: | |
mov r9, rdx | |
shr r9, 5 | |
jz .use_erms | |
.avx_loop: | |
vmovdqu ymm0, [rsi] | |
vmovdqa [rdi], ymm0 | |
add rdi, 32 | |
add rsi, 32 | |
dec r9 | |
jnz .avx_loop | |
mov r9, rdx | |
and r9, 31 | |
mov rdx, r9 | |
.use_erms: | |
cmp rdx, 4096 | |
jb .no_prefetch | |
mov rcx, rdx | |
shr rcx, 6 | |
mov r9, rsi | |
.prefetch_loop: | |
prefetcht0 [r9 + 512] | |
add r9, 64 | |
dec rcx | |
jnz .prefetch_loop | |
.no_prefetch: | |
mov rcx, rdx | |
rep movsb | |
jmp .done | |
.small_copy: | |
cmp rdx, 16 | |
jb .tiny_copy | |
mov rcx, rdx | |
shr rcx, 4 | |
.sse_loop: | |
movdqu xmm0, [rsi] | |
movdqu [rdi], xmm0 | |
add rsi, 16 | |
add rdi, 16 | |
dec rcx | |
jnz .sse_loop | |
mov rcx, rdx | |
and rcx, 15 | |
rep movsb | |
mov rax, rdi | |
jmp .done | |
.tiny_copy: | |
mov rcx, rdx | |
rep movsb | |
mov rax, rdi | |
jmp .done | |
.tail: | |
mov rcx, rdx | |
rep movsb | |
.done: | |
vzeroupper | |
pop rbx | |
ret |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
./compile_and_test_memcpy_test.sh | |
ld: warning: no platform load command found in '/private/tmp/memcpy_test/fast_memcpy.o', assuming: macOS | |
Size: 64 bytes | |
fast_memcpy: 3.19 GB/s (48.08 cycles/iter) | |
std memcpy: 10.97 GB/s (14.00 cycles/iter) | |
-------------------- | |
Size: 256 bytes | |
fast_memcpy: 11.49 GB/s (53.47 cycles/iter) | |
std memcpy: 45.87 GB/s (13.39 cycles/iter) | |
-------------------- | |
Size: 1024 bytes | |
fast_memcpy: 33.02 GB/s (74.43 cycles/iter) | |
std memcpy: 87.47 GB/s (28.10 cycles/iter) | |
-------------------- | |
Size: 4096 bytes | |
fast_memcpy: 44.69 GB/s (219.98 cycles/iter) | |
std memcpy: 72.05 GB/s (136.45 cycles/iter) | |
-------------------- | |
Size: 1048576 bytes | |
fast_memcpy: 27.24 GB/s (92395.32 cycles/iter) | |
std memcpy: 33.85 GB/s (74350.49 cycles/iter) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
// Declare the assembly function | |
void *fast_memcpy(void *dst, const void *src, size_t n); | |
// Function to measure time in nanoseconds | |
static inline unsigned long long rdtsc(void) { | |
unsigned int lo, hi; | |
__asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi)); | |
return ((unsigned long long)hi << 32) | lo; | |
} | |
int main() { | |
// Buffer sizes to test | |
size_t sizes[] = {64, 256, 1024, 4096, 1048576}; // 64B, 256B, 1KB, 4KB, 1MB | |
int num_sizes = sizeof(sizes) / sizeof(sizes[0]); | |
int iterations = 1000000; // Adjust based on size for reasonable runtime | |
// Allocate aligned buffers | |
void *src, *dst; | |
posix_memalign(&src, 32, sizes[num_sizes - 1]); | |
posix_memalign(&dst, 32, sizes[num_sizes - 1]); | |
// Fill source buffer with data | |
memset(src, 0xAA, sizes[num_sizes - 1]); | |
for (int s = 0; s < num_sizes; s++) { | |
size_t size = sizes[s]; | |
iterations = (size < 4096) ? 1000000 : 10000; // Fewer iterations for large sizes | |
// Test fast_memcpy | |
unsigned long long start, end, cycles_fast = 0; | |
for (int i = 0; i < iterations; i++) { | |
start = rdtsc(); | |
fast_memcpy(dst, src, size); | |
end = rdtsc(); | |
cycles_fast += (end - start); | |
} | |
// Test standard memcpy | |
unsigned long long cycles_std = 0; | |
for (int i = 0; i < iterations; i++) { | |
start = rdtsc(); | |
memcpy(dst, src, size); | |
end = rdtsc(); | |
cycles_std += (end - start); | |
} | |
// Estimate CPU frequency (rough approximation, adjust for your i9) | |
double cpu_freq_ghz = 2.4; // i9-9980HK base frequency is ~2.4 GHz | |
double time_fast_ns = (cycles_fast / (double)iterations) / cpu_freq_ghz; | |
double time_std_ns = (cycles_std / (double)iterations) / cpu_freq_ghz; | |
// Calculate bandwidth (bytes per second) | |
double bandwidth_fast = (size / time_fast_ns) * 1e9; // GB/s | |
double bandwidth_std = (size / time_std_ns) * 1e9; // GB/s | |
printf("Size: %zu bytes\n", size); | |
printf("fast_memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_fast / 1e9, (double)cycles_fast / iterations); | |
printf("std memcpy: %.2f GB/s (%.2f cycles/iter)\n", bandwidth_std / 1e9, (double)cycles_std / iterations); | |
printf("--------------------\n"); | |
} | |
free(src); | |
free(dst); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment