Last active
May 11, 2025 17:49
-
-
Save DocBohn/a31f7f8c2be4ba99dd87f95bee93d101 to your computer and use it in GitHub Desktop.
Code demonstrating hand-crafted and compiler-generated vectorization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void simple_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) { | |
for (unsigned int i = 0; i < length; i++) { | |
destination[i] = source[i]; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void vectorized_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) { | |
unsigned int i = 0; | |
// check to see if source and destination are equidistant to an 8-byte boundary | |
if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) { | |
// bytewise copy until we reach a word boundary (or the end of a very short array) | |
while (((uintptr_t) destination & 0x7) && (i < length)) { | |
*destination++ = *source++; | |
i++; | |
} | |
// source and destination are now aligned to an 8-byte boundary | |
// wordwise copy until there's less than a full word remaining | |
int64_t *word_destination = (int64_t *) destination; | |
int64_t const *word_source = (int64_t const *) source; | |
while (i + 8 <= length) { | |
*word_destination++ = *word_source++; | |
i += 8; | |
} | |
destination = (char *) word_destination; | |
source = (char const *) word_source; | |
} | |
// bytewise copy whatever remains | |
while (i < length) { | |
*destination++ = *source++; | |
i++; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void stringified_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) { | |
unsigned int i = 0; | |
// check to see if source and destination are equidistant to an 8-byte boundary | |
if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) { | |
// Bytewise copy until we reach a word boundary (or the end of a very short array) | |
if ((uintptr_t) destination & 0x7) { | |
unsigned int prefix_length = 8 - ((uintptr_t) destination & 0x7); | |
if (prefix_length > length) { | |
prefix_length = length; | |
} | |
asm volatile ( | |
"rep movsb" | |
: "+D"(destination), "+S"(source), "+c"(prefix_length) | |
: | |
: "memory" | |
); | |
i += prefix_length; | |
} | |
// source and destination are now aligned to an 8-byte boundary | |
// wordwise copy until there's less than a full word remaining | |
unsigned int bulk_length = (length - i) / 8; | |
if (bulk_length > 0) { | |
asm volatile ( | |
"rep movsq" | |
: "+D"(destination), "+S"(source), "+c"(bulk_length) | |
: | |
: "memory" | |
); | |
i += bulk_length * 8; | |
} | |
} | |
// bytewise copy whatever remains | |
unsigned int suffix_length = length - i; | |
if (suffix_length > 0) { | |
asm volatile ( | |
"rep movsb" | |
: "+D"(destination), "+S"(source), "+c"(suffix_length) | |
: | |
: "memory" | |
); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
simple_memory_copy: | |
cbz w2, .L1 | |
sub w5, w2, #1 | |
cmp w5, 14 | |
bls .L10 | |
and x4, x2, 4294967280 | |
mov x3, 0 | |
.L4: | |
ldr q31, [x1, x3] | |
str q31, [x0, x3] | |
add x3, x3, 16 | |
cmp x3, x4 | |
bne .L4 | |
tst x2, 15 | |
beq .L1 | |
and w4, w2, -16 | |
.L3: | |
sub w5, w5, w4 | |
cmp w5, 6 | |
bls .L7 | |
sub w3, w2, w4 | |
ldr d31, [x1, w4, uxtw] | |
str d31, [x0, w4, uxtw] | |
tst x3, 7 | |
beq .L1 | |
and w3, w3, -8 | |
add w4, w4, w3 | |
.L7: | |
mov x3, 0 | |
uxtw x5, w4 | |
add x0, x0, x5 | |
add x1, x1, x5 | |
.L9: | |
ldrb w5, [x1, x3] | |
strb w5, [x0, x3] | |
add x3, x3, 1 | |
add w5, w4, w3 | |
cmp w2, w5 | |
bhi .L9 | |
.L1: | |
ret | |
.L10: | |
mov w4, 0 | |
b .L3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
simple_memory_copy: | |
movq %rsi, %rcx | |
testl %edx, %edx | |
je .L1 | |
leal -1(%rdx), %r8d | |
cmpl $14, %r8d | |
jbe .L10 | |
movl %edx, %esi | |
shrl $4, %esi | |
movl %esi, %esi | |
salq $4, %rsi | |
movl $0, %eax | |
.L4: | |
movdqu (%rcx,%rax), %xmm0 | |
movups %xmm0, (%rdi,%rax) | |
addq $16, %rax | |
cmpq %rsi, %rax | |
jne .L4 | |
testb $15, %dl | |
je .L1 | |
movl %edx, %esi | |
andl $-16, %esi | |
.L3: | |
subl %esi, %r8d | |
cmpl $6, %r8d | |
jbe .L7 | |
movl %edx, %eax | |
subl %esi, %eax | |
movl %esi, %r8d | |
movq (%rcx,%r8), %r9 | |
movq %r9, (%rdi,%r8) | |
testb $7, %al | |
je .L1 | |
andl $-8, %eax | |
addl %eax, %esi | |
.L7: | |
movl $0, %eax | |
movl %esi, %r8d | |
addq %r8, %rdi | |
addq %r8, %rcx | |
.L9: | |
movzbl (%rcx,%rax), %r8d | |
movb %r8b, (%rdi,%rax) | |
addq $1, %rax | |
leal (%rsi,%rax), %r8d | |
cmpl %edx, %r8d | |
jb .L9 | |
.L1: | |
ret | |
.L10: | |
movl $0, %esi | |
jmp .L3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -O1 | |
simple_memory_copy: | |
cbz w2, .L1 | |
uxtw x3, w2 | |
mov x2, 0 | |
.L3: | |
ldrb w4, [x1, x2] | |
strb w4, [x0, x2] | |
add x2, x2, 1 | |
cmp x3, x2 | |
bne .L3 | |
.L1: | |
ret |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -O1 | |
simple_memory_copy: | |
testl %edx, %edx | |
je .L1 | |
movl %edx, %edx | |
movl $0, %eax | |
.L3: | |
movzbl (%rsi,%rax), %ecx | |
movb %cl, (%rdi,%rax) | |
addq $1, %rax | |
cmpq %rax, %rdx | |
jne .L3 | |
.L1: | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment