Skip to content

Instantly share code, notes, and snippets.

@DocBohn
Last active May 11, 2025 17:49
Show Gist options
  • Save DocBohn/a31f7f8c2be4ba99dd87f95bee93d101 to your computer and use it in GitHub Desktop.
Save DocBohn/a31f7f8c2be4ba99dd87f95bee93d101 to your computer and use it in GitHub Desktop.
Code demonstrating hand-crafted and compiler-generated vectorization
void simple_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) {
for (unsigned int i = 0; i < length; i++) {
destination[i] = source[i];
}
}
void vectorized_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) {
unsigned int i = 0;
// check to see if source and destination are equidistant to an 8-byte boundary
if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) {
// bytewise copy until we reach a word boundary (or the end of a very short array)
while (((uintptr_t) destination & 0x7) && (i < length)) {
*destination++ = *source++;
i++;
}
// source and destination are now aligned to an 8-byte boundary
// wordwise copy until there's less than a full word remaining
int64_t *word_destination = (int64_t *) destination;
int64_t const *word_source = (int64_t const *) source;
while (i + 8 <= length) {
*word_destination++ = *word_source++;
i += 8;
}
destination = (char *) word_destination;
source = (char const *) word_source;
}
// bytewise copy whatever remains
while (i < length) {
*destination++ = *source++;
i++;
}
}
void stringified_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) {
unsigned int i = 0;
// check to see if source and destination are equidistant to an 8-byte boundary
if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) {
// Bytewise copy until we reach a word boundary (or the end of a very short array)
if ((uintptr_t) destination & 0x7) {
unsigned int prefix_length = 8 - ((uintptr_t) destination & 0x7);
if (prefix_length > length) {
prefix_length = length;
}
asm volatile (
"rep movsb"
: "+D"(destination), "+S"(source), "+c"(prefix_length)
:
: "memory"
);
i += prefix_length;
}
// source and destination are now aligned to an 8-byte boundary
// wordwise copy until there's less than a full word remaining
unsigned int bulk_length = (length - i) / 8;
if (bulk_length > 0) {
asm volatile (
"rep movsq"
: "+D"(destination), "+S"(source), "+c"(bulk_length)
:
: "memory"
);
i += bulk_length * 8;
}
}
// bytewise copy whatever remains
unsigned int suffix_length = length - i;
if (suffix_length > 0) {
asm volatile (
"rep movsb"
: "+D"(destination), "+S"(source), "+c"(suffix_length)
:
: "memory"
);
}
}
simple_memory_copy:
cbz w2, .L1
sub w5, w2, #1
cmp w5, 14
bls .L10
and x4, x2, 4294967280
mov x3, 0
.L4:
ldr q31, [x1, x3]
str q31, [x0, x3]
add x3, x3, 16
cmp x3, x4
bne .L4
tst x2, 15
beq .L1
and w4, w2, -16
.L3:
sub w5, w5, w4
cmp w5, 6
bls .L7
sub w3, w2, w4
ldr d31, [x1, w4, uxtw]
str d31, [x0, w4, uxtw]
tst x3, 7
beq .L1
and w3, w3, -8
add w4, w4, w3
.L7:
mov x3, 0
uxtw x5, w4
add x0, x0, x5
add x1, x1, x5
.L9:
ldrb w5, [x1, x3]
strb w5, [x0, x3]
add x3, x3, 1
add w5, w4, w3
cmp w2, w5
bhi .L9
.L1:
ret
.L10:
mov w4, 0
b .L3
simple_memory_copy:
movq %rsi, %rcx
testl %edx, %edx
je .L1
leal -1(%rdx), %r8d
cmpl $14, %r8d
jbe .L10
movl %edx, %esi
shrl $4, %esi
movl %esi, %esi
salq $4, %rsi
movl $0, %eax
.L4:
movdqu (%rcx,%rax), %xmm0
movups %xmm0, (%rdi,%rax)
addq $16, %rax
cmpq %rsi, %rax
jne .L4
testb $15, %dl
je .L1
movl %edx, %esi
andl $-16, %esi
.L3:
subl %esi, %r8d
cmpl $6, %r8d
jbe .L7
movl %edx, %eax
subl %esi, %eax
movl %esi, %r8d
movq (%rcx,%r8), %r9
movq %r9, (%rdi,%r8)
testb $7, %al
je .L1
andl $-8, %eax
addl %eax, %esi
.L7:
movl $0, %eax
movl %esi, %r8d
addq %r8, %rdi
addq %r8, %rcx
.L9:
movzbl (%rcx,%rax), %r8d
movb %r8b, (%rdi,%rax)
addq $1, %rax
leal (%rsi,%rax), %r8d
cmpl %edx, %r8d
jb .L9
.L1:
ret
.L10:
movl $0, %esi
jmp .L3
// -O1
simple_memory_copy:
cbz w2, .L1
uxtw x3, w2
mov x2, 0
.L3:
ldrb w4, [x1, x2]
strb w4, [x0, x2]
add x2, x2, 1
cmp x3, x2
bne .L3
.L1:
ret
# -O1
simple_memory_copy:
testl %edx, %edx
je .L1
movl %edx, %edx
movl $0, %eax
.L3:
movzbl (%rsi,%rax), %ecx
movb %cl, (%rdi,%rax)
addq $1, %rax
cmpq %rax, %rdx
jne .L3
.L1:
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment