DocBohn · May 11, 2025 17:49
diff --git a/vectorization.c b/vectorization.c
 void simple_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) {
    for (unsigned int i = 0; i < length; i++) {
        destination[i] = source[i];
    }
 }
diff --git a/vectorization_handrafted.c b/vectorization_handrafted.c
 void vectorized_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) {
    unsigned int i = 0;
    // check to see if source and destination are equidistant to an 8-byte boundary
    if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) {
        // bytewise copy until we reach a word boundary (or the end of a very short array)
        while (((uintptr_t) destination & 0x7) && (i < length)) {
            *destination++ = *source++;
            i++;
        }
        // source and destination are now aligned to an 8-byte boundary
        // wordwise copy until there's less than a full word remaining
        int64_t *word_destination = (int64_t *) destination;
        int64_t const *word_source = (int64_t const *) source;
        while (i + 8 <= length) {
            *word_destination++ = *word_source++;
            i += 8;
        }
        destination = (char *) word_destination;
        source = (char const *) word_source;
    }
    // bytewise copy whatever remains
    while (i < length) {
        *destination++ = *source++;
        i++;
    }
 }
diff --git a/vectorization_handrafted_string-instructions.c b/vectorization_handrafted_string-instructions.c
 void stringified_memory_copy(char *restrict destination, char const *restrict source, unsigned int length) {
    unsigned int i = 0;
    // check to see if source and destination are equidistant to an 8-byte boundary
    if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) {
        // Bytewise copy until we reach a word boundary (or the end of a very short array)
        if ((uintptr_t) destination & 0x7) {
            unsigned int prefix_length = 8 - ((uintptr_t) destination & 0x7);
            if (prefix_length > length) {
                prefix_length = length;
            }
            asm volatile (
                "rep movsb"
                : "+D"(destination), "+S"(source), "+c"(prefix_length)
                : 
                : "memory"
            );
            i += prefix_length;
        }
        // source and destination are now aligned to an 8-byte boundary
        // wordwise copy until there's less than a full word remaining
        unsigned int bulk_length = (length - i) / 8;
        if (bulk_length > 0) {
            asm volatile (
                "rep movsq"
                : "+D"(destination), "+S"(source), "+c"(bulk_length)
                : 
                : "memory"
            );
            i += bulk_length * 8;
        }
    }
    // bytewise copy whatever remains
    unsigned int suffix_length = length - i;
    if (suffix_length > 0) {
        asm volatile (
            "rep movsb"
            : "+D"(destination), "+S"(source), "+c"(suffix_length)
            :
            : "memory"
        );
    }
 }
diff --git a/vectorization_with-simd_arm64.asm b/vectorization_with-simd_arm64.asm
 simple_memory_copy:
        cbz     w2, .L1
        sub     w5, w2, #1
        cmp     w5, 14
        bls     .L10
        and     x4, x2, 4294967280
        mov     x3, 0
 .L4:
        ldr     q31, [x1, x3]
        str     q31, [x0, x3]
        add     x3, x3, 16
        cmp     x3, x4
        bne     .L4
        tst     x2, 15
        beq     .L1
        and     w4, w2, -16
 .L3:
        sub     w5, w5, w4
        cmp     w5, 6
        bls     .L7
        sub     w3, w2, w4
        ldr     d31, [x1, w4, uxtw]
        str     d31, [x0, w4, uxtw]
        tst     x3, 7
        beq     .L1
        and     w3, w3, -8
        add     w4, w4, w3
 .L7:
        mov     x3, 0
        uxtw    x5, w4
        add     x0, x0, x5
        add     x1, x1, x5
 .L9:
        ldrb    w5, [x1, x3]
        strb    w5, [x0, x3]
        add     x3, x3, 1
        add     w5, w4, w3
        cmp     w2, w5
        bhi     .L9
 .L1:
        ret
 .L10:
        mov     w4, 0
        b       .L3
diff --git a/vectorization_with-simd_x86-64.asm b/vectorization_with-simd_x86-64.asm
 simple_memory_copy:
        movq    %rsi, %rcx
        testl   %edx, %edx
        je      .L1
        leal    -1(%rdx), %r8d
        cmpl    $14, %r8d
        jbe     .L10
        movl    %edx, %esi
        shrl    $4, %esi
        movl    %esi, %esi
        salq    $4, %rsi
        movl    $0, %eax
 .L4:
        movdqu  (%rcx,%rax), %xmm0
        movups  %xmm0, (%rdi,%rax)
        addq    $16, %rax
        cmpq    %rsi, %rax
        jne     .L4
        testb   $15, %dl
        je      .L1
        movl    %edx, %esi
        andl    $-16, %esi
 .L3:
        subl    %esi, %r8d
        cmpl    $6, %r8d
        jbe     .L7
        movl    %edx, %eax
        subl    %esi, %eax
        movl    %esi, %r8d
        movq    (%rcx,%r8), %r9
        movq    %r9, (%rdi,%r8)
        testb   $7, %al
        je      .L1
        andl    $-8, %eax
        addl    %eax, %esi
 .L7:
        movl    $0, %eax
        movl    %esi, %r8d
        addq    %r8, %rdi
        addq    %r8, %rcx
 .L9:
        movzbl  (%rcx,%rax), %r8d
        movb    %r8b, (%rdi,%rax)
        addq    $1, %rax
        leal    (%rsi,%rax), %r8d
        cmpl    %edx, %r8d
        jb      .L9
 .L1:
        ret
 .L10:
        movl    $0, %esi
        jmp     .L3
diff --git a/vectorization_without-simd_arm64.asm b/vectorization_without-simd_arm64.asm
 // -O1

 simple_memory_copy:
        cbz     w2, .L1
        uxtw    x3, w2
        mov     x2, 0
 .L3:
        ldrb    w4, [x1, x2]
        strb    w4, [x0, x2]
        add     x2, x2, 1
        cmp     x3, x2
        bne     .L3
 .L1:
        ret
diff --git a/vectorization_without-simd_x86-64.asm b/vectorization_without-simd_x86-64.asm
 # -O1

 simple_memory_copy:
        testl   %edx, %edx
        je      .L1
        movl    %edx, %edx
        movl    $0, %eax
 .L3:
        movzbl  (%rsi,%rax), %ecx
        movb    %cl, (%rdi,%rax)
        addq    $1, %rax
        cmpq    %rax, %rdx
        jne     .L3
 .L1:
        ret
	void simple_memory_copy(char restrict destination, char const restrict source, unsigned int length) {
	for (unsigned int i = 0; i < length; i++) {
	destination[i] = source[i];
	}
	}
	void vectorized_memory_copy(char restrict destination, char const restrict source, unsigned int length) {
	unsigned int i = 0;
	// check to see if source and destination are equidistant to an 8-byte boundary
	if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) {
	// bytewise copy until we reach a word boundary (or the end of a very short array)
	while (((uintptr_t) destination & 0x7) && (i < length)) {
	destination++ = source++;
	i++;
	}
	// source and destination are now aligned to an 8-byte boundary
	// wordwise copy until there's less than a full word remaining
	int64_t word_destination = (int64_t ) destination;
	int64_t const word_source = (int64_t const ) source;
	while (i + 8 <= length) {
	word_destination++ = word_source++;
	i += 8;
	}
	destination = (char *) word_destination;
	source = (char const *) word_source;
	}
	// bytewise copy whatever remains
	while (i < length) {
	destination++ = source++;
	i++;
	}
	}
	void stringified_memory_copy(char restrict destination, char const restrict source, unsigned int length) {
	unsigned int i = 0;
	// check to see if source and destination are equidistant to an 8-byte boundary
	if (((uintptr_t) destination & 0x7) == ((uintptr_t) source & 0x7)) {
	// Bytewise copy until we reach a word boundary (or the end of a very short array)
	if ((uintptr_t) destination & 0x7) {
	unsigned int prefix_length = 8 - ((uintptr_t) destination & 0x7);
	if (prefix_length > length) {
	prefix_length = length;
	}
	asm volatile (
	"rep movsb"
	: "+D"(destination), "+S"(source), "+c"(prefix_length)
	:
	: "memory"
	);
	i += prefix_length;
	}
	// source and destination are now aligned to an 8-byte boundary
	// wordwise copy until there's less than a full word remaining
	unsigned int bulk_length = (length - i) / 8;
	if (bulk_length > 0) {
	asm volatile (
	"rep movsq"
	: "+D"(destination), "+S"(source), "+c"(bulk_length)
	:
	: "memory"
	);
	i += bulk_length * 8;
	}
	}
	// bytewise copy whatever remains
	unsigned int suffix_length = length - i;
	if (suffix_length > 0) {
	asm volatile (
	"rep movsb"
	: "+D"(destination), "+S"(source), "+c"(suffix_length)
	:
	: "memory"
	);
	}
	}
	simple_memory_copy:
	cbz w2, .L1
	sub w5, w2, #1
	cmp w5, 14
	bls .L10
	and x4, x2, 4294967280
	mov x3, 0
	.L4:
	ldr q31, [x1, x3]
	str q31, [x0, x3]
	add x3, x3, 16
	cmp x3, x4
	bne .L4
	tst x2, 15
	beq .L1
	and w4, w2, -16
	.L3:
	sub w5, w5, w4
	cmp w5, 6
	bls .L7
	sub w3, w2, w4
	ldr d31, [x1, w4, uxtw]
	str d31, [x0, w4, uxtw]
	tst x3, 7
	beq .L1
	and w3, w3, -8
	add w4, w4, w3
	.L7:
	mov x3, 0
	uxtw x5, w4
	add x0, x0, x5
	add x1, x1, x5
	.L9:
	ldrb w5, [x1, x3]
	strb w5, [x0, x3]
	add x3, x3, 1
	add w5, w4, w3
	cmp w2, w5
	bhi .L9
	.L1:
	ret
	.L10:
	mov w4, 0
	b .L3
	simple_memory_copy:
	movq %rsi, %rcx
	testl %edx, %edx
	je .L1
	leal -1(%rdx), %r8d
	cmpl $14, %r8d
	jbe .L10
	movl %edx, %esi
	shrl $4, %esi
	movl %esi, %esi
	salq $4, %rsi
	movl $0, %eax
	.L4:
	movdqu (%rcx,%rax), %xmm0
	movups %xmm0, (%rdi,%rax)
	addq $16, %rax
	cmpq %rsi, %rax
	jne .L4
	testb $15, %dl
	je .L1
	movl %edx, %esi
	andl $-16, %esi
	.L3:
	subl %esi, %r8d
	cmpl $6, %r8d
	jbe .L7
	movl %edx, %eax
	subl %esi, %eax
	movl %esi, %r8d
	movq (%rcx,%r8), %r9
	movq %r9, (%rdi,%r8)
	testb $7, %al
	je .L1
	andl $-8, %eax
	addl %eax, %esi
	.L7:
	movl $0, %eax
	movl %esi, %r8d
	addq %r8, %rdi
	addq %r8, %rcx
	.L9:
	movzbl (%rcx,%rax), %r8d
	movb %r8b, (%rdi,%rax)
	addq $1, %rax
	leal (%rsi,%rax), %r8d
	cmpl %edx, %r8d
	jb .L9
	.L1:
	ret
	.L10:
	movl $0, %esi
	jmp .L3
	// -O1

	simple_memory_copy:
	cbz w2, .L1
	uxtw x3, w2
	mov x2, 0
	.L3:
	ldrb w4, [x1, x2]
	strb w4, [x0, x2]
	add x2, x2, 1
	cmp x3, x2
	bne .L3
	.L1:
	ret
	# -O1

	simple_memory_copy:
	testl %edx, %edx
	je .L1
	movl %edx, %edx
	movl $0, %eax
	.L3:
	movzbl (%rsi,%rax), %ecx
	movb %cl, (%rdi,%rax)
	addq $1, %rax
	cmpq %rax, %rdx
	jne .L3
	.L1:
	ret