Last active
May 9, 2025 20:55
-
-
Save DocBohn/7ff96640af34509f040b51ecf9fad94c to your computer and use it in GitHub Desktop.
Code demonstrating hand-crafted and compiler-generated loop unrolling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static int values[] = {5, 6, 2, 9, 8, 2, 9, 7}; | |
long basic_loop(unsigned int count) { | |
long sum = 0; | |
for(unsigned int i = 0; i < count; i++) { | |
sum += values[i % 8]; | |
} | |
return sum; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -funroll-loops | |
basic_loop: | |
mov w3, w0 | |
mov x0, 0 | |
mov w2, 0 | |
ands w1, w3, 7 | |
beq .L6 | |
cmp w2, w3 | |
bcs .L35 | |
adrp x4, .LANCHOR0 | |
and x5, x2, 7 | |
add x6, x4, :lo12:.LANCHOR0 | |
ldrsw x7, [x6, x5, lsl 2] | |
add x0, x0, x7 | |
add w2, w2, 1 | |
cmp w1, 1 | |
beq .L6 | |
cmp w1, 2 | |
beq .L29 | |
cmp w1, 3 | |
beq .L30 | |
cmp w1, 4 | |
beq .L31 | |
cmp w1, 5 | |
beq .L32 | |
cmp w1, 6 | |
beq .L33 | |
adrp x8, .LANCHOR0 | |
and x9, x2, 7 | |
add x10, x8, :lo12:.LANCHOR0 | |
ldrsw x11, [x10, x9, lsl 2] | |
add x0, x0, x11 | |
add w2, w2, 1 | |
.L33: | |
adrp x12, .LANCHOR0 | |
and x13, x2, 7 | |
add x14, x12, :lo12:.LANCHOR0 | |
ldrsw x15, [x14, x13, lsl 2] | |
add x0, x0, x15 | |
add w2, w2, 1 | |
.L32: | |
adrp x16, .LANCHOR0 | |
and x17, x2, 7 | |
add x18, x16, :lo12:.LANCHOR0 | |
ldrsw x1, [x18, x17, lsl 2] | |
add x0, x0, x1 | |
add w2, w2, 1 | |
.L31: | |
adrp x5, .LANCHOR0 | |
and x4, x2, 7 | |
add x6, x5, :lo12:.LANCHOR0 | |
ldrsw x7, [x6, x4, lsl 2] | |
add x0, x0, x7 | |
add w2, w2, 1 | |
.L30: | |
adrp x8, .LANCHOR0 | |
and x9, x2, 7 | |
add x10, x8, :lo12:.LANCHOR0 | |
ldrsw x11, [x10, x9, lsl 2] | |
add x0, x0, x11 | |
add w2, w2, 1 | |
.L29: | |
adrp x12, .LANCHOR0 | |
and x13, x2, 7 | |
add x14, x12, :lo12:.LANCHOR0 | |
ldrsw x15, [x14, x13, lsl 2] | |
add x0, x0, x15 | |
add w2, w2, 1 | |
b .L6 | |
.L7: | |
and x16, x2, 7 | |
adrp x17, .LANCHOR0 | |
add x18, x17, :lo12:.LANCHOR0 | |
ldrsw x1, [x18, x16, lsl 2] | |
add x5, x0, x1 | |
add w6, w2, 1 | |
adrp x0, .LANCHOR0 | |
and x4, x6, 7 | |
add x7, x0, :lo12:.LANCHOR0 | |
ldrsw x8, [x7, x4, lsl 2] | |
add x9, x5, x8 | |
add w10, w6, 1 | |
adrp x11, .LANCHOR0 | |
and x12, x10, 7 | |
add x13, x11, :lo12:.LANCHOR0 | |
ldrsw x14, [x13, x12, lsl 2] | |
add x15, x9, x14 | |
add w2, w6, 2 | |
adrp x16, .LANCHOR0 | |
and x17, x2, 7 | |
add x18, x16, :lo12:.LANCHOR0 | |
ldrsw x1, [x18, x17, lsl 2] | |
add x5, x15, x1 | |
add w4, w6, 3 | |
adrp x0, .LANCHOR0 | |
and x7, x4, 7 | |
add x8, x0, :lo12:.LANCHOR0 | |
ldrsw x9, [x8, x7, lsl 2] | |
add x10, x5, x9 | |
add w11, w6, 4 | |
adrp x12, .LANCHOR0 | |
and x13, x11, 7 | |
add x14, x12, :lo12:.LANCHOR0 | |
ldrsw x15, [x14, x13, lsl 2] | |
add x2, x10, x15 | |
add w16, w6, 5 | |
adrp x17, .LANCHOR0 | |
and x18, x16, 7 | |
add x1, x17, :lo12:.LANCHOR0 | |
ldrsw x5, [x1, x18, lsl 2] | |
add x7, x2, x5 | |
add w4, w6, 6 | |
adrp x0, .LANCHOR0 | |
and x8, x4, 7 | |
add x9, x0, :lo12:.LANCHOR0 | |
ldrsw x10, [x9, x8, lsl 2] | |
add x0, x7, x10 | |
add w2, w6, 7 | |
.L6: | |
cmp w2, w3 | |
bcc .L7 | |
.L35: | |
ret | |
.set .LANCHOR0,. + 0 | |
values: | |
.word 5 | |
.word 6 | |
.word 2 | |
.word 9 | |
.word 8 | |
.word 2 | |
.word 9 | |
.word 7 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
long duffs_device_loop_with_code_migration(unsigned int count) { | |
long sum = 0; | |
unsigned int i = 0; | |
unsigned int case0_index = (count + 0) % 8; | |
unsigned int case7_index = (count + 1) % 8; | |
unsigned int case6_index = (count + 2) % 8; | |
unsigned int case5_index = (count + 3) % 8; | |
unsigned int case4_index = (count + 4) % 8; | |
unsigned int case3_index = (count + 5) % 8; | |
unsigned int case2_index = (count + 6) % 8; | |
unsigned int case1_index = (count + 7) % 8; | |
unsigned int number_of_iterations = (count / 8) + ((count % 8 == 0) ? 0 : 1); | |
switch (count % 8) { | |
case 0: | |
while (i < number_of_iterations) { | |
sum += values[case0_index]; | |
case 7: sum += values[case7_index]; | |
case 6: sum += values[case6_index]; | |
case 5: sum += values[case5_index]; | |
case 4: sum += values[case4_index]; | |
case 3: sum += values[case3_index]; | |
case 2: sum += values[case2_index]; | |
case 1: sum += values[case1_index]; | |
i++; | |
} | |
} | |
return sum; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static int values[] = {5, 6, 2, 9, 8, 2, 9, 7}; | |
long duffs_device_loop(unsigned int count) { | |
long sum = 0; | |
unsigned int i = 0; | |
switch (count % 8) { | |
case 0: | |
while (i < count) { | |
sum += values[i % 8]; | |
i++; | |
case 7: | |
sum += values[i % 8]; | |
i++; | |
case 6: | |
sum += values[i % 8]; | |
i++; | |
case 5: | |
sum += values[i % 8]; | |
i++; | |
case 4: | |
sum += values[i % 8]; | |
i++; | |
case 3: | |
sum += values[i % 8]; | |
i++; | |
case 2: | |
sum += values[i % 8]; | |
i++; | |
case 1: | |
sum += values[i % 8]; | |
i++; | |
} | |
} | |
return sum; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -funroll-loops | |
basic_loop: | |
movl %edi, %esi | |
movl $0, %ecx | |
movl $0, %edx | |
movl %edi, %eax | |
andl $7, %eax | |
je .L6 | |
cmpl %edi, %edx | |
jnb .L35 | |
movl %edx, %edi | |
andl $7, %edi | |
movslq values(,%rdi,4), %r8 | |
addq %r8, %rcx | |
addl $1, %edx | |
cmpl $1, %eax | |
je .L6 | |
cmpl $2, %eax | |
je .L29 | |
cmpl $3, %eax | |
je .L30 | |
cmpl $4, %eax | |
je .L31 | |
cmpl $5, %eax | |
je .L32 | |
cmpl $6, %eax | |
je .L33 | |
movl %edx, %r9d | |
andl $7, %r9d | |
movslq values(,%r9,4), %r10 | |
addq %r10, %rcx | |
addl $1, %edx | |
.L33: | |
movl %edx, %r11d | |
andl $7, %r11d | |
movslq values(,%r11,4), %rax | |
addq %rax, %rcx | |
addl $1, %edx | |
.L32: | |
movl %edx, %edi | |
andl $7, %edi | |
movslq values(,%rdi,4), %r8 | |
addq %r8, %rcx | |
addl $1, %edx | |
.L31: | |
movl %edx, %r9d | |
andl $7, %r9d | |
movslq values(,%r9,4), %r10 | |
addq %r10, %rcx | |
addl $1, %edx | |
.L30: | |
movl %edx, %r11d | |
andl $7, %r11d | |
movslq values(,%r11,4), %rax | |
addq %rax, %rcx | |
addl $1, %edx | |
.L29: | |
movl %edx, %edi | |
andl $7, %edi | |
movslq values(,%rdi,4), %r8 | |
addq %r8, %rcx | |
addl $1, %edx | |
jmp .L6 | |
.L7: | |
movl %edx, %r9d | |
andl $7, %r9d | |
movslq values(,%r9,4), %r10 | |
addq %rcx, %r10 | |
addl $1, %edx | |
movl %edx, %ecx | |
andl $7, %ecx | |
movslq values(,%rcx,4), %r11 | |
addq %r11, %r10 | |
leal 1(%rdx), %eax | |
andl $7, %eax | |
movslq values(,%rax,4), %rdi | |
addq %rdi, %r10 | |
leal 2(%rdx), %r8d | |
andl $7, %r8d | |
movslq values(,%r8,4), %r9 | |
addq %r9, %r10 | |
leal 3(%rdx), %ecx | |
andl $7, %ecx | |
movslq values(,%rcx,4), %r11 | |
addq %r11, %r10 | |
leal 4(%rdx), %eax | |
andl $7, %eax | |
movslq values(,%rax,4), %rdi | |
addq %rdi, %r10 | |
leal 5(%rdx), %r8d | |
andl $7, %r8d | |
movslq values(,%r8,4), %r9 | |
addq %r9, %r10 | |
leal 6(%rdx), %ecx | |
andl $7, %ecx | |
movslq values(,%rcx,4), %rcx | |
addq %r10, %rcx | |
addl $7, %edx | |
.L6: | |
cmpl %esi, %edx | |
jb .L7 | |
.L35: | |
movq %rcx, %rax | |
ret | |
values: | |
.long 5 | |
.long 6 | |
.long 2 | |
.long 9 | |
.long 8 | |
.long 2 | |
.long 9 | |
.long 7 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment