Skip to content

Instantly share code, notes, and snippets.

@DocBohn
Last active May 9, 2025 20:55
Show Gist options
  • Save DocBohn/7ff96640af34509f040b51ecf9fad94c to your computer and use it in GitHub Desktop.
Save DocBohn/7ff96640af34509f040b51ecf9fad94c to your computer and use it in GitHub Desktop.
Code demonstrating hand-crafted and compiler-generated loop unrolling
static int values[] = {5, 6, 2, 9, 8, 2, 9, 7};
long basic_loop(unsigned int count) {
long sum = 0;
for(unsigned int i = 0; i < count; i++) {
sum += values[i % 8];
}
return sum;
}
// -funroll-loops
basic_loop:
mov w3, w0
mov x0, 0
mov w2, 0
ands w1, w3, 7
beq .L6
cmp w2, w3
bcs .L35
adrp x4, .LANCHOR0
and x5, x2, 7
add x6, x4, :lo12:.LANCHOR0
ldrsw x7, [x6, x5, lsl 2]
add x0, x0, x7
add w2, w2, 1
cmp w1, 1
beq .L6
cmp w1, 2
beq .L29
cmp w1, 3
beq .L30
cmp w1, 4
beq .L31
cmp w1, 5
beq .L32
cmp w1, 6
beq .L33
adrp x8, .LANCHOR0
and x9, x2, 7
add x10, x8, :lo12:.LANCHOR0
ldrsw x11, [x10, x9, lsl 2]
add x0, x0, x11
add w2, w2, 1
.L33:
adrp x12, .LANCHOR0
and x13, x2, 7
add x14, x12, :lo12:.LANCHOR0
ldrsw x15, [x14, x13, lsl 2]
add x0, x0, x15
add w2, w2, 1
.L32:
adrp x16, .LANCHOR0
and x17, x2, 7
add x18, x16, :lo12:.LANCHOR0
ldrsw x1, [x18, x17, lsl 2]
add x0, x0, x1
add w2, w2, 1
.L31:
adrp x5, .LANCHOR0
and x4, x2, 7
add x6, x5, :lo12:.LANCHOR0
ldrsw x7, [x6, x4, lsl 2]
add x0, x0, x7
add w2, w2, 1
.L30:
adrp x8, .LANCHOR0
and x9, x2, 7
add x10, x8, :lo12:.LANCHOR0
ldrsw x11, [x10, x9, lsl 2]
add x0, x0, x11
add w2, w2, 1
.L29:
adrp x12, .LANCHOR0
and x13, x2, 7
add x14, x12, :lo12:.LANCHOR0
ldrsw x15, [x14, x13, lsl 2]
add x0, x0, x15
add w2, w2, 1
b .L6
.L7:
and x16, x2, 7
adrp x17, .LANCHOR0
add x18, x17, :lo12:.LANCHOR0
ldrsw x1, [x18, x16, lsl 2]
add x5, x0, x1
add w6, w2, 1
adrp x0, .LANCHOR0
and x4, x6, 7
add x7, x0, :lo12:.LANCHOR0
ldrsw x8, [x7, x4, lsl 2]
add x9, x5, x8
add w10, w6, 1
adrp x11, .LANCHOR0
and x12, x10, 7
add x13, x11, :lo12:.LANCHOR0
ldrsw x14, [x13, x12, lsl 2]
add x15, x9, x14
add w2, w6, 2
adrp x16, .LANCHOR0
and x17, x2, 7
add x18, x16, :lo12:.LANCHOR0
ldrsw x1, [x18, x17, lsl 2]
add x5, x15, x1
add w4, w6, 3
adrp x0, .LANCHOR0
and x7, x4, 7
add x8, x0, :lo12:.LANCHOR0
ldrsw x9, [x8, x7, lsl 2]
add x10, x5, x9
add w11, w6, 4
adrp x12, .LANCHOR0
and x13, x11, 7
add x14, x12, :lo12:.LANCHOR0
ldrsw x15, [x14, x13, lsl 2]
add x2, x10, x15
add w16, w6, 5
adrp x17, .LANCHOR0
and x18, x16, 7
add x1, x17, :lo12:.LANCHOR0
ldrsw x5, [x1, x18, lsl 2]
add x7, x2, x5
add w4, w6, 6
adrp x0, .LANCHOR0
and x8, x4, 7
add x9, x0, :lo12:.LANCHOR0
ldrsw x10, [x9, x8, lsl 2]
add x0, x7, x10
add w2, w6, 7
.L6:
cmp w2, w3
bcc .L7
.L35:
ret
.set .LANCHOR0,. + 0
values:
.word 5
.word 6
.word 2
.word 9
.word 8
.word 2
.word 9
.word 7
long duffs_device_loop_with_code_migration(unsigned int count) {
long sum = 0;
unsigned int i = 0;
unsigned int case0_index = (count + 0) % 8;
unsigned int case7_index = (count + 1) % 8;
unsigned int case6_index = (count + 2) % 8;
unsigned int case5_index = (count + 3) % 8;
unsigned int case4_index = (count + 4) % 8;
unsigned int case3_index = (count + 5) % 8;
unsigned int case2_index = (count + 6) % 8;
unsigned int case1_index = (count + 7) % 8;
unsigned int number_of_iterations = (count / 8) + ((count % 8 == 0) ? 0 : 1);
switch (count % 8) {
case 0:
while (i < number_of_iterations) {
sum += values[case0_index];
case 7: sum += values[case7_index];
case 6: sum += values[case6_index];
case 5: sum += values[case5_index];
case 4: sum += values[case4_index];
case 3: sum += values[case3_index];
case 2: sum += values[case2_index];
case 1: sum += values[case1_index];
i++;
}
}
return sum;
}
static int values[] = {5, 6, 2, 9, 8, 2, 9, 7};
long duffs_device_loop(unsigned int count) {
long sum = 0;
unsigned int i = 0;
switch (count % 8) {
case 0:
while (i < count) {
sum += values[i % 8];
i++;
case 7:
sum += values[i % 8];
i++;
case 6:
sum += values[i % 8];
i++;
case 5:
sum += values[i % 8];
i++;
case 4:
sum += values[i % 8];
i++;
case 3:
sum += values[i % 8];
i++;
case 2:
sum += values[i % 8];
i++;
case 1:
sum += values[i % 8];
i++;
}
}
return sum;
}
# -funroll-loops
basic_loop:
movl %edi, %esi
movl $0, %ecx
movl $0, %edx
movl %edi, %eax
andl $7, %eax
je .L6
cmpl %edi, %edx
jnb .L35
movl %edx, %edi
andl $7, %edi
movslq values(,%rdi,4), %r8
addq %r8, %rcx
addl $1, %edx
cmpl $1, %eax
je .L6
cmpl $2, %eax
je .L29
cmpl $3, %eax
je .L30
cmpl $4, %eax
je .L31
cmpl $5, %eax
je .L32
cmpl $6, %eax
je .L33
movl %edx, %r9d
andl $7, %r9d
movslq values(,%r9,4), %r10
addq %r10, %rcx
addl $1, %edx
.L33:
movl %edx, %r11d
andl $7, %r11d
movslq values(,%r11,4), %rax
addq %rax, %rcx
addl $1, %edx
.L32:
movl %edx, %edi
andl $7, %edi
movslq values(,%rdi,4), %r8
addq %r8, %rcx
addl $1, %edx
.L31:
movl %edx, %r9d
andl $7, %r9d
movslq values(,%r9,4), %r10
addq %r10, %rcx
addl $1, %edx
.L30:
movl %edx, %r11d
andl $7, %r11d
movslq values(,%r11,4), %rax
addq %rax, %rcx
addl $1, %edx
.L29:
movl %edx, %edi
andl $7, %edi
movslq values(,%rdi,4), %r8
addq %r8, %rcx
addl $1, %edx
jmp .L6
.L7:
movl %edx, %r9d
andl $7, %r9d
movslq values(,%r9,4), %r10
addq %rcx, %r10
addl $1, %edx
movl %edx, %ecx
andl $7, %ecx
movslq values(,%rcx,4), %r11
addq %r11, %r10
leal 1(%rdx), %eax
andl $7, %eax
movslq values(,%rax,4), %rdi
addq %rdi, %r10
leal 2(%rdx), %r8d
andl $7, %r8d
movslq values(,%r8,4), %r9
addq %r9, %r10
leal 3(%rdx), %ecx
andl $7, %ecx
movslq values(,%rcx,4), %r11
addq %r11, %r10
leal 4(%rdx), %eax
andl $7, %eax
movslq values(,%rax,4), %rdi
addq %rdi, %r10
leal 5(%rdx), %r8d
andl $7, %r8d
movslq values(,%r8,4), %r9
addq %r9, %r10
leal 6(%rdx), %ecx
andl $7, %ecx
movslq values(,%rcx,4), %rcx
addq %r10, %rcx
addl $7, %edx
.L6:
cmpl %esi, %edx
jb .L7
.L35:
movq %rcx, %rax
ret
values:
.long 5
.long 6
.long 2
.long 9
.long 8
.long 2
.long 9
.long 7
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment