Created
May 6, 2025 03:07
-
-
Save DocBohn/f6ad567c7642443db7ef5e5c1bff4c98 to your computer and use it in GitHub Desktop.
Code demonstrating that some "branchless" transformations cannot be made by the compiler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// The ternary operator version and the bit manipuation version is *not* equivalent to the "if" version | |
// since they'll have a memory access regardless of array[i]'s value | |
void clamp_if(int *array, int length) { | |
for (int i = 0; i < length; i++) { | |
if (array[i] < 0) { | |
array[i] = 0; | |
} | |
} | |
} | |
// Using a conditional move instruction instead of branching for the ternary operator | |
// is easy for the compiler since the value for each condition is already available | |
void clamp_ternary(int *array, int length) { | |
for (int i = 0; i < length; i++) { | |
array[i] = array[i] < 0 ? 0 : array[i]; | |
} | |
} | |
void clamp_unconditional(int *array, int length) { | |
for (int i = 0; i < length; i++) { | |
array[i] = array[i] & ~(array[i] >> 31); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -O1 -ftree-vectorize | |
clamp_if: | |
cmp w1, 0 | |
ble .L1 | |
mov x2, x0 | |
add x1, x0, w1, sxtw 2 | |
b .L4 | |
.L6: | |
str wzr, [x2] | |
.L3: | |
add x2, x2, 4 | |
cmp x2, x1 | |
beq .L1 | |
.L4: | |
ldr w0, [x2] | |
tbz w0, #31, .L3 | |
b .L6 | |
.L1: | |
ret | |
clamp_ternary: | |
cmp w1, 0 | |
ble .L7 | |
sub w2, w1, #1 | |
cmp w2, 2 | |
bls .L14 | |
mov x2, x0 | |
lsr w3, w1, 2 | |
add x3, x0, x3, lsl 4 | |
movi v30.4s, 0 | |
.L10: | |
ldr q31, [x2] | |
smax v31.4s, v31.4s, v30.4s | |
str q31, [x2], 16 | |
cmp x2, x3 | |
bne .L10 | |
and w2, w1, -4 | |
cmp w1, w2 | |
beq .L7 | |
.L9: | |
sxtw x2, w2 | |
.L13: | |
ldr w3, [x0, x2, lsl 2] | |
bic w3, w3, w3, asr #31 | |
str w3, [x0, x2, lsl 2] | |
add x2, x2, 1 | |
cmp w1, w2 | |
bgt .L13 | |
.L7: | |
ret | |
.L14: | |
mov w2, 0 | |
b .L9 | |
clamp_unconditional: | |
cmp w1, 0 | |
ble .L17 | |
sub w2, w1, #1 | |
cmp w2, 2 | |
bls .L24 | |
mov x2, x0 | |
lsr w3, w1, 2 | |
add x3, x0, x3, lsl 4 | |
.L20: | |
ldr q31, [x2] | |
cmlt v30.4s, v31.4s, #0 | |
bic v31.16b, v31.16b, v30.16b | |
str q31, [x2], 16 | |
cmp x2, x3 | |
bne .L20 | |
and w2, w1, -4 | |
cmp w1, w2 | |
beq .L17 | |
.L19: | |
sxtw x2, w2 | |
.L23: | |
ldr w3, [x0, x2, lsl 2] | |
asr w4, w3, 31 | |
bic w3, w3, w4 | |
str w3, [x0, x2, lsl 2] | |
add x2, x2, 1 | |
cmp w1, w2 | |
bgt .L23 | |
.L17: | |
ret | |
.L24: | |
mov w2, 0 | |
b .L19 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -O1 -ftree-vectorize | |
clamp_if: | |
testl %esi, %esi | |
jle .L1 | |
movq %rdi, %rax | |
movslq %esi, %rsi | |
leaq (%rdi,%rsi,4), %rdx | |
jmp .L4 | |
.L6: | |
movl $0, (%rax) | |
.L3: | |
addq $4, %rax | |
cmpq %rdx, %rax | |
je .L1 | |
.L4: | |
cmpl $0, (%rax) | |
jns .L3 | |
jmp .L6 | |
.L1: | |
ret | |
clamp_ternary: | |
testl %esi, %esi | |
jle .L7 | |
leal -1(%rsi), %eax | |
cmpl $2, %eax | |
jbe .L14 | |
movq %rdi, %rax | |
movl %esi, %edx | |
shrl $2, %edx | |
movl %edx, %edx | |
salq $4, %rdx | |
addq %rdi, %rdx | |
pxor %xmm2, %xmm2 | |
.L10: | |
movdqu (%rax), %xmm0 | |
movdqa %xmm0, %xmm1 | |
pcmpgtd %xmm2, %xmm1 | |
pand %xmm1, %xmm0 | |
movups %xmm0, (%rax) | |
addq $16, %rax | |
cmpq %rdx, %rax | |
jne .L10 | |
movl %esi, %eax | |
andl $-4, %eax | |
testb $3, %sil | |
je .L17 | |
.L9: | |
cltq | |
.L13: | |
movl (%rdi,%rax,4), %edx | |
testl %edx, %edx | |
movl $0, %ecx | |
cmovs %ecx, %edx | |
movl %edx, (%rdi,%rax,4) | |
addq $1, %rax | |
cmpl %eax, %esi | |
jg .L13 | |
.L7: | |
ret | |
.L17: | |
ret | |
.L14: | |
movl $0, %eax | |
jmp .L9 | |
clamp_unconditional: | |
movq %rdi, %rcx | |
testl %esi, %esi | |
jle .L18 | |
leal -1(%rsi), %eax | |
cmpl $2, %eax | |
jbe .L26 | |
movq %rdi, %rax | |
movl %esi, %edx | |
shrl $2, %edx | |
movl %edx, %edx | |
salq $4, %rdx | |
addq %rdi, %rdx | |
.L21: | |
movdqu (%rax), %xmm1 | |
movdqa %xmm1, %xmm0 | |
psrad $31, %xmm0 | |
pandn %xmm1, %xmm0 | |
movups %xmm0, (%rax) | |
addq $16, %rax | |
cmpq %rax, %rdx | |
jne .L21 | |
movl %esi, %eax | |
andl $-4, %eax | |
movl %eax, %edx | |
cmpl %eax, %esi | |
je .L28 | |
.L20: | |
subl %edx, %esi | |
cmpl $1, %esi | |
je .L24 | |
movl %edx, %edx | |
leaq (%rcx,%rdx,4), %rdx | |
movq (%rdx), %xmm1 | |
movdqa %xmm1, %xmm0 | |
psrad $31, %xmm0 | |
pandn %xmm1, %xmm0 | |
movq %xmm0, (%rdx) | |
testb $1, %sil | |
je .L18 | |
andl $-2, %esi | |
addl %esi, %eax | |
.L24: | |
cltq | |
leaq (%rcx,%rax,4), %rdx | |
movl (%rdx), %ecx | |
movl %ecx, %eax | |
sarl $31, %eax | |
notl %eax | |
andl %ecx, %eax | |
movl %eax, (%rdx) | |
.L18: | |
ret | |
.L28: | |
ret | |
.L26: | |
movl $0, %edx | |
movl $0, %eax | |
jmp .L20 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clamp_if: | |
cmp w1, 0 | |
ble .L1 | |
mov x2, x0 | |
add x1, x0, w1, sxtw 2 | |
b .L4 | |
.L6: | |
str wzr, [x2] | |
.L3: | |
add x2, x2, 4 | |
cmp x2, x1 | |
beq .L1 | |
.L4: | |
ldr w0, [x2] | |
tbz w0, #31, .L3 | |
b .L6 | |
.L1: | |
ret | |
clamp_ternary: | |
cmp w1, 0 | |
ble .L7 | |
mov x2, x0 | |
add x1, x0, w1, sxtw 2 | |
.L9: | |
ldr w0, [x2] | |
bic w0, w0, w0, asr #31 | |
str w0, [x2], 4 | |
cmp x2, x1 | |
bne .L9 | |
.L7: | |
ret | |
clamp_unconditional: | |
cmp w1, 0 | |
ble .L11 | |
mov x2, x0 | |
add x3, x0, w1, sxtw 2 | |
.L13: | |
ldr w0, [x2] | |
asr w1, w0, 31 | |
bic w0, w0, w1 | |
str w0, [x2], 4 | |
cmp x2, x3 | |
bne .L13 | |
.L11: | |
ret |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clamp_if: | |
movl $0, %eax | |
jmp .L2 | |
.L6: | |
movl $0, (%rdx) | |
.L3: | |
addl $1, %eax | |
.L2: | |
cmpl %esi, %eax | |
jge .L5 | |
movslq %eax, %rdx | |
leaq (%rdi,%rdx,4), %rdx | |
cmpl $0, (%rdx) | |
jns .L3 | |
jmp .L6 | |
.L5: | |
ret | |
clamp_ternary: | |
movl $0, %edx | |
jmp .L8 | |
.L9: | |
movslq %edx, %rax | |
leaq (%rdi,%rax,4), %rcx | |
movl (%rcx), %eax | |
testl %eax, %eax | |
movl $0, %r8d | |
cmovs %r8d, %eax | |
movl %eax, (%rcx) | |
addl $1, %edx | |
.L8: | |
cmpl %esi, %edx | |
jl .L9 | |
ret | |
clamp_unconditional: | |
movl $0, %edx | |
jmp .L11 | |
.L12: | |
movslq %edx, %rax | |
leaq (%rdi,%rax,4), %r8 | |
movl (%r8), %eax | |
movl %eax, %ecx | |
sarl $31, %ecx | |
notl %ecx | |
andl %ecx, %eax | |
movl %eax, (%r8) | |
addl $1, %edx | |
.L11: | |
cmpl %esi, %edx | |
jl .L12 | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment