Skip to content

Instantly share code, notes, and snippets.

@DocBohn
Created May 6, 2025 03:07
Show Gist options
  • Save DocBohn/f6ad567c7642443db7ef5e5c1bff4c98 to your computer and use it in GitHub Desktop.
Save DocBohn/f6ad567c7642443db7ef5e5c1bff4c98 to your computer and use it in GitHub Desktop.
Code demonstrating that some "branchless" transformations cannot be made by the compiler
// The ternary operator version and the bit manipuation version is *not* equivalent to the "if" version
// since they'll have a memory access regardless of array[i]'s value
void clamp_if(int *array, int length) {
for (int i = 0; i < length; i++) {
if (array[i] < 0) {
array[i] = 0;
}
}
}
// Using a conditional move instruction instead of branching for the ternary operator
// is easy for the compiler since the value for each condition is already available
void clamp_ternary(int *array, int length) {
for (int i = 0; i < length; i++) {
array[i] = array[i] < 0 ? 0 : array[i];
}
}
void clamp_unconditional(int *array, int length) {
for (int i = 0; i < length; i++) {
array[i] = array[i] & ~(array[i] >> 31);
}
}
// -O1 -ftree-vectorize
clamp_if:
cmp w1, 0
ble .L1
mov x2, x0
add x1, x0, w1, sxtw 2
b .L4
.L6:
str wzr, [x2]
.L3:
add x2, x2, 4
cmp x2, x1
beq .L1
.L4:
ldr w0, [x2]
tbz w0, #31, .L3
b .L6
.L1:
ret
clamp_ternary:
cmp w1, 0
ble .L7
sub w2, w1, #1
cmp w2, 2
bls .L14
mov x2, x0
lsr w3, w1, 2
add x3, x0, x3, lsl 4
movi v30.4s, 0
.L10:
ldr q31, [x2]
smax v31.4s, v31.4s, v30.4s
str q31, [x2], 16
cmp x2, x3
bne .L10
and w2, w1, -4
cmp w1, w2
beq .L7
.L9:
sxtw x2, w2
.L13:
ldr w3, [x0, x2, lsl 2]
bic w3, w3, w3, asr #31
str w3, [x0, x2, lsl 2]
add x2, x2, 1
cmp w1, w2
bgt .L13
.L7:
ret
.L14:
mov w2, 0
b .L9
clamp_unconditional:
cmp w1, 0
ble .L17
sub w2, w1, #1
cmp w2, 2
bls .L24
mov x2, x0
lsr w3, w1, 2
add x3, x0, x3, lsl 4
.L20:
ldr q31, [x2]
cmlt v30.4s, v31.4s, #0
bic v31.16b, v31.16b, v30.16b
str q31, [x2], 16
cmp x2, x3
bne .L20
and w2, w1, -4
cmp w1, w2
beq .L17
.L19:
sxtw x2, w2
.L23:
ldr w3, [x0, x2, lsl 2]
asr w4, w3, 31
bic w3, w3, w4
str w3, [x0, x2, lsl 2]
add x2, x2, 1
cmp w1, w2
bgt .L23
.L17:
ret
.L24:
mov w2, 0
b .L19
# -O1 -ftree-vectorize
clamp_if:
testl %esi, %esi
jle .L1
movq %rdi, %rax
movslq %esi, %rsi
leaq (%rdi,%rsi,4), %rdx
jmp .L4
.L6:
movl $0, (%rax)
.L3:
addq $4, %rax
cmpq %rdx, %rax
je .L1
.L4:
cmpl $0, (%rax)
jns .L3
jmp .L6
.L1:
ret
clamp_ternary:
testl %esi, %esi
jle .L7
leal -1(%rsi), %eax
cmpl $2, %eax
jbe .L14
movq %rdi, %rax
movl %esi, %edx
shrl $2, %edx
movl %edx, %edx
salq $4, %rdx
addq %rdi, %rdx
pxor %xmm2, %xmm2
.L10:
movdqu (%rax), %xmm0
movdqa %xmm0, %xmm1
pcmpgtd %xmm2, %xmm1
pand %xmm1, %xmm0
movups %xmm0, (%rax)
addq $16, %rax
cmpq %rdx, %rax
jne .L10
movl %esi, %eax
andl $-4, %eax
testb $3, %sil
je .L17
.L9:
cltq
.L13:
movl (%rdi,%rax,4), %edx
testl %edx, %edx
movl $0, %ecx
cmovs %ecx, %edx
movl %edx, (%rdi,%rax,4)
addq $1, %rax
cmpl %eax, %esi
jg .L13
.L7:
ret
.L17:
ret
.L14:
movl $0, %eax
jmp .L9
clamp_unconditional:
movq %rdi, %rcx
testl %esi, %esi
jle .L18
leal -1(%rsi), %eax
cmpl $2, %eax
jbe .L26
movq %rdi, %rax
movl %esi, %edx
shrl $2, %edx
movl %edx, %edx
salq $4, %rdx
addq %rdi, %rdx
.L21:
movdqu (%rax), %xmm1
movdqa %xmm1, %xmm0
psrad $31, %xmm0
pandn %xmm1, %xmm0
movups %xmm0, (%rax)
addq $16, %rax
cmpq %rax, %rdx
jne .L21
movl %esi, %eax
andl $-4, %eax
movl %eax, %edx
cmpl %eax, %esi
je .L28
.L20:
subl %edx, %esi
cmpl $1, %esi
je .L24
movl %edx, %edx
leaq (%rcx,%rdx,4), %rdx
movq (%rdx), %xmm1
movdqa %xmm1, %xmm0
psrad $31, %xmm0
pandn %xmm1, %xmm0
movq %xmm0, (%rdx)
testb $1, %sil
je .L18
andl $-2, %esi
addl %esi, %eax
.L24:
cltq
leaq (%rcx,%rax,4), %rdx
movl (%rdx), %ecx
movl %ecx, %eax
sarl $31, %eax
notl %eax
andl %ecx, %eax
movl %eax, (%rdx)
.L18:
ret
.L28:
ret
.L26:
movl $0, %edx
movl $0, %eax
jmp .L20
clamp_if:
cmp w1, 0
ble .L1
mov x2, x0
add x1, x0, w1, sxtw 2
b .L4
.L6:
str wzr, [x2]
.L3:
add x2, x2, 4
cmp x2, x1
beq .L1
.L4:
ldr w0, [x2]
tbz w0, #31, .L3
b .L6
.L1:
ret
clamp_ternary:
cmp w1, 0
ble .L7
mov x2, x0
add x1, x0, w1, sxtw 2
.L9:
ldr w0, [x2]
bic w0, w0, w0, asr #31
str w0, [x2], 4
cmp x2, x1
bne .L9
.L7:
ret
clamp_unconditional:
cmp w1, 0
ble .L11
mov x2, x0
add x3, x0, w1, sxtw 2
.L13:
ldr w0, [x2]
asr w1, w0, 31
bic w0, w0, w1
str w0, [x2], 4
cmp x2, x3
bne .L13
.L11:
ret
clamp_if:
movl $0, %eax
jmp .L2
.L6:
movl $0, (%rdx)
.L3:
addl $1, %eax
.L2:
cmpl %esi, %eax
jge .L5
movslq %eax, %rdx
leaq (%rdi,%rdx,4), %rdx
cmpl $0, (%rdx)
jns .L3
jmp .L6
.L5:
ret
clamp_ternary:
movl $0, %edx
jmp .L8
.L9:
movslq %edx, %rax
leaq (%rdi,%rax,4), %rcx
movl (%rcx), %eax
testl %eax, %eax
movl $0, %r8d
cmovs %r8d, %eax
movl %eax, (%rcx)
addl $1, %edx
.L8:
cmpl %esi, %edx
jl .L9
ret
clamp_unconditional:
movl $0, %edx
jmp .L11
.L12:
movslq %edx, %rax
leaq (%rdi,%rax,4), %r8
movl (%r8), %eax
movl %eax, %ecx
sarl $31, %ecx
notl %ecx
andl %ecx, %eax
movl %eax, (%r8)
addl $1, %edx
.L11:
cmpl %esi, %edx
jl .L12
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment