Skip to content

Instantly share code, notes, and snippets.

@DocBohn
Created May 5, 2025 21:47
Show Gist options
  • Save DocBohn/9bfb3c3e7305aa80c0793e6c44ca38c3 to your computer and use it in GitHub Desktop.
Save DocBohn/9bfb3c3e7305aa80c0793e6c44ca38c3 to your computer and use it in GitHub Desktop.
Code demonstrating that the compiler can eliminate branches to make vectorization possible
// extracting the load in each conditional path or each half of the conditional assignment was necessary for vectorization
// duplicate loads is the impediment to vectorization, not if/else statements
void loop_feedback_if(int *restrict levels, int *restrict deviations, int length, int feedback) {
for (int i = 0; i < length; i++) {
int level = levels[i];
if (deviations[i] < 0) {
levels[i] = level + feedback;
} else {
levels[i] = level - feedback;
}
}
}
void loop_feedback_ternary(int *restrict levels, int *restrict deviations, int length, int feedback) {
for (int i = 0; i < length; i++) {
int level = levels[i];
levels[i] = deviations[i] < 0 ? level + feedback : level - feedback;
}
}
void loop_feedback_unconditional(int *restrict levels, int *restrict deviations, int length, int feedback) {
for (int i = 0; i < length; i++) {
int add_subtract = -1 - 2 * (deviations[i] >> (8 * sizeof(int) - 1));
levels[i] = levels[i] + add_subtract * feedback;
}
}
// -O1 -ftree-vectorize
loop_feedback_if:
cmp w2, 0
ble .L1
sub w4, w2, #1
cmp w4, 2
bls .L11
dup v29.4s, w3
lsr w5, w2, 2
lsl x5, x5, 4
mov x4, 0
.L4:
ldr q30, [x0, x4]
ldr q31, [x1, x4]
cmlt v31.4s, v31.4s, #0
add v28.4s, v30.4s, v29.4s
sub v30.4s, v30.4s, v29.4s
bsl v31.16b, v28.16b, v30.16b
str q31, [x0, x4]
add x4, x4, 16
cmp x5, x4
bne .L4
and w4, w2, -4
cmp w2, w4
beq .L1
.L3:
sxtw x4, w4
.L9:
ldr w5, [x0, x4, lsl 2]
ldr w7, [x1, x4, lsl 2]
sub w6, w5, w3
add w5, w3, w5
cmp w7, 0
csel w5, w5, w6, lt
str w5, [x0, x4, lsl 2]
add x4, x4, 1
cmp w2, w4
bgt .L9
.L1:
ret
.L11:
mov w4, 0
b .L3
loop_feedback_ternary:
cmp w2, 0
ble .L14
sub w4, w2, #1
cmp w4, 2
bls .L24
dup v29.4s, w3
lsr w5, w2, 2
lsl x5, x5, 4
mov x4, 0
.L17:
ldr q30, [x0, x4]
ldr q31, [x1, x4]
cmlt v31.4s, v31.4s, #0
add v28.4s, v30.4s, v29.4s
sub v30.4s, v30.4s, v29.4s
bsl v31.16b, v28.16b, v30.16b
str q31, [x0, x4]
add x4, x4, 16
cmp x5, x4
bne .L17
and w4, w2, -4
cmp w2, w4
beq .L14
.L16:
sxtw x4, w4
.L22:
ldr w5, [x0, x4, lsl 2]
ldr w7, [x1, x4, lsl 2]
sub w6, w5, w3
add w5, w3, w5
cmp w7, 0
csel w5, w5, w6, lt
str w5, [x0, x4, lsl 2]
add x4, x4, 1
cmp w2, w4
bgt .L22
.L14:
ret
.L24:
mov w4, 0
b .L16
loop_feedback_unconditional:
cmp w2, 0
ble .L27
sub w4, w2, #1
cmp w4, 2
bls .L34
dup v29.4s, w3
lsr w5, w2, 2
lsl x5, x5, 4
mov x4, 0
.L30:
ldr q31, [x1, x4]
cmlt v31.4s, v31.4s, #0
add v31.4s, v31.4s, v31.4s
not v31.16b, v31.16b
ldr q30, [x0, x4]
mla v30.4s, v31.4s, v29.4s
str q30, [x0, x4]
add x4, x4, 16
cmp x4, x5
bne .L30
and w5, w2, -4
cmp w2, w5
beq .L27
.L29:
sxtw x5, w5
.L33:
ldr w4, [x1, x5, lsl 2]
asr w4, w4, 31
mvn w4, w4, lsl 1
ldr w6, [x0, x5, lsl 2]
madd w4, w4, w3, w6
str w4, [x0, x5, lsl 2]
add x5, x5, 1
cmp w2, w5
bgt .L33
.L27:
ret
.L34:
mov w5, 0
b .L29
# -O1 -ftree-vectorize
loop_feedback_if:
testl %edx, %edx
jle .L1
leal -1(%rdx), %eax
cmpl $2, %eax
jbe .L11
movd %ecx, %xmm6
pshufd $0, %xmm6, %xmm3
movl %edx, %r8d
shrl $2, %r8d
movl %r8d, %r8d
salq $4, %r8
movl $0, %eax
pxor %xmm4, %xmm4
.L4:
movdqu (%rdi,%rax), %xmm1
movdqa %xmm4, %xmm0
movdqu (%rsi,%rax), %xmm5
pcmpgtd %xmm5, %xmm0
movdqa %xmm1, %xmm2
paddd %xmm3, %xmm2
psubd %xmm3, %xmm1
pand %xmm0, %xmm2
pandn %xmm1, %xmm0
por %xmm2, %xmm0
movups %xmm0, (%rdi,%rax)
addq $16, %rax
cmpq %rax, %r8
jne .L4
movl %edx, %eax
andl $-4, %eax
testb $3, %dl
je .L15
.L3:
cltq
jmp .L9
.L15:
ret
.L11:
movl $0, %eax
jmp .L3
.L10:
movl %r8d, (%rdi,%rax,4)
addq $1, %rax
cmpl %eax, %edx
jle .L16
.L9:
movl (%rdi,%rax,4), %r9d
leal (%rcx,%r9), %r8d
cmpl $0, (%rsi,%rax,4)
js .L10
subl %ecx, %r9d
movl %r9d, %r8d
jmp .L10
.L16:
ret
.L1:
ret
loop_feedback_ternary:
testl %edx, %edx
jle .L17
leal -1(%rdx), %eax
cmpl $2, %eax
jbe .L27
movd %ecx, %xmm6
pshufd $0, %xmm6, %xmm3
movl %edx, %r8d
shrl $2, %r8d
movl %r8d, %r8d
salq $4, %r8
movl $0, %eax
pxor %xmm4, %xmm4
.L20:
movdqu (%rdi,%rax), %xmm1
movdqa %xmm4, %xmm0
movdqu (%rsi,%rax), %xmm5
pcmpgtd %xmm5, %xmm0
movdqa %xmm1, %xmm2
paddd %xmm3, %xmm2
psubd %xmm3, %xmm1
pand %xmm0, %xmm2
pandn %xmm1, %xmm0
por %xmm2, %xmm0
movups %xmm0, (%rdi,%rax)
addq $16, %rax
cmpq %rax, %r8
jne .L20
movl %edx, %eax
andl $-4, %eax
testb $3, %dl
je .L31
.L19:
cltq
jmp .L25
.L31:
ret
.L27:
movl $0, %eax
jmp .L19
.L26:
movl %r8d, (%rdi,%rax,4)
addq $1, %rax
cmpl %eax, %edx
jle .L32
.L25:
movl (%rdi,%rax,4), %r9d
leal (%rcx,%r9), %r8d
cmpl $0, (%rsi,%rax,4)
js .L26
subl %ecx, %r9d
movl %r9d, %r8d
jmp .L26
.L32:
ret
.L17:
ret
loop_feedback_unconditional:
movl %edx, %r8d
testl %edx, %edx
jle .L33
leal -1(%rdx), %eax
cmpl $2, %eax
jbe .L40
movd %ecx, %xmm5
pshufd $0, %xmm5, %xmm2
shrl $2, %edx
movl %edx, %edx
salq $4, %rdx
movl $0, %eax
pcmpeqd %xmm4, %xmm4
movdqa %xmm2, %xmm3
psrlq $32, %xmm3
.L36:
movdqu (%rsi,%rax), %xmm0
psrad $31, %xmm0
pslld $1, %xmm0
pxor %xmm4, %xmm0
movdqa %xmm0, %xmm1
pmuludq %xmm2, %xmm1
psrlq $32, %xmm0
pmuludq %xmm3, %xmm0
pshufd $8, %xmm1, %xmm1
pshufd $8, %xmm0, %xmm0
punpckldq %xmm0, %xmm1
movdqu (%rdi,%rax), %xmm0
paddd %xmm1, %xmm0
movups %xmm0, (%rdi,%rax)
addq $16, %rax
cmpq %rdx, %rax
jne .L36
movl %r8d, %edx
andl $-4, %edx
testb $3, %r8b
je .L43
.L35:
movslq %edx, %rdx
.L39:
movl (%rsi,%rdx,4), %eax
sarl $31, %eax
addl %eax, %eax
notl %eax
imull %ecx, %eax
addl %eax, (%rdi,%rdx,4)
addq $1, %rdx
cmpl %edx, %r8d
jg .L39
.L33:
ret
.L43:
ret
.L40:
movl $0, %edx
jmp .L35
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment