Created
May 5, 2025 21:47
-
-
Save DocBohn/9bfb3c3e7305aa80c0793e6c44ca38c3 to your computer and use it in GitHub Desktop.
Code demonstrating that the compiler can eliminate branches to make vectorization possible
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// extracting the load in each conditional path or each half of the conditional assignment was necessary for vectorization | |
// duplicate loads is the impediment to vectorization, not if/else statements | |
void loop_feedback_if(int *restrict levels, int *restrict deviations, int length, int feedback) { | |
for (int i = 0; i < length; i++) { | |
int level = levels[i]; | |
if (deviations[i] < 0) { | |
levels[i] = level + feedback; | |
} else { | |
levels[i] = level - feedback; | |
} | |
} | |
} | |
void loop_feedback_ternary(int *restrict levels, int *restrict deviations, int length, int feedback) { | |
for (int i = 0; i < length; i++) { | |
int level = levels[i]; | |
levels[i] = deviations[i] < 0 ? level + feedback : level - feedback; | |
} | |
} | |
void loop_feedback_unconditional(int *restrict levels, int *restrict deviations, int length, int feedback) { | |
for (int i = 0; i < length; i++) { | |
int add_subtract = -1 - 2 * (deviations[i] >> (8 * sizeof(int) - 1)); | |
levels[i] = levels[i] + add_subtract * feedback; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -O1 -ftree-vectorize | |
loop_feedback_if: | |
cmp w2, 0 | |
ble .L1 | |
sub w4, w2, #1 | |
cmp w4, 2 | |
bls .L11 | |
dup v29.4s, w3 | |
lsr w5, w2, 2 | |
lsl x5, x5, 4 | |
mov x4, 0 | |
.L4: | |
ldr q30, [x0, x4] | |
ldr q31, [x1, x4] | |
cmlt v31.4s, v31.4s, #0 | |
add v28.4s, v30.4s, v29.4s | |
sub v30.4s, v30.4s, v29.4s | |
bsl v31.16b, v28.16b, v30.16b | |
str q31, [x0, x4] | |
add x4, x4, 16 | |
cmp x5, x4 | |
bne .L4 | |
and w4, w2, -4 | |
cmp w2, w4 | |
beq .L1 | |
.L3: | |
sxtw x4, w4 | |
.L9: | |
ldr w5, [x0, x4, lsl 2] | |
ldr w7, [x1, x4, lsl 2] | |
sub w6, w5, w3 | |
add w5, w3, w5 | |
cmp w7, 0 | |
csel w5, w5, w6, lt | |
str w5, [x0, x4, lsl 2] | |
add x4, x4, 1 | |
cmp w2, w4 | |
bgt .L9 | |
.L1: | |
ret | |
.L11: | |
mov w4, 0 | |
b .L3 | |
loop_feedback_ternary: | |
cmp w2, 0 | |
ble .L14 | |
sub w4, w2, #1 | |
cmp w4, 2 | |
bls .L24 | |
dup v29.4s, w3 | |
lsr w5, w2, 2 | |
lsl x5, x5, 4 | |
mov x4, 0 | |
.L17: | |
ldr q30, [x0, x4] | |
ldr q31, [x1, x4] | |
cmlt v31.4s, v31.4s, #0 | |
add v28.4s, v30.4s, v29.4s | |
sub v30.4s, v30.4s, v29.4s | |
bsl v31.16b, v28.16b, v30.16b | |
str q31, [x0, x4] | |
add x4, x4, 16 | |
cmp x5, x4 | |
bne .L17 | |
and w4, w2, -4 | |
cmp w2, w4 | |
beq .L14 | |
.L16: | |
sxtw x4, w4 | |
.L22: | |
ldr w5, [x0, x4, lsl 2] | |
ldr w7, [x1, x4, lsl 2] | |
sub w6, w5, w3 | |
add w5, w3, w5 | |
cmp w7, 0 | |
csel w5, w5, w6, lt | |
str w5, [x0, x4, lsl 2] | |
add x4, x4, 1 | |
cmp w2, w4 | |
bgt .L22 | |
.L14: | |
ret | |
.L24: | |
mov w4, 0 | |
b .L16 | |
loop_feedback_unconditional: | |
cmp w2, 0 | |
ble .L27 | |
sub w4, w2, #1 | |
cmp w4, 2 | |
bls .L34 | |
dup v29.4s, w3 | |
lsr w5, w2, 2 | |
lsl x5, x5, 4 | |
mov x4, 0 | |
.L30: | |
ldr q31, [x1, x4] | |
cmlt v31.4s, v31.4s, #0 | |
add v31.4s, v31.4s, v31.4s | |
not v31.16b, v31.16b | |
ldr q30, [x0, x4] | |
mla v30.4s, v31.4s, v29.4s | |
str q30, [x0, x4] | |
add x4, x4, 16 | |
cmp x4, x5 | |
bne .L30 | |
and w5, w2, -4 | |
cmp w2, w5 | |
beq .L27 | |
.L29: | |
sxtw x5, w5 | |
.L33: | |
ldr w4, [x1, x5, lsl 2] | |
asr w4, w4, 31 | |
mvn w4, w4, lsl 1 | |
ldr w6, [x0, x5, lsl 2] | |
madd w4, w4, w3, w6 | |
str w4, [x0, x5, lsl 2] | |
add x5, x5, 1 | |
cmp w2, w5 | |
bgt .L33 | |
.L27: | |
ret | |
.L34: | |
mov w5, 0 | |
b .L29 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -O1 -ftree-vectorize | |
loop_feedback_if: | |
testl %edx, %edx | |
jle .L1 | |
leal -1(%rdx), %eax | |
cmpl $2, %eax | |
jbe .L11 | |
movd %ecx, %xmm6 | |
pshufd $0, %xmm6, %xmm3 | |
movl %edx, %r8d | |
shrl $2, %r8d | |
movl %r8d, %r8d | |
salq $4, %r8 | |
movl $0, %eax | |
pxor %xmm4, %xmm4 | |
.L4: | |
movdqu (%rdi,%rax), %xmm1 | |
movdqa %xmm4, %xmm0 | |
movdqu (%rsi,%rax), %xmm5 | |
pcmpgtd %xmm5, %xmm0 | |
movdqa %xmm1, %xmm2 | |
paddd %xmm3, %xmm2 | |
psubd %xmm3, %xmm1 | |
pand %xmm0, %xmm2 | |
pandn %xmm1, %xmm0 | |
por %xmm2, %xmm0 | |
movups %xmm0, (%rdi,%rax) | |
addq $16, %rax | |
cmpq %rax, %r8 | |
jne .L4 | |
movl %edx, %eax | |
andl $-4, %eax | |
testb $3, %dl | |
je .L15 | |
.L3: | |
cltq | |
jmp .L9 | |
.L15: | |
ret | |
.L11: | |
movl $0, %eax | |
jmp .L3 | |
.L10: | |
movl %r8d, (%rdi,%rax,4) | |
addq $1, %rax | |
cmpl %eax, %edx | |
jle .L16 | |
.L9: | |
movl (%rdi,%rax,4), %r9d | |
leal (%rcx,%r9), %r8d | |
cmpl $0, (%rsi,%rax,4) | |
js .L10 | |
subl %ecx, %r9d | |
movl %r9d, %r8d | |
jmp .L10 | |
.L16: | |
ret | |
.L1: | |
ret | |
loop_feedback_ternary: | |
testl %edx, %edx | |
jle .L17 | |
leal -1(%rdx), %eax | |
cmpl $2, %eax | |
jbe .L27 | |
movd %ecx, %xmm6 | |
pshufd $0, %xmm6, %xmm3 | |
movl %edx, %r8d | |
shrl $2, %r8d | |
movl %r8d, %r8d | |
salq $4, %r8 | |
movl $0, %eax | |
pxor %xmm4, %xmm4 | |
.L20: | |
movdqu (%rdi,%rax), %xmm1 | |
movdqa %xmm4, %xmm0 | |
movdqu (%rsi,%rax), %xmm5 | |
pcmpgtd %xmm5, %xmm0 | |
movdqa %xmm1, %xmm2 | |
paddd %xmm3, %xmm2 | |
psubd %xmm3, %xmm1 | |
pand %xmm0, %xmm2 | |
pandn %xmm1, %xmm0 | |
por %xmm2, %xmm0 | |
movups %xmm0, (%rdi,%rax) | |
addq $16, %rax | |
cmpq %rax, %r8 | |
jne .L20 | |
movl %edx, %eax | |
andl $-4, %eax | |
testb $3, %dl | |
je .L31 | |
.L19: | |
cltq | |
jmp .L25 | |
.L31: | |
ret | |
.L27: | |
movl $0, %eax | |
jmp .L19 | |
.L26: | |
movl %r8d, (%rdi,%rax,4) | |
addq $1, %rax | |
cmpl %eax, %edx | |
jle .L32 | |
.L25: | |
movl (%rdi,%rax,4), %r9d | |
leal (%rcx,%r9), %r8d | |
cmpl $0, (%rsi,%rax,4) | |
js .L26 | |
subl %ecx, %r9d | |
movl %r9d, %r8d | |
jmp .L26 | |
.L32: | |
ret | |
.L17: | |
ret | |
loop_feedback_unconditional: | |
movl %edx, %r8d | |
testl %edx, %edx | |
jle .L33 | |
leal -1(%rdx), %eax | |
cmpl $2, %eax | |
jbe .L40 | |
movd %ecx, %xmm5 | |
pshufd $0, %xmm5, %xmm2 | |
shrl $2, %edx | |
movl %edx, %edx | |
salq $4, %rdx | |
movl $0, %eax | |
pcmpeqd %xmm4, %xmm4 | |
movdqa %xmm2, %xmm3 | |
psrlq $32, %xmm3 | |
.L36: | |
movdqu (%rsi,%rax), %xmm0 | |
psrad $31, %xmm0 | |
pslld $1, %xmm0 | |
pxor %xmm4, %xmm0 | |
movdqa %xmm0, %xmm1 | |
pmuludq %xmm2, %xmm1 | |
psrlq $32, %xmm0 | |
pmuludq %xmm3, %xmm0 | |
pshufd $8, %xmm1, %xmm1 | |
pshufd $8, %xmm0, %xmm0 | |
punpckldq %xmm0, %xmm1 | |
movdqu (%rdi,%rax), %xmm0 | |
paddd %xmm1, %xmm0 | |
movups %xmm0, (%rdi,%rax) | |
addq $16, %rax | |
cmpq %rdx, %rax | |
jne .L36 | |
movl %r8d, %edx | |
andl $-4, %edx | |
testb $3, %r8b | |
je .L43 | |
.L35: | |
movslq %edx, %rdx | |
.L39: | |
movl (%rsi,%rdx,4), %eax | |
sarl $31, %eax | |
addl %eax, %eax | |
notl %eax | |
imull %ecx, %eax | |
addl %eax, (%rdi,%rdx,4) | |
addq $1, %rdx | |
cmpl %edx, %r8d | |
jg .L39 | |
.L33: | |
ret | |
.L43: | |
ret | |
.L40: | |
movl $0, %edx | |
jmp .L35 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment