Created
October 29, 2015 23:51
-
-
Save jeapostrophe/d54d3a6a871e5127a6ed to your computer and use it in GitHub Desktop.
Vectorizing structure reads, writes, etc examples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; External declaration of the puts function | |
declare i32 @exit() nounwind | |
; float = 1, 2, 3, 4, 5, 6, 7, 8 | |
; i16 = 2, 4, 6, 8, 10, 12, 13, 14, 15, 16 | |
; i8 = 4, 8, 12, 16, 20, 24, 26, 28, 29, 30, 31, 32 | |
%athing = type { float, float, float, float, float, float, i16, i16, i8, i8, i8, i8 } | |
@one = external global %athing | |
@two = external global %athing | |
define i32 @main() { | |
; Make a one one | |
store %athing { float 5.0, float 6.0, | |
float 1.0, float 2.0, | |
float 3.0, float 4.0, | |
i16 25, i16 15, | |
i8 1, | |
i8 2, i8 3, i8 4 }, %athing* @one | |
call i32 @exit() | |
; RESULT: Very slow with 13 memory stores | |
; Load a two one from somewhere else | |
%a = load %athing* @two | |
store %athing %a, %athing* @one | |
call i32 @exit() | |
; RESULT: 6 vector loads, then a lot of parsing, then 12 memory stores, some as vectors | |
; See what the size of a sprite is | |
%gp = alloca %athing | |
%g = load %athing* %gp | |
store %athing %g, %athing* @one | |
; RESULT %rsp has 40 added to it, but we start at %rsp+8 for alignment | |
; Try to make it use a wide vector op (exchange) | |
%two_vector = bitcast %athing* @two to <8 x float>* | |
%b = load <8 x float>* %two_vector | |
%one_vector = bitcast %athing* @one to <8 x float>* | |
store <8 x float> %b, <8 x float>* %one_vector | |
call i32 @exit() | |
; RESULT: 1 vector load and 1 vector store | |
; Try to make it use a wide vector op, changing one of the floats | |
%cv_orig = load <8 x float>* %two_vector | |
%c_dx_orig = extractelement <8 x float> %cv_orig, i32 0 | |
%c_dx_change = fadd float %c_dx_orig, 1.0 | |
%cv_change = insertelement <8 x float> %cv_orig, float %c_dx_change, i32 0 | |
store <8 x float> %cv_change, <8 x float>* %one_vector | |
call i32 @exit() | |
; RESULT: 1 vector load, 1 vector load of the constant, an addition, then a single vector store | |
; Try to use a vector op to change the shorts and bytes | |
%dv_orig = load <8 x float>* %two_vector | |
;; change the float | |
%d_dx_orig = extractelement <8 x float> %dv_orig, i32 0 | |
%d_dx_change = fadd float %d_dx_orig, 1.0 | |
%dv_change0 = insertelement <8 x float> %dv_orig, float %d_dx_change, i32 0 | |
;; change the short | |
%dv_change0_s = bitcast <8 x float> %dv_change0 to <16 x i16> | |
%dv_change1_s = insertelement <16 x i16> %dv_change0_s, i16 5, i32 14 | |
%dv_change1 = bitcast <16 x i16> %dv_change1_s to <8 x float> | |
;; change a byte | |
%dv_change1_b = bitcast <8 x float> %dv_change1 to <32 x i8> | |
%dv_change2_b = insertelement <32 x i8> %dv_change1_b, i8 66, i32 29 | |
%dv_change2 = bitcast <32 x i8> %dv_change2_b to <8 x float> | |
store <8 x float> %dv_change2, <8 x float>* %one_vector | |
call i32 @exit() | |
; RESULT: virtually equivalent to the above, but with some more vector manipulations | |
; Make a one one using vector ops | |
%es_f = bitcast <2 x i16> <i16 25, i16 15> to float | |
%ev_fs = insertelement <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 >, float %es_f, i32 6 | |
%eb_f = bitcast <4 x i8> <i8 1, i8 2, i8 3, i8 4> to float | |
%ev_fsb = insertelement <8 x float> %ev_fs, float %eb_f, i32 7 | |
store <8 x float> %ev_fsb, <8 x float>* %one_vector | |
call i32 @exit() | |
; RESULT: lots of complicated constants loaded from memory before a single write | |
; Make a one one using bitcasting vector ops | |
%fv_fs_pre = bitcast <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 > to <16 x i16> | |
%fv_fs_post0 = insertelement <16 x i16> %fv_fs_pre, i16 25, i8 13 | |
%fv_fs_post1 = insertelement <16 x i16> %fv_fs_post0, i16 15, i8 14 | |
%fv_fsb_pre = bitcast <16 x i16> %fv_fs_post1 to <32 x i8> | |
%fv_fsb_post0 = insertelement <32 x i8> %fv_fsb_pre, i8 1, i8 28 | |
%fv_fsb_post1 = insertelement <32 x i8> %fv_fsb_post0, i8 2, i8 29 | |
%fv_fsb_post2 = insertelement <32 x i8> %fv_fsb_post1, i8 3, i8 30 | |
%fv_fsb_post3 = insertelement <32 x i8> %fv_fsb_post2, i8 4, i8 31 | |
%fv_fsb = bitcast <32 x i8> %fv_fsb_post3 to <8 x float> | |
store <8 x float> %fv_fsb, <8 x float>* %one_vector | |
call i32 @exit() | |
; RESULTS: one constant for the first four floats, then a trivial sequence of inserts, then a single write | |
ret i32 0 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.macosx_version_min 14, 5 | |
.section __TEXT,__literal4,4byte_literals | |
.align 2 | |
LCPI0_0: | |
.long 1065353216 ## float 1 | |
LCPI0_3: | |
.long 1082130432 ## float 4 | |
LCPI0_4: | |
.long 1077936128 ## float 3 | |
.section __TEXT,__literal16,16byte_literals | |
.align 4 | |
LCPI0_1: | |
.quad 25 ## 0x19 | |
.quad 15 ## 0xf | |
LCPI0_2: | |
.byte 0 ## 0x0 | |
.byte 1 ## 0x1 | |
.byte 8 ## 0x8 | |
.byte 9 ## 0x9 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
LCPI0_6: | |
.long 1 ## 0x1 | |
.long 2 ## 0x2 | |
.long 3 ## 0x3 | |
.long 4 ## 0x4 | |
LCPI0_7: | |
.byte 0 ## 0x0 | |
.byte 4 ## 0x4 | |
.byte 8 ## 0x8 | |
.byte 12 ## 0xc | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.byte 128 ## 0x80 | |
.section __TEXT,__const | |
.align 5 | |
LCPI0_5: | |
.long 1084227584 ## float 5.000000e+00 | |
.long 1086324736 ## float 6.000000e+00 | |
.long 1065353216 ## float 1.000000e+00 | |
.long 1073741824 ## float 2.000000e+00 | |
.long 1077936128 ## float 3.000000e+00 | |
.long 1082130432 ## float 4.000000e+00 | |
.long 0 ## float 0.000000e+00 | |
.long 0 ## float 0.000000e+00 | |
.section __TEXT,__text,regular,pure_instructions | |
.globl _main | |
.align 4, 0x90 | |
_main: ## @main | |
.cfi_startproc | |
## BB#0: | |
pushq %r14 | |
Ltmp0: | |
.cfi_def_cfa_offset 16 | |
pushq %rbx | |
Ltmp1: | |
.cfi_def_cfa_offset 24 | |
subq $40, %rsp | |
Ltmp2: | |
.cfi_def_cfa_offset 64 | |
Ltmp3: | |
.cfi_offset %rbx, -24 | |
Ltmp4: | |
.cfi_offset %r14, -16 | |
movq _one@GOTPCREL(%rip), %rbx | |
movb $4, 31(%rbx) | |
movb $3, 30(%rbx) | |
movb $2, 29(%rbx) | |
movb $1, 28(%rbx) | |
movw $15, 26(%rbx) | |
movw $25, 24(%rbx) | |
movl $1082130432, 20(%rbx) ## imm = 0x40800000 | |
movl $1077936128, 16(%rbx) ## imm = 0x40400000 | |
movl $1073741824, 12(%rbx) ## imm = 0x40000000 | |
movl $1065353216, 8(%rbx) ## imm = 0x3F800000 | |
movl $1086324736, 4(%rbx) ## imm = 0x40C00000 | |
movl $1084227584, (%rbx) ## imm = 0x40A00000 | |
callq _exit | |
movq _two@GOTPCREL(%rip), %r14 | |
vmovss (%r14), %xmm0 | |
vmovss 4(%r14), %xmm1 | |
vmovss 8(%r14), %xmm2 | |
vmovss 12(%r14), %xmm3 | |
vmovss 16(%r14), %xmm4 | |
vmovss 20(%r14), %xmm5 | |
movw 24(%r14), %si | |
movw 26(%r14), %di | |
movb 28(%r14), %r8b | |
movb 29(%r14), %al | |
movb 30(%r14), %cl | |
movb 31(%r14), %dl | |
movb %dl, 31(%rbx) | |
movb %cl, 30(%rbx) | |
movb %al, 29(%rbx) | |
movb %r8b, 28(%rbx) | |
movw %di, 26(%rbx) | |
movw %si, 24(%rbx) | |
vmovss %xmm5, 20(%rbx) | |
vmovss %xmm4, 16(%rbx) | |
vmovss %xmm3, 12(%rbx) | |
vmovss %xmm2, 8(%rbx) | |
vmovss %xmm1, 4(%rbx) | |
vmovss %xmm0, (%rbx) | |
callq _exit | |
vmovss 8(%rsp), %xmm0 | |
vmovss 12(%rsp), %xmm1 | |
vmovss 16(%rsp), %xmm2 | |
vmovss 20(%rsp), %xmm3 | |
vmovss 24(%rsp), %xmm4 | |
vmovss 28(%rsp), %xmm5 | |
movw 32(%rsp), %si | |
movw 34(%rsp), %di | |
movb 36(%rsp), %r8b | |
movb 37(%rsp), %al | |
movb 38(%rsp), %cl | |
movb 39(%rsp), %dl | |
movb %dl, 31(%rbx) | |
movb %cl, 30(%rbx) | |
movb %al, 29(%rbx) | |
movb %r8b, 28(%rbx) | |
movw %di, 26(%rbx) | |
movw %si, 24(%rbx) | |
vmovss %xmm5, 20(%rbx) | |
vmovss %xmm4, 16(%rbx) | |
vmovss %xmm3, 12(%rbx) | |
vmovss %xmm2, 8(%rbx) | |
vmovss %xmm1, 4(%rbx) | |
vmovss %xmm0, (%rbx) | |
vmovaps (%r14), %ymm0 | |
vmovaps %ymm0, (%rbx) | |
vzeroupper | |
callq _exit | |
vmovaps (%r14), %ymm0 | |
vmovss LCPI0_0(%rip), %xmm1 | |
vaddss %xmm1, %xmm0, %xmm1 | |
vinsertf128 $0, %xmm1, %ymm0, %ymm0 | |
vmovaps %ymm0, (%rbx) | |
vzeroupper | |
callq _exit | |
vmovaps (%r14), %ymm0 | |
vmovss LCPI0_0(%rip), %xmm1 | |
vaddss %xmm1, %xmm0, %xmm1 | |
vinsertf128 $0, %xmm1, %ymm0, %ymm1 | |
vextractf128 $1, %ymm0, %xmm0 | |
movl $5, %eax | |
vpinsrw $6, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm1 | |
movl $66, %eax | |
vpinsrb $13, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm0 | |
vmovaps %ymm0, (%rbx) | |
vzeroupper | |
callq _exit | |
vmovdqa LCPI0_1(%rip), %xmm0 | |
vpshufb LCPI0_2(%rip), %xmm0, %xmm0 | |
vmovd %xmm0, (%rsp) | |
vmovss LCPI0_3(%rip), %xmm0 | |
vmovss LCPI0_4(%rip), %xmm1 | |
vunpcklps %xmm0, %xmm1, %xmm0 ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] | |
vmovq %xmm0, %xmm0 | |
vinsertps $32, (%rsp), %xmm0, %xmm0 | |
vmovaps LCPI0_5(%rip), %ymm1 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm1 | |
vmovdqa LCPI0_6(%rip), %xmm2 | |
vpshufb LCPI0_7(%rip), %xmm2, %xmm2 | |
vmovd %xmm2, 4(%rsp) | |
vinsertps $48, 4(%rsp), %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm0 | |
vmovaps %ymm0, (%rbx) | |
vzeroupper | |
callq _exit | |
vmovaps LCPI0_5(%rip), %ymm1 | |
vextractf128 $1, %ymm1, %xmm0 | |
movl $25, %eax | |
vpinsrw $5, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm1 | |
movl $15, %eax | |
vpinsrw $6, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm1 | |
movl $1, %eax | |
vpinsrb $12, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm1 | |
movl $2, %eax | |
vpinsrb $13, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm1 | |
movl $3, %eax | |
vpinsrb $14, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm1 | |
movl $4, %eax | |
vpinsrb $15, %eax, %xmm0, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm1, %ymm0 | |
vmovaps %ymm0, (%rbx) | |
vzeroupper | |
callq _exit | |
xorl %eax, %eax | |
addq $40, %rsp | |
popq %rbx | |
popq %r14 | |
retq | |
.cfi_endproc | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment