jeapostrophe · October 29, 2015 23:51
diff --git a/test.ll b/test.ll
 ; External declaration of the puts function
 declare i32 @exit() nounwind

 ; float =            1,     2,     3,     4,     5,     6,        7,              8
 ;   i16 =            2,     4,     6,     8,    10,    12,  13,  14,     15,     16
 ;    i8 =            4,     8,    12,    16,    20,    24,  26,  28, 29, 30, 31, 32
 %athing = type { float, float, float, float, float, float, i16, i16, i8, i8, i8, i8 }

 @one = external global %athing
 @two = external global %athing

 define i32 @main() {
  ; Make a one one
  store %athing { float 5.0, float 6.0,
         float 1.0, float 2.0,
         float 3.0, float 4.0,
         i16 25, i16 15,
         i8 1,
         i8 2, i8 3, i8 4 }, %athing* @one
  call i32 @exit()
  ; RESULT: Very slow with 13 memory stores

  ; Load a two one from somewhere else
  %a = load %athing* @two
  store %athing %a, %athing* @one
  call i32 @exit()
  ; RESULT: 6 vector loads, then a lot of parsing, then 12 memory stores, some as vectors

  ; See what the size of a sprite is
  %gp = alloca %athing
  %g = load %athing* %gp
  store %athing %g, %athing* @one
  ; RESULT %rsp has 40 added to it, but we start at %rsp+8 for alignment

  ; Try to make it use a wide vector op (exchange)
  %two_vector = bitcast %athing* @two to <8 x float>*
  %b = load <8 x float>* %two_vector
  %one_vector = bitcast %athing* @one to <8 x float>*
  store <8 x float> %b, <8 x float>* %one_vector
  call i32 @exit()
  ; RESULT: 1 vector load and 1 vector store

  ; Try to make it use a wide vector op, changing one of the floats
  %cv_orig = load <8 x float>* %two_vector
  %c_dx_orig = extractelement <8 x float> %cv_orig, i32 0
  %c_dx_change = fadd float %c_dx_orig, 1.0
  %cv_change = insertelement <8 x float> %cv_orig, float %c_dx_change, i32 0
  store <8 x float> %cv_change, <8 x float>* %one_vector
  call i32 @exit()
  ; RESULT: 1 vector load, 1 vector load of the constant, an addition, then a single vector store

  ; Try to use a vector op to change the shorts and bytes  
  %dv_orig = load <8 x float>* %two_vector
  ;; change the float
  %d_dx_orig = extractelement <8 x float> %dv_orig, i32 0
  %d_dx_change = fadd float %d_dx_orig, 1.0
  %dv_change0 = insertelement <8 x float> %dv_orig, float %d_dx_change, i32 0
  ;; change the short
  %dv_change0_s = bitcast <8 x float> %dv_change0 to <16 x i16>
  %dv_change1_s = insertelement <16 x i16> %dv_change0_s, i16 5, i32 14
  %dv_change1 = bitcast <16 x i16> %dv_change1_s to <8 x float>
  ;; change a byte
  %dv_change1_b = bitcast <8 x float> %dv_change1 to <32 x i8>
  %dv_change2_b = insertelement <32 x i8> %dv_change1_b, i8 66, i32 29
  %dv_change2 = bitcast <32 x i8> %dv_change2_b to <8 x float>
  store <8 x float> %dv_change2, <8 x float>* %one_vector
  call i32 @exit()
  ; RESULT: virtually equivalent to the above, but with some more vector manipulations

  ; Make a one one using vector ops
  %es_f = bitcast <2 x i16> <i16 25, i16 15> to float
  %ev_fs = insertelement <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 >, float %es_f, i32 6
  %eb_f = bitcast <4 x i8> <i8 1, i8 2, i8 3, i8 4> to float
  %ev_fsb = insertelement <8 x float> %ev_fs, float %eb_f, i32 7
  store <8 x float> %ev_fsb, <8 x float>* %one_vector
  call i32 @exit()
  ; RESULT: lots of complicated constants loaded from memory before a single write

  ; Make a one one using bitcasting vector ops
  %fv_fs_pre = bitcast <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 > to <16 x i16>
  %fv_fs_post0 = insertelement <16 x i16> %fv_fs_pre, i16 25, i8 13
  %fv_fs_post1 = insertelement <16 x i16> %fv_fs_post0, i16 15, i8 14
  %fv_fsb_pre = bitcast <16 x i16> %fv_fs_post1 to <32 x i8>
  %fv_fsb_post0 = insertelement <32 x i8> %fv_fsb_pre, i8 1, i8 28
  %fv_fsb_post1 = insertelement <32 x i8> %fv_fsb_post0, i8 2, i8 29
  %fv_fsb_post2 = insertelement <32 x i8> %fv_fsb_post1, i8 3, i8 30
  %fv_fsb_post3 = insertelement <32 x i8> %fv_fsb_post2, i8 4, i8 31
  %fv_fsb = bitcast <32 x i8> %fv_fsb_post3 to <8 x float>
  store <8 x float> %fv_fsb, <8 x float>* %one_vector
  call i32 @exit()
  ; RESULTS: one constant for the first four floats, then a trivial sequence of inserts, then a single write

  ret i32 0
 }
diff --git a/test.s b/test.s
 	.section	__TEXT,__text,regular,pure_instructions
 	.macosx_version_min 14, 5
 	.section	__TEXT,__literal4,4byte_literals
 	.align	2
 LCPI0_0:
 	.long	1065353216              ## float 1
 LCPI0_3:
 	.long	1082130432              ## float 4
 LCPI0_4:
 	.long	1077936128              ## float 3
 	.section	__TEXT,__literal16,16byte_literals
 	.align	4
 LCPI0_1:
 	.quad	25                      ## 0x19
 	.quad	15                      ## 0xf
 LCPI0_2:
 	.byte	0                       ## 0x0
 	.byte	1                       ## 0x1
 	.byte	8                       ## 0x8
 	.byte	9                       ## 0x9
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 LCPI0_6:
 	.long	1                       ## 0x1
 	.long	2                       ## 0x2
 	.long	3                       ## 0x3
 	.long	4                       ## 0x4
 LCPI0_7:
 	.byte	0                       ## 0x0
 	.byte	4                       ## 0x4
 	.byte	8                       ## 0x8
 	.byte	12                      ## 0xc
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.byte	128                     ## 0x80
 	.section	__TEXT,__const
 	.align	5
 LCPI0_5:
 	.long	1084227584              ## float 5.000000e+00
 	.long	1086324736              ## float 6.000000e+00
 	.long	1065353216              ## float 1.000000e+00
 	.long	1073741824              ## float 2.000000e+00
 	.long	1077936128              ## float 3.000000e+00
 	.long	1082130432              ## float 4.000000e+00
 	.long	0                       ## float 0.000000e+00
 	.long	0                       ## float 0.000000e+00
 	.section	__TEXT,__text,regular,pure_instructions
 	.globl	_main
 	.align	4, 0x90
 _main:                                  ## @main
 	.cfi_startproc
 ## BB#0:
 	pushq	%r14
 Ltmp0:
 	.cfi_def_cfa_offset 16
 	pushq	%rbx
 Ltmp1:
 	.cfi_def_cfa_offset 24
 	subq	$40, %rsp
 Ltmp2:
 	.cfi_def_cfa_offset 64
 Ltmp3:
 	.cfi_offset %rbx, -24
 Ltmp4:
 	.cfi_offset %r14, -16
 	movq	_one@GOTPCREL(%rip), %rbx
 	movb	$4, 31(%rbx)
 	movb	$3, 30(%rbx)
 	movb	$2, 29(%rbx)
 	movb	$1, 28(%rbx)
 	movw	$15, 26(%rbx)
 	movw	$25, 24(%rbx)
 	movl	$1082130432, 20(%rbx)   ## imm = 0x40800000
 	movl	$1077936128, 16(%rbx)   ## imm = 0x40400000
 	movl	$1073741824, 12(%rbx)   ## imm = 0x40000000
 	movl	$1065353216, 8(%rbx)    ## imm = 0x3F800000
 	movl	$1086324736, 4(%rbx)    ## imm = 0x40C00000
 	movl	$1084227584, (%rbx)     ## imm = 0x40A00000
 	callq	_exit
 	movq	_two@GOTPCREL(%rip), %r14
 	vmovss	(%r14), %xmm0
 	vmovss	4(%r14), %xmm1
 	vmovss	8(%r14), %xmm2
 	vmovss	12(%r14), %xmm3
 	vmovss	16(%r14), %xmm4
 	vmovss	20(%r14), %xmm5
 	movw	24(%r14), %si
 	movw	26(%r14), %di
 	movb	28(%r14), %r8b
 	movb	29(%r14), %al
 	movb	30(%r14), %cl
 	movb	31(%r14), %dl
 	movb	%dl, 31(%rbx)
 	movb	%cl, 30(%rbx)
 	movb	%al, 29(%rbx)
 	movb	%r8b, 28(%rbx)
 	movw	%di, 26(%rbx)
 	movw	%si, 24(%rbx)
 	vmovss	%xmm5, 20(%rbx)
 	vmovss	%xmm4, 16(%rbx)
 	vmovss	%xmm3, 12(%rbx)
 	vmovss	%xmm2, 8(%rbx)
 	vmovss	%xmm1, 4(%rbx)
 	vmovss	%xmm0, (%rbx)
 	callq	_exit
 	vmovss	8(%rsp), %xmm0
 	vmovss	12(%rsp), %xmm1
 	vmovss	16(%rsp), %xmm2
 	vmovss	20(%rsp), %xmm3
 	vmovss	24(%rsp), %xmm4
 	vmovss	28(%rsp), %xmm5
 	movw	32(%rsp), %si
 	movw	34(%rsp), %di
 	movb	36(%rsp), %r8b
 	movb	37(%rsp), %al
 	movb	38(%rsp), %cl
 	movb	39(%rsp), %dl
 	movb	%dl, 31(%rbx)
 	movb	%cl, 30(%rbx)
 	movb	%al, 29(%rbx)
 	movb	%r8b, 28(%rbx)
 	movw	%di, 26(%rbx)
 	movw	%si, 24(%rbx)
 	vmovss	%xmm5, 20(%rbx)
 	vmovss	%xmm4, 16(%rbx)
 	vmovss	%xmm3, 12(%rbx)
 	vmovss	%xmm2, 8(%rbx)
 	vmovss	%xmm1, 4(%rbx)
 	vmovss	%xmm0, (%rbx)
 	vmovaps	(%r14), %ymm0
 	vmovaps	%ymm0, (%rbx)
 	vzeroupper
 	callq	_exit
 	vmovaps	(%r14), %ymm0
 	vmovss	LCPI0_0(%rip), %xmm1
 	vaddss	%xmm1, %xmm0, %xmm1
 	vinsertf128	$0, %xmm1, %ymm0, %ymm0
 	vmovaps	%ymm0, (%rbx)
 	vzeroupper
 	callq	_exit
 	vmovaps	(%r14), %ymm0
 	vmovss	LCPI0_0(%rip), %xmm1
 	vaddss	%xmm1, %xmm0, %xmm1
 	vinsertf128	$0, %xmm1, %ymm0, %ymm1
 	vextractf128	$1, %ymm0, %xmm0
 	movl	$5, %eax
 	vpinsrw	$6, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm1
 	movl	$66, %eax
 	vpinsrb	$13, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm0
 	vmovaps	%ymm0, (%rbx)
 	vzeroupper
 	callq	_exit
 	vmovdqa	LCPI0_1(%rip), %xmm0
 	vpshufb	LCPI0_2(%rip), %xmm0, %xmm0
 	vmovd	%xmm0, (%rsp)
 	vmovss	LCPI0_3(%rip), %xmm0
 	vmovss	LCPI0_4(%rip), %xmm1
 	vunpcklps	%xmm0, %xmm1, %xmm0 ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 	vmovq	%xmm0, %xmm0
 	vinsertps	$32, (%rsp), %xmm0, %xmm0
 	vmovaps	LCPI0_5(%rip), %ymm1
 	vinsertf128	$1, %xmm0, %ymm1, %ymm1
 	vmovdqa	LCPI0_6(%rip), %xmm2
 	vpshufb	LCPI0_7(%rip), %xmm2, %xmm2
 	vmovd	%xmm2, 4(%rsp)
 	vinsertps	$48, 4(%rsp), %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm0
 	vmovaps	%ymm0, (%rbx)
 	vzeroupper
 	callq	_exit
 	vmovaps	LCPI0_5(%rip), %ymm1
 	vextractf128	$1, %ymm1, %xmm0
 	movl	$25, %eax
 	vpinsrw	$5, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm1
 	movl	$15, %eax
 	vpinsrw	$6, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm1
 	movl	$1, %eax
 	vpinsrb	$12, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm1
 	movl	$2, %eax
 	vpinsrb	$13, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm1
 	movl	$3, %eax
 	vpinsrb	$14, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm1
 	movl	$4, %eax
 	vpinsrb	$15, %eax, %xmm0, %xmm0
 	vinsertf128	$1, %xmm0, %ymm1, %ymm0
 	vmovaps	%ymm0, (%rbx)
 	vzeroupper
 	callq	_exit
 	xorl	%eax, %eax
 	addq	$40, %rsp
 	popq	%rbx
 	popq	%r14
 	retq
 	.cfi_endproc


 .subsections_via_symbols
	; External declaration of the puts function
	declare i32 @exit() nounwind

	; float = 1, 2, 3, 4, 5, 6, 7, 8
	; i16 = 2, 4, 6, 8, 10, 12, 13, 14, 15, 16
	; i8 = 4, 8, 12, 16, 20, 24, 26, 28, 29, 30, 31, 32
	%athing = type { float, float, float, float, float, float, i16, i16, i8, i8, i8, i8 }

	@one = external global %athing
	@two = external global %athing

	define i32 @main() {
	; Make a one one
	store %athing { float 5.0, float 6.0,
	float 1.0, float 2.0,
	float 3.0, float 4.0,
	i16 25, i16 15,
	i8 1,
	i8 2, i8 3, i8 4 }, %athing* @one
	call i32 @exit()
	; RESULT: Very slow with 13 memory stores

	; Load a two one from somewhere else
	%a = load %athing* @two
	store %athing %a, %athing* @one
	call i32 @exit()
	; RESULT: 6 vector loads, then a lot of parsing, then 12 memory stores, some as vectors

	; See what the size of a sprite is
	%gp = alloca %athing
	%g = load %athing* %gp
	store %athing %g, %athing* @one
	; RESULT %rsp has 40 added to it, but we start at %rsp+8 for alignment

	; Try to make it use a wide vector op (exchange)
	%two_vector = bitcast %athing* @two to <8 x float>*
	%b = load <8 x float>* %two_vector
	%one_vector = bitcast %athing* @one to <8 x float>*
	store <8 x float> %b, <8 x float>* %one_vector
	call i32 @exit()
	; RESULT: 1 vector load and 1 vector store

	; Try to make it use a wide vector op, changing one of the floats
	%cv_orig = load <8 x float>* %two_vector
	%c_dx_orig = extractelement <8 x float> %cv_orig, i32 0
	%c_dx_change = fadd float %c_dx_orig, 1.0
	%cv_change = insertelement <8 x float> %cv_orig, float %c_dx_change, i32 0
	store <8 x float> %cv_change, <8 x float>* %one_vector
	call i32 @exit()
	; RESULT: 1 vector load, 1 vector load of the constant, an addition, then a single vector store

	; Try to use a vector op to change the shorts and bytes
	%dv_orig = load <8 x float>* %two_vector
	;; change the float
	%d_dx_orig = extractelement <8 x float> %dv_orig, i32 0
	%d_dx_change = fadd float %d_dx_orig, 1.0
	%dv_change0 = insertelement <8 x float> %dv_orig, float %d_dx_change, i32 0
	;; change the short
	%dv_change0_s = bitcast <8 x float> %dv_change0 to <16 x i16>
	%dv_change1_s = insertelement <16 x i16> %dv_change0_s, i16 5, i32 14
	%dv_change1 = bitcast <16 x i16> %dv_change1_s to <8 x float>
	;; change a byte
	%dv_change1_b = bitcast <8 x float> %dv_change1 to <32 x i8>
	%dv_change2_b = insertelement <32 x i8> %dv_change1_b, i8 66, i32 29
	%dv_change2 = bitcast <32 x i8> %dv_change2_b to <8 x float>
	store <8 x float> %dv_change2, <8 x float>* %one_vector
	call i32 @exit()
	; RESULT: virtually equivalent to the above, but with some more vector manipulations

	; Make a one one using vector ops
	%es_f = bitcast <2 x i16> <i16 25, i16 15> to float
	%ev_fs = insertelement <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 >, float %es_f, i32 6
	%eb_f = bitcast <4 x i8> <i8 1, i8 2, i8 3, i8 4> to float
	%ev_fsb = insertelement <8 x float> %ev_fs, float %eb_f, i32 7
	store <8 x float> %ev_fsb, <8 x float>* %one_vector
	call i32 @exit()
	; RESULT: lots of complicated constants loaded from memory before a single write

	; Make a one one using bitcasting vector ops
	%fv_fs_pre = bitcast <8 x float> <float 5.0, float 6.0, float 1.0, float 2.0, float 3.0, float 4.0, float 0.0, float 0.0 > to <16 x i16>
	%fv_fs_post0 = insertelement <16 x i16> %fv_fs_pre, i16 25, i8 13
	%fv_fs_post1 = insertelement <16 x i16> %fv_fs_post0, i16 15, i8 14
	%fv_fsb_pre = bitcast <16 x i16> %fv_fs_post1 to <32 x i8>
	%fv_fsb_post0 = insertelement <32 x i8> %fv_fsb_pre, i8 1, i8 28
	%fv_fsb_post1 = insertelement <32 x i8> %fv_fsb_post0, i8 2, i8 29
	%fv_fsb_post2 = insertelement <32 x i8> %fv_fsb_post1, i8 3, i8 30
	%fv_fsb_post3 = insertelement <32 x i8> %fv_fsb_post2, i8 4, i8 31
	%fv_fsb = bitcast <32 x i8> %fv_fsb_post3 to <8 x float>
	store <8 x float> %fv_fsb, <8 x float>* %one_vector
	call i32 @exit()
	; RESULTS: one constant for the first four floats, then a trivial sequence of inserts, then a single write

	ret i32 0
	}
	.section __TEXT,__text,regular,pure_instructions
	.macosx_version_min 14, 5
	.section __TEXT,__literal4,4byte_literals
	.align 2
	LCPI0_0:
	.long 1065353216 ## float 1
	LCPI0_3:
	.long 1082130432 ## float 4
	LCPI0_4:
	.long 1077936128 ## float 3
	.section __TEXT,__literal16,16byte_literals
	.align 4
	LCPI0_1:
	.quad 25 ## 0x19
	.quad 15 ## 0xf
	LCPI0_2:
	.byte 0 ## 0x0
	.byte 1 ## 0x1
	.byte 8 ## 0x8
	.byte 9 ## 0x9
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	LCPI0_6:
	.long 1 ## 0x1
	.long 2 ## 0x2
	.long 3 ## 0x3
	.long 4 ## 0x4
	LCPI0_7:
	.byte 0 ## 0x0
	.byte 4 ## 0x4
	.byte 8 ## 0x8
	.byte 12 ## 0xc
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.byte 128 ## 0x80
	.section __TEXT,__const
	.align 5
	LCPI0_5:
	.long 1084227584 ## float 5.000000e+00
	.long 1086324736 ## float 6.000000e+00
	.long 1065353216 ## float 1.000000e+00
	.long 1073741824 ## float 2.000000e+00
	.long 1077936128 ## float 3.000000e+00
	.long 1082130432 ## float 4.000000e+00
	.long 0 ## float 0.000000e+00
	.long 0 ## float 0.000000e+00
	.section __TEXT,__text,regular,pure_instructions
	.globl _main
	.align 4, 0x90
	_main: ## @main
	.cfi_startproc
	## BB#0:
	pushq %r14
	Ltmp0:
	.cfi_def_cfa_offset 16
	pushq %rbx
	Ltmp1:
	.cfi_def_cfa_offset 24
	subq $40, %rsp
	Ltmp2:
	.cfi_def_cfa_offset 64
	Ltmp3:
	.cfi_offset %rbx, -24
	Ltmp4:
	.cfi_offset %r14, -16
	movq _one@GOTPCREL(%rip), %rbx
	movb $4, 31(%rbx)
	movb $3, 30(%rbx)
	movb $2, 29(%rbx)
	movb $1, 28(%rbx)
	movw $15, 26(%rbx)
	movw $25, 24(%rbx)
	movl $1082130432, 20(%rbx) ## imm = 0x40800000
	movl $1077936128, 16(%rbx) ## imm = 0x40400000
	movl $1073741824, 12(%rbx) ## imm = 0x40000000
	movl $1065353216, 8(%rbx) ## imm = 0x3F800000
	movl $1086324736, 4(%rbx) ## imm = 0x40C00000
	movl $1084227584, (%rbx) ## imm = 0x40A00000
	callq _exit
	movq _two@GOTPCREL(%rip), %r14
	vmovss (%r14), %xmm0
	vmovss 4(%r14), %xmm1
	vmovss 8(%r14), %xmm2
	vmovss 12(%r14), %xmm3
	vmovss 16(%r14), %xmm4
	vmovss 20(%r14), %xmm5
	movw 24(%r14), %si
	movw 26(%r14), %di
	movb 28(%r14), %r8b
	movb 29(%r14), %al
	movb 30(%r14), %cl
	movb 31(%r14), %dl
	movb %dl, 31(%rbx)
	movb %cl, 30(%rbx)
	movb %al, 29(%rbx)
	movb %r8b, 28(%rbx)
	movw %di, 26(%rbx)
	movw %si, 24(%rbx)
	vmovss %xmm5, 20(%rbx)
	vmovss %xmm4, 16(%rbx)
	vmovss %xmm3, 12(%rbx)
	vmovss %xmm2, 8(%rbx)
	vmovss %xmm1, 4(%rbx)
	vmovss %xmm0, (%rbx)
	callq _exit
	vmovss 8(%rsp), %xmm0
	vmovss 12(%rsp), %xmm1
	vmovss 16(%rsp), %xmm2
	vmovss 20(%rsp), %xmm3
	vmovss 24(%rsp), %xmm4
	vmovss 28(%rsp), %xmm5
	movw 32(%rsp), %si
	movw 34(%rsp), %di
	movb 36(%rsp), %r8b
	movb 37(%rsp), %al
	movb 38(%rsp), %cl
	movb 39(%rsp), %dl
	movb %dl, 31(%rbx)
	movb %cl, 30(%rbx)
	movb %al, 29(%rbx)
	movb %r8b, 28(%rbx)
	movw %di, 26(%rbx)
	movw %si, 24(%rbx)
	vmovss %xmm5, 20(%rbx)
	vmovss %xmm4, 16(%rbx)
	vmovss %xmm3, 12(%rbx)
	vmovss %xmm2, 8(%rbx)
	vmovss %xmm1, 4(%rbx)
	vmovss %xmm0, (%rbx)
	vmovaps (%r14), %ymm0
	vmovaps %ymm0, (%rbx)
	vzeroupper
	callq _exit
	vmovaps (%r14), %ymm0
	vmovss LCPI0_0(%rip), %xmm1
	vaddss %xmm1, %xmm0, %xmm1
	vinsertf128 $0, %xmm1, %ymm0, %ymm0
	vmovaps %ymm0, (%rbx)
	vzeroupper
	callq _exit
	vmovaps (%r14), %ymm0
	vmovss LCPI0_0(%rip), %xmm1
	vaddss %xmm1, %xmm0, %xmm1
	vinsertf128 $0, %xmm1, %ymm0, %ymm1
	vextractf128 $1, %ymm0, %xmm0
	movl $5, %eax
	vpinsrw $6, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm1
	movl $66, %eax
	vpinsrb $13, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm0
	vmovaps %ymm0, (%rbx)
	vzeroupper
	callq _exit
	vmovdqa LCPI0_1(%rip), %xmm0
	vpshufb LCPI0_2(%rip), %xmm0, %xmm0
	vmovd %xmm0, (%rsp)
	vmovss LCPI0_3(%rip), %xmm0
	vmovss LCPI0_4(%rip), %xmm1
	vunpcklps %xmm0, %xmm1, %xmm0 ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	vmovq %xmm0, %xmm0
	vinsertps $32, (%rsp), %xmm0, %xmm0
	vmovaps LCPI0_5(%rip), %ymm1
	vinsertf128 $1, %xmm0, %ymm1, %ymm1
	vmovdqa LCPI0_6(%rip), %xmm2
	vpshufb LCPI0_7(%rip), %xmm2, %xmm2
	vmovd %xmm2, 4(%rsp)
	vinsertps $48, 4(%rsp), %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm0
	vmovaps %ymm0, (%rbx)
	vzeroupper
	callq _exit
	vmovaps LCPI0_5(%rip), %ymm1
	vextractf128 $1, %ymm1, %xmm0
	movl $25, %eax
	vpinsrw $5, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm1
	movl $15, %eax
	vpinsrw $6, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm1
	movl $1, %eax
	vpinsrb $12, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm1
	movl $2, %eax
	vpinsrb $13, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm1
	movl $3, %eax
	vpinsrb $14, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm1
	movl $4, %eax
	vpinsrb $15, %eax, %xmm0, %xmm0
	vinsertf128 $1, %xmm0, %ymm1, %ymm0
	vmovaps %ymm0, (%rbx)
	vzeroupper
	callq _exit
	xorl %eax, %eax
	addq $40, %rsp
	popq %rbx
	popq %r14
	retq
	.cfi_endproc


	.subsections_via_symbols