Skip to content

Instantly share code, notes, and snippets.

@Narsil
Created August 1, 2023 16:54
Show Gist options
  • Save Narsil/225fc9af3bf771655c2dda93346e166b to your computer and use it in GitHub Desktop.
Save Narsil/225fc9af3bf771655c2dda93346e166b to your computer and use it in GitHub Desktop.
.globl <gemm_f16::gemm::f16::GEMM as lazy_static::LazyStatic>::initialize
.p2align 2
<gemm_f16::gemm::f16::GEMM as lazy_static::LazyStatic>::initialize:
Lfunc_begin18:
.cfi_startproc
sub sp, sp, #48
.cfi_def_cfa_offset 48
stp x29, x30, [sp, #32]
add x29, sp, #32
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
Lloh101:
adrp x8, <gemm_f16::gemm::f16::GEMM as core::ops::deref::Deref>::deref::__stability::LAZY@PAGE
Lloh102:
add x8, x8, <gemm_f16::gemm::f16::GEMM as core::ops::deref::Deref>::deref::__stability::LAZY@PAGEOFF
str x8, [sp, #8]
add x8, x8, #8
ldapr x8, [x8]
cmp x8, #3
b.ne LBB18_2
LBB18_1:
.cfi_def_cfa wsp, 48
ldp x29, x30, [sp, #32]
add sp, sp, #48
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
LBB18_2:
.cfi_restore_state
add x8, sp, #8
str x8, [sp, #16]
add x8, sp, #16
stur x8, [x29, #-8]
Lloh103:
adrp x0, <gemm_f16::gemm::f16::GEMM as core::ops::deref::Deref>::deref::__stability::LAZY@PAGE+8
Lloh104:
add x0, x0, <gemm_f16::gemm::f16::GEMM as core::ops::deref::Deref>::deref::__stability::LAZY@PAGEOFF+8
Lloh105:
adrp x3, l___unnamed_3@PAGE
Lloh106:
add x3, x3, l___unnamed_3@PAGEOFF
Lloh107:
adrp x4, l___unnamed_25@PAGE
Lloh108:
add x4, x4, l___unnamed_25@PAGEOFF
sub x2, x29, #8
mov w1, #0
bl std::sys_common::once::queue::Once::call
b LBB18_1
.loh AdrpAdd Lloh101, Lloh102
.loh AdrpAdd Lloh107, Lloh108
.loh AdrpAdd Lloh105, Lloh106
.loh AdrpAdd Lloh103, Lloh104
Lfunc_end18:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x1:
Lfunc_begin19:
.cfi_startproc
sub sp, sp, #16
.cfi_def_cfa_offset 16
movi.2d v0, #0000000000000000
cmp x2, #2
b.hs LBB19_2
mov x8, x5
b LBB19_4
LBB19_2:
ldp x10, x8, [sp, #16]
lsr x9, x2, #1
lsl x10, x10, #1
lsl x11, x8, #1
lsl x12, x11, #1
mov x8, x5
LBB19_3:
ld1r.8h { v1 }, [x8], x12
ldr q2, [x4]
; InlineAsm Start
fmla.8h v0, v2, v1
; InlineAsm End
add x13, x5, x11
ld1r.8h { v1 }, [x13]
ldr q2, [x4, x10]
; InlineAsm Start
fmla.8h v0, v2, v1
; InlineAsm End
add x4, x4, x10, lsl #1
mov x5, x8
subs x9, x9, #1
b.ne LBB19_3
LBB19_4:
tbz w2, #0, LBB19_6
ld1r.8h { v1 }, [x8]
ldr q2, [x4]
; InlineAsm Start
fmla.8h v0, v2, v1
; InlineAsm End
ldrh w8, [sp, #42]
ldrh w9, [sp, #40]
ldrb w10, [sp, #44]
cmp x0, #8
b.ne LBB19_12
cmp x1, #1
b.ne LBB19_12
cmp x7, #1
b.ne LBB19_12
dup.8h v1, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB19_20
cmp w8, #2
b.ne LBB19_21
dup.8h v2, w9
ldr q3, [x3]
; InlineAsm Start
fmul.8h v4, v2, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v1, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v4, v2
; InlineAsm End
str q0, [x3]
b LBB19_35
LBB19_12:
str q0, [sp]
and w10, w10, #0xff
cmp w10, #2
b.eq LBB19_23
cmp w10, #1
b.ne LBB19_29
cbz x1, LBB19_35
cbz x0, LBB19_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB19_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB19_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB19_18
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB19_17
b LBB19_35
LBB19_20:
ldr q2, [x3]
; InlineAsm Start
fmla.8h v2, v1, v0
; InlineAsm End
b LBB19_22
LBB19_21:
; InlineAsm Start
fmul.8h v2, v1, v0
; InlineAsm End
LBB19_22:
str q2, [x3]
b LBB19_35
LBB19_23:
cbz x1, LBB19_35
cbz x0, LBB19_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB19_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB19_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB19_27
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB19_26
b LBB19_35
LBB19_29:
cbz x1, LBB19_35
cbz x0, LBB19_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB19_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB19_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB19_33
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB19_32
LBB19_35:
add sp, sp, #16
.cfi_def_cfa_offset 0
ret
Lfunc_end19:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x2:
Lfunc_begin20:
.cfi_startproc
sub sp, sp, #32
.cfi_def_cfa_offset 32
ldr x8, [sp, #48]
movi.2d v0, #0000000000000000
lsl x8, x8, #1
cmp x2, #2
b.hs LBB20_2
movi.2d v1, #0000000000000000
mov x9, x5
b LBB20_4
LBB20_2:
ldp x11, x9, [sp, #32]
lsr x10, x2, #1
lsl x11, x11, #1
lsl x12, x9, #1
add x13, x8, x12
lsl x14, x12, #1
movi.2d v1, #0000000000000000
mov x9, x5
LBB20_3:
ld1r.8h { v2 }, [x9], x14
ldr q3, [x4]
; InlineAsm Start
fmla.8h v1, v3, v2
; InlineAsm End
add x15, x5, x8
ld1r.8h { v2 }, [x15]
; InlineAsm Start
fmla.8h v0, v3, v2
; InlineAsm End
add x15, x5, x12
ld1r.8h { v2 }, [x15]
ldr q3, [x4, x11]
; InlineAsm Start
fmla.8h v1, v3, v2
; InlineAsm End
add x15, x5, x13
ld1r.8h { v2 }, [x15]
; InlineAsm Start
fmla.8h v0, v3, v2
; InlineAsm End
add x4, x4, x11, lsl #1
mov x5, x9
subs x10, x10, #1
b.ne LBB20_3
LBB20_4:
tbz w2, #0, LBB20_6
ld1r.8h { v2 }, [x9], x8
ldr q3, [x4]
; InlineAsm Start
fmla.8h v1, v3, v2
; InlineAsm End
ld1r.8h { v2 }, [x9]
; InlineAsm Start
fmla.8h v0, v3, v2
; InlineAsm End
ldrh w8, [sp, #58]
ldrh w9, [sp, #56]
ldrb w10, [sp, #60]
cmp x0, #8
b.ne LBB20_12
cmp x1, #2
b.ne LBB20_12
cmp x7, #1
b.ne LBB20_12
dup.8h v2, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB20_20
cmp w8, #2
b.ne LBB20_21
dup.8h v3, w9
ldr q4, [x3]
; InlineAsm Start
fmul.8h v5, v3, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v2, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v5, v4
; InlineAsm End
str q1, [x3]
lsl x8, x6, #1
ldr q1, [x3, x8]
; InlineAsm Start
fmul.8h v4, v3, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v2, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v4, v1
; InlineAsm End
str q0, [x3, x8]
b LBB20_35
LBB20_12:
stp q1, q0, [sp]
and w10, w10, #0xff
cmp w10, #2
b.eq LBB20_23
cmp w10, #1
b.ne LBB20_29
cbz x1, LBB20_35
cbz x0, LBB20_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB20_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB20_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB20_18
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB20_17
b LBB20_35
LBB20_20:
ldr q3, [x3]
; InlineAsm Start
fmla.8h v3, v2, v1
; InlineAsm End
str q3, [x3]
lsl x8, x6, #1
ldr q1, [x3, x8]
; InlineAsm Start
fmla.8h v1, v2, v0
; InlineAsm End
b LBB20_22
LBB20_21:
; InlineAsm Start
fmul.8h v3, v2, v1
; InlineAsm End
str q3, [x3]
lsl x8, x6, #1
; InlineAsm Start
fmul.8h v1, v2, v0
; InlineAsm End
LBB20_22:
str q1, [x3, x8]
b LBB20_35
LBB20_23:
cbz x1, LBB20_35
cbz x0, LBB20_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB20_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB20_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB20_27
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB20_26
b LBB20_35
LBB20_29:
cbz x1, LBB20_35
cbz x0, LBB20_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB20_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB20_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB20_33
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB20_32
LBB20_35:
add sp, sp, #32
.cfi_def_cfa_offset 0
ret
Lfunc_end20:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x3:
Lfunc_begin21:
.cfi_startproc
sub sp, sp, #64
.cfi_def_cfa_offset 64
stp x20, x19, [sp, #48]
.cfi_offset w19, -8
.cfi_offset w20, -16
ldr x8, [sp, #80]
movi.2d v0, #0000000000000000
lsl x9, x8, #2
cmp x2, #2
b.hs LBB21_2
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
mov x10, x5
b LBB21_4
LBB21_2:
lsr x11, x2, #1
ldp x13, x10, [sp, #64]
lsl x12, x8, #1
lsl x13, x13, #1
lsl x14, x10, #1
add x15, x9, x14
add x16, x12, x14
lsl x17, x14, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
mov x10, x5
LBB21_3:
ld1r.8h { v3 }, [x10], x17
ldr q4, [x4]
; InlineAsm Start
fmla.8h v2, v4, v3
; InlineAsm End
add x19, x5, x12
ld1r.8h { v3 }, [x19]
; InlineAsm Start
fmla.8h v1, v4, v3
; InlineAsm End
add x19, x5, x9
ld1r.8h { v3 }, [x19]
; InlineAsm Start
fmla.8h v0, v4, v3
; InlineAsm End
add x19, x5, x14
ld1r.8h { v3 }, [x19]
ldr q4, [x4, x13]
; InlineAsm Start
fmla.8h v2, v4, v3
; InlineAsm End
add x19, x5, x16
ld1r.8h { v3 }, [x19]
; InlineAsm Start
fmla.8h v1, v4, v3
; InlineAsm End
add x5, x5, x15
ld1r.8h { v3 }, [x5]
; InlineAsm Start
fmla.8h v0, v4, v3
; InlineAsm End
add x4, x4, x13, lsl #1
mov x5, x10
subs x11, x11, #1
b.ne LBB21_3
LBB21_4:
tbz w2, #0, LBB21_6
mov x11, x10
ld1r.8h { v3 }, [x11], x9
ldr q4, [x4]
; InlineAsm Start
fmla.8h v2, v4, v3
; InlineAsm End
add x8, x10, x8, lsl #1
ld1r.8h { v3 }, [x8]
; InlineAsm Start
fmla.8h v1, v4, v3
; InlineAsm End
ld1r.8h { v3 }, [x11]
; InlineAsm Start
fmla.8h v0, v4, v3
; InlineAsm End
ldrh w8, [sp, #90]
ldrh w9, [sp, #88]
ldrb w10, [sp, #92]
cmp x0, #8
b.ne LBB21_12
cmp x1, #3
b.ne LBB21_12
cmp x7, #1
b.ne LBB21_12
dup.8h v3, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB21_20
cmp w8, #2
b.ne LBB21_21
dup.8h v4, w9
ldr q5, [x3]
; InlineAsm Start
fmul.8h v6, v4, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v3, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v6, v5
; InlineAsm End
str q2, [x3]
lsl x8, x6, #1
ldr q2, [x3, x8]
; InlineAsm Start
fmul.8h v5, v4, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v3, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v5, v2
; InlineAsm End
str q1, [x3, x8]
lsl x8, x6, #2
ldr q1, [x3, x8]
; InlineAsm Start
fmul.8h v2, v4, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v3, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v1
; InlineAsm End
str q0, [x3, x8]
b LBB21_35
LBB21_12:
stp q2, q1, [sp]
str q0, [sp, #32]
and w10, w10, #0xff
cmp w10, #2
b.eq LBB21_23
cmp w10, #1
b.ne LBB21_29
cbz x1, LBB21_35
cbz x0, LBB21_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB21_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB21_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB21_18
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB21_17
b LBB21_35
LBB21_20:
ldr q4, [x3]
; InlineAsm Start
fmla.8h v4, v3, v2
; InlineAsm End
str q4, [x3]
lsl x8, x6, #1
ldr q2, [x3, x8]
; InlineAsm Start
fmla.8h v2, v3, v1
; InlineAsm End
str q2, [x3, x8]
lsl x8, x6, #2
ldr q1, [x3, x8]
; InlineAsm Start
fmla.8h v1, v3, v0
; InlineAsm End
b LBB21_22
LBB21_21:
; InlineAsm Start
fmul.8h v4, v3, v2
; InlineAsm End
str q4, [x3]
lsl x8, x6, #1
; InlineAsm Start
fmul.8h v2, v3, v1
; InlineAsm End
str q2, [x3, x8]
lsl x8, x6, #2
; InlineAsm Start
fmul.8h v1, v3, v0
; InlineAsm End
LBB21_22:
str q1, [x3, x8]
b LBB21_35
LBB21_23:
cbz x1, LBB21_35
cbz x0, LBB21_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB21_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB21_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB21_27
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB21_26
b LBB21_35
LBB21_29:
cbz x1, LBB21_35
cbz x0, LBB21_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB21_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB21_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB21_33
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB21_32
LBB21_35:
ldp x20, x19, [sp, #48]
add sp, sp, #64
.cfi_def_cfa_offset 0
.cfi_restore w19
.cfi_restore w20
ret
Lfunc_end21:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x4:
Lfunc_begin22:
.cfi_startproc
sub sp, sp, #96
.cfi_def_cfa_offset 96
stp x22, x21, [sp, #64]
stp x20, x19, [sp, #80]
.cfi_offset w19, -8
.cfi_offset w20, -16
.cfi_offset w21, -24
.cfi_offset w22, -32
ldp x11, x9, [sp, #104]
ldr x12, [sp, #96]
lsr x8, x2, #1
movi.2d v0, #0000000000000000
cmp x9, #1
b.ne LBB22_3
cmp x2, #2
b.hs LBB22_5
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
b LBB22_7
LBB22_3:
lsl x10, x9, #1
cmp x2, #2
b.hs LBB22_9
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
mov x11, x5
b LBB22_11
LBB22_5:
lsl x9, x12, #1
lsl x10, x11, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
LBB22_6:
ldr q4, [x5]
dup.8h v5, v4[0]
ldr q6, [x4]
; InlineAsm Start
fmla.8h v3, v6, v5
; InlineAsm End
dup.8h v5, v4[1]
; InlineAsm Start
fmla.8h v2, v6, v5
; InlineAsm End
dup.8h v5, v4[2]
; InlineAsm Start
fmla.8h v1, v6, v5
; InlineAsm End
dup.8h v4, v4[3]
; InlineAsm Start
fmla.8h v0, v6, v4
; InlineAsm End
ldr q4, [x5, x10]
dup.8h v5, v4[0]
ldr q6, [x4, x9]
; InlineAsm Start
fmla.8h v3, v6, v5
; InlineAsm End
dup.8h v5, v4[1]
; InlineAsm Start
fmla.8h v2, v6, v5
; InlineAsm End
dup.8h v5, v4[2]
; InlineAsm Start
fmla.8h v1, v6, v5
; InlineAsm End
dup.8h v4, v4[3]
; InlineAsm Start
fmla.8h v0, v6, v4
; InlineAsm End
add x4, x4, x9, lsl #1
add x5, x5, x10, lsl #1
subs x8, x8, #1
b.ne LBB22_6
LBB22_7:
tbz w2, #0, LBB22_13
ldr q4, [x5]
dup.8h v5, v4[0]
ldr q6, [x4]
; InlineAsm Start
fmla.8h v3, v6, v5
; InlineAsm End
dup.8h v5, v4[1]
; InlineAsm Start
fmla.8h v2, v6, v5
; InlineAsm End
dup.8h v5, v4[2]
; InlineAsm Start
fmla.8h v1, v6, v5
; InlineAsm End
dup.8h v4, v4[3]
; InlineAsm Start
fmla.8h v0, v6, v4
; InlineAsm End
b LBB22_13
LBB22_9:
lsl x12, x12, #1
lsl x13, x11, #1
add x11, x10, x9
lsl x14, x11, #1
add x15, x14, x13
lsl x16, x9, #2
add x17, x16, x13
add x19, x10, x13
lsl x20, x13, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
mov x11, x5
LBB22_10:
ld1r.8h { v4 }, [x11], x20
ldr q5, [x4]
; InlineAsm Start
fmla.8h v3, v5, v4
; InlineAsm End
add x21, x5, x10
ld1r.8h { v4 }, [x21]
; InlineAsm Start
fmla.8h v2, v5, v4
; InlineAsm End
add x21, x5, x16
ld1r.8h { v4 }, [x21]
; InlineAsm Start
fmla.8h v1, v5, v4
; InlineAsm End
add x21, x5, x14
ld1r.8h { v4 }, [x21]
; InlineAsm Start
fmla.8h v0, v5, v4
; InlineAsm End
add x21, x5, x13
ld1r.8h { v4 }, [x21]
ldr q5, [x4, x12]
; InlineAsm Start
fmla.8h v3, v5, v4
; InlineAsm End
add x21, x5, x19
ld1r.8h { v4 }, [x21]
; InlineAsm Start
fmla.8h v2, v5, v4
; InlineAsm End
add x21, x5, x17
ld1r.8h { v4 }, [x21]
; InlineAsm Start
fmla.8h v1, v5, v4
; InlineAsm End
add x5, x5, x15
ld1r.8h { v4 }, [x5]
; InlineAsm Start
fmla.8h v0, v5, v4
; InlineAsm End
add x4, x4, x12, lsl #1
mov x5, x11
subs x8, x8, #1
b.ne LBB22_10
LBB22_11:
tbz w2, #0, LBB22_13
add x8, x10, x9
lsl x8, x8, #1
mov x12, x11
ld1r.8h { v4 }, [x12], x8
ldr q5, [x4]
; InlineAsm Start
fmla.8h v3, v5, v4
; InlineAsm End
add x8, x11, x10
ld1r.8h { v4 }, [x8]
; InlineAsm Start
fmla.8h v2, v5, v4
; InlineAsm End
add x8, x11, x9, lsl #2
ld1r.8h { v4 }, [x8]
; InlineAsm Start
fmla.8h v1, v5, v4
; InlineAsm End
ld1r.8h { v4 }, [x12]
; InlineAsm Start
fmla.8h v0, v5, v4
; InlineAsm End
LBB22_13:
ldrh w8, [sp, #122]
ldrh w9, [sp, #120]
ldrb w10, [sp, #124]
cmp x0, #8
b.ne LBB22_19
cmp x1, #4
b.ne LBB22_19
cmp x7, #1
b.ne LBB22_19
dup.8h v4, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB22_27
cmp w8, #2
b.ne LBB22_28
dup.8h v5, w9
ldr q6, [x3]
; InlineAsm Start
fmul.8h v7, v5, v6
; InlineAsm End
; InlineAsm Start
fmul.8h v6, v4, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v7, v6
; InlineAsm End
str q3, [x3]
lsl x8, x6, #1
ldr q3, [x3, x8]
; InlineAsm Start
fmul.8h v6, v5, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v4, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v6, v3
; InlineAsm End
str q2, [x3, x8]
lsl x8, x6, #2
ldr q2, [x3, x8]
; InlineAsm Start
fmul.8h v3, v5, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v4, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v3, v2
; InlineAsm End
str q1, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmul.8h v2, v5, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v4, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v1
; InlineAsm End
str q0, [x3, x8]
b LBB22_42
LBB22_19:
stp q3, q2, [sp]
stp q1, q0, [sp, #32]
and w10, w10, #0xff
cmp w10, #2
b.eq LBB22_30
cmp w10, #1
b.ne LBB22_36
cbz x1, LBB22_42
cbz x0, LBB22_42
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB22_24:
mov x13, x0
mov x14, x3
mov x15, x12
LBB22_25:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB22_25
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB22_24
b LBB22_42
LBB22_27:
ldr q5, [x3]
; InlineAsm Start
fmla.8h v5, v4, v3
; InlineAsm End
str q5, [x3]
lsl x8, x6, #1
ldr q3, [x3, x8]
; InlineAsm Start
fmla.8h v3, v4, v2
; InlineAsm End
str q3, [x3, x8]
lsl x8, x6, #2
ldr q2, [x3, x8]
; InlineAsm Start
fmla.8h v2, v4, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmla.8h v1, v4, v0
; InlineAsm End
b LBB22_29
LBB22_28:
; InlineAsm Start
fmul.8h v5, v4, v3
; InlineAsm End
str q5, [x3]
lsl x8, x6, #1
; InlineAsm Start
fmul.8h v3, v4, v2
; InlineAsm End
str q3, [x3, x8]
lsl x8, x6, #2
; InlineAsm Start
fmul.8h v2, v4, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #6
mul x8, x6, x8
; InlineAsm Start
fmul.8h v1, v4, v0
; InlineAsm End
LBB22_29:
str q1, [x3, x8]
b LBB22_42
LBB22_30:
cbz x1, LBB22_42
cbz x0, LBB22_42
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB22_33:
mov x14, x0
mov x15, x3
mov x16, x13
LBB22_34:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB22_34
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB22_33
b LBB22_42
LBB22_36:
cbz x1, LBB22_42
cbz x0, LBB22_42
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB22_39:
mov x13, x0
mov x14, x3
mov x15, x12
LBB22_40:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB22_40
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB22_39
LBB22_42:
ldp x20, x19, [sp, #80]
ldp x22, x21, [sp, #64]
add sp, sp, #96
.cfi_def_cfa_offset 0
.cfi_restore w19
.cfi_restore w20
.cfi_restore w21
.cfi_restore w22
ret
Lfunc_end22:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x5:
Lfunc_begin23:
.cfi_startproc
sub sp, sp, #80
.cfi_def_cfa_offset 80
ldr x8, [sp, #96]
movi.2d v0, #0000000000000000
cmp x2, #2
b.hs LBB23_2
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
tbnz w2, #0, LBB23_5
b LBB23_6
LBB23_2:
lsr x9, x2, #1
ldp x11, x12, [sp, #80]
lsl x10, x8, #1
lsl x11, x11, #1
lsl x13, x12, #1
sub x12, x13, x8, lsl #3
lsl x13, x13, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
LBB23_3:
mov x14, x5
ld1r.8h { v5 }, [x14], x13
ldr q6, [x4]
; InlineAsm Start
fmla.8h v4, v6, v5
; InlineAsm End
add x5, x5, x10
ld1r.8h { v5 }, [x5], x10
; InlineAsm Start
fmla.8h v3, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x10
; InlineAsm Start
fmla.8h v2, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x10
; InlineAsm Start
fmla.8h v1, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x10
ldr q6, [x4, x11]
; InlineAsm Start
fmla.8h v4, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x10
; InlineAsm Start
fmla.8h v3, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x10
; InlineAsm Start
fmla.8h v2, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x10
; InlineAsm Start
fmla.8h v1, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v6, v5
; InlineAsm End
add x4, x4, x11, lsl #1
subs x9, x9, #1
b.ne LBB23_3
mov x5, x14
tbz w2, #0, LBB23_6
lsl x9, x8, #3
mov x10, x5
ld1r.8h { v5 }, [x10], x9
ldr q6, [x4]
; InlineAsm Start
fmla.8h v4, v6, v5
; InlineAsm End
add x9, x5, x8, lsl #1
ld1r.8h { v5 }, [x9]
; InlineAsm Start
fmla.8h v3, v6, v5
; InlineAsm End
add x9, x5, x8, lsl #2
ld1r.8h { v5 }, [x9]
; InlineAsm Start
fmla.8h v2, v6, v5
; InlineAsm End
mov w9, #6
madd x8, x8, x9, x5
ld1r.8h { v5 }, [x8]
; InlineAsm Start
fmla.8h v1, v6, v5
; InlineAsm End
ld1r.8h { v5 }, [x10]
; InlineAsm Start
fmla.8h v0, v6, v5
; InlineAsm End
LBB23_6:
ldrh w8, [sp, #106]
ldrh w9, [sp, #104]
ldrb w10, [sp, #108]
cmp x0, #8
b.ne LBB23_12
cmp x1, #5
b.ne LBB23_12
cmp x7, #1
b.ne LBB23_12
dup.8h v5, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB23_20
cmp w8, #2
b.ne LBB23_21
dup.8h v6, w9
ldr q7, [x3]
; InlineAsm Start
fmul.8h v16, v6, v7
; InlineAsm End
; InlineAsm Start
fmul.8h v7, v5, v4
; InlineAsm End
; InlineAsm Start
fadd.8h v4, v16, v7
; InlineAsm End
str q4, [x3]
lsl x8, x6, #1
ldr q4, [x3, x8]
; InlineAsm Start
fmul.8h v7, v6, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v5, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v7, v4
; InlineAsm End
str q3, [x3, x8]
lsl x8, x6, #2
ldr q3, [x3, x8]
; InlineAsm Start
fmul.8h v4, v6, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v5, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v4, v3
; InlineAsm End
str q2, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q2, [x3, x8]
; InlineAsm Start
fmul.8h v3, v6, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v5, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v3, v2
; InlineAsm End
str q1, [x3, x8]
lsl x8, x6, #3
ldr q1, [x3, x8]
; InlineAsm Start
fmul.8h v2, v6, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v5, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v1
; InlineAsm End
str q0, [x3, x8]
b LBB23_35
LBB23_12:
stp q4, q3, [sp]
stp q2, q1, [sp, #32]
and w10, w10, #0xff
str q0, [sp, #64]
cmp w10, #2
b.eq LBB23_23
cmp w10, #1
b.ne LBB23_29
cbz x1, LBB23_35
cbz x0, LBB23_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB23_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB23_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB23_18
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB23_17
b LBB23_35
LBB23_20:
ldr q6, [x3]
; InlineAsm Start
fmla.8h v6, v5, v4
; InlineAsm End
str q6, [x3]
lsl x8, x6, #1
ldr q4, [x3, x8]
; InlineAsm Start
fmla.8h v4, v5, v3
; InlineAsm End
str q4, [x3, x8]
lsl x8, x6, #2
ldr q3, [x3, x8]
; InlineAsm Start
fmla.8h v3, v5, v2
; InlineAsm End
str q3, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q2, [x3, x8]
; InlineAsm Start
fmla.8h v2, v5, v1
; InlineAsm End
str q2, [x3, x8]
lsl x8, x6, #3
ldr q1, [x3, x8]
; InlineAsm Start
fmla.8h v1, v5, v0
; InlineAsm End
b LBB23_22
LBB23_21:
; InlineAsm Start
fmul.8h v6, v5, v4
; InlineAsm End
str q6, [x3]
lsl x8, x6, #1
; InlineAsm Start
fmul.8h v4, v5, v3
; InlineAsm End
str q4, [x3, x8]
lsl x8, x6, #2
; InlineAsm Start
fmul.8h v3, v5, v2
; InlineAsm End
str q3, [x3, x8]
mov w8, #6
mul x8, x6, x8
; InlineAsm Start
fmul.8h v2, v5, v1
; InlineAsm End
str q2, [x3, x8]
lsl x8, x6, #3
; InlineAsm Start
fmul.8h v1, v5, v0
; InlineAsm End
LBB23_22:
str q1, [x3, x8]
b LBB23_35
LBB23_23:
cbz x1, LBB23_35
cbz x0, LBB23_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB23_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB23_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB23_27
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB23_26
b LBB23_35
LBB23_29:
cbz x1, LBB23_35
cbz x0, LBB23_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB23_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB23_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB23_33
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB23_32
LBB23_35:
add sp, sp, #80
.cfi_def_cfa_offset 0
ret
Lfunc_end23:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x6:
Lfunc_begin24:
.cfi_startproc
sub sp, sp, #96
.cfi_def_cfa_offset 96
ldr x8, [sp, #112]
movi.2d v0, #0000000000000000
cmp x2, #2
b.hs LBB24_2
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
tbnz w2, #0, LBB24_5
b LBB24_6
LBB24_2:
ldp x11, x12, [sp, #96]
lsr x9, x2, #1
lsl x10, x8, #1
lsl x11, x11, #1
lsl x13, x12, #1
mov w12, #10
msub x12, x8, x12, x13
lsl x13, x13, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
LBB24_3:
mov x14, x5
ld1r.8h { v6 }, [x14], x13
ldr q7, [x4]
; InlineAsm Start
fmla.8h v5, v7, v6
; InlineAsm End
add x5, x5, x10
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v4, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v3, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v2, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v1, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
ldr q7, [x4, x11]
; InlineAsm Start
fmla.8h v5, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v4, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v3, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v2, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x10
; InlineAsm Start
fmla.8h v1, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v7, v6
; InlineAsm End
add x4, x4, x11, lsl #1
subs x9, x9, #1
b.ne LBB24_3
mov x5, x14
tbz w2, #0, LBB24_6
lsl x9, x8, #2
add x10, x9, x8
lsl x10, x10, #1
mov x11, x5
ld1r.8h { v6 }, [x11], x10
ldr q7, [x4]
; InlineAsm Start
fmla.8h v5, v7, v6
; InlineAsm End
add x10, x5, x8, lsl #1
ld1r.8h { v6 }, [x10]
; InlineAsm Start
fmla.8h v4, v7, v6
; InlineAsm End
add x9, x5, x9
ld1r.8h { v6 }, [x9]
; InlineAsm Start
fmla.8h v3, v7, v6
; InlineAsm End
mov w9, #6
madd x9, x8, x9, x5
ld1r.8h { v6 }, [x9]
; InlineAsm Start
fmla.8h v2, v7, v6
; InlineAsm End
add x8, x5, x8, lsl #3
ld1r.8h { v6 }, [x8]
; InlineAsm Start
fmla.8h v1, v7, v6
; InlineAsm End
ld1r.8h { v6 }, [x11]
; InlineAsm Start
fmla.8h v0, v7, v6
; InlineAsm End
LBB24_6:
ldrh w8, [sp, #122]
ldrh w9, [sp, #120]
ldrb w10, [sp, #124]
cmp x0, #8
b.ne LBB24_12
cmp x1, #6
b.ne LBB24_12
cmp x7, #1
b.ne LBB24_12
dup.8h v6, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB24_20
cmp w8, #2
b.ne LBB24_21
dup.8h v7, w9
ldr q16, [x3]
; InlineAsm Start
fmul.8h v17, v7, v16
; InlineAsm End
; InlineAsm Start
fmul.8h v16, v6, v5
; InlineAsm End
; InlineAsm Start
fadd.8h v5, v17, v16
; InlineAsm End
str q5, [x3]
lsl x8, x6, #1
ldr q5, [x3, x8]
; InlineAsm Start
fmul.8h v16, v7, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v6, v4
; InlineAsm End
; InlineAsm Start
fadd.8h v4, v16, v5
; InlineAsm End
str q4, [x3, x8]
lsl x8, x6, #2
ldr q4, [x3, x8]
; InlineAsm Start
fmul.8h v5, v7, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v6, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v5, v4
; InlineAsm End
str q3, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q3, [x3, x8]
; InlineAsm Start
fmul.8h v4, v7, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v6, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v4, v3
; InlineAsm End
str q2, [x3, x8]
lsl x8, x6, #3
ldr q2, [x3, x8]
; InlineAsm Start
fmul.8h v3, v7, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v6, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v3, v2
; InlineAsm End
str q1, [x3, x8]
mov w8, #10
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmul.8h v2, v7, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v6, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v1
; InlineAsm End
str q0, [x3, x8]
b LBB24_35
LBB24_12:
stp q5, q4, [sp]
stp q3, q2, [sp, #32]
and w10, w10, #0xff
stp q1, q0, [sp, #64]
cmp w10, #2
b.eq LBB24_23
cmp w10, #1
b.ne LBB24_29
cbz x1, LBB24_35
cbz x0, LBB24_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB24_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB24_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB24_18
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB24_17
b LBB24_35
LBB24_20:
ldr q7, [x3]
; InlineAsm Start
fmla.8h v7, v6, v5
; InlineAsm End
str q7, [x3]
lsl x8, x6, #1
ldr q5, [x3, x8]
; InlineAsm Start
fmla.8h v5, v6, v4
; InlineAsm End
str q5, [x3, x8]
lsl x8, x6, #2
ldr q4, [x3, x8]
; InlineAsm Start
fmla.8h v4, v6, v3
; InlineAsm End
str q4, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q3, [x3, x8]
; InlineAsm Start
fmla.8h v3, v6, v2
; InlineAsm End
str q3, [x3, x8]
lsl x8, x6, #3
ldr q2, [x3, x8]
; InlineAsm Start
fmla.8h v2, v6, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #10
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmla.8h v1, v6, v0
; InlineAsm End
b LBB24_22
LBB24_21:
; InlineAsm Start
fmul.8h v7, v6, v5
; InlineAsm End
str q7, [x3]
lsl x8, x6, #1
; InlineAsm Start
fmul.8h v5, v6, v4
; InlineAsm End
str q5, [x3, x8]
lsl x8, x6, #2
; InlineAsm Start
fmul.8h v4, v6, v3
; InlineAsm End
str q4, [x3, x8]
mov w8, #6
mul x8, x6, x8
; InlineAsm Start
fmul.8h v3, v6, v2
; InlineAsm End
str q3, [x3, x8]
lsl x8, x6, #3
; InlineAsm Start
fmul.8h v2, v6, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #10
mul x8, x6, x8
; InlineAsm Start
fmul.8h v1, v6, v0
; InlineAsm End
LBB24_22:
str q1, [x3, x8]
b LBB24_35
LBB24_23:
cbz x1, LBB24_35
cbz x0, LBB24_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB24_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB24_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB24_27
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB24_26
b LBB24_35
LBB24_29:
cbz x1, LBB24_35
cbz x0, LBB24_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB24_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB24_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB24_33
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB24_32
LBB24_35:
add sp, sp, #96
.cfi_def_cfa_offset 0
ret
Lfunc_end24:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x7:
Lfunc_begin25:
.cfi_startproc
sub sp, sp, #112
.cfi_def_cfa_offset 112
ldr x8, [sp, #128]
movi.2d v0, #0000000000000000
lsl x9, x8, #1
cmp x2, #2
b.hs LBB25_2
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
tbnz w2, #0, LBB25_5
b LBB25_6
LBB25_2:
ldp x11, x12, [sp, #112]
lsr x10, x2, #1
lsl x11, x11, #1
lsl x13, x12, #1
mov w12, #12
msub x12, x8, x12, x13
lsl x13, x13, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
LBB25_3:
mov x14, x5
ld1r.8h { v7 }, [x14], x13
ldr q16, [x4]
; InlineAsm Start
fmla.8h v6, v16, v7
; InlineAsm End
add x5, x5, x9
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v5, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v4, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v3, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v2, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v1, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
ldr q16, [x4, x11]
; InlineAsm Start
fmla.8h v6, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v5, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v4, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v3, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v2, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x9
; InlineAsm Start
fmla.8h v1, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v16, v7
; InlineAsm End
add x4, x4, x11, lsl #1
subs x10, x10, #1
b.ne LBB25_3
mov x5, x14
tbz w2, #0, LBB25_6
add x10, x9, x8
lsl x10, x10, #2
mov x11, x5
ld1r.8h { v7 }, [x11], x10
ldr q16, [x4]
; InlineAsm Start
fmla.8h v6, v16, v7
; InlineAsm End
add x9, x5, x9
ld1r.8h { v7 }, [x9]
; InlineAsm Start
fmla.8h v5, v16, v7
; InlineAsm End
add x9, x5, x8, lsl #2
ld1r.8h { v7 }, [x9]
; InlineAsm Start
fmla.8h v4, v16, v7
; InlineAsm End
mov w9, #6
madd x9, x8, x9, x5
ld1r.8h { v7 }, [x9]
; InlineAsm Start
fmla.8h v3, v16, v7
; InlineAsm End
add x9, x5, x8, lsl #3
ld1r.8h { v7 }, [x9]
; InlineAsm Start
fmla.8h v2, v16, v7
; InlineAsm End
mov w9, #10
madd x8, x8, x9, x5
ld1r.8h { v7 }, [x8]
; InlineAsm Start
fmla.8h v1, v16, v7
; InlineAsm End
ld1r.8h { v7 }, [x11]
; InlineAsm Start
fmla.8h v0, v16, v7
; InlineAsm End
LBB25_6:
ldrh w8, [sp, #138]
ldrh w9, [sp, #136]
ldrb w10, [sp, #140]
cmp x0, #8
b.ne LBB25_12
cmp x1, #7
b.ne LBB25_12
cmp x7, #1
b.ne LBB25_12
dup.8h v7, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB25_20
cmp w8, #2
b.ne LBB25_21
dup.8h v16, w9
ldr q17, [x3]
; InlineAsm Start
fmul.8h v18, v16, v17
; InlineAsm End
; InlineAsm Start
fmul.8h v17, v7, v6
; InlineAsm End
; InlineAsm Start
fadd.8h v6, v18, v17
; InlineAsm End
str q6, [x3]
lsl x8, x6, #1
ldr q6, [x3, x8]
; InlineAsm Start
fmul.8h v17, v16, v6
; InlineAsm End
; InlineAsm Start
fmul.8h v6, v7, v5
; InlineAsm End
; InlineAsm Start
fadd.8h v5, v17, v6
; InlineAsm End
str q5, [x3, x8]
lsl x8, x6, #2
ldr q5, [x3, x8]
; InlineAsm Start
fmul.8h v6, v16, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v7, v4
; InlineAsm End
; InlineAsm Start
fadd.8h v4, v6, v5
; InlineAsm End
str q4, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q4, [x3, x8]
; InlineAsm Start
fmul.8h v5, v16, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v7, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v5, v4
; InlineAsm End
str q3, [x3, x8]
lsl x8, x6, #3
ldr q3, [x3, x8]
; InlineAsm Start
fmul.8h v4, v16, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v7, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v4, v3
; InlineAsm End
str q2, [x3, x8]
mov w8, #10
mul x8, x6, x8
ldr q2, [x3, x8]
; InlineAsm Start
fmul.8h v3, v16, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v7, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v3, v2
; InlineAsm End
str q1, [x3, x8]
mov w8, #12
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmul.8h v2, v16, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v7, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v1
; InlineAsm End
str q0, [x3, x8]
b LBB25_35
LBB25_12:
stp q6, q5, [sp]
stp q4, q3, [sp, #32]
stp q2, q1, [sp, #64]
and w10, w10, #0xff
str q0, [sp, #96]
cmp w10, #2
b.eq LBB25_23
cmp w10, #1
b.ne LBB25_29
cbz x1, LBB25_35
cbz x0, LBB25_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB25_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB25_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB25_18
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB25_17
b LBB25_35
LBB25_20:
ldr q16, [x3]
; InlineAsm Start
fmla.8h v16, v7, v6
; InlineAsm End
str q16, [x3]
lsl x8, x6, #1
ldr q6, [x3, x8]
; InlineAsm Start
fmla.8h v6, v7, v5
; InlineAsm End
str q6, [x3, x8]
lsl x8, x6, #2
ldr q5, [x3, x8]
; InlineAsm Start
fmla.8h v5, v7, v4
; InlineAsm End
str q5, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q4, [x3, x8]
; InlineAsm Start
fmla.8h v4, v7, v3
; InlineAsm End
str q4, [x3, x8]
lsl x8, x6, #3
ldr q3, [x3, x8]
; InlineAsm Start
fmla.8h v3, v7, v2
; InlineAsm End
str q3, [x3, x8]
mov w8, #10
mul x8, x6, x8
ldr q2, [x3, x8]
; InlineAsm Start
fmla.8h v2, v7, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #12
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmla.8h v1, v7, v0
; InlineAsm End
b LBB25_22
LBB25_21:
; InlineAsm Start
fmul.8h v16, v7, v6
; InlineAsm End
str q16, [x3]
lsl x8, x6, #1
; InlineAsm Start
fmul.8h v6, v7, v5
; InlineAsm End
str q6, [x3, x8]
lsl x8, x6, #2
; InlineAsm Start
fmul.8h v5, v7, v4
; InlineAsm End
str q5, [x3, x8]
mov w8, #6
mul x8, x6, x8
; InlineAsm Start
fmul.8h v4, v7, v3
; InlineAsm End
str q4, [x3, x8]
lsl x8, x6, #3
; InlineAsm Start
fmul.8h v3, v7, v2
; InlineAsm End
str q3, [x3, x8]
mov w8, #10
mul x8, x6, x8
; InlineAsm Start
fmul.8h v2, v7, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #12
mul x8, x6, x8
; InlineAsm Start
fmul.8h v1, v7, v0
; InlineAsm End
LBB25_22:
str q1, [x3, x8]
b LBB25_35
LBB25_23:
cbz x1, LBB25_35
cbz x0, LBB25_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB25_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB25_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB25_27
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB25_26
b LBB25_35
LBB25_29:
cbz x1, LBB25_35
cbz x0, LBB25_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB25_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB25_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB25_33
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB25_32
LBB25_35:
add sp, sp, #112
.cfi_def_cfa_offset 0
ret
Lfunc_end25:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x1x8:
Lfunc_begin26:
.cfi_startproc
sub sp, sp, #128
.cfi_def_cfa_offset 128
ldp x12, x9, [sp, #136]
ldr x11, [sp, #128]
lsr x8, x2, #1
movi.2d v0, #0000000000000000
cmp x9, #1
b.ne LBB26_3
cmp x2, #2
b.hs LBB26_5
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
tbnz w2, #0, LBB26_8
b LBB26_13
LBB26_3:
lsl x10, x9, #1
cmp x2, #2
b.hs LBB26_9
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
tbnz w2, #0, LBB26_12
b LBB26_13
LBB26_5:
lsl x9, x11, #1
movi.2d v1, #0000000000000000
lsl x10, x12, #1
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
LBB26_6:
mov x11, x5
ldr q16, [x5]
dup.8h v17, v16[0]
ldr q18, [x4]
; InlineAsm Start
fmla.8h v7, v18, v17
; InlineAsm End
dup.8h v17, v16[1]
; InlineAsm Start
fmla.8h v6, v18, v17
; InlineAsm End
dup.8h v17, v16[2]
; InlineAsm Start
fmla.8h v5, v18, v17
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v4, v18, v16
; InlineAsm End
ldur q16, [x5, #8]
dup.8h v17, v16[0]
; InlineAsm Start
fmla.8h v3, v18, v17
; InlineAsm End
dup.8h v17, v16[1]
; InlineAsm Start
fmla.8h v2, v18, v17
; InlineAsm End
dup.8h v17, v16[2]
; InlineAsm Start
fmla.8h v1, v18, v17
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v0, v18, v16
; InlineAsm End
add x12, x5, x10
add x5, x12, x10
ldr q16, [x12]
dup.8h v17, v16[0]
ldr q18, [x4, x9]
; InlineAsm Start
fmla.8h v7, v18, v17
; InlineAsm End
dup.8h v17, v16[1]
; InlineAsm Start
fmla.8h v6, v18, v17
; InlineAsm End
dup.8h v17, v16[2]
; InlineAsm Start
fmla.8h v5, v18, v17
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v4, v18, v16
; InlineAsm End
ldur q16, [x12, #8]
dup.8h v17, v16[0]
; InlineAsm Start
fmla.8h v3, v18, v17
; InlineAsm End
dup.8h v17, v16[1]
; InlineAsm Start
fmla.8h v2, v18, v17
; InlineAsm End
dup.8h v17, v16[2]
; InlineAsm Start
fmla.8h v1, v18, v17
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v0, v18, v16
; InlineAsm End
add x4, x4, x9, lsl #1
subs x8, x8, #1
b.ne LBB26_6
add x5, x11, x10, lsl #1
tbz w2, #0, LBB26_13
ldr q16, [x5]
dup.8h v17, v16[0]
ldr q18, [x4]
; InlineAsm Start
fmla.8h v7, v18, v17
; InlineAsm End
dup.8h v17, v16[1]
dup.8h v19, v16[2]
; InlineAsm Start
fmla.8h v6, v18, v17
; InlineAsm End
; InlineAsm Start
fmla.8h v5, v18, v19
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v4, v18, v16
; InlineAsm End
ldur q16, [x5, #8]
dup.8h v17, v16[0]
; InlineAsm Start
fmla.8h v3, v18, v17
; InlineAsm End
dup.8h v17, v16[1]
; InlineAsm Start
fmla.8h v2, v18, v17
; InlineAsm End
dup.8h v17, v16[2]
; InlineAsm Start
fmla.8h v1, v18, v17
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v0, v18, v16
; InlineAsm End
b LBB26_13
LBB26_9:
lsl x11, x11, #1
lsl x13, x12, #1
mov w12, #14
movi.2d v1, #0000000000000000
msub x12, x9, x12, x13
lsl x13, x13, #1
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
LBB26_10:
mov x14, x5
ld1r.8h { v16 }, [x14], x13
ldr q17, [x4]
; InlineAsm Start
fmla.8h v7, v17, v16
; InlineAsm End
add x5, x5, x10
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v6, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v5, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v4, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v3, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v2, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v1, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
ldr q17, [x4, x11]
; InlineAsm Start
fmla.8h v7, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v6, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v5, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v4, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v3, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v2, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x10
; InlineAsm Start
fmla.8h v1, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x5], x12
; InlineAsm Start
fmla.8h v0, v17, v16
; InlineAsm End
add x4, x4, x11, lsl #1
subs x8, x8, #1
b.ne LBB26_10
mov x5, x14
tbz w2, #0, LBB26_13
lsl x8, x9, #4
sub x8, x8, x10
mov x11, x5
ld1r.8h { v16 }, [x11], x8
ldr q17, [x4]
; InlineAsm Start
fmla.8h v7, v17, v16
; InlineAsm End
add x8, x5, x10
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v6, v17, v16
; InlineAsm End
add x8, x5, x9, lsl #2
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v5, v17, v16
; InlineAsm End
mov w8, #6
madd x8, x9, x8, x5
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v4, v17, v16
; InlineAsm End
add x8, x5, x9, lsl #3
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v3, v17, v16
; InlineAsm End
mov w8, #10
madd x8, x9, x8, x5
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v2, v17, v16
; InlineAsm End
mov w8, #12
madd x8, x9, x8, x5
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v1, v17, v16
; InlineAsm End
ld1r.8h { v16 }, [x11]
; InlineAsm Start
fmla.8h v0, v17, v16
; InlineAsm End
LBB26_13:
ldrh w8, [sp, #154]
ldrh w9, [sp, #152]
ldrb w10, [sp, #156]
cmp x0, #8
b.ne LBB26_19
cmp x1, #8
b.ne LBB26_19
cmp x7, #1
b.ne LBB26_19
dup.8h v16, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB26_27
cmp w8, #2
b.ne LBB26_28
dup.8h v17, w9
ldr q18, [x3]
; InlineAsm Start
fmul.8h v19, v17, v18
; InlineAsm End
; InlineAsm Start
fmul.8h v18, v16, v7
; InlineAsm End
; InlineAsm Start
fadd.8h v7, v19, v18
; InlineAsm End
str q7, [x3]
lsl x8, x6, #1
ldr q7, [x3, x8]
; InlineAsm Start
fmul.8h v18, v17, v7
; InlineAsm End
; InlineAsm Start
fmul.8h v7, v16, v6
; InlineAsm End
; InlineAsm Start
fadd.8h v6, v18, v7
; InlineAsm End
str q6, [x3, x8]
lsl x8, x6, #2
ldr q6, [x3, x8]
; InlineAsm Start
fmul.8h v7, v17, v6
; InlineAsm End
; InlineAsm Start
fmul.8h v6, v16, v5
; InlineAsm End
; InlineAsm Start
fadd.8h v5, v7, v6
; InlineAsm End
str q5, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q5, [x3, x8]
; InlineAsm Start
fmul.8h v6, v17, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v16, v4
; InlineAsm End
; InlineAsm Start
fadd.8h v4, v6, v5
; InlineAsm End
str q4, [x3, x8]
lsl x8, x6, #3
ldr q4, [x3, x8]
; InlineAsm Start
fmul.8h v5, v17, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v16, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v5, v4
; InlineAsm End
str q3, [x3, x8]
mov w8, #10
mul x8, x6, x8
ldr q3, [x3, x8]
; InlineAsm Start
fmul.8h v4, v17, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v16, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v4, v3
; InlineAsm End
str q2, [x3, x8]
mov w8, #12
mul x8, x6, x8
ldr q2, [x3, x8]
; InlineAsm Start
fmul.8h v3, v17, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v16, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v3, v2
; InlineAsm End
str q1, [x3, x8]
mov w8, #14
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmul.8h v2, v17, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v16, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v1
; InlineAsm End
str q0, [x3, x8]
b LBB26_42
LBB26_19:
stp q7, q6, [sp]
stp q5, q4, [sp, #32]
stp q3, q2, [sp, #64]
and w10, w10, #0xff
stp q1, q0, [sp, #96]
cmp w10, #2
b.eq LBB26_30
cmp w10, #1
b.ne LBB26_36
cbz x1, LBB26_42
cbz x0, LBB26_42
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB26_24:
mov x13, x0
mov x14, x3
mov x15, x12
LBB26_25:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB26_25
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB26_24
b LBB26_42
LBB26_27:
ldr q17, [x3]
; InlineAsm Start
fmla.8h v17, v16, v7
; InlineAsm End
str q17, [x3]
lsl x8, x6, #1
ldr q7, [x3, x8]
; InlineAsm Start
fmla.8h v7, v16, v6
; InlineAsm End
str q7, [x3, x8]
lsl x8, x6, #2
ldr q6, [x3, x8]
; InlineAsm Start
fmla.8h v6, v16, v5
; InlineAsm End
str q6, [x3, x8]
mov w8, #6
mul x8, x6, x8
ldr q5, [x3, x8]
; InlineAsm Start
fmla.8h v5, v16, v4
; InlineAsm End
str q5, [x3, x8]
lsl x8, x6, #3
ldr q4, [x3, x8]
; InlineAsm Start
fmla.8h v4, v16, v3
; InlineAsm End
str q4, [x3, x8]
mov w8, #10
mul x8, x6, x8
ldr q3, [x3, x8]
; InlineAsm Start
fmla.8h v3, v16, v2
; InlineAsm End
str q3, [x3, x8]
mov w8, #12
mul x8, x6, x8
ldr q2, [x3, x8]
; InlineAsm Start
fmla.8h v2, v16, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #14
mul x8, x6, x8
ldr q1, [x3, x8]
; InlineAsm Start
fmla.8h v1, v16, v0
; InlineAsm End
b LBB26_29
LBB26_28:
; InlineAsm Start
fmul.8h v17, v16, v7
; InlineAsm End
str q17, [x3]
lsl x8, x6, #1
; InlineAsm Start
fmul.8h v7, v16, v6
; InlineAsm End
str q7, [x3, x8]
lsl x8, x6, #2
; InlineAsm Start
fmul.8h v6, v16, v5
; InlineAsm End
str q6, [x3, x8]
mov w8, #6
mul x8, x6, x8
; InlineAsm Start
fmul.8h v5, v16, v4
; InlineAsm End
str q5, [x3, x8]
lsl x8, x6, #3
; InlineAsm Start
fmul.8h v4, v16, v3
; InlineAsm End
str q4, [x3, x8]
mov w8, #10
mul x8, x6, x8
; InlineAsm Start
fmul.8h v3, v16, v2
; InlineAsm End
str q3, [x3, x8]
mov w8, #12
mul x8, x6, x8
; InlineAsm Start
fmul.8h v2, v16, v1
; InlineAsm End
str q2, [x3, x8]
mov w8, #14
mul x8, x6, x8
; InlineAsm Start
fmul.8h v1, v16, v0
; InlineAsm End
LBB26_29:
str q1, [x3, x8]
b LBB26_42
LBB26_30:
cbz x1, LBB26_42
cbz x0, LBB26_42
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB26_33:
mov x14, x0
mov x15, x3
mov x16, x13
LBB26_34:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB26_34
add x10, x10, #1
add x13, x13, #16
add x3, x3, x11
cmp x10, x1
b.ne LBB26_33
b LBB26_42
LBB26_36:
cbz x1, LBB26_42
cbz x0, LBB26_42
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB26_39:
mov x13, x0
mov x14, x3
mov x15, x12
LBB26_40:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB26_40
add x9, x9, #1
add x12, x12, #16
add x3, x3, x10
cmp x9, x1
b.ne LBB26_39
LBB26_42:
add sp, sp, #128
.cfi_def_cfa_offset 0
ret
Lfunc_end26:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x2x1:
Lfunc_begin27:
.cfi_startproc
sub sp, sp, #32
.cfi_def_cfa_offset 32
movi.2d v0, #0000000000000000
cmp x2, #2
b.hs LBB27_2
movi.2d v1, #0000000000000000
tbnz w2, #0, LBB27_5
b LBB27_6
LBB27_2:
ldp x9, x10, [sp, #32]
lsr x8, x2, #1
lsl x9, x9, #1
lsl x10, x10, #1
lsl x11, x10, #1
movi.2d v1, #0000000000000000
mov x12, x5
LBB27_3:
mov x13, x4
ld1r.8h { v2 }, [x12], x11
ldp q3, q4, [x4]
; InlineAsm Start
fmla.8h v1, v3, v2
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v4, v2
; InlineAsm End
add x14, x5, x10
ld1r.8h { v2 }, [x14]
add x14, x4, x9
add x4, x14, x9
ldp q3, q4, [x14]
; InlineAsm Start
fmla.8h v1, v3, v2
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v4, v2
; InlineAsm End
mov x5, x12
subs x8, x8, #1
b.ne LBB27_3
add x4, x13, x9, lsl #1
tbz w2, #0, LBB27_6
ld1r.8h { v2 }, [x5]
ldp q3, q4, [x4]
; InlineAsm Start
fmla.8h v1, v3, v2
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v4, v2
; InlineAsm End
LBB27_6:
ldrh w8, [sp, #58]
ldrh w9, [sp, #56]
ldrb w10, [sp, #60]
cmp x0, #16
b.ne LBB27_12
cmp x1, #1
b.ne LBB27_12
cmp x7, #1
b.ne LBB27_12
dup.8h v2, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB27_20
cmp w8, #2
b.ne LBB27_21
dup.8h v3, w9
ldp q4, q5, [x3]
; InlineAsm Start
fmul.8h v6, v3, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v2, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v6, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v3, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v2, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v4, v3
; InlineAsm End
stp q1, q0, [x3]
b LBB27_35
LBB27_12:
stp q1, q0, [sp]
and w10, w10, #0xff
cmp w10, #2
b.eq LBB27_23
cmp w10, #1
b.ne LBB27_29
cbz x1, LBB27_35
cbz x0, LBB27_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB27_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB27_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB27_18
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB27_17
b LBB27_35
LBB27_20:
ldr q3, [x3]
; InlineAsm Start
fmla.8h v3, v2, v1
; InlineAsm End
str q3, [x3]
ldr q1, [x3, #16]
; InlineAsm Start
fmla.8h v1, v2, v0
; InlineAsm End
b LBB27_22
LBB27_21:
; InlineAsm Start
fmul.8h v3, v2, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v2, v0
; InlineAsm End
str q3, [x3]
LBB27_22:
str q1, [x3, #16]
b LBB27_35
LBB27_23:
cbz x1, LBB27_35
cbz x0, LBB27_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB27_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB27_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB27_27
add x10, x10, #1
add x13, x13, #32
add x3, x3, x11
cmp x10, x1
b.ne LBB27_26
b LBB27_35
LBB27_29:
cbz x1, LBB27_35
cbz x0, LBB27_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB27_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB27_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB27_33
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB27_32
LBB27_35:
add sp, sp, #32
.cfi_def_cfa_offset 0
ret
Lfunc_end27:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x2x2:
Lfunc_begin28:
.cfi_startproc
sub sp, sp, #64
.cfi_def_cfa_offset 64
ldr x8, [sp, #80]
movi.2d v0, #0000000000000000
lsl x8, x8, #1
cmp x2, #2
b.hs LBB28_2
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
tbnz w2, #0, LBB28_5
b LBB28_6
LBB28_2:
ldp x10, x11, [sp, #64]
lsr x9, x2, #1
lsl x10, x10, #1
lsl x11, x11, #1
add x12, x8, x11
lsl x13, x11, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
mov x14, x5
LBB28_3:
mov x15, x4
ld1r.8h { v4 }, [x14], x13
ldp q5, q6, [x4]
; InlineAsm Start
fmla.8h v3, v5, v4
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v6, v4
; InlineAsm End
add x16, x5, x8
ld1r.8h { v4 }, [x16]
; InlineAsm Start
fmla.8h v1, v5, v4
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v6, v4
; InlineAsm End
add x16, x5, x11
ld1r.8h { v4 }, [x16]
add x16, x4, x10
add x4, x16, x10
ldp q5, q6, [x16]
; InlineAsm Start
fmla.8h v3, v5, v4
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v6, v4
; InlineAsm End
add x16, x5, x12
ld1r.8h { v4 }, [x16]
; InlineAsm Start
fmla.8h v1, v5, v4
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v6, v4
; InlineAsm End
mov x5, x14
subs x9, x9, #1
b.ne LBB28_3
add x4, x15, x10, lsl #1
tbz w2, #0, LBB28_6
ld1r.8h { v4 }, [x5], x8
ldp q5, q6, [x4]
; InlineAsm Start
fmla.8h v3, v5, v4
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v6, v4
; InlineAsm End
ld1r.8h { v4 }, [x5]
; InlineAsm Start
fmla.8h v1, v5, v4
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v6, v4
; InlineAsm End
LBB28_6:
ldrh w8, [sp, #90]
ldrh w9, [sp, #88]
ldrb w10, [sp, #92]
cmp x0, #16
b.ne LBB28_12
cmp x1, #2
b.ne LBB28_12
cmp x7, #1
b.ne LBB28_12
dup.8h v4, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB28_20
cmp w8, #2
b.ne LBB28_21
dup.8h v5, w9
ldp q6, q7, [x3]
; InlineAsm Start
fmul.8h v16, v5, v6
; InlineAsm End
; InlineAsm Start
fmul.8h v6, v4, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v16, v6
; InlineAsm End
; InlineAsm Start
fmul.8h v6, v5, v7
; InlineAsm End
; InlineAsm Start
fmul.8h v7, v4, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v6, v7
; InlineAsm End
stp q3, q2, [x3]
add x8, x3, x6, lsl #1
ldp q2, q3, [x8]
; InlineAsm Start
fmul.8h v6, v5, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v4, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v6, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v5, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v4, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v3
; InlineAsm End
stp q1, q0, [x8]
b LBB28_35
LBB28_12:
stp q3, q2, [sp]
stp q1, q0, [sp, #32]
and w10, w10, #0xff
cmp w10, #2
b.eq LBB28_23
cmp w10, #1
b.ne LBB28_29
cbz x1, LBB28_35
cbz x0, LBB28_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB28_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB28_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB28_18
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB28_17
b LBB28_35
LBB28_20:
ldr q5, [x3]
; InlineAsm Start
fmla.8h v5, v4, v3
; InlineAsm End
ldr q3, [x3, #16]
; InlineAsm Start
fmla.8h v3, v4, v2
; InlineAsm End
stp q5, q3, [x3]
add x8, x3, x6, lsl #1
ldr q2, [x8]
; InlineAsm Start
fmla.8h v2, v4, v1
; InlineAsm End
str q2, [x8]
ldr q1, [x8, #16]
; InlineAsm Start
fmla.8h v1, v4, v0
; InlineAsm End
b LBB28_22
LBB28_21:
; InlineAsm Start
fmul.8h v5, v4, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v4, v2
; InlineAsm End
stp q5, q3, [x3]
add x8, x3, x6, lsl #1
; InlineAsm Start
fmul.8h v2, v4, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v4, v0
; InlineAsm End
str q2, [x8]
LBB28_22:
str q1, [x8, #16]
b LBB28_35
LBB28_23:
cbz x1, LBB28_35
cbz x0, LBB28_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB28_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB28_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB28_27
add x10, x10, #1
add x13, x13, #32
add x3, x3, x11
cmp x10, x1
b.ne LBB28_26
b LBB28_35
LBB28_29:
cbz x1, LBB28_35
cbz x0, LBB28_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB28_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB28_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB28_33
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB28_32
LBB28_35:
add sp, sp, #64
.cfi_def_cfa_offset 0
ret
Lfunc_end28:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x2x3:
Lfunc_begin29:
.cfi_startproc
sub sp, sp, #112
.cfi_def_cfa_offset 112
stp x20, x19, [sp, #96]
.cfi_offset w19, -8
.cfi_offset w20, -16
ldr x8, [sp, #128]
movi.2d v0, #0000000000000000
lsl x9, x8, #2
cmp x2, #2
b.hs LBB29_2
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
tbnz w2, #0, LBB29_5
b LBB29_6
LBB29_2:
ldp x12, x13, [sp, #112]
lsr x10, x2, #1
lsl x11, x8, #1
lsl x12, x12, #1
lsl x13, x13, #1
add x14, x9, x13
add x15, x11, x13
lsl x16, x13, #1
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
mov x17, x5
LBB29_3:
mov x19, x4
ld1r.8h { v6 }, [x17], x16
ldp q7, q16, [x4]
; InlineAsm Start
fmla.8h v5, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v16, v6
; InlineAsm End
add x4, x5, x11
ld1r.8h { v6 }, [x4]
; InlineAsm Start
fmla.8h v3, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v16, v6
; InlineAsm End
add x4, x5, x9
ld1r.8h { v6 }, [x4]
; InlineAsm Start
fmla.8h v1, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v16, v6
; InlineAsm End
add x4, x5, x13
ld1r.8h { v6 }, [x4]
add x20, x19, x12
add x4, x20, x12
ldp q7, q16, [x20]
; InlineAsm Start
fmla.8h v5, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v16, v6
; InlineAsm End
add x20, x5, x15
ld1r.8h { v6 }, [x20]
; InlineAsm Start
fmla.8h v3, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v16, v6
; InlineAsm End
add x5, x5, x14
ld1r.8h { v6 }, [x5]
; InlineAsm Start
fmla.8h v1, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v16, v6
; InlineAsm End
mov x5, x17
subs x10, x10, #1
b.ne LBB29_3
add x4, x19, x12, lsl #1
tbz w2, #0, LBB29_6
mov x10, x5
ld1r.8h { v6 }, [x10], x9
ldp q7, q16, [x4]
; InlineAsm Start
fmla.8h v5, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v16, v6
; InlineAsm End
add x8, x5, x8, lsl #1
ld1r.8h { v6 }, [x8]
; InlineAsm Start
fmla.8h v3, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v16, v6
; InlineAsm End
ld1r.8h { v6 }, [x10]
; InlineAsm Start
fmla.8h v1, v7, v6
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v16, v6
; InlineAsm End
LBB29_6:
ldrh w8, [sp, #138]
ldrh w9, [sp, #136]
ldrb w10, [sp, #140]
cmp x0, #16
b.ne LBB29_12
cmp x1, #3
b.ne LBB29_12
cmp x7, #1
b.ne LBB29_12
dup.8h v6, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB29_20
cmp w8, #2
b.ne LBB29_21
dup.8h v7, w9
ldp q16, q17, [x3]
; InlineAsm Start
fmul.8h v18, v7, v16
; InlineAsm End
; InlineAsm Start
fmul.8h v16, v6, v5
; InlineAsm End
; InlineAsm Start
fadd.8h v5, v18, v16
; InlineAsm End
; InlineAsm Start
fmul.8h v16, v7, v17
; InlineAsm End
; InlineAsm Start
fmul.8h v17, v6, v4
; InlineAsm End
; InlineAsm Start
fadd.8h v4, v16, v17
; InlineAsm End
stp q5, q4, [x3]
add x8, x3, x6, lsl #1
ldp q4, q5, [x8]
; InlineAsm Start
fmul.8h v16, v7, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v6, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v16, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v7, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v6, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v4, v5
; InlineAsm End
stp q3, q2, [x8]
add x8, x3, x6, lsl #2
ldp q2, q3, [x8]
; InlineAsm Start
fmul.8h v4, v7, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v6, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v4, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v7, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v6, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v3
; InlineAsm End
stp q1, q0, [x8]
b LBB29_35
LBB29_12:
stp q5, q4, [sp]
stp q3, q2, [sp, #32]
and w10, w10, #0xff
stp q1, q0, [sp, #64]
cmp w10, #2
b.eq LBB29_23
cmp w10, #1
b.ne LBB29_29
cbz x1, LBB29_35
cbz x0, LBB29_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB29_17:
mov x13, x0
mov x14, x3
mov x15, x12
LBB29_18:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB29_18
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB29_17
b LBB29_35
LBB29_20:
ldr q7, [x3]
; InlineAsm Start
fmla.8h v7, v6, v5
; InlineAsm End
ldr q5, [x3, #16]
; InlineAsm Start
fmla.8h v5, v6, v4
; InlineAsm End
stp q7, q5, [x3]
add x8, x3, x6, lsl #1
ldr q4, [x8]
; InlineAsm Start
fmla.8h v4, v6, v3
; InlineAsm End
ldr q3, [x8, #16]
; InlineAsm Start
fmla.8h v3, v6, v2
; InlineAsm End
stp q4, q3, [x8]
add x8, x3, x6, lsl #2
ldr q2, [x8]
; InlineAsm Start
fmla.8h v2, v6, v1
; InlineAsm End
str q2, [x8]
ldr q1, [x8, #16]
; InlineAsm Start
fmla.8h v1, v6, v0
; InlineAsm End
b LBB29_22
LBB29_21:
; InlineAsm Start
fmul.8h v7, v6, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v6, v4
; InlineAsm End
stp q7, q5, [x3]
add x8, x3, x6, lsl #1
; InlineAsm Start
fmul.8h v4, v6, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v6, v2
; InlineAsm End
stp q4, q3, [x8]
add x8, x3, x6, lsl #2
; InlineAsm Start
fmul.8h v2, v6, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v6, v0
; InlineAsm End
str q2, [x8]
LBB29_22:
str q1, [x8, #16]
b LBB29_35
LBB29_23:
cbz x1, LBB29_35
cbz x0, LBB29_35
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB29_26:
mov x14, x0
mov x15, x3
mov x16, x13
LBB29_27:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB29_27
add x10, x10, #1
add x13, x13, #32
add x3, x3, x11
cmp x10, x1
b.ne LBB29_26
b LBB29_35
LBB29_29:
cbz x1, LBB29_35
cbz x0, LBB29_35
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB29_32:
mov x13, x0
mov x14, x3
mov x15, x12
LBB29_33:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB29_33
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB29_32
LBB29_35:
ldp x20, x19, [sp, #96]
add sp, sp, #112
.cfi_def_cfa_offset 0
.cfi_restore w19
.cfi_restore w20
ret
Lfunc_end29:
.cfi_endproc
.p2align 2
gemm_f16::microkernel::neon::f16::x2x4:
Lfunc_begin30:
.cfi_startproc
sub sp, sp, #160
.cfi_def_cfa_offset 160
stp x22, x21, [sp, #128]
stp x20, x19, [sp, #144]
.cfi_offset w19, -8
.cfi_offset w20, -16
.cfi_offset w21, -24
.cfi_offset w22, -32
ldp x12, x9, [sp, #168]
ldr x11, [sp, #160]
lsr x8, x2, #1
movi.2d v0, #0000000000000000
cmp x9, #1
b.ne LBB30_3
cmp x2, #2
b.hs LBB30_5
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
tbnz w2, #0, LBB30_8
b LBB30_13
LBB30_3:
lsl x10, x9, #1
cmp x2, #2
b.hs LBB30_9
movi.2d v1, #0000000000000000
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
tbnz w2, #0, LBB30_12
b LBB30_13
LBB30_5:
lsl x9, x11, #1
movi.2d v1, #0000000000000000
lsl x10, x12, #1
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
LBB30_6:
mov x11, x4
ldr q16, [x5]
dup.8h v17, v16[0]
ldp q18, q19, [x4]
; InlineAsm Start
fmla.8h v7, v18, v17
; InlineAsm End
; InlineAsm Start
fmla.8h v6, v19, v17
; InlineAsm End
dup.8h v17, v16[1]
; InlineAsm Start
fmla.8h v5, v18, v17
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v19, v17
; InlineAsm End
dup.8h v17, v16[2]
; InlineAsm Start
fmla.8h v3, v18, v17
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v19, v17
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v1, v18, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v19, v16
; InlineAsm End
add x12, x4, x9
add x4, x12, x9
ldp q16, q17, [x12]
ldr q18, [x5, x10]
dup.8h v19, v18[0]
; InlineAsm Start
fmla.8h v7, v16, v19
; InlineAsm End
; InlineAsm Start
fmla.8h v6, v17, v19
; InlineAsm End
dup.8h v19, v18[1]
; InlineAsm Start
fmla.8h v5, v16, v19
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v17, v19
; InlineAsm End
dup.8h v19, v18[2]
; InlineAsm Start
fmla.8h v3, v16, v19
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v17, v19
; InlineAsm End
dup.8h v18, v18[3]
; InlineAsm Start
fmla.8h v1, v16, v18
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v17, v18
; InlineAsm End
add x5, x5, x10, lsl #1
subs x8, x8, #1
b.ne LBB30_6
add x4, x11, x9, lsl #1
tbz w2, #0, LBB30_13
ldr q16, [x5]
dup.8h v17, v16[0]
ldp q18, q19, [x4]
; InlineAsm Start
fmla.8h v7, v18, v17
; InlineAsm End
; InlineAsm Start
fmla.8h v6, v19, v17
; InlineAsm End
dup.8h v17, v16[1]
; InlineAsm Start
fmla.8h v5, v18, v17
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v19, v17
; InlineAsm End
dup.8h v17, v16[2]
; InlineAsm Start
fmla.8h v3, v18, v17
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v19, v17
; InlineAsm End
dup.8h v16, v16[3]
; InlineAsm Start
fmla.8h v1, v18, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v19, v16
; InlineAsm End
b LBB30_13
LBB30_9:
lsl x11, x11, #1
lsl x12, x12, #1
add x13, x10, x9
lsl x13, x13, #1
movi.2d v1, #0000000000000000
add x14, x13, x12
lsl x15, x9, #2
add x16, x15, x12
add x17, x10, x12
lsl x19, x12, #1
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
mov x20, x5
LBB30_10:
mov x21, x4
ld1r.8h { v16 }, [x20], x19
ldp q17, q18, [x4]
; InlineAsm Start
fmla.8h v7, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v6, v18, v16
; InlineAsm End
add x4, x5, x10
ld1r.8h { v16 }, [x4]
; InlineAsm Start
fmla.8h v5, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v18, v16
; InlineAsm End
add x4, x5, x15
ld1r.8h { v16 }, [x4]
; InlineAsm Start
fmla.8h v3, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v18, v16
; InlineAsm End
add x4, x5, x13
ld1r.8h { v16 }, [x4]
; InlineAsm Start
fmla.8h v1, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v18, v16
; InlineAsm End
add x4, x5, x12
ld1r.8h { v16 }, [x4]
add x22, x21, x11
add x4, x22, x11
ldp q17, q18, [x22]
; InlineAsm Start
fmla.8h v7, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v6, v18, v16
; InlineAsm End
add x22, x5, x17
ld1r.8h { v16 }, [x22]
; InlineAsm Start
fmla.8h v5, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v18, v16
; InlineAsm End
add x22, x5, x16
ld1r.8h { v16 }, [x22]
; InlineAsm Start
fmla.8h v3, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v18, v16
; InlineAsm End
add x5, x5, x14
ld1r.8h { v16 }, [x5]
; InlineAsm Start
fmla.8h v1, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v18, v16
; InlineAsm End
mov x5, x20
subs x8, x8, #1
b.ne LBB30_10
add x4, x21, x11, lsl #1
tbz w2, #0, LBB30_13
add x8, x10, x9
lsl x8, x8, #1
mov x11, x5
ld1r.8h { v16 }, [x11], x8
ldp q17, q18, [x4]
; InlineAsm Start
fmla.8h v7, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v6, v18, v16
; InlineAsm End
add x8, x5, x10
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v5, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v4, v18, v16
; InlineAsm End
add x8, x5, x9, lsl #2
ld1r.8h { v16 }, [x8]
; InlineAsm Start
fmla.8h v3, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v2, v18, v16
; InlineAsm End
ld1r.8h { v16 }, [x11]
; InlineAsm Start
fmla.8h v1, v17, v16
; InlineAsm End
; InlineAsm Start
fmla.8h v0, v18, v16
; InlineAsm End
LBB30_13:
ldrh w8, [sp, #186]
ldrh w9, [sp, #184]
ldrb w10, [sp, #188]
cmp x0, #16
b.ne LBB30_19
cmp x1, #4
b.ne LBB30_19
cmp x7, #1
b.ne LBB30_19
dup.8h v16, w8
and w8, w10, #0xff
cmp w8, #1
b.eq LBB30_27
cmp w8, #2
b.ne LBB30_28
dup.8h v17, w9
ldp q18, q19, [x3]
; InlineAsm Start
fmul.8h v20, v17, v18
; InlineAsm End
; InlineAsm Start
fmul.8h v18, v16, v7
; InlineAsm End
; InlineAsm Start
fadd.8h v7, v20, v18
; InlineAsm End
; InlineAsm Start
fmul.8h v18, v17, v19
; InlineAsm End
; InlineAsm Start
fmul.8h v19, v16, v6
; InlineAsm End
; InlineAsm Start
fadd.8h v6, v18, v19
; InlineAsm End
stp q7, q6, [x3]
add x8, x3, x6, lsl #1
ldp q6, q7, [x8]
; InlineAsm Start
fmul.8h v18, v17, v6
; InlineAsm End
; InlineAsm Start
fmul.8h v6, v16, v5
; InlineAsm End
; InlineAsm Start
fadd.8h v5, v18, v6
; InlineAsm End
; InlineAsm Start
fmul.8h v6, v17, v7
; InlineAsm End
; InlineAsm Start
fmul.8h v7, v16, v4
; InlineAsm End
; InlineAsm Start
fadd.8h v4, v6, v7
; InlineAsm End
stp q5, q4, [x8]
add x8, x3, x6, lsl #2
ldp q4, q5, [x8]
; InlineAsm Start
fmul.8h v6, v17, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v16, v3
; InlineAsm End
; InlineAsm Start
fadd.8h v3, v6, v4
; InlineAsm End
; InlineAsm Start
fmul.8h v4, v17, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v16, v2
; InlineAsm End
; InlineAsm Start
fadd.8h v2, v4, v5
; InlineAsm End
stp q3, q2, [x8]
mov w8, #6
madd x8, x6, x8, x3
ldp q2, q3, [x8]
; InlineAsm Start
fmul.8h v4, v17, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v16, v1
; InlineAsm End
; InlineAsm Start
fadd.8h v1, v4, v2
; InlineAsm End
; InlineAsm Start
fmul.8h v2, v17, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v16, v0
; InlineAsm End
; InlineAsm Start
fadd.8h v0, v2, v3
; InlineAsm End
stp q1, q0, [x8]
b LBB30_42
LBB30_19:
stp q7, q6, [sp]
stp q5, q4, [sp, #32]
stp q3, q2, [sp, #64]
and w10, w10, #0xff
stp q1, q0, [sp, #96]
cmp w10, #2
b.eq LBB30_30
cmp w10, #1
b.ne LBB30_36
cbz x1, LBB30_42
cbz x0, LBB30_42
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB30_24:
mov x13, x0
mov x14, x3
mov x15, x12
LBB30_25:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x14]
; InlineAsm Start
fadd h1, h0, h2
; InlineAsm End
str h1, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB30_25
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB30_24
b LBB30_42
LBB30_27:
ldr q17, [x3]
; InlineAsm Start
fmla.8h v17, v16, v7
; InlineAsm End
ldr q7, [x3, #16]
; InlineAsm Start
fmla.8h v7, v16, v6
; InlineAsm End
stp q17, q7, [x3]
add x8, x3, x6, lsl #1
ldr q6, [x8]
; InlineAsm Start
fmla.8h v6, v16, v5
; InlineAsm End
ldr q5, [x8, #16]
; InlineAsm Start
fmla.8h v5, v16, v4
; InlineAsm End
stp q6, q5, [x8]
add x8, x3, x6, lsl #2
ldr q4, [x8]
; InlineAsm Start
fmla.8h v4, v16, v3
; InlineAsm End
ldr q3, [x8, #16]
; InlineAsm Start
fmla.8h v3, v16, v2
; InlineAsm End
stp q4, q3, [x8]
mov w8, #6
madd x8, x6, x8, x3
ldr q2, [x8]
; InlineAsm Start
fmla.8h v2, v16, v1
; InlineAsm End
str q2, [x8]
ldr q1, [x8, #16]
; InlineAsm Start
fmla.8h v1, v16, v0
; InlineAsm End
b LBB30_29
LBB30_28:
; InlineAsm Start
fmul.8h v17, v16, v7
; InlineAsm End
; InlineAsm Start
fmul.8h v7, v16, v6
; InlineAsm End
stp q17, q7, [x3]
add x8, x3, x6, lsl #1
; InlineAsm Start
fmul.8h v6, v16, v5
; InlineAsm End
; InlineAsm Start
fmul.8h v5, v16, v4
; InlineAsm End
stp q6, q5, [x8]
add x8, x3, x6, lsl #2
; InlineAsm Start
fmul.8h v4, v16, v3
; InlineAsm End
; InlineAsm Start
fmul.8h v3, v16, v2
; InlineAsm End
stp q4, q3, [x8]
mov w8, #6
madd x8, x6, x8, x3
; InlineAsm Start
fmul.8h v2, v16, v1
; InlineAsm End
; InlineAsm Start
fmul.8h v1, v16, v0
; InlineAsm End
str q2, [x8]
LBB30_29:
str q1, [x8, #16]
b LBB30_42
LBB30_30:
cbz x1, LBB30_42
cbz x0, LBB30_42
mov x10, #0
lsl x11, x6, #1
lsl x12, x7, #1
mov x13, sp
LBB30_33:
mov x14, x0
mov x15, x3
mov x16, x13
LBB30_34:
ldr h0, [x15]
fmov s1, w9
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
ldr h0, [x16], #2
fmov s1, w8
; InlineAsm Start
fmul h3, h1, h0
; InlineAsm End
; InlineAsm Start
fadd h0, h2, h3
; InlineAsm End
str h0, [x15]
add x15, x15, x12
subs x14, x14, #1
b.ne LBB30_34
add x10, x10, #1
add x13, x13, #32
add x3, x3, x11
cmp x10, x1
b.ne LBB30_33
b LBB30_42
LBB30_36:
cbz x1, LBB30_42
cbz x0, LBB30_42
mov x9, #0
lsl x10, x6, #1
lsl x11, x7, #1
mov x12, sp
LBB30_39:
mov x13, x0
mov x14, x3
mov x15, x12
LBB30_40:
ldr h0, [x15], #2
fmov s1, w8
; InlineAsm Start
fmul h2, h1, h0
; InlineAsm End
str h2, [x14]
add x14, x14, x11
subs x13, x13, #1
b.ne LBB30_40
add x9, x9, #1
add x12, x12, #32
add x3, x3, x10
cmp x9, x1
b.ne LBB30_39
LBB30_42:
ldp x20, x19, [sp, #144]
ldp x22, x21, [sp, #128]
add sp, sp, #160
.cfi_def_cfa_offset 0
.cfi_restore w19
.cfi_restore w20
.cfi_restore w21
.cfi_restore w22
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment