Skip to content

Instantly share code, notes, and snippets.

@farnoy
Last active June 17, 2025 16:05
Show Gist options
  • Save farnoy/58be6004632b02537b834b620e3a3654 to your computer and use it in GitHub Desktop.
Save farnoy/58be6004632b02537b834b620e3a3654 to your computer and use it in GitHub Desktop.
amdgcn DPP `shuffle_xor(val, 1)` implementation in mojo
# import random
from memory import UnsafePointer
from compile.reflection import get_type_name
from sys.intrinsics import _type_is_eq
from sys.info import _current_target, is_triple
from gpu import *
from gpu.warp import *
from gpu.host import DeviceContext
from sys import has_accelerator, is_nvidia_gpu, llvm_intrinsic, is_amd_gpu
from layout import Layout, LayoutTensor
from layout.tensor_builder import LayoutTensorBuild as tb
fn _amdgcn_dpp[
dtype: DType,
width: Int, //,
dpp_ctrl: UInt32,
row_mask: UInt32 = 0b1111,
bank_mask: UInt32 = 0b1111,
](old: SIMD[dtype, width], src: SIMD[dtype, width]) -> SIMD[dtype, width]:
# constrained[is_amd_gpu()]()
constrained[
dtype.bitwidth() in (32, 64), "Can only use DPP with 32/64-bit dtypes"
]()
bound_ctrl = False
return llvm_intrinsic["llvm.amdgcn.update.dpp", SIMD[dtype, width]](
old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl
)
fn amdgcn_row_mirror[
dtype: DType, width: Int, //
](old: SIMD[dtype, width], src: SIMD[dtype, width]) -> SIMD[dtype, width]:
return _amdgcn_dpp[dpp_ctrl=0x140](old, src)
fn amdgcn_row_shift_left[
dtype: DType, width: Int, //, offset: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
constrained[
offset > 0 and offset < 16, "Can only shift row by up to 15 positions"
]()
return _amdgcn_dpp[dpp_ctrl = 0x100 + offset](old, src)
fn amdgcn_row_rotate_left[
dtype: DType, width: Int, //, offset: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
constrained[
offset > 0 and offset < 16, "Can only rotate row by up to 15 positions"
]()
return _amdgcn_dpp[dpp_ctrl = 0x110 + offset](old, src)
fn amdgcn_shift_left[
dtype: DType, width: Int, //
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
return _amdgcn_dpp[dpp_ctrl=0x130](old, src)
fn amdgcn_rotate_left[
dtype: DType, width: Int, //
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
return _amdgcn_dpp[dpp_ctrl=0x134](old, src)
fn amdgcn_row_read_lane[
dtype: DType, width: Int, //, offset: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
constrained[
offset >= 0 and offset < 16, "Can only broadcast within each row (0-15)"
]()
return _amdgcn_dpp[dpp_ctrl = 0x150 + offset](old, src)
fn amdgcn_quad_perm[
dtype: DType, width: Int, //, perm: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
constrained[
perm >= 0 and perm <= 0xFF, "DPP_QUAD_PERM must be between 0 and 0xFF"
]()
return _amdgcn_dpp[dpp_ctrl = 0x0 + perm](old, src)
fn amdgcn_quad_shuffle_xor[
dtype: DType, width: Int, //, mask: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
constrained[
mask >= 0 and mask <= 3, "Quad shuffle mask must be between 0 and 3"
]()
fn calculate_bitmask(xor: Int) -> Int:
# calculate lane indices for a quad
mask = 0
for i in range(4):
mask |= (i ^ xor) << 2 * i
return mask
alias bitmask = calculate_bitmask(mask)
return amdgcn_quad_perm[bitmask](src, old)
from layout import Layout, IntTuple
from layout.layout import coalesce, print_layout
from layout.layout_tensor import LayoutTensor
from layout.tensor_builder import LayoutTensorBuild as tb
from algorithm.functional import tile, elementwise
from utils import StaticTuple, IndexList
from gpu import *
from gpu.host import DeviceContext
from gpu.host.info import _get_info_from_target, Vendor
from sys.info import _accelerator_arch
from benchmark import *
from gpu.warp import *
from time import sleep
from amdgcn_kernels import *
alias out_layout = Layout.row_major(32)
alias perdivisor = 3
alias divisor = 4
alias warps = 8
alias loops = 320
@export
fn kernel(output: LayoutTensor[mut=False, DType.uint32, out_layout]):
val = UInt32(thread_idx.x)
@parameter
for i in range(loops):
@parameter
if i % divisor < perdivisor:
val = amdgcn_quad_shuffle_xor[1](val)
@parameter
if i % divisor >= perdivisor:
val = shuffle_xor(val, 1)
if lane_id() == 0:
output[block_idx.x * warps + warp_id(), 1] = val
from layout import Layout, IntTuple
from layout.layout import coalesce, print_layout
from layout.layout_tensor import LayoutTensor
from layout.tensor_builder import LayoutTensorBuild as tb
from algorithm.functional import tile, elementwise
from utils import StaticTuple, IndexList
from gpu import *
from gpu.host import DeviceContext
from gpu.host.info import _get_info_from_target, Vendor
from sys.info import _accelerator_arch
from benchmark import *
from gpu.warp import *
from time import sleep
from amdgcn_kernels import *
def main():
var bench = Bench(BenchConfig(max_iters=8000, min_runtime_secs=0.1, min_warmuptime_secs=0.0))
alias layout = Layout.row_major(10240, 640)
buf = List[Scalar[DType.int32]](length=UInt(layout.size()), fill=0)
x = LayoutTensor[mut=True, DType.int32, layout, MutableAnyOrigin](buf.unsafe_ptr())
@parameter
@always_inline
fn gpu_launch[blocks: Int, warps: Int, loops: Int, *, new_method_percent: Int](mut b: Bencher) raises:
ctx = DeviceContext()
alias out_layout = Layout.row_major(blocks * warps, 1)
out = ctx.enqueue_create_buffer[DType.uint32](out_layout.size()).enqueue_fill(0)
out_tensor = LayoutTensor[mut=False, DType.uint32, out_layout](out.unsafe_ptr())
alias target = _get_info_from_target[_accelerator_arch()]()
alias percent = new_method_percent if target.vendor == Vendor.AMD_GPU and Bool(target.compute >= 9.0) else 0
fn kernel(output: LayoutTensor[mut=False, DType.uint32, out_layout]):
val = UInt32(thread_idx.x)
@parameter
for i in range(loops):
@parameter
if i % 100 < percent:
val = amdgcn_quad_shuffle_xor[1](val)
@parameter
if i % 100 >= percent:
val = shuffle_xor(val, 1)
if lane_id() == 0:
output[block_idx.x * warps + warp_id(), 1] = val
fun = ctx.compile_function[kernel]()
@parameter
@always_inline
fn launch(ctx: DeviceContext) raises:
ctx.enqueue_function(fun, out_tensor, grid_dim=blocks, block_dim=warps * WARP_SIZE)
b.iter_custom[launch](ctx)
alias BLOCKS = List[Int](1024, 10240)
alias SHUFFLES = List[Int](100, 1000)
alias PERCENTS = List[Int](0, 30, 60, 100)
@parameter
for block_ix in range(len(BLOCKS)):
alias blocks = BLOCKS[block_ix]
@parameter
for shuffles_ix in range(len(SHUFFLES)):
alias shuffles = SHUFFLES[shuffles_ix]
@parameter
for percent_ix in range(len(PERCENTS)):
alias percent = PERCENTS[percent_ix]
bench.bench_function[gpu_launch[blocks, 8, shuffles, new_method_percent=percent]](
BenchId("gpu launch", String("{}x8 x {} shuffles ({}% using new method)").format(blocks, shuffles, percent)),
List(ThroughputMeasure(BenchMetric.elements, blocks*8*WARP_SIZE*shuffles))
)
print(bench)

MI300X

name                                                        met (ms)              iters throughput (GElems/s)
gpu launch/1024x8 x 100 shuffles (0% using new method)      0.010453643           10000 5015.361630390477     
gpu launch/1024x8 x 100 shuffles (30% using new method)     0.0079553482          10000 6590.384063893019     
gpu launch/1024x8 x 100 shuffles (60% using new method)     0.0055413536          10000 9461.370593639793     
gpu launch/1024x8 x 100 shuffles (100% using new method)    0.0043070808          10000 12172.699430203398    
gpu launch/1024x8 x 1000 shuffles (0% using new method)     0.094118019           1000   5570.5379859301975    
gpu launch/1024x8 x 1000 shuffles (30% using new method)    0.06710963870246085   1788   7812.409813804808     
gpu launch/1024x8 x 1000 shuffles (60% using new method)    0.04129958278829604   2905   12694.752939455333    
gpu launch/1024x8 x 1000 shuffles (100% using new method)   0.032438968369829686 3699   16162.289565522113    
gpu launch/10240x8 x 100 shuffles (0% using new method)     0.08054313355704698   1490   6509.406535923489     
gpu launch/10240x8 x 100 shuffles (30% using new method)    0.05715032586946165   2099   9173.840954075014     
gpu launch/10240x8 x 100 shuffles (60% using new method)    0.03391186065573771   3538   15460.313585338272    
gpu launch/10240x8 x 100 shuffles (100% using new method)   0.028408811982003315 4223   18455.118796665312    
gpu launch/10240x8 x 1000 shuffles (0% using new method)    0.7921750860927153    151    6618.334875765558     
gpu launch/10240x8 x 1000 shuffles (30% using new method)   0.5608276901408451    213    9348.468508541928     
gpu launch/10240x8 x 1000 shuffles (60% using new method)   0.32620536512261583   367    16072.329153842331    
gpu launch/10240x8 x 1000 shuffles (100% using new method) 0.2676903504464286    448    19585.614465580933    

RX 6900 XT

name                                                        met (ms)              iters throughput (GElems/s)
gpu launch/1024x8 x 100 shuffles (0% using new method)      0.016253166147174414 7379   1612.8795929744024    
gpu launch/1024x8 x 100 shuffles (30% using new method)     0.012098112639293243 9961   2166.8173194931846    
gpu launch/1024x8 x 100 shuffles (60% using new method)     0.0090560127          10000 2894.6955871649784    
gpu launch/1024x8 x 100 shuffles (100% using new method)    0.01261612555251526   9502   2077.8486937912307    
gpu launch/1024x8 x 1000 shuffles (0% using new method)     0.14161556906729633   847    1851.0959051079196    
gpu launch/1024x8 x 1000 shuffles (30% using new method)    0.10176524299999999   1000   2575.9679068422215    
gpu launch/1024x8 x 1000 shuffles (60% using new method)    0.07156855217650566   1677   3662.8378251035233    
gpu launch/1024x8 x 1000 shuffles (100% using new method)   0.10833399199999999   1000   2419.776056992343     
gpu launch/10240x8 x 100 shuffles (0% using new method)     0.13085703707742638   917    2003.2854621711544    
gpu launch/10240x8 x 100 shuffles (30% using new method)    0.092669609           1000   2828.802266771192     
gpu launch/10240x8 x 100 shuffles (60% using new method)    0.06808482463110102   1762   3850.255933247321     
gpu launch/10240x8 x 100 shuffles (100% using new method)   0.11006811500000001   1000   2381.652488552203     
gpu launch/10240x8 x 1000 shuffles (0% using new method)    1.2215475714285715    98     2145.9991090926464    
gpu launch/10240x8 x 1000 shuffles (30% using new method)   0.8607480647482014    139    3045.536908371513     
gpu launch/10240x8 x 1000 shuffles (60% using new method)   0.6099580612244898    196    4297.738101431865     
gpu launch/10240x8 x 1000 shuffles (100% using new method) 1.01526641            100    2582.0217966238047    
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 6
.text
.globl kernel
.p2align 8
.type kernel,@function
kernel:
kernel$local:
.type kernel$local,@function
v_mov_b32_dpp v1, v0 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v1, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
v_mbcnt_lo_u32_b32 v1, -1, 0
v_mbcnt_hi_u32_b32 v1, -1, v1
v_and_b32_e32 v4, 64, v1
v_xor_b32_e32 v2, 1, v1
v_add_u32_e32 v4, 64, v4
v_cmp_lt_u32_e32 vcc, v2, v4
s_nop 1
v_cndmask_b32_e32 v2, v1, v2, vcc
v_lshlrev_b32_e32 v2, 2, v2
ds_bpermute_b32 v3, v2, v3
v_cmp_eq_u32_e32 vcc, 0, v1
s_waitcnt lgkmcnt(0)
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v3, v2, v3
s_waitcnt lgkmcnt(0)
s_nop 0
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
s_nop 1
v_mov_b32_dpp v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
ds_bpermute_b32 v2, v2, v3
s_and_saveexec_b64 s[4:5], vcc
s_cbranch_execz .LBB0_2
s_load_dword s6, s[0:1], 0xc
s_load_dwordx2 s[4:5], s[0:1], 0x0
s_ashr_i32 s3, s2, 31
v_lshrrev_b32_e32 v0, 6, v0
v_mov_b32_e32 v1, 0
s_waitcnt lgkmcnt(0)
s_ashr_i32 s0, s6, 31
v_lshl_add_u64 v[0:1], s[2:3], 3, v[0:1]
v_mul_lo_u32 v3, v1, s6
v_mul_lo_u32 v4, v0, s0
v_mad_u64_u32 v[0:1], s[0:1], v0, s6, 0
v_add3_u32 v1, v1, v4, v3
v_lshl_add_u64 v[0:1], v[0:1], 2, s[4:5]
global_store_dword v[0:1], v2, off
.LBB0_2:
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel kernel
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 2
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 0
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 0
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 5
.amdhsa_next_free_sgpr 7
.amdhsa_accum_offset 8
.amdhsa_reserve_vcc 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size kernel, .Lfunc_end0-kernel
.size kernel$local, .Lfunc_end0-kernel
.set kernel.num_vgpr, 5
.set kernel.num_agpr, 0
.set kernel.numbered_sgpr, 7
.set kernel.private_seg_size, 0
.set kernel.uses_vcc, 1
.set kernel.uses_flat_scratch, 0
.set kernel.has_dyn_sized_stack, 0
.set kernel.has_recursion, 0
.set kernel.has_indirect_call, 0
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section .AMDGPU.gpr_maximums,"",@progbits
.set amdgpu.max_num_vgpr, 0
.set amdgpu.max_num_agpr, 0
.set amdgpu.max_num_sgpr, 0
.text
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .offset: 0
.size: 24
.value_kind: by_value
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 24
.max_flat_workgroup_size: 1024
.name: kernel
.private_segment_fixed_size: 0
.sgpr_count: 13
.sgpr_spill_count: 0
.symbol: kernel.kd
.uses_dynamic_stack: false
.vgpr_count: 5
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx942
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment