
/******************************************/
/* Begin Kernel                           */
/******************************************/
.amdgcn_target "amdgcn-amd-amdhsa--gfx950"
.text
.protected Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950
.globl Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950
.p2align 8
.type Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950,@function
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950
  .amdhsa_user_sgpr_kernarg_segment_ptr 1
  .amdhsa_accum_offset 256 // accvgpr offset
  .amdhsa_next_free_vgpr 512 // vgprs
  .amdhsa_next_free_sgpr 102 // sgprs
  .amdhsa_group_segment_fixed_size 135168 // lds bytes
  .amdhsa_private_segment_fixed_size 0
  .amdhsa_system_sgpr_workgroup_id_x 1
  .amdhsa_system_sgpr_workgroup_id_y 1
  .amdhsa_system_sgpr_workgroup_id_z 1
  .amdhsa_system_vgpr_workitem_id 0
  .amdhsa_float_denorm_mode_32 3
  .amdhsa_float_denorm_mode_16_64 3
  .amdhsa_user_sgpr_count 13
  .amdhsa_user_sgpr_kernarg_preload_length 11
  .amdhsa_user_sgpr_kernarg_preload_offset 0
.end_amdhsa_kernel
.text
/* Num VGPR   =249 */
/* Num AccVGPR=256 */
/* Num SGPR   =105 */

/******************************************/
/* Optimizations and Config:              */
/******************************************/
/* ThreadTile= 32 x 8 */
/* SubGroup= 8 x 32 */
/* VectorWidthA=8 */
/* VectorWidthB=8 */
/* GlobalReadVectorWidthA=16, GlobalReadVectorWidthB=16 */
/* DirectToLdsA=True */
/* DirectToLdsB=True */
/* UseSgprForGRO=1 */
.amdgpu_metadata
---
custom.config:
  InternalSupportParams:
    KernArgsVersion: 2
  ProblemType:
      OperationType: GEMM
      DataType: f8b8
      DestDataType: b
      ComputeDataType: s
      HighPrecisionAccumulate: True
      TransposeA: 1
      TransposeB: 0
      UseBeta: True
      Batched: True
      UseBias: 1
      BiasDataTypeList: [0, 7]
      UseScaleAlphaVec: 1
      UseScaleAB: Scalar
      Activation: true
      ActivationType: hipblaslt_all
      ActivationFuncCall: true
  MIBlock: [16, 16, 128, 1, 1, 1]
  MatrixInstruction: [16, 16, 128, 1]
  WavefrontSize: 64
  WorkGroupMapping: 16
  WorkGroupMappingXCC: 2
  WorkGroupMappingXCCGroup: -1
  StaggerU: 0
  EnableMatrixInstruction: True
  MIWaveGroup: [2, 2]
  MIWaveTile: [8, 8]
  MIInputPerThread: 32
  MIInputPerThreadA: 32
  MIInputPerThreadB: 32
  DepthU: 128
  DirectToLds: 1
  LocalReadVectorWidth: 16
  GlobalReadVectorWidthA: 16
  GlobalReadVectorWidthB: 16
  GlobalSplitU: 0
  GlobalSplitUAlgorithm: MultipleBuffer
  GlobalSplitUCoalesced: false
  GlobalSplitUWorkGroupMappingRoundRobin: false
  PrefetchGlobalRead: 2
  PrefetchLocalRead: 1
  StreamK: 3
  StreamKAtomic: 0
  StreamKXCCMapping: 0
  TransposeLDS: 1
amdhsa.version:
  - 1
  - 1
amdhsa.kernels:
  - .name: Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950
    .symbol: 'Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950.kd'
    .language:                   OpenCL C
    .language_version:
      - 2
      - 0
    .args:
      - .name:            Gemm info
        .size:            4
        .offset:          0
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info0
        .size:            4
        .offset:          4
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info1
        .size:            4
        .offset:          8
        .value_kind:      by_value
        .value_type:      u32
      - .name:            numWG
        .size:            4
        .offset:          12
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree0
        .size:            4
        .offset:          16
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree1
        .size:            4
        .offset:          20
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree2
        .size:            4
        .offset:          24
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesSum0
        .size:            4
        .offset:          28
        .value_kind:      by_value
        .value_type:      u32
      - .name:            D
        .size:            8
        .offset:          32
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            C
        .size:            8
        .offset:          40
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            A
        .size:            8
        .offset:          48
        .value_kind:      global_buffer
        .value_type:      fp8
        .address_space:   generic
      - .name:            B
        .size:            8
        .offset:          56
        .value_kind:      global_buffer
        .value_type:      fp8
        .address_space:   generic
      - .name:            AddressWS
        .size:            8
        .offset:          64
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            AddressFlags
        .size:            8
        .offset:          72
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            strideD0
        .size:            4
        .offset:          80
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideD1
        .size:            4
        .offset:          84
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC0
        .size:            4
        .offset:          88
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC1
        .size:            4
        .offset:          92
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA0
        .size:            4
        .offset:          96
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA1
        .size:            4
        .offset:          100
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB0
        .size:            4
        .offset:          104
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB1
        .size:            4
        .offset:          108
        .value_kind:      by_value
        .value_type:      u32
      - .name:            alpha
        .size:            4
        .offset:          112
        .value_kind:      by_value
        .value_type:      f32
      - .name:            beta
        .size:            4
        .offset:          116
        .value_kind:      by_value
        .value_type:      f32
      - .name:            ItersPerTile
        .size:            4
        .offset:          120
        .value_kind:      by_value
        .value_type:      u32
      - .name:            TotalIters
        .size:            4
        .offset:          124
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SKItersPerWG
        .size:            4
        .offset:          128
        .value_kind:      by_value
        .value_type:      u32
      - .name:            skGridAndTiles
        .size:            4
        .offset:          132
        .value_kind:      by_value
        .value_type:      u32
      - .name:            skExtraIters
        .size:            4
        .offset:          136
        .value_kind:      by_value
        .value_type:      u32
      - .name:            AddressScaleA
        .size:            8
        .offset:          140
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            AddressScaleB
        .size:            8
        .offset:          148
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            AddressScaleAlphaVec
        .size:            8
        .offset:          156
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            bias
        .size:            8
        .offset:          164
        .value_kind:      global_buffer
        .value_type:      void
        .address_space:   generic
      - .name:            biasType
        .size:            4
        .offset:          172
        .value_kind:      by_value
        .value_type:      u32
      - .name:            StrideBias
        .size:            4
        .offset:          176
        .value_kind:      by_value
        .value_type:      u32
      - .name:            activationAlpha
        .size:            4
        .offset:          180
        .value_kind:      by_value
        .value_type:      f32
      - .name:            activationBeta
        .size:            4
        .offset:          184
        .value_kind:      by_value
        .value_type:      f32
      - .name:            activationType
        .size:            4
        .offset:          188
        .value_kind:      by_value
        .value_type:      u32
    .group_segment_fixed_size:   135168
    .kernarg_segment_align:      8
    .kernarg_segment_size:       192
    .max_flat_workgroup_size:    256
    .private_segment_fixed_size: 0
    .sgpr_count:                 102
    .sgpr_spill_count:           0
    .vgpr_count:                 249
    .vgpr_spill_count:           0
    .wavefront_size:             64
...
.end_amdgpu_metadata
Custom_Cijk_Alik_Bljk_F8B8BS_BH_Bias_HA_S_SAB_SAV_NTD_SK3_UserArgs_MT256x256x128_MI16x16x1_shortname0_gfx950:
.macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req
    v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber
    v_mul_lo_u32 v[\vgprDstIdx+0], \dividend, \magicA
    v_add_u32 v[\vgprDstIdx+0], v[\vgprDstIdx+0], v[\vgprDstIdx+1]
    v_lshrrev_b32 v[\vgprDstIdx+0], \magicShift, v[\vgprDstIdx+0]
.endm

/******************************************/
/* VGPR Assignments                       */
/******************************************/
/* ValuC range: [0-0), serializedStore enabled */
.set vgprValuC, 0
/* ValuA/B   Xn=PLR buffer idx,  In=InnerUnroll idx */
.set vgprBase, 4
.set vgprGlobalReadOffsetA, 0
.set vgprGlobalReadOffsetB, 1
.set vgprLocalReadAddrA, 2
.set vgprLocalReadAddrB, 3
.set vgprLocalReadSwapAddrA, 132
.set vgprLocalReadSwapAddrB, 133
.set vgprSerial, 134

/******************************************/
/* VGPR Macro Assignments                 */
/******************************************/
.set vgprValuA_X0_I0_BASE, vgprBase+0
.set vgprValuB_X0_I0_BASE, vgprBase+64
.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0
.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0

/******************************************/
/* SGPR Assignments                       */
/******************************************/
.set sgprKernArgAddress, 0
.set sgprWorkGroup0, 2
.set sgprWorkGroup1, 3
.set sgprWorkGroup2, 4
.set sgprArgType, 5
.set sgprStaggerU, 6
.set sgprWGM, 7
.set sgprLoopCounterL, 8
.set sgprOrigLoopCounter, 9
.set sgprSrdD, 12
.set sgprSrdC, 16
.set sgprNumWorkGroups0, 10
.set sgprNumWorkGroups1, 11
.set sgprSizesFree, 20
.set sgprSizesSum, 23
.set sgprAddressD, 24
.set sgprAddressC, 26
.set sgprAddressA, 28
.set sgprAddressB, 30
.set sgprAddressWS, 32
.set sgprAddressFlags, 34
.set sgprStridesD, 36
.set sgprStridesC, 38
.set sgprStridesA, 40
.set sgprStridesB, 42
.set sgprAlpha, 44
.set sgprBeta, 45
.set sgprItersPerTile, 46
.set sgprTotalIters, 47
.set sgprSKItersPerWG, 48
.set sgprskGridAndTiles, 49
.set sgprskExtraIters, 50
.set sgprLocalWriteAddrA, 51
.set sgprLocalWriteAddrB, 52
.set sgprSwapA, 53
.set sgprSwapB, 54
.set sgprStreamKIdx, 55
.set sgprStreamKIter, 56
.set sgprStreamKIterEnd, 57
.set sgprStreamKLocalStart, 58
.set sgprStreamKLocalEnd, 59
.set sgprSrdWS, 60

/* Size Assignments */
.set sgprSizeI, sgprSizesFree+0
.set sgprSizeJ, sgprSizesFree+1
.set sgprSizeK, sgprSizesFree+2
.set sgprSizeL, sgprSizesSum+0

/* Stride Assignments */
.set constStrideD0I, 1
.set sgprStrideD1J, sgprStridesD+0
.set sgprStrideDK, sgprStridesD+1
.set constStrideC0I, 1
.set sgprStrideC1J, sgprStridesC+0
.set sgprStrideCK, sgprStridesC+1
.set constStrideAL, 1
.set sgprStrideA0I, sgprStridesA+0
.set sgprStrideAK, sgprStridesA+1
.set constStrideBL, 1
.set sgprStrideB1J, sgprStridesB+0
.set sgprStrideBK, sgprStridesB+1

.set MT0, 256
.set MT1, 256
.set DepthU, 128
.set BpeA, 1
.set BpeALog2, 0
.set BpeB, 1
.set BpeBLog2, 0
.set BpeAGR, 1
.set BpeAGRLog2, 0
.set BpeBGR, 1
.set BpeBGRLog2, 0
/* Number of elements to shift-left SRD */
.set SrdShiftLeftA, 16
.set SrdShiftLeftB, 16
/* 2GB limit - set offsets to -1 to exceed this and clamp */
.set BufferLimit, 0xffffffff
.set BufferOOB, 0x80000000

/******************************************/
/* Bits 127:96 of SRD.                    */
/* hex: 0x20000                           */
/* dst_sel_x (3b): 0                      */
/* dst_sel_y (3b): 0                      */
/* dst_sel_z (3b): 0                      */
/* dst_sel_w (3b): 0                      */
/* num_format (3b): 0                     */
/* data_format (4b): 4                    */
/* user_vm_enable (1b): 0                 */
/* user_vm_mode (1b): 0                   */
/* index_stride (2b): 0                   */
/* add_tid_enable (1b): 0                 */
/* _unusedA (3b): 0                       */
/* nv (1b): 0                             */
/* _unusedB (2b): 0                       */
/* type (2b): 0                           */
/******************************************/
.set Srd127_96, 0x20000

/* Global Offset A */
.macro GLOBAL_OFFSET_A vgprAddr:req, vgprOffsetL:req, vgprOffset0I:req, vgprTmp:req
    v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower
    v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower
    v_add_u32 v[\vgprAddr+0], 0x10, v[\vgprAddr+0]     // add prepad for pointer shift
                                                       // offset *= bytes/element (multiplier is 1, do nothing)
.endm

/* Global Offset B */
.macro GLOBAL_OFFSET_B vgprAddr:req, vgprOffsetL:req, vgprOffset1J:req, vgprTmp:req
    v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower
    v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower
    v_add_u32 v[\vgprAddr+0], 0x10, v[\vgprAddr+0]     // add prepad for pointer shift
                                                       // offset *= bytes/element (multiplier is 1, do nothing)
.endm

/******************************************/
/* Allocate Resources                     */
/******************************************/

/* Load num of Gemms */
s_load_dword s64, s[sgprKernArgAddress:sgprKernArgAddress+1], 0

/* Load packed kernel args (StaggerU/GSU) */
s_load_dword s66, s[sgprKernArgAddress:sgprKernArgAddress+1], 4

/* Load WGM data */
s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 8

/* Load num of WGs */
s_load_dword s67, s[sgprKernArgAddress:sgprKernArgAddress+1], 12
s_waitcnt lgkmcnt(0)                               // load args
s_lshr_b32 s65, s64, 0x1e                          // Get arg type
s_and_b32 s64, 0x3fffffff, s64                     // Get nums of gemm
s_cmp_eq_u32 s65, 0                                // Is kernel args
s_cbranch_scc0 label_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dwordx16 s[20:35], s[sgprKernArgAddress:sgprKernArgAddress+1], 0 // 0
s_load_dwordx8 s[36:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 64 // 64
s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120
s_waitcnt lgkmcnt(0)                               // preload
s_branch label_LoadArgsEnd
label_HBMArgs:

/* Load address of kernel arguments */
s_load_dwordx2 s[sgprKernArgAddress:sgprKernArgAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 16
s_waitcnt lgkmcnt(0)                               // wait for args to load
label_LoadArgsEnd:
s_branch label_common_kernel_entry

/* pad 33 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
label_Preload_Offset_Start:
s_and_b32 s64, 0x3fffffff, s2                      // Get nums of gemm
s_lshr_b32 s65, s2, 0x1e                           // Get arg type
s_mov_b32 s66, s3                                  // Preload internal args
s_cmp_eq_u32 s65, 0                                // Is kernel args
s_cbranch_scc0 label_Preload_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dword s27, s[sgprKernArgAddress:sgprKernArgAddress+1], 28 // 28
s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 32 // 32
s_load_dwordx4 s[44:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 96 // 96
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120
s_mov_b64 s[20:21], s[6:7]                         // move preload data to correct sgpr
s_mov_b64 s[22:23], s[8:9]                         // move preload data to correct sgpr
s_mov_b64 s[24:25], s[10:11]                       // move preload data to correct sgpr
s_mov_b32 s26, s12                                 // move preload data to correct sgpr
s_branch label_Preload_LoadArgsEnd
label_Preload_HBMArgs:
s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments
label_Preload_LoadArgsEnd:
s_mov_b32 s[sgprWGM], s4                           // Preload internal args2
s_mov_b32 s67, s5                                  // Load num of WGs
label_common_kernel_entry:  /// for both preload/non-preload common code
s_mov_b32 s[sgprWorkGroup0+0], s13                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+1], s14                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+2], s15                 // restore workgroup id
s_and_b32 s[sgprStaggerU], s66, 0xffff0000         // Restore StaggerU related vars
s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10
s_mov_b32 s[sgprArgType], s65
s_mov_b32 m0, 0x21000                              // LDS clamp at 135168 bytes
v_mov_b32 v[vgprSerial], v0                        // thread serial id

/* remap workgroup to XCCs */
s_lshr_b32 s72, s[sgprWGM], 0x10                   // Get WGMXCC
s_ff1_i32_b32 s72, s72                             // Get log(WGMXCC)
s_lshr_b32 s73, s[sgprWGM], 0x16                   // Get CU_Count
/* remap WGs if WGMXCC > 1 ( log(WGMXCC) > 0 ) */
s_cmp_gt_i32 s72, 0
s_cbranch_scc0 label_skip_WGMXCC
/* only remap WGs in the range */
s_lshr_b32 s69, s67, s72
s_lshl_b32 s69, s69, s72
s_cmp_ge_u32 s[sgprWorkGroup0], s69
s_cbranch_scc1 label_skip_WGMXCC
s_cmp_eq_u32 s73, 0                                // CU_Count == 0 ?
s_cbranch_scc0 label_XCCG_nonzero
s_lshr_b32 s69, s[sgprWorkGroup0], s72
s_bfm_b32 s70, s72, 0
s_and_b32 s70, s[sgprWorkGroup0], s70
s_lshr_b32 s71, s67, s72
s_mul_i32 s70, s70, s71
s_add_u32 s[sgprWorkGroup0], s69, s70
s_branch label_skip_WGMXCC
label_XCCG_nonzero:
/* temp0 = (wg//CU_Count)*CU_Count */
v_cvt_f32_u32 v4, s73                              // wg//CU_Count
v_rcp_iflag_f32 v4, v4                             // wg//CU_Count
v_cvt_f32_u32 v5, s[sgprWorkGroup0]                // wg//CU_Count
v_mul_f32 v4, v4, v5                               // wg//CU_Count
v_cvt_u32_f32 v4, v4                               // wg//CU_Count
v_mul_u32_u24 v5, v4, s73                          // wg//CU_Count
v_sub_u32 v5, s[sgprWorkGroup0], v5                // wg//CU_Count
v_cmpx_eq_u32 exec, v5, s73                        // wg//CU_Count
v_add_u32 v4, 1, v4                                // wg//CU_Count
v_mov_b32 v5, 0                                    // wg//CU_Count
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s73                        // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s73                          // re-calculate remainder
v_sub_u32 v5, s[sgprWorkGroup0], v5                // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s69, v4                        // quotient
v_readfirstlane_b32 s70, v5                        // remainder
s_mul_i32 s69, s69, s73
/* temp1 = (wg%CU_Count)//WGMXCC */
s_lshr_b32 s70, s70, s72
/* temp0 = temp0 + temp1 */
s_add_u32 s69, s69, s70
/* temp1 = (wg%WGMXCC) * ((WGs - (WGs//CU_Count) * CU_Count) if (wg > (WGs//CU_Count) * CU_Count) else CU_Count)//WGMXCC */
v_cvt_f32_u32 v4, s73                              // WGs//CU_Count
v_rcp_iflag_f32 v4, v4                             // WGs//CU_Count
v_cvt_f32_u32 v5, s67                              // WGs//CU_Count
v_mul_f32 v4, v4, v5                               // WGs//CU_Count
v_cvt_u32_f32 v4, v4                               // WGs//CU_Count
v_mul_u32_u24 v5, v4, s73                          // WGs//CU_Count
v_sub_u32 v5, s67, v5                              // WGs//CU_Count
v_cmpx_eq_u32 exec, v5, s73                        // WGs//CU_Count
v_add_u32 v4, 1, v4                                // WGs//CU_Count
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s73                        // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s70, v4                        // quotient
s_mul_i32 s70, s70, s73
s_sub_u32 s71, s67, s70
s_cmp_gt_u32 s[sgprWorkGroup0], s70
s_cselect_b32 s70, s71, s73
s_lshr_b32 s70, s70, s72
s_bfm_b32 s71, s72, 0
s_and_b32 s71, s[sgprWorkGroup0], s71
s_mul_i32 s70, s70, s71
/* WorkGroup0 = temp0 + temp1 */
s_add_u32 s[sgprWorkGroup0], s69, s70
label_skip_WGMXCC:  /// skip WGMXCC if no enough WGs to remap
s_cmp_eq_u32 s65, 0
s_cbranch_scc0 label_MultiGemm
/* init: add vgpr [4...136) to pool */
/* init: add vgpr [0...0) to pool */
/* init: add agpr [0...256) to pool */
v_mov_b32 v6, MT0                                  // set MT0 into sgpr
v_mov_b32 v5, s[sgprSizesFree+0]                   // set Free0 size
v_cvt_f32_u32 v4, v6                               // v4 = ceil(v5 / v6)
v_rcp_iflag_f32 v4, v4                             // v4 = ceil(v5 / v6)
v_cvt_f32_u32 v7, v5                               // v4 = ceil(v5 / v6)
v_mul_f32 v4, v4, v7                               // v4 = ceil(v5 / v6)
v_cvt_u32_f32 v4, v4                               // v4 = ceil(v5 / v6)
v_mul_u32_u24 v7, v4, v6                           // v4 = ceil(v5 / v6)
v_sub_u32 v7, v5, v7                               // v4 = ceil(v5 / v6)
v_cmp_ne_u32 vcc, v7, 0                            // v4 = ceil(v5 / v6)
v_addc_co_u32 v4, vcc, v4, 0, vcc                  // ceil
v_mov_b32 v6, MT1                                  // set MT1 into sgpr
v_mov_b32 v5, s[sgprSizesFree+1]                   // set Free1 size
v_readfirstlane_b32 s[sgprNumWorkGroups0], v4      // set back to numWorkGroup0
v_cvt_f32_u32 v4, v6                               // v4 = ceil(v5 / v6)
v_rcp_iflag_f32 v4, v4                             // v4 = ceil(v5 / v6)
v_cvt_f32_u32 v7, v5                               // v4 = ceil(v5 / v6)
v_mul_f32 v4, v4, v7                               // v4 = ceil(v5 / v6)
v_cvt_u32_f32 v4, v4                               // v4 = ceil(v5 / v6)
v_mul_u32_u24 v7, v4, v6                           // v4 = ceil(v5 / v6)
v_sub_u32 v7, v5, v7                               // v4 = ceil(v5 / v6)
v_cmp_ne_u32 vcc, v7, 0                            // v4 = ceil(v5 / v6)
v_addc_co_u32 v4, vcc, v4, 0, vcc                  // ceil
s_nop 0                                            // 1 wait states
v_readfirstlane_b32 s[sgprNumWorkGroups1], v4      // set back to numWorkGroup1
s_waitcnt lgkmcnt(0)                               // wait for 80/0 bytes of kern args
s_branch label_MultiGemmEnd
label_MultiGemm:

/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_IsExternalValid               // branch if ArgType == 2
s_mov_b32 s11, 176
s_mul_i32 s72, s64, 4
s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1]
s_branch label_IsExternalValidEnd
label_IsExternalValid:
s_mov_b32 s11, 216
s_mov_b32 s72, 0
s_mov_b64 s[66:67], s[sgprKernArgAddress:sgprKernArgAddress+1]
label_IsExternalValidEnd:

/* Grouped Gemm:: prefetch 1 arg load */
s_mov_b32 s10, 1
s_mov_b32 s73, 0
s_load_dwordx4 s[20:23], s[66:67], s72
s_cmpk_eq_u32 s64, 1                               // if gemm_count is 1?
s_cbranch_scc1 label_wgTable_noLoadLoop

/* Grouped Gemm:: accumulate numTiles for each gemm */
/* Grouped Gemm:: loop start */
label_Loop_GemmCount:
s_waitcnt lgkmcnt(0)
s_lshr_b32 s70, s20, 8                             // s70 = s20 / 256
s_and_b32 s68, 255, s20                            // s68 = s20 % 256
s_addc_u32 s70, s70, 0
s_lshr_b32 s71, s21, 8                             // s71 = s21 / 256
s_and_b32 s68, 255, s21                            // s68 = s21 % 256
s_addc_u32 s71, s71, 0
s_mul_i32 s70, s70, s71
s_mul_i32 s70, s70, s22
s_add_u32 s73, s73, s70
s_cmp_lt_u32 s[sgprWorkGroup0], s73
s_cbranch_scc1 label_FOUND
s_add_u32 s72, s72, s11
s_load_dwordx4 s[20:23], s[66:67], s72
s_add_u32 s10, s10, 1
s_cmp_lt_u32 s10, s64
s_cbranch_scc1 label_Loop_GemmCount

/* Grouped Gemm:: noLoadLoop */
label_wgTable_noLoadLoop:
s_waitcnt lgkmcnt(0)
s_lshr_b32 s70, s20, 8                             // s70 = s20 / 256
s_and_b32 s68, 255, s20                            // s68 = s20 % 256
s_addc_u32 s70, s70, 0
s_lshr_b32 s71, s21, 8                             // s71 = s21 / 256
s_and_b32 s68, 255, s21                            // s68 = s21 % 256
s_addc_u32 s71, s71, 0
s_mul_i32 s70, s70, s71
s_mul_i32 s70, s70, s22
s_add_u32 s73, s73, s70

/* Grouped Gemm:: gemmIndex found */
label_FOUND:
s_sub_u32 s67, s10, 1
s_sub_u32 s66, s73, s70
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s66
/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_LoadExternalStruct            // branch if ArgType == 2

/* Grouped Gemm: offset argument address to gemm */
/* Grouped Gemm: offset address from wg_table_start to args_start */
s_lshl2_add_u32 s[sgprKernArgAddress], s64, s[sgprKernArgAddress]
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0
/* Grouped Gemm: offset address from args_start to gemm_start */
s_mul_i32 s67, s67, 176
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0

/* Load Kernel Args */
s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16
s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
s_load_dword s50, s[sgprKernArgAddress:sgprKernArgAddress+1], 120 // 120
s_branch label_LoadExternalStructEnd
label_LoadExternalStruct:
/* Grouped Gemm: offset address from args_start to gemm_start */
s_mul_i32 s67, s67, 216
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s67
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0
s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 16 // 16
s_load_dwordx8 s[40:47], s[sgprKernArgAddress:sgprKernArgAddress+1], 80 // 80
s_load_dwordx2 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1], 112 // 112
// Read Beta
s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 132 // 132
label_LoadExternalStructEnd:
/* init: add vgpr [4...136) to pool */
/* init: add vgpr [0...0) to pool */
/* init: add agpr [0...256) to pool */
v_mov_b32 v6, MT0                                  // set MT0 into sgpr
v_mov_b32 v5, s[sgprSizesFree+0]                   // set Free0 size
v_cvt_f32_u32 v4, v6                               // v4 = ceil(v5 / v6)
v_rcp_iflag_f32 v4, v4                             // v4 = ceil(v5 / v6)
v_cvt_f32_u32 v7, v5                               // v4 = ceil(v5 / v6)
v_mul_f32 v4, v4, v7                               // v4 = ceil(v5 / v6)
v_cvt_u32_f32 v4, v4                               // v4 = ceil(v5 / v6)
v_mul_u32_u24 v7, v4, v6                           // v4 = ceil(v5 / v6)
v_sub_u32 v7, v5, v7                               // v4 = ceil(v5 / v6)
v_cmp_ne_u32 vcc, v7, 0                            // v4 = ceil(v5 / v6)
v_addc_co_u32 v4, vcc, v4, 0, vcc                  // ceil
v_mov_b32 v6, MT1                                  // set MT1 into sgpr
v_mov_b32 v5, s[sgprSizesFree+1]                   // set Free1 size
v_readfirstlane_b32 s[sgprNumWorkGroups0], v4      // set back to numWorkGroup0
v_cvt_f32_u32 v4, v6                               // v4 = ceil(v5 / v6)
v_rcp_iflag_f32 v4, v4                             // v4 = ceil(v5 / v6)
v_cvt_f32_u32 v7, v5                               // v4 = ceil(v5 / v6)
v_mul_f32 v4, v4, v7                               // v4 = ceil(v5 / v6)
v_cvt_u32_f32 v4, v4                               // v4 = ceil(v5 / v6)
v_mul_u32_u24 v7, v4, v6                           // v4 = ceil(v5 / v6)
v_sub_u32 v7, v5, v7                               // v4 = ceil(v5 / v6)
v_cmp_ne_u32 vcc, v7, 0                            // v4 = ceil(v5 / v6)
v_addc_co_u32 v4, vcc, v4, 0, vcc                  // ceil
s_nop 0                                            // 1 wait states
v_readfirstlane_b32 s[sgprNumWorkGroups1], v4      // set back to numWorkGroup1
s_waitcnt lgkmcnt(0)                               // wait for 80/0 bytes of kern args

/* Early stop if N(SizeFreeJ) == 0 */
s_cmp_eq_u32 s[sgprSizeJ], 0
s_cbranch_scc0 label_NoEarlyStop_N0
label_EarlyStop_if_N_is_0:
s_endpgm
label_NoEarlyStop_N0:

label_MultiGemmEnd:
.set sgprSrdA, 64
.set sgprSrdB, 68
.set sgprShadowLimitA, 72
.set sgprShadowLimitB, 74
.set sgprStaggerUIter, 76
.set sgprWrapUA, 77
.set sgprWrapUB, 79
.set sgprGlobalReadIncsA, 81
.set sgprGlobalReadIncsB, 82
.set sgprScalarGlobalReadOffsetA, 83
.set sgprScalarGlobalReadOffsetB, 90
s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift
s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // pre-pad to make room for possible pointer shift

/* Short circuit condition if Alpha == 0, then sumDims=0 */
v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_AlphaNonZero                  // branch if s[Alpha] != 0
s_mov_b32 s[sgprSizesSum+0], 0                     // Set summation dim=0 if Alpha == 0
label_AlphaNonZero:
s_mov_b32 s[sgprStreamKIdx], s[sgprWorkGroup0]     // Save original StreamK index
s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprItersPerTile] // DP starting iteration (case: DP work to do)
s_mov_b32 s[sgprStreamKIterEnd], s[sgprTotalIters] // DP ending iteration (case: only DP work to do)
s_and_b32 s97, s[sgprskGridAndTiles], 0xffff       // Get skTiles
s_mul_i32 s97, s97, s[sgprItersPerTile]            // Total SK iters
s_cmp_lt_u32 s97, s[sgprTotalIters]                // Check if there are DP tiles to do
s_cbranch_scc1 label_SK_InitDone                   // Done init
s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters)
s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters
s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters)
s_add_u32 s98, s[sgprSKItersPerWG], 1              // Spread out extra iterations
s_mul_i32 s97, s[sgprStreamKIdx], s98              // StreamK starting iteration (case: before extra iters)
s_add_u32 s98, s97, s98                            // StreamK ending iteration (case: before extra iters)
s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration
s_cselect_b32 s[sgprStreamKIter], s97, s[sgprStreamKIter] // Set start iter
s_cselect_b32 s[sgprStreamKIterEnd], s98, s[sgprStreamKIterEnd] // Set end iter
s_and_b32 s97, s[sgprskGridAndTiles], 0xffff       // Get skTiles
s_mul_i32 s97, s97, s[sgprItersPerTile]            // Total SK iters
s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s97 // Cap ending iter at total SK iters
label_SK_InitDone:
s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do
s_cbranch_scc1 label_NoBranch_T8JHFHKM7BO5OHXW     // Only branch on scc0
s_getpc_b64 s[98:99]                               // addr of next instr
s_add_i32 s100, label_KernelEnd, 4                 // target branch offset
s_add_u32 s98, s98, s100                           // add target branch offset
s_addc_u32 s99, s99, 0                             // add high and carry
s_setpc_b64 s[98:99]                               // branch to label_KernelEnd
label_NoBranch_T8JHFHKM7BO5OHXW:

/******************************************/
/* Persistent Loop Start                  */
/******************************************/
label_PersistentLoopStart:

// Use sgprScalarGlobalReadOffsetA/B sgprs
.set sgpr102, 84
.set sgpr103, 85
.set sgpr104, 86
  
/******************************************/
/* Begin setupNewTile                     */
/******************************************/

/* global read addresses: work-group */
/* graWorkGroup mapping */

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v4, v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v4 // Set LRA to first buffer offset

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v4, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v4 // Set LRA to first buffer offset
/* StreamK calculate tile idx and map to WG */
v_cvt_f32_u32 v4, s[sgprItersPerTile]              // StreamKIter // ItersPerTile
v_rcp_iflag_f32 v4, v4                             // StreamKIter // ItersPerTile
v_cvt_f32_u32 v5, s[sgprStreamKIter]               // StreamKIter // ItersPerTile
v_mul_f32 v4, v4, v5                               // StreamKIter // ItersPerTile
v_cvt_u32_f32 v4, v4                               // StreamKIter // ItersPerTile
v_mul_u32_u24 v5, v4, s[sgprItersPerTile]          // StreamKIter // ItersPerTile
v_sub_u32 v5, s[sgprStreamKIter], v5               // StreamKIter // ItersPerTile
v_cmpx_eq_u32 exec, v5, s[sgprItersPerTile]        // StreamKIter // ItersPerTile
v_add_u32 v4, 1, v4                                // StreamKIter // ItersPerTile
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s[sgprItersPerTile]        // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s98, v4                        // quotient
s_mul_i32 s99, s98, s[sgprItersPerTile]            // Tile start iteration
s_add_u32 s100, s99, s[sgprItersPerTile]           // Tile end iteration
s_sub_u32 s[sgprStreamKLocalStart], s[sgprStreamKIter], s99 // Local iteration start
s_min_u32 s[sgprStreamKLocalEnd], s[sgprStreamKIterEnd], s100 // 1. (Local) iteration end (SK tile)
s_sub_u32 s[sgprStreamKLocalEnd], s[sgprStreamKLocalEnd], s99 // 2. Local iteration end (SK tile)
s_and_b32 s101, s[sgprskGridAndTiles], 0xffff      // Get skTiles
s_mul_i32 s101, s101, s[sgprItersPerTile]          // Total SK iters
s_sub_u32 s101, s[sgprTotalIters], s101            // Offset to first SK tile
s_lshr_b32 s99, s[sgprskGridAndTiles], 0x10        // Get skGrid
s_mul_i32 s99, s99, s[sgprItersPerTile]            // DP iterations shift
s_add_u32 s99, s99, s[sgprStreamKIter]             // Add DP shift
s_cmp_lt_u32 s99, s101                             // Check if still in DP section
s_cbranch_scc1 label_SK_UpdateDone                 // Done update
s_mov_b32 s99, s100                                // SK iterations shift
s_cmp_le_u32 s101, s[sgprStreamKIter]              // Check if continuing in SK section
s_cbranch_scc1 label_SK_UpdateDone                 // Done update
s_mul_i32 s[sgprStreamKIter], s[sgprStreamKIdx], s[sgprSKItersPerWG] // StreamK starting iteration (case: after extra iters)
s_add_u32 s[sgprStreamKIter], s[sgprStreamKIter], s[sgprskExtraIters] // Add extra iters
s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIter], s[sgprSKItersPerWG] // StreamK ending iteration (case: after extra iters)
s_add_u32 s[sgpr103], s[sgprSKItersPerWG], 1             // Spread out extra iterations
s_mul_i32 s[sgpr102], s[sgprStreamKIdx], s[sgpr103]            // StreamK starting iteration (case: before extra iters)
s_add_u32 s[sgpr103], s[sgpr102], s[sgpr103]                         // StreamK ending iteration (case: before extra iters)
s_cmp_lt_u32 s[sgprStreamKIdx], s[sgprskExtraIters] // Check if lane gets an extra iteration
s_cselect_b32 s[sgprStreamKIter], s[sgpr102], s[sgprStreamKIter] // Set start iter
s_cselect_b32 s[sgprStreamKIterEnd], s[sgpr103], s[sgprStreamKIterEnd] // Set end iter
s_add_u32 s99, s[sgprStreamKIter], s101            // Offset to start of SK section
s_add_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s101 // Offset to start of SK section
s_min_u32 s[sgprStreamKIterEnd], s[sgprStreamKIterEnd], s[sgprTotalIters] // Cap ending iter at total SK iters
s_cmp_lt_u32 s[sgprStreamKIter], s[sgprTotalIters] // Make sure there's work to do
s_cbranch_scc1 label_NoBranch_S4FDBQ587JJL6NOU     // Only branch on scc0
s_getpc_b64 s[sgpr102:sgpr103]                             // addr of next instr
s_add_i32 s[sgpr104], label_KernelEnd, 4                 // target branch offset
s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104]                         // add target branch offset
s_addc_u32 s[sgpr103], s[sgpr103], 0                           // add high and carry
s_setpc_b64 s[sgpr102:sgpr103]                             // branch to label_KernelEnd
label_NoBranch_S4FDBQ587JJL6NOU:
label_SK_UpdateDone:
s_mov_b32 s[sgprStreamKIter], s99                  // Store current iteration
/* Map StreamK tile index to wg0/1/2 */
s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] // Total tiles
v_cvt_f32_u32 v4, s99                              // TileID // nWG0*nWG1
v_rcp_iflag_f32 v4, v4                             // TileID // nWG0*nWG1
v_cvt_f32_u32 v5, s98                              // TileID // nWG0*nWG1
v_mul_f32 v4, v4, v5                               // TileID // nWG0*nWG1
v_cvt_u32_f32 v4, v4                               // TileID // nWG0*nWG1
v_mul_u32_u24 v5, v4, s99                          // TileID // nWG0*nWG1
v_sub_u32 v5, s98, v5                              // TileID // nWG0*nWG1
v_cmpx_eq_u32 exec, v5, s99                        // TileID // nWG0*nWG1
v_add_u32 v4, 1, v4                                // TileID // nWG0*nWG1
v_mov_b32 v5, 0                                    // TileID // nWG0*nWG1
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s99                        // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s99                          // re-calculate remainder
v_sub_u32 v5, s98, v5                              // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup2], v4          // quotient
v_readfirstlane_b32 s100, v5                       // remainder
v_cvt_f32_u32 v4, s[sgprNumWorkGroups0]            // TileID // nWG0
v_rcp_iflag_f32 v4, v4                             // TileID // nWG0
v_cvt_f32_u32 v5, s100                             // TileID // nWG0
v_mul_f32 v4, v4, v5                               // TileID // nWG0
v_cvt_u32_f32 v4, v4                               // TileID // nWG0
v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0]        // TileID // nWG0
v_sub_u32 v5, s100, v5                             // TileID // nWG0
v_cmpx_eq_u32 exec, v5, s[sgprNumWorkGroups0]      // TileID // nWG0
v_add_u32 v4, 1, v4                                // TileID // nWG0
v_mov_b32 v5, 0                                    // TileID // nWG0
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s[sgprNumWorkGroups0]      // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s[sgprNumWorkGroups0]        // re-calculate remainder
v_sub_u32 v5, s100, v5                             // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup1], v4          // quotient
v_readfirstlane_b32 s[sgprWorkGroup0], v5          // remainder

v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_SKAlphaCheck                  // branch if s[Alpha] != 0
s_cmp_eq_u32 s[sgprStreamKLocalStart], 0           // does wg start tile?
s_cbranch_scc1 label_NoBranch_UR8VN3A1SJCPC6PO     // Only branch on scc0
s_getpc_b64 s[sgpr102:sgpr103]                             // addr of next instr
s_add_i32 s[sgpr104], label_GW_End, 4                    // target branch offset
s_add_u32 s[sgpr102], s[sgpr102], s[sgpr104]                         // add target branch offset
s_addc_u32 s[sgpr103], s[sgpr103], 0                           // add high and carry
s_setpc_b64 s[sgpr102:sgpr103]                             // branch to label_GW_End
label_NoBranch_UR8VN3A1SJCPC6PO:
s_mov_b32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Skip iterations
label_SKAlphaCheck:
s_sext_i32_i16 s[sgprWGM], s[sgprWGM]              // Restore WGM
s_cmp_gt_i32 s[sgprWGM], 1                         // WGM > 1 ?
s_cbranch_scc1 label_WGMPositive                   // branch if WGM > 1
s_cmp_ge_i32 s[sgprWGM], 0                         // WGM >= 0 ?
s_cbranch_scc1 label_WGM                           // branch if WGM >= 0
s_abs_i32 s101, s[sgprWGM]                         // abs(WGM)
v_cvt_f32_u32 v4, s101                             // WGM
v_rcp_iflag_f32 v4, v4                             // WGM
v_cvt_f32_u32 v5, s[sgprWorkGroup0]                // WGM
v_mul_f32 v4, v4, v5                               // WGM
v_cvt_u32_f32 v4, v4                               // WGM
v_mul_u32_u24 v5, v4, s101                         // WGM
v_sub_u32 v5, s[sgprWorkGroup0], v5                // WGM
v_cmpx_eq_u32 exec, v5, s101                       // WGM
v_add_u32 v4, 1, v4                                // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s101                       // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s97, v4                        // quotient
s_mul_i32 s100, s97, s101                          // quotient * non-magic divisor
s_sub_u32 s100, s[sgprWorkGroup0], s100            // WorkGroup0=remainder
s_mul_i32 s100, s100, s[sgprNumWorkGroups1]        // (wg1 % WGM)*NumWorkGroups1
s_add_u32 s100, s100, s[sgprWorkGroup1]            // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1
v_cvt_f32_u32 v4, s101                             // WGM
v_rcp_iflag_f32 v4, v4                             // WGM
v_cvt_f32_u32 v5, s[sgprNumWorkGroups0]            // WGM
v_mul_f32 v4, v4, v5                               // WGM
v_cvt_u32_f32 v4, v4                               // WGM
v_mul_u32_u24 v5, v4, s101                         // WGM
v_sub_u32 v5, s[sgprNumWorkGroups0], v5            // WGM
v_cmpx_eq_u32 exec, v5, s101                       // WGM
v_add_u32 v4, 1, v4                                // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s101                       // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s98, v4                        // quotient
s_mul_i32 s99, s101, s98                           // quotient * non-magic divisor
s_sub_u32 s99, s[sgprNumWorkGroups0], s99          // NumWorkGroups0=remainder
s_cmp_eq_u32 s99, 0                                // remainder == 0 ?
s_cmov_b32 s99, s101                               // remainder = WGM if remainder == 0
s_cmp_ge_u32 s97, s98                              // blockId >= numFullBlocks ?
s_cselect_b32 s98, s99, s101
v_cvt_f32_u32 v4, s98                              // s[sgprWorkGroup1] = s100 / s98
v_rcp_iflag_f32 v4, v4                             // s[sgprWorkGroup1] = s100 / s98
v_cvt_f32_u32 v5, s100                             // s[sgprWorkGroup1] = s100 / s98
v_mul_f32 v4, v4, v5                               // s[sgprWorkGroup1] = s100 / s98
v_cvt_u32_f32 v4, v4                               // s[sgprWorkGroup1] = s100 / s98
v_mul_u32_u24 v5, v4, s98                          // s[sgprWorkGroup1] = s100 / s98
v_sub_u32 v5, s100, v5                             // s[sgprWorkGroup1] = s100 / s98
v_cmpx_eq_u32 exec, v5, s98                        // s[sgprWorkGroup1] = s100 / s98
v_add_u32 v4, 1, v4                                // s[sgprWorkGroup1] = s100 / s98
v_mov_b32 v5, 0                                    // s[sgprWorkGroup0] = s100 % s98
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s98                        // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s98                          // re-calculate remainder
v_sub_u32 v5, s100, v5                             // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup1], v4          // quotient
v_readfirstlane_b32 s[sgprWorkGroup0], v5          // remainder
s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s98 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup0], s100, s[sgprWorkGroup0] // WorkGroup0=remainder
s_mul_i32 s97, s97, s101                           // blockId * WGM
s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s97 // wg1 += blockId * WGM
s_branch label_WGM
label_WGMPositive:
s_mov_b32 s101, s[sgprWGM]                         // WGM
v_cvt_f32_u32 v4, s101                             // WGM
v_rcp_iflag_f32 v4, v4                             // WGM
v_cvt_f32_u32 v5, s[sgprWorkGroup1]                // WGM
v_mul_f32 v4, v4, v5                               // WGM
v_cvt_u32_f32 v4, v4                               // WGM
v_mul_u32_u24 v5, v4, s101                         // WGM
v_sub_u32 v5, s[sgprWorkGroup1], v5                // WGM
v_cmpx_eq_u32 exec, v5, s101                       // WGM
v_add_u32 v4, 1, v4                                // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s101                       // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s97, v4                        // quotient
s_mul_i32 s100, s97, s101                          // quotient * non-magic divisor
s_sub_u32 s100, s[sgprWorkGroup1], s100            // WorkGroup1=remainder
s_mul_i32 s100, s100, s[sgprNumWorkGroups0]        // (wg1 % WGM)*NumWorkGroups0
s_add_u32 s100, s100, s[sgprWorkGroup0]            // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0
v_cvt_f32_u32 v4, s101                             // WGM
v_rcp_iflag_f32 v4, v4                             // WGM
v_cvt_f32_u32 v5, s[sgprNumWorkGroups1]            // WGM
v_mul_f32 v4, v4, v5                               // WGM
v_cvt_u32_f32 v4, v4                               // WGM
v_mul_u32_u24 v5, v4, s101                         // WGM
v_sub_u32 v5, s[sgprNumWorkGroups1], v5            // WGM
v_cmpx_eq_u32 exec, v5, s101                       // WGM
v_add_u32 v4, 1, v4                                // WGM
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s101                       // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s98, v4                        // quotient
s_mul_i32 s99, s101, s98                           // quotient * non-magic divisor
s_sub_u32 s99, s[sgprNumWorkGroups1], s99          // NumWorkGroups1=remainder
s_cmp_eq_u32 s99, 0                                // remainder == 0 ?
s_cmov_b32 s99, s101                               // remainder = WGM if remainder == 0
s_cmp_ge_u32 s97, s98                              // blockId >= numFullBlocks ?
s_cselect_b32 s98, s99, s101
v_cvt_f32_u32 v4, s98                              // s[sgprWorkGroup0] = s100 / s98
v_rcp_iflag_f32 v4, v4                             // s[sgprWorkGroup0] = s100 / s98
v_cvt_f32_u32 v5, s100                             // s[sgprWorkGroup0] = s100 / s98
v_mul_f32 v4, v4, v5                               // s[sgprWorkGroup0] = s100 / s98
v_cvt_u32_f32 v4, v4                               // s[sgprWorkGroup0] = s100 / s98
v_mul_u32_u24 v5, v4, s98                          // s[sgprWorkGroup0] = s100 / s98
v_sub_u32 v5, s100, v5                             // s[sgprWorkGroup0] = s100 / s98
v_cmpx_eq_u32 exec, v5, s98                        // s[sgprWorkGroup0] = s100 / s98
v_add_u32 v4, 1, v4                                // s[sgprWorkGroup0] = s100 / s98
v_mov_b32 v5, 0                                    // s[sgprWorkGroup1] = s100 % s98
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v5, s98                        // overflow happened in remainder
v_sub_u32 v4, v4, 1                                // quotient - 1
v_mul_u32_u24 v5, v4, s98                          // re-calculate remainder
v_sub_u32 v5, s100, v5                             // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s[sgprWorkGroup0], v4          // quotient
v_readfirstlane_b32 s[sgprWorkGroup1], v5          // remainder
s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s98 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup1], s100, s[sgprWorkGroup1] // WorkGroup1=remainder
s_mul_i32 s97, s97, s101                           // blockId * WGM
s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s97 // wg1 += blockId * WGM
label_WGM:

/******************************************/
/* Local Read Addresses                   */
/******************************************/

/* local read addresses: tile assignments a/b */
/* lr0I */
v_and_b32 v5, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v4, 15, v5                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v4, 7, v4                            // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v4, 3, v4                            // 4. apply VectorWidth: bnOffset = bnOffset * vw(8)
v_lshrrev_b32 v5, 4, v5                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshl_add_u32 v4, v5, 4, v4                       // 5. K offset: lrKOffset = kIdx * mStride(16); 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v8, 6, v[vgprSerial]                 // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
v_and_b32 v8, 1, v8                                // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2)
v_lshl_add_u32 v4, v8, 14, v4                      // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(16384); 7. final local read offset: flrOffset = lrOffset + WOffset
/* lr1J */
v_and_b32 v6, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v5, 15, v6                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v5, 7, v5                            // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v5, 3, v5                            // 4. apply VectorWidth: bnOffset = bnOffset * vw(8)
v_lshrrev_b32 v6, 4, v6                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshl_add_u32 v5, v6, 4, v5                       // 5. K offset: lrKOffset = kIdx * mStride(16); 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v7, 7, v[vgprSerial]                 // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(128)
v_and_b32 v7, 1, v7                                // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(2)
v_lshl_add_u32 v5, v7, 14, v5                      // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(16384); 7. final local read offset: flrOffset = lrOffset + WOffset

/* local read addresses: final offsets a */
v_lshrrev_b32 v6, 6, v[vgprSerial]                 // 6 = Serial / 64
v_lshrrev_b32 v6, 2, v6                            // LSU offset: Get LSU wave_id
s_mov_b32 s97, 128                                 // LSU offset: stride = lsuStride(128) when umlds==True
v_mul_lo_u32 v6, s97, v6                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD)
v_add_u32 v[vgprLocalReadAddrA], v6, v4            // Final Offset: offset = (lro0+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v7, 10, v[vgprLocalReadAddrA]        // Final Offset: padding 32 per block 1024
v_lshl_add_u32 v[vgprLocalReadAddrA], v7, 5, v[vgprLocalReadAddrA] // Final Offset: padding 32 per block 1024

/* local read addresses: final offsets b */
v_lshrrev_b32 v4, 6, v[vgprSerial]                 // 4 = Serial / 64
v_lshrrev_b32 v4, 2, v4                            // LSU offset: Get LSU wave_id
                                                   // LSU offset: stride = lsuStride(128) when umlds==True (dup assign opt.)
v_mul_lo_u32 v4, s97, v4                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD)
v_add_u32 v[vgprLocalReadAddrB], v4, v5            // Final Offset: offset = (lro1+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v6, 10, v[vgprLocalReadAddrB]        // Final Offset: padding 32 per block 1024
v_lshl_add_u32 v[vgprLocalReadAddrB], v6, 5, v[vgprLocalReadAddrB] // Final Offset: padding 32 per block 1024

/* local read addresses: declare addresses a */
/* N/A */

/* local read addresses: declare addresses b */
v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x8400, v[vgprLocalReadAddrB+0] //  += LdsOffsetB (lower)
v_add_u32 v[vgprLocalReadSwapAddrA], 67584, v[vgprLocalReadAddrA] // Calculate starting lds addr of second buffer
v_xor_b32 v[vgprLocalReadSwapAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // xor both lds buffer offsets to enable swapping
v_add_u32 v[vgprLocalReadSwapAddrB], 67584, v[vgprLocalReadAddrB] // Calculate starting lds addr of second buffer
v_xor_b32 v[vgprLocalReadSwapAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // xor both lds buffer offsets to enable swapping

/******************************************/
/* Local Write Addresses                  */
/******************************************/
/* LVCA = 8 */
/* v5 = A-unroll = serial%LVCA */
v_lshrrev_b32 v4, 3, v[vgprSerial]                 // 4 = Serial / 8
v_and_b32 v5, 7, v[vgprSerial]                     // 5 = Serial % 8
/* unroll *= glvw */
v_lshlrev_b32 v5, 4, v5                            // v5 = v5 * 16
v_mov_b32 v8, v5                                   // copy for GlobalSplitU
/* LVCB = 8 */
/* v7 = B-unroll = serial%LVCB */
v_lshrrev_b32 v6, 3, v[vgprSerial]                 // 6 = Serial / 8
v_and_b32 v7, 7, v[vgprSerial]                     // 7 = Serial % 8
/* unroll *= glvw */
v_lshlrev_b32 v7, 4, v7                            // v7 = v7 * 16
v_mov_b32 v9, v7                                   // copy for GlobalSplitU
/* lwaUnrollAssignmentA = v8 */
/* lwaUnrollAssignmentB = v9 */

/* local write addresses: first offset a */
v_mul_u32_u24 v10, 0x80, v4                        // lwAL**(DepthU_Compute + PAD)
v_add_u32 v10, v8, v10                             // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS(1)
v_lshrrev_b32 v12, 10, v10                         // padding 32 per block 1024
v_lshl_add_u32 v10, v12, 5, v10                    // padding 32 per block 1024
s_nop 0                                            // 1 wait states required before reading vgpr by lane
v_readfirstlane_b32 s[sgprLocalWriteAddrA], v10    // Copy lds write address VGPR to SGPR
s_nop 0                                            // 1 wait states
s_add_u32 s[sgprSwapA], s[sgprLocalWriteAddrA], 67584 // Calculate starting lds addr of second buffer
s_xor_b32 s[sgprSwapA], s[sgprSwapA], s[sgprLocalWriteAddrA] // xor both lds buffer offsets to enable swapping

/* local write addresses: first offset b */
v_mul_u32_u24 v10, 0x80, v6                        // lwBL**(DepthU_Compute + PAD)
v_add_u32 v10, v9, v10                             // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS(1)
v_lshrrev_b32 v12, 10, v10                         // padding 32 per block 1024
v_lshl_add_u32 v10, v12, 5, v10                    // padding 32 per block 1024
v_add_co_u32 v10, vcc, 0x8400, v10                 // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=33792
s_nop 0                                            // 1 wait states required before reading vgpr by lane
v_readfirstlane_b32 s[sgprLocalWriteAddrB], v10    // Copy lds write address VGPR to SGPR
s_nop 0                                            // 1 wait states
s_add_u32 s[sgprSwapB], s[sgprLocalWriteAddrB], 67584 // Calculate starting lds addr of second buffer
s_xor_b32 s[sgprSwapB], s[sgprSwapB], s[sgprLocalWriteAddrB] // xor both lds buffer offsets to enable swapping

/* global read addresses: tile offset assignment a */
/* graTileAssignmentA = v4 */

/* global read addresses: tile offset assignment b */
/* graTileAssignmentB = v6 */

/* global read addresses: unroll assignment a */
/* v5 */

/* global read addresses: unroll assignment b */
/* v7 */

/* global read addresses: other free assignments */
/* s[sgprWorkGroup2] */

/* global read addresses: tile offsets a */

/* global read addresses: tile offsets b */

/* global read addresses: unroll offsets a */

/* global read addresses: unroll offsets b */

/* global read addresses: final offsets a */
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0,  5,  4, 10 // gROA_0_0_0_0
s_mul_i32 s[sgprScalarGlobalReadOffsetA+0], s[sgprStrideA0I], 32 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetA+1], s[sgprStrideA0I], 64 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetA+2], s[sgprStrideA0I], 96 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetA+3], s[sgprStrideA0I], 128 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetA+4], s[sgprStrideA0I], 160 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetA+5], s[sgprStrideA0I], 192 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetA+6], s[sgprStrideA0I], 224 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)

/* global read addresses: final offsets b */
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0,  7,  6, 10 // gROB_0_0_0_0
s_mul_i32 s[sgprScalarGlobalReadOffsetB+0], s[sgprStrideB1J], 32 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+1], s[sgprStrideB1J], 64 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+2], s[sgprStrideB1J], 96 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+3], s[sgprStrideB1J], 128 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+4], s[sgprStrideB1J], 160 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)

/* global read addresses: addresses a */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s101, s[sgprWorkGroup0], 256          // WorkGroup[01] * MT
s_mul_i32 s100, s[sgprWorkGroup0], 256             // WorkGroup[01] * MT
s_mul_hi_u32 s101, s100, s[sgprStrideA0I]          // tlu=0, scaled tile-offset by stride
s_mul_i32 s100, s100, s[sgprStrideA0I]             // tlu=0, scaled tile-offset by stride
s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU    // StreamK tile start offset
s_mul_hi_u32 s99, s98, constStrideAL               // StreamK tile start offset
s_mul_i32 s98, s98, constStrideAL                  // StreamK tile start offset
s_add_u32 s100, s100, s98                          // accum GsuOffset term to tilestart
s_addc_u32 s101, s101, s99                         // accum GsuOffset term to tilestart
s_mov_b64 s[sgprShadowLimitA+0:sgprShadowLimitA+0+1], 1 // Init tensor size
s_sub_u32 s98, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s99, constStrideAL, s98               // stride x (size-1)
s_mul_i32 s98, constStrideAL, s98                  // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size
s_sub_u32 s98, s[sgprSizeI], 1                     // (size-1)
s_mul_hi_u32 s99, s[sgprStrideA0I], s98            // stride x (size-1)
s_mul_i32 s98, s[sgprStrideA0I], s98               // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // sum tensor size
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s100 // sub tileStart
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s101 // sub tileStart
                                                   // Set limit to use bytes (byte is 1, do nothing)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s99, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s98, s[sgprStrideAK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s100, s100, s98                          // accum wg term to tilestart
s_addc_u32 s101, s101, s99                         // accum wg term to tilestart
                                                   // tileStart *= BPE (multiplier is 1, do nothing)
s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s100   // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s101  // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdA+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: addresses b */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s101, s[sgprWorkGroup1], 256          // WorkGroup[01] * MT
s_mul_i32 s100, s[sgprWorkGroup1], 256             // WorkGroup[01] * MT
s_mul_hi_u32 s101, s100, s[sgprStrideB1J]          // tlu=0, scaled tile-offset by stride
s_mul_i32 s100, s100, s[sgprStrideB1J]             // tlu=0, scaled tile-offset by stride
s_mul_i32 s98, s[sgprStreamKLocalStart], DepthU    // StreamK tile start offset
s_mul_hi_u32 s99, s98, constStrideBL               // StreamK tile start offset
s_mul_i32 s98, s98, constStrideBL                  // StreamK tile start offset
s_add_u32 s100, s100, s98                          // accum GsuOffset term to tilestart
s_addc_u32 s101, s101, s99                         // accum GsuOffset term to tilestart
s_mov_b64 s[sgprShadowLimitB+0:sgprShadowLimitB+0+1], 1 // Init tensor size
s_sub_u32 s98, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s99, constStrideBL, s98               // stride x (size-1)
s_mul_i32 s98, constStrideBL, s98                  // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size
s_sub_u32 s98, s[sgprSizeJ], 1                     // (size-1)
s_mul_hi_u32 s99, s[sgprStrideB1J], s98            // stride x (size-1)
s_mul_i32 s98, s[sgprStrideB1J], s98               // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // sum tensor size
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s100 // sub tileStart
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s101 // sub tileStart
                                                   // Set limit to use bytes (byte is 1, do nothing)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s99, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s98, s[sgprStrideBK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s100, s100, s98                          // accum wg term to tilestart
s_addc_u32 s101, s101, s99                         // accum wg term to tilestart
                                                   // tileStart *= BPE (multiplier is 1, do nothing)
s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s100   // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s101  // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdB+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: increments a */
s_mov_b32 s[sgprGlobalReadIncsA+0], DepthU*BpeAGR  // incrA (unrollIdx)

/* global read addresses: increments b */
s_mov_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeBGR  // incrB (unrollIdx)
/* declare loop num iterations */
s_sub_u32 s[sgprLoopCounterL], s[sgprStreamKLocalEnd], s[sgprStreamKLocalStart] // StreamK loop counter = localEnd - localStart
v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_SKAlphaCheck2                 // branch if s[Alpha] != 0
s_mov_b32 s[sgprLoopCounterL], 0                   // Skip iterations
label_SKAlphaCheck2:
s_and_b32 s99, 127, s[sgprSizesSum+0]              // s99 = s[sgprSizesSum+0] % 128
s_cmp_eq_u32 s99, 0                                // numIterL == 0
s_cselect_b32 s98, 0, 1                            // check if size uses tail loop
s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile
s_cselect_b32 s98, s98, 0                          // this WG runs tail loop
s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], s98 // Adjust loop counter for tail loop
s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter
s_and_b32 s100, s[sgprStaggerU], 0x1f00
s_lshr_b32 s100, s100, 0x8
s_and_b32 s101, s[sgprStaggerU], 0xe000
s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff
s_mov_b32 s98, s[sgprStaggerU]                     // init staggerU
label_beginStaggerUIter:
s_lshl_b32 s99, s98, s100                          // shift by StaggerUStride
s_cmp_ge_u32 s[sgprOrigLoopCounter], s99           // loopCount >= current shift Count
s_cbranch_scc1 label_endStaggerUIter               // jump to end
s_lshr_b32 s98, s98, 1                             // step down to smaller stagger
s_branch label_beginStaggerUIter                   // jump to begin
label_endStaggerUIter:
s_sub_u32 s99, s98, 1                              // staggerU mask
s_cmp_ge_u32 s98, 1                                // if current staggerU >= 1
s_cselect_b32 s[sgprStaggerUIter], s99, 0          // set Mask
s_cmp_eq_u32 s101, 0x0
s_cbranch_scc1 label_StaggerUMapping_1
s_mov_b32 s98, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_1:
s_cmp_eq_u32 s101, 0x2000
s_cbranch_scc1 label_StaggerUMapping_2
s_mov_b32 s98, s[sgprWorkGroup1]
s_branch label_staggerInputEnd
label_StaggerUMapping_2:
s_cmp_eq_u32 s101, 0x4000
s_cbranch_scc1 label_StaggerUMapping_3
s_mov_b32 s98, -0x1
s_branch label_staggerInputEnd
label_StaggerUMapping_3:
s_cmp_eq_u32 s101, 0x6000
s_cbranch_scc1 label_StaggerUMapping_4
s_mul_i32 s99, s[sgprNumWorkGroups0], s[sgprWorkGroup1]
s_add_u32 s98, s98, s99
s_add_u32 s98, s98, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_4:
s_cmp_eq_u32 s101, 0x8000
s_cbranch_scc1 label_staggerInputEnd
s_mov_b32 s98, -0x1
s_branch label_staggerInputEnd
label_staggerInputEnd:
s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s98 // Compute actual stagger start for this tile
s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s100 // shift by StaggerUStride
s_cmp_gt_u32 s[sgprStreamKLocalStart], 0           // does wg start tile?
s_cmov_b32 s[sgprStaggerUIter], 0                  // set stagger=0 for partial tiles
s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile?
s_cmov_b32 s[sgprStaggerUIter], 0                  // set stagger=0 for partial tiles

/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */
s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration
s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1]     // remove one iteration
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */
s_mul_hi_i32 s99, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_i32 s98, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration
s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1]     // remove one iteration
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap
/* local read addresses: init pointers a */

/* localReadInitPointers */
/* local read addresses: init pointers b */

/* localReadInitPointers */

/* prefetch: global -> local */
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?
s_cbranch_scc1 label_ShadowInitStart               // skip to ShadowInitStart iter b/c numIter==0

s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0

s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0

/* global read inc A loopL */
s_add_u32 s100, s[sgprLoopCounterL], 1             // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s100             // Is this wrapIter? (pf)
s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
s_cselect_b32 s99, s[sgprWrapUA+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* global read inc B loopL */
s_add_u32 s100, s[sgprLoopCounterL], 1             // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s100             // Is this wrapIter? (pf)
s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
s_cselect_b32 s99, s[sgprWrapUB+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

/******************************************/
/* End setupNewTile                       */
/******************************************/
label_ShadowInitStart:
s_mov_b64 s[sgprSrdD+0:sgprSrdD+0+1], s[sgprAddressD+0:sgprAddressD+0+1] // init SRD base address
s_mov_b32 s[sgprSrdD+2], BufferOOB
s_mov_b32 s[sgprSrdD+3], Srd127_96                 // Set bits 127_96 in post-loop SRD

s_mov_b64 s[sgprSrdC+0:sgprSrdC+0+1], s[sgprAddressC+0:sgprAddressC+0+1] // init SRD base address
s_mov_b32 s[sgprSrdC+2], BufferOOB
s_mov_b32 s[sgprSrdC+3], Srd127_96                 // Set bits 127_96 in post-loop SRD


s_mul_i32 s100, MT1, s[sgprWorkGroup1]             // <- wg1*MT1
s_mul_hi_u32 s99, s100, s[sgprStrideC1J]           // ScaleC s100 by Stride
s_mul_i32 s98, s100, s[sgprStrideC1J]              // ScaleC s100 by Stride
s_lshl_b64 s[98:99], s[98:99], 1                   // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s98    // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s99   // add hi to SRD
s_mul_hi_u32 s99, s100, s[sgprStrideD1J]           // ScaleD s100 by Stride
s_mul_i32 s98, s100, s[sgprStrideD1J]              // ScaleD s100 by Stride
s_lshl_b64 s[98:99], s[98:99], 1                   // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s98    // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s99   // add hi to SRD

s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride
s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideCK]  // ScaleC s[sgprWorkGroup2] by Stride
s_lshl_b64 s[98:99], s[98:99], 1                   // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s98        // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s99       // add hi to SRD
s_mul_hi_u32 s99, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride
s_mul_i32 s98, s[sgprWorkGroup2], s[sgprStrideDK]  // ScaleD s[sgprWorkGroup2] by Stride
s_lshl_b64 s[98:99], s[98:99], 1                   // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s98        // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s99       // add hi to SRD


/* initC: remove ValuC vgpr buffer [0...0) from pool */

/* initC: remove acc vgpr buffer [0...256) from pool */

/* initC: remove ValuA/B vgpr buffer [4...132) from pool */
v_accvgpr_write acc0, 0                            // initC
v_accvgpr_write acc1, 0                            // initC
v_accvgpr_write acc2, 0                            // initC
v_accvgpr_write acc3, 0                            // initC
v_accvgpr_write acc4, 0                            // initC
v_accvgpr_write acc5, 0                            // initC
v_accvgpr_write acc6, 0                            // initC
v_accvgpr_write acc7, 0                            // initC
v_accvgpr_write acc8, 0                            // initC
v_accvgpr_write acc9, 0                            // initC
v_accvgpr_write acc10, 0                           // initC
v_accvgpr_write acc11, 0                           // initC
v_accvgpr_write acc12, 0                           // initC
v_accvgpr_write acc13, 0                           // initC
v_accvgpr_write acc14, 0                           // initC
v_accvgpr_write acc15, 0                           // initC
v_accvgpr_write acc16, 0                           // initC
v_accvgpr_write acc17, 0                           // initC
v_accvgpr_write acc18, 0                           // initC
v_accvgpr_write acc19, 0                           // initC
v_accvgpr_write acc20, 0                           // initC
v_accvgpr_write acc21, 0                           // initC
v_accvgpr_write acc22, 0                           // initC
v_accvgpr_write acc23, 0                           // initC
v_accvgpr_write acc24, 0                           // initC
v_accvgpr_write acc25, 0                           // initC
v_accvgpr_write acc26, 0                           // initC
v_accvgpr_write acc27, 0                           // initC
v_accvgpr_write acc28, 0                           // initC
v_accvgpr_write acc29, 0                           // initC
v_accvgpr_write acc30, 0                           // initC
v_accvgpr_write acc31, 0                           // initC
v_accvgpr_write acc32, 0                           // initC
v_accvgpr_write acc33, 0                           // initC
v_accvgpr_write acc34, 0                           // initC
v_accvgpr_write acc35, 0                           // initC
v_accvgpr_write acc36, 0                           // initC
v_accvgpr_write acc37, 0                           // initC
v_accvgpr_write acc38, 0                           // initC
v_accvgpr_write acc39, 0                           // initC
v_accvgpr_write acc40, 0                           // initC
v_accvgpr_write acc41, 0                           // initC
v_accvgpr_write acc42, 0                           // initC
v_accvgpr_write acc43, 0                           // initC
v_accvgpr_write acc44, 0                           // initC
v_accvgpr_write acc45, 0                           // initC
v_accvgpr_write acc46, 0                           // initC
v_accvgpr_write acc47, 0                           // initC
v_accvgpr_write acc48, 0                           // initC
v_accvgpr_write acc49, 0                           // initC
v_accvgpr_write acc50, 0                           // initC
v_accvgpr_write acc51, 0                           // initC
v_accvgpr_write acc52, 0                           // initC
v_accvgpr_write acc53, 0                           // initC
v_accvgpr_write acc54, 0                           // initC
v_accvgpr_write acc55, 0                           // initC
v_accvgpr_write acc56, 0                           // initC
v_accvgpr_write acc57, 0                           // initC
v_accvgpr_write acc58, 0                           // initC
v_accvgpr_write acc59, 0                           // initC
v_accvgpr_write acc60, 0                           // initC
v_accvgpr_write acc61, 0                           // initC
v_accvgpr_write acc62, 0                           // initC
v_accvgpr_write acc63, 0                           // initC
v_accvgpr_write acc64, 0                           // initC
v_accvgpr_write acc65, 0                           // initC
v_accvgpr_write acc66, 0                           // initC
v_accvgpr_write acc67, 0                           // initC
v_accvgpr_write acc68, 0                           // initC
v_accvgpr_write acc69, 0                           // initC
v_accvgpr_write acc70, 0                           // initC
v_accvgpr_write acc71, 0                           // initC
v_accvgpr_write acc72, 0                           // initC
v_accvgpr_write acc73, 0                           // initC
v_accvgpr_write acc74, 0                           // initC
v_accvgpr_write acc75, 0                           // initC
v_accvgpr_write acc76, 0                           // initC
v_accvgpr_write acc77, 0                           // initC
v_accvgpr_write acc78, 0                           // initC
v_accvgpr_write acc79, 0                           // initC
v_accvgpr_write acc80, 0                           // initC
v_accvgpr_write acc81, 0                           // initC
v_accvgpr_write acc82, 0                           // initC
v_accvgpr_write acc83, 0                           // initC
v_accvgpr_write acc84, 0                           // initC
v_accvgpr_write acc85, 0                           // initC
v_accvgpr_write acc86, 0                           // initC
v_accvgpr_write acc87, 0                           // initC
v_accvgpr_write acc88, 0                           // initC
v_accvgpr_write acc89, 0                           // initC
v_accvgpr_write acc90, 0                           // initC
v_accvgpr_write acc91, 0                           // initC
v_accvgpr_write acc92, 0                           // initC
v_accvgpr_write acc93, 0                           // initC
v_accvgpr_write acc94, 0                           // initC
v_accvgpr_write acc95, 0                           // initC
v_accvgpr_write acc96, 0                           // initC
v_accvgpr_write acc97, 0                           // initC
v_accvgpr_write acc98, 0                           // initC
v_accvgpr_write acc99, 0                           // initC
v_accvgpr_write acc100, 0                          // initC
v_accvgpr_write acc101, 0                          // initC
v_accvgpr_write acc102, 0                          // initC
v_accvgpr_write acc103, 0                          // initC
v_accvgpr_write acc104, 0                          // initC
v_accvgpr_write acc105, 0                          // initC
v_accvgpr_write acc106, 0                          // initC
v_accvgpr_write acc107, 0                          // initC
v_accvgpr_write acc108, 0                          // initC
v_accvgpr_write acc109, 0                          // initC
v_accvgpr_write acc110, 0                          // initC
v_accvgpr_write acc111, 0                          // initC
v_accvgpr_write acc112, 0                          // initC
v_accvgpr_write acc113, 0                          // initC
v_accvgpr_write acc114, 0                          // initC
v_accvgpr_write acc115, 0                          // initC
v_accvgpr_write acc116, 0                          // initC
v_accvgpr_write acc117, 0                          // initC
v_accvgpr_write acc118, 0                          // initC
v_accvgpr_write acc119, 0                          // initC
v_accvgpr_write acc120, 0                          // initC
v_accvgpr_write acc121, 0                          // initC
v_accvgpr_write acc122, 0                          // initC
v_accvgpr_write acc123, 0                          // initC
v_accvgpr_write acc124, 0                          // initC
v_accvgpr_write acc125, 0                          // initC
v_accvgpr_write acc126, 0                          // initC
v_accvgpr_write acc127, 0                          // initC
v_accvgpr_write acc128, 0                          // initC
v_accvgpr_write acc129, 0                          // initC
v_accvgpr_write acc130, 0                          // initC
v_accvgpr_write acc131, 0                          // initC
v_accvgpr_write acc132, 0                          // initC
v_accvgpr_write acc133, 0                          // initC
v_accvgpr_write acc134, 0                          // initC
v_accvgpr_write acc135, 0                          // initC
v_accvgpr_write acc136, 0                          // initC
v_accvgpr_write acc137, 0                          // initC
v_accvgpr_write acc138, 0                          // initC
v_accvgpr_write acc139, 0                          // initC
v_accvgpr_write acc140, 0                          // initC
v_accvgpr_write acc141, 0                          // initC
v_accvgpr_write acc142, 0                          // initC
v_accvgpr_write acc143, 0                          // initC
v_accvgpr_write acc144, 0                          // initC
v_accvgpr_write acc145, 0                          // initC
v_accvgpr_write acc146, 0                          // initC
v_accvgpr_write acc147, 0                          // initC
v_accvgpr_write acc148, 0                          // initC
v_accvgpr_write acc149, 0                          // initC
v_accvgpr_write acc150, 0                          // initC
v_accvgpr_write acc151, 0                          // initC
v_accvgpr_write acc152, 0                          // initC
v_accvgpr_write acc153, 0                          // initC
v_accvgpr_write acc154, 0                          // initC
v_accvgpr_write acc155, 0                          // initC
v_accvgpr_write acc156, 0                          // initC
v_accvgpr_write acc157, 0                          // initC
v_accvgpr_write acc158, 0                          // initC
v_accvgpr_write acc159, 0                          // initC
v_accvgpr_write acc160, 0                          // initC
v_accvgpr_write acc161, 0                          // initC
v_accvgpr_write acc162, 0                          // initC
v_accvgpr_write acc163, 0                          // initC
v_accvgpr_write acc164, 0                          // initC
v_accvgpr_write acc165, 0                          // initC
v_accvgpr_write acc166, 0                          // initC
v_accvgpr_write acc167, 0                          // initC
v_accvgpr_write acc168, 0                          // initC
v_accvgpr_write acc169, 0                          // initC
v_accvgpr_write acc170, 0                          // initC
v_accvgpr_write acc171, 0                          // initC
v_accvgpr_write acc172, 0                          // initC
v_accvgpr_write acc173, 0                          // initC
v_accvgpr_write acc174, 0                          // initC
v_accvgpr_write acc175, 0                          // initC
v_accvgpr_write acc176, 0                          // initC
v_accvgpr_write acc177, 0                          // initC
v_accvgpr_write acc178, 0                          // initC
v_accvgpr_write acc179, 0                          // initC
v_accvgpr_write acc180, 0                          // initC
v_accvgpr_write acc181, 0                          // initC
v_accvgpr_write acc182, 0                          // initC
v_accvgpr_write acc183, 0                          // initC
v_accvgpr_write acc184, 0                          // initC
v_accvgpr_write acc185, 0                          // initC
v_accvgpr_write acc186, 0                          // initC
v_accvgpr_write acc187, 0                          // initC
v_accvgpr_write acc188, 0                          // initC
v_accvgpr_write acc189, 0                          // initC
v_accvgpr_write acc190, 0                          // initC
v_accvgpr_write acc191, 0                          // initC
v_accvgpr_write acc192, 0                          // initC
v_accvgpr_write acc193, 0                          // initC
v_accvgpr_write acc194, 0                          // initC
v_accvgpr_write acc195, 0                          // initC
v_accvgpr_write acc196, 0                          // initC
v_accvgpr_write acc197, 0                          // initC
v_accvgpr_write acc198, 0                          // initC
v_accvgpr_write acc199, 0                          // initC
v_accvgpr_write acc200, 0                          // initC
v_accvgpr_write acc201, 0                          // initC
v_accvgpr_write acc202, 0                          // initC
v_accvgpr_write acc203, 0                          // initC
v_accvgpr_write acc204, 0                          // initC
v_accvgpr_write acc205, 0                          // initC
v_accvgpr_write acc206, 0                          // initC
v_accvgpr_write acc207, 0                          // initC
v_accvgpr_write acc208, 0                          // initC
v_accvgpr_write acc209, 0                          // initC
v_accvgpr_write acc210, 0                          // initC
v_accvgpr_write acc211, 0                          // initC
v_accvgpr_write acc212, 0                          // initC
v_accvgpr_write acc213, 0                          // initC
v_accvgpr_write acc214, 0                          // initC
v_accvgpr_write acc215, 0                          // initC
v_accvgpr_write acc216, 0                          // initC
v_accvgpr_write acc217, 0                          // initC
v_accvgpr_write acc218, 0                          // initC
v_accvgpr_write acc219, 0                          // initC
v_accvgpr_write acc220, 0                          // initC
v_accvgpr_write acc221, 0                          // initC
v_accvgpr_write acc222, 0                          // initC
v_accvgpr_write acc223, 0                          // initC
v_accvgpr_write acc224, 0                          // initC
v_accvgpr_write acc225, 0                          // initC
v_accvgpr_write acc226, 0                          // initC
v_accvgpr_write acc227, 0                          // initC
v_accvgpr_write acc228, 0                          // initC
v_accvgpr_write acc229, 0                          // initC
v_accvgpr_write acc230, 0                          // initC
v_accvgpr_write acc231, 0                          // initC
v_accvgpr_write acc232, 0                          // initC
v_accvgpr_write acc233, 0                          // initC
v_accvgpr_write acc234, 0                          // initC
v_accvgpr_write acc235, 0                          // initC
v_accvgpr_write acc236, 0                          // initC
v_accvgpr_write acc237, 0                          // initC
v_accvgpr_write acc238, 0                          // initC
v_accvgpr_write acc239, 0                          // initC
v_accvgpr_write acc240, 0                          // initC
v_accvgpr_write acc241, 0                          // initC
v_accvgpr_write acc242, 0                          // initC
v_accvgpr_write acc243, 0                          // initC
v_accvgpr_write acc244, 0                          // initC
v_accvgpr_write acc245, 0                          // initC
v_accvgpr_write acc246, 0                          // initC
v_accvgpr_write acc247, 0                          // initC
v_accvgpr_write acc248, 0                          // initC
v_accvgpr_write acc249, 0                          // initC
v_accvgpr_write acc250, 0                          // initC
v_accvgpr_write acc251, 0                          // initC
v_accvgpr_write acc252, 0                          // initC
v_accvgpr_write acc253, 0                          // initC
v_accvgpr_write acc254, 0                          // initC
v_accvgpr_write acc255, 0                          // initC
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?

/* after InitC, skip to end of prefetch last iter if numIter==0 */
s_cbranch_scc0 label_NoBranch_8S4L1KCK9VFC7AQU     // Only branch on scc1
s_getpc_b64 s[98:99]                               // addr of next instr
s_add_i32 s100, label_PrefetchGlobalLastIterEnd, 4 // target branch offset
s_add_u32 s98, s98, s100                           // add target branch offset
s_addc_u32 s99, s99, 0                             // add high and carry
s_setpc_b64 s[98:99]                               // branch to label_PrefetchGlobalLastIterEnd
label_NoBranch_8S4L1KCK9VFC7AQU:
s_waitcnt vmcnt(0)                                 // wait for global read
s_barrier                                          // For stream-k / persistent loop

/* local write a */

/* local write b */

/* local write swap a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR

/* local write swap b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // PGR=2 but only 1 loop
s_cbranch_scc1 label_skipPGR2                      // PGR=2 but only 1 loop

s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0

s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0

/* local write swap a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR

/* local write swap b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR
label_skipPGR2:

s_barrier

// LR A0, B0

ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

s_waitcnt lgkmcnt(0)
  
/******************************************/
/* Unrolled Loop(s) - Begin               */
/******************************************/
label_openLoopL:
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // LoopCounterL < EndCounter
s_cbranch_scc1 label_toPGR1                        // PGR=2 but only 1 loop, toPGR1
s_cmp_le_u32 s[sgprLoopCounterL], 0x2              // LoopCounterL < EndCounter
s_cbranch_scc1 label_LoopEndL                      // do not enter LoopL
label_LoopBeginL:

/******************************************/
/* Unrolled Loop 1/1 - Begin              */
/******************************************/

// B0 A0

/*  mfmaIndex:0  */
v_mfma_f32_16x16x128_f8f6f4 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[0:3] cbsz:1 blgp:0 // left value = acc[0+0:3+0]
ds_read_b128 v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+36:vgprValuA_X0_I0+36+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:1  */
v_mfma_f32_16x16x128_f8f6f4 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[4:7] cbsz:1 blgp:0 // left value = acc[4+0:7+0]
ds_read_b128 v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+44:vgprValuA_X0_I0+44+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:2  */
v_mfma_f32_16x16x128_f8f6f4 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[8:11] cbsz:1 blgp:0 // left value = acc[8+0:11+0]
ds_read_b128 v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+52:vgprValuA_X0_I0+52+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:3  */
v_mfma_f32_16x16x128_f8f6f4 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[12:15] cbsz:1 blgp:0 // left value = acc[12+0:15+0]
ds_read_b128 v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+60:vgprValuA_X0_I0+60+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:8  */
v_mfma_f32_16x16x128_f8f6f4 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[32:35] cbsz:1 blgp:0 // left value = acc[32+0:35+0]
/* global read inc A loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
s_cselect_b32 s99, s[sgprWrapUA+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99       // gra SRD += inc(upper)

/*  mfmaIndex:9  */
v_mfma_f32_16x16x128_f8f6f4 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[36:39] cbsz:1 blgp:0 // left value = acc[36+0:39+0]
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
s_waitcnt lgkmcnt(0)

/*  mfmaIndex:10  */
v_mfma_f32_16x16x128_f8f6f4 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[40:43] cbsz:1 blgp:0 // left value = acc[40+0:43+0]
s_barrier


/*  mfmaIndex:11  */
v_mfma_f32_16x16x128_f8f6f4 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[44:47] cbsz:1 blgp:0 // left value = acc[44+0:47+0]
/* global read inc B loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
s_cselect_b32 s99, s[sgprWrapUB+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99       // gra SRD += inc(upper)

/*  mfmaIndex:16  */
v_mfma_f32_16x16x128_f8f6f4 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[64:67] cbsz:1 blgp:0 // left value = acc[64+0:67+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:17  */
v_mfma_f32_16x16x128_f8f6f4 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[68:71] cbsz:1 blgp:0 // left value = acc[68+0:71+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:18  */
v_mfma_f32_16x16x128_f8f6f4 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[72:75] cbsz:1 blgp:0 // left value = acc[72+0:75+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

  /*  mfmaIndex:19  */
v_mfma_f32_16x16x128_f8f6f4 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[76:79] cbsz:1 blgp:0 // left value = acc[76+0:79+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:24  */
v_mfma_f32_16x16x128_f8f6f4 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[96:99] cbsz:1 blgp:0 // left value = acc[96+0:99+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0

/*  mfmaIndex:25  */
v_mfma_f32_16x16x128_f8f6f4 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[100:103] cbsz:1 blgp:0 // left value = acc[100+0:103+0]
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:26  */
v_mfma_f32_16x16x128_f8f6f4 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[104:107] cbsz:1 blgp:0 // left value = acc[104+0:107+0]
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:27  */
v_mfma_f32_16x16x128_f8f6f4 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[108:111] cbsz:1 blgp:0 // left value = acc[108+0:111+0]
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0

// B0 A1

/*  mfmaIndex:4  */
v_mfma_f32_16x16x128_f8f6f4 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[16:19] cbsz:1 blgp:0 // left value = acc[16+0:19+0]
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:5  */
v_mfma_f32_16x16x128_f8f6f4 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[20:23] cbsz:1 blgp:0 // left value = acc[20+0:23+0]
/* local read swap offsets a */
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk
/* local read swap offsets b */
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk

/*  mfmaIndex:6  */
v_mfma_f32_16x16x128_f8f6f4 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[24:27] cbsz:1 blgp:0 // left value = acc[24+0:27+0]
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

  /*  mfmaIndex:7  */
v_mfma_f32_16x16x128_f8f6f4 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[28:31] cbsz:1 blgp:0 // left value = acc[28+0:31+0]
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:12  */
v_mfma_f32_16x16x128_f8f6f4 acc[48:51], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[48:51] cbsz:1 blgp:0 // left value = acc[48+0:51+0]
s_waitcnt lgkmcnt(0)

/*  mfmaIndex:13  */
v_mfma_f32_16x16x128_f8f6f4 acc[52:55], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[52:55] cbsz:1 blgp:0 // left value = acc[52+0:55+0]
s_barrier

/*  mfmaIndex:14  */
v_mfma_f32_16x16x128_f8f6f4 acc[56:59], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[56:59] cbsz:1 blgp:0 // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
v_mfma_f32_16x16x128_f8f6f4 acc[60:63], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[60:63] cbsz:1 blgp:0 // left value = acc[60+0:63+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:20  */
v_mfma_f32_16x16x128_f8f6f4 acc[80:83], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[80:83] cbsz:1 blgp:0 // left value = acc[80+0:83+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:21  */
v_mfma_f32_16x16x128_f8f6f4 acc[84:87], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[84:87] cbsz:1 blgp:0 // left value = acc[84+0:87+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0
s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address

/*  mfmaIndex:22  */
v_mfma_f32_16x16x128_f8f6f4 acc[88:91], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[88:91] cbsz:1 blgp:0 // left value = acc[88+0:91+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:23  */
v_mfma_f32_16x16x128_f8f6f4 acc[92:95], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[92:95] cbsz:1 blgp:0 // left value = acc[92+0:95+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0

/*  mfmaIndex:28  */
v_mfma_f32_16x16x128_f8f6f4 acc[112:115], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[112:115] cbsz:1 blgp:0 // left value = acc[112+0:115+0]
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:29  */
v_mfma_f32_16x16x128_f8f6f4 acc[116:119], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[116:119] cbsz:1 blgp:0 // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
v_mfma_f32_16x16x128_f8f6f4 acc[120:123], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[120:123] cbsz:1 blgp:0 // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
v_mfma_f32_16x16x128_f8f6f4 acc[124:127], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[124:127] cbsz:1 blgp:0 // left value = acc[124+0:127+0]


// B1 A0

/*  mfmaIndex:32  */
v_mfma_f32_16x16x128_f8f6f4 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[128:131] cbsz:1 blgp:0 // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
v_mfma_f32_16x16x128_f8f6f4 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[132:135] cbsz:1 blgp:0 // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
v_mfma_f32_16x16x128_f8f6f4 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[136:139] cbsz:1 blgp:0 // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
v_mfma_f32_16x16x128_f8f6f4 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[140:143] cbsz:1 blgp:0 // left value = acc[140+0:143+0]

/*  mfmaIndex:40  */
v_mfma_f32_16x16x128_f8f6f4 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[160:163] cbsz:1 blgp:0 // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
v_mfma_f32_16x16x128_f8f6f4 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[164:167] cbsz:1 blgp:0 // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
v_mfma_f32_16x16x128_f8f6f4 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[168:171] cbsz:1 blgp:0 // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
v_mfma_f32_16x16x128_f8f6f4 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[172:175] cbsz:1 blgp:0 // left value = acc[172+0:175+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:48  */
v_mfma_f32_16x16x128_f8f6f4 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[192:195] cbsz:1 blgp:0 // left value = acc[192+0:195+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:49  */
v_mfma_f32_16x16x128_f8f6f4 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[196:199] cbsz:1 blgp:0 // left value = acc[196+0:199+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

/*  mfmaIndex:50  */
v_mfma_f32_16x16x128_f8f6f4 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[200:203] cbsz:1 blgp:0 // left value = acc[200+0:203+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line

  /*  mfmaIndex:51  */
v_mfma_f32_16x16x128_f8f6f4 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[204:207] cbsz:1 blgp:0 // left value = acc[204+0:207+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0

/*  mfmaIndex:56  */
v_mfma_f32_16x16x128_f8f6f4 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[224:227] cbsz:1 blgp:0 // left value = acc[224+0:227+0]
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
  /*  mfmaIndex:57  */
v_mfma_f32_16x16x128_f8f6f4 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[228:231] cbsz:1 blgp:0 // left value = acc[228+0:231+0]

  /*  mfmaIndex:58  */
v_mfma_f32_16x16x128_f8f6f4 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[232:235] cbsz:1 blgp:0 // left value = acc[232+0:235+0]
s_waitcnt vmcnt(15)
  /*  mfmaIndex:59  */
v_mfma_f32_16x16x128_f8f6f4 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[236:239] cbsz:1 blgp:0 // left value = acc[236+0:239+0]
s_barrier

// B1 A1 + LR A0 B0


/*  mfmaIndex:36  */
v_mfma_f32_16x16x128_f8f6f4 acc[144:147], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[144:147] cbsz:1 blgp:0 // left value = acc[144+0:147+0]
ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0

  /*  mfmaIndex:37  */
v_mfma_f32_16x16x128_f8f6f4 acc[148:151], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[148:151] cbsz:1 blgp:0 // left value = acc[148+0:151+0]
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:38  */
v_mfma_f32_16x16x128_f8f6f4 acc[152:155], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[152:155] cbsz:1 blgp:0 // left value = acc[152+0:155+0]
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:39  */
v_mfma_f32_16x16x128_f8f6f4 acc[156:159], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[156:159] cbsz:1 blgp:0 // left value = acc[156+0:159+0]
ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:44  */
v_mfma_f32_16x16x128_f8f6f4 acc[176:179], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[176:179] cbsz:1 blgp:0 // left value = acc[176+0:179+0]
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:45  */
v_mfma_f32_16x16x128_f8f6f4 acc[180:183], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[180:183] cbsz:1 blgp:0 // left value = acc[180+0:183+0]
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0

/*  mfmaIndex:46  */
v_mfma_f32_16x16x128_f8f6f4 acc[184:187], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[184:187] cbsz:1 blgp:0 // left value = acc[184+0:187+0]
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:47  */
v_mfma_f32_16x16x128_f8f6f4 acc[188:191], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[188:191] cbsz:1 blgp:0 // left value = acc[188+0:191+0]
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:52  */
v_mfma_f32_16x16x128_f8f6f4 acc[208:211], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[208:211] cbsz:1 blgp:0 // left value = acc[208+0:211+0]
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:53  */
v_mfma_f32_16x16x128_f8f6f4 acc[212:215], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[212:215] cbsz:1 blgp:0 // left value = acc[212+0:215+0]
/* local write swap offsets a */
s_xor_b32 s[sgprLocalWriteAddrA], s[sgprSwapA], s[sgprLocalWriteAddrA] // swap Red Blk SGPR
/* local write swap offsets b */
s_xor_b32 s[sgprLocalWriteAddrB], s[sgprSwapB], s[sgprLocalWriteAddrB] // swap Red Blk SGPR


  /*  mfmaIndex:54  */
v_mfma_f32_16x16x128_f8f6f4 acc[216:219], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[216:219] cbsz:1 blgp:0 // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x128_f8f6f4 acc[220:223], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[220:223] cbsz:1 blgp:0 // left value = acc[220+0:223+0]

/*  mfmaIndex:60  */
v_mfma_f32_16x16x128_f8f6f4 acc[240:243], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[240:243] cbsz:1 blgp:0 // left value = acc[240+0:243+0]
s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL
s_cmp_eq_i32 s[sgprLoopCounterL], 0x2              // counterL==2
/*  mfmaIndex:61  */
v_mfma_f32_16x16x128_f8f6f4 acc[244:247], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[244:247] cbsz:1 blgp:0 // left value = acc[244+0:247+0]
s_waitcnt lgkmcnt(0)
/*  mfmaIndex:62  */
v_mfma_f32_16x16x128_f8f6f4 acc[248:251], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[248:251] cbsz:1 blgp:0 // left value = acc[248+0:251+0]

/*  mfmaIndex:63  */
v_mfma_f32_16x16x128_f8f6f4 acc[252:255], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[252:255] cbsz:1 blgp:0 // left value = acc[252+0:255+0]

/******************************************/
/* Unrolled Loop - End                    */
/******************************************/

/* closeLoop loopL finalLoop=1 tailLoop=0 */
s_cbranch_scc0 label_LoopBeginL                    // restart LoopL
label_LoopEndL:

/* Before NLL: Check VGPR.checkin for INT8 LW */

/******************************************/
/* Ord. NoGlobalLoadLoop - Begin          */
/******************************************/
s_waitcnt vmcnt(16)                                // 10wait for global read

/* iter 0 (reset local read pointers iteration)  (swap and reset local write pointers iteration)  (swap local read pointers iteration)  */
/*  grEndMfmaIndex:6, lwStartMfmaIndex:37, lwEndMfmaIndex:63  */
/*  numMfmaForLR:36, syncPlrMfmaIndex:0  */
/*  mfmaIndex:0  */
v_mfma_f32_16x16x128_f8f6f4 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[0:3] cbsz:1 blgp:0 // left value = acc[0+0:3+0]
s_barrier

/*  mfmaIndex:1  */
v_mfma_f32_16x16x128_f8f6f4 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[4:7] cbsz:1 blgp:0 // left value = acc[4+0:7+0]
ds_read_b128 v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+36:vgprValuA_X0_I0+36+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:2  */
v_mfma_f32_16x16x128_f8f6f4 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[8:11] cbsz:1 blgp:0 // left value = acc[8+0:11+0]
ds_read_b128 v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+44:vgprValuA_X0_I0+44+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:3  */
v_mfma_f32_16x16x128_f8f6f4 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[12:15] cbsz:1 blgp:0 // left value = acc[12+0:15+0]
ds_read_b128 v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+52:vgprValuA_X0_I0+52+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:8  */
v_mfma_f32_16x16x128_f8f6f4 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[32:35] cbsz:1 blgp:0 // left value = acc[32+0:35+0]
ds_read_b128 v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+60:vgprValuA_X0_I0+60+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0
/*  mfmaIndex:9  */
v_mfma_f32_16x16x128_f8f6f4 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[36:39] cbsz:1 blgp:0 // left value = acc[36+0:39+0]
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:10  */
v_mfma_f32_16x16x128_f8f6f4 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[40:43] cbsz:1 blgp:0 // left value = acc[40+0:43+0]
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:11  */
v_mfma_f32_16x16x128_f8f6f4 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[44:47] cbsz:1 blgp:0 // left value = acc[44+0:47+0]
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:16  */
v_mfma_f32_16x16x128_f8f6f4 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[64:67] cbsz:1 blgp:0 // left value = acc[64+0:67+0]
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:17  */
v_mfma_f32_16x16x128_f8f6f4 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[68:71] cbsz:1 blgp:0 // left value = acc[68+0:71+0]
/* local read swap offsets a */
v_xor_b32 v[vgprLocalReadAddrA], v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // swap Red Blk
/* local read swap offsets b */
v_xor_b32 v[vgprLocalReadAddrB], v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // swap Red Blk

/*  mfmaIndex:18  */
v_mfma_f32_16x16x128_f8f6f4 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[72:75] cbsz:1 blgp:0 // left value = acc[72+0:75+0]
/* global read inc A loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
s_cselect_b32 s98, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
s_cselect_b32 s99, s[sgprWrapUA+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99       // gra SRD += inc(upper)

/*  mfmaIndex:19  */
v_mfma_f32_16x16x128_f8f6f4 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[76:79] cbsz:1 blgp:0 // left value = acc[76+0:79+0]
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/*  mfmaIndex:24  */
v_mfma_f32_16x16x128_f8f6f4 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[96:99] cbsz:1 blgp:0 // left value = acc[96+0:99+0]
/* global read inc B loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
s_cselect_b32 s98, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
s_cselect_b32 s99, s[sgprWrapUB+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99       // gra SRD += inc(upper)

/*  mfmaIndex:25  */
v_mfma_f32_16x16x128_f8f6f4 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[100:103] cbsz:1 blgp:0 // left value = acc[100+0:103+0]
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

/*  mfmaIndex:26  */
v_mfma_f32_16x16x128_f8f6f4 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[104:107] cbsz:1 blgp:0 // left value = acc[104+0:107+0]


/*  mfmaIndex:27  */
v_mfma_f32_16x16x128_f8f6f4 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[108:111] cbsz:1 blgp:0 // left value = acc[108+0:111+0]
s_waitcnt lgkmcnt(0)

// B0 A1

/*  mfmaIndex:4  */
v_mfma_f32_16x16x128_f8f6f4 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[16:19] cbsz:1 blgp:0 // left value = acc[16+0:19+0]

/*  mfmaIndex:5  */
v_mfma_f32_16x16x128_f8f6f4 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[20:23] cbsz:1 blgp:0 // left value = acc[20+0:23+0]


/*  mfmaIndex:6  */
v_mfma_f32_16x16x128_f8f6f4 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[24:27] cbsz:1 blgp:0 // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
v_mfma_f32_16x16x128_f8f6f4 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[28:31] cbsz:1 blgp:0 // left value = acc[28+0:31+0]


/*  mfmaIndex:12  */
v_mfma_f32_16x16x128_f8f6f4 acc[48:51], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[48:51] cbsz:1 blgp:0 // left value = acc[48+0:51+0]


/*  mfmaIndex:13  */
v_mfma_f32_16x16x128_f8f6f4 acc[52:55], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[52:55] cbsz:1 blgp:0 // left value = acc[52+0:55+0]


/*  mfmaIndex:14  */
v_mfma_f32_16x16x128_f8f6f4 acc[56:59], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[56:59] cbsz:1 blgp:0 // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
v_mfma_f32_16x16x128_f8f6f4 acc[60:63], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[60:63] cbsz:1 blgp:0 // left value = acc[60+0:63+0]

/*  mfmaIndex:20  */
v_mfma_f32_16x16x128_f8f6f4 acc[80:83], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[80:83] cbsz:1 blgp:0 // left value = acc[80+0:83+0]

/*  mfmaIndex:21  */
v_mfma_f32_16x16x128_f8f6f4 acc[84:87], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[84:87] cbsz:1 blgp:0 // left value = acc[84+0:87+0]

/*  mfmaIndex:22  */
v_mfma_f32_16x16x128_f8f6f4 acc[88:91], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[88:91] cbsz:1 blgp:0 // left value = acc[88+0:91+0]

/*  mfmaIndex:23  */
v_mfma_f32_16x16x128_f8f6f4 acc[92:95], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[92:95] cbsz:1 blgp:0 // left value = acc[92+0:95+0]

/*  mfmaIndex:28  */
v_mfma_f32_16x16x128_f8f6f4 acc[112:115], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[112:115] cbsz:1 blgp:0 // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
v_mfma_f32_16x16x128_f8f6f4 acc[116:119], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[116:119] cbsz:1 blgp:0 // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
v_mfma_f32_16x16x128_f8f6f4 acc[120:123], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[120:123] cbsz:1 blgp:0 // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
v_mfma_f32_16x16x128_f8f6f4 acc[124:127], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[124:127] cbsz:1 blgp:0 // left value = acc[124+0:127+0]


// B1 A0

/*  mfmaIndex:32  */
v_mfma_f32_16x16x128_f8f6f4 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[128:131] cbsz:1 blgp:0 // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
v_mfma_f32_16x16x128_f8f6f4 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[132:135] cbsz:1 blgp:0 // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
v_mfma_f32_16x16x128_f8f6f4 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[136:139] cbsz:1 blgp:0 // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
v_mfma_f32_16x16x128_f8f6f4 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[140:143] cbsz:1 blgp:0 // left value = acc[140+0:143+0]

/*  mfmaIndex:40  */
v_mfma_f32_16x16x128_f8f6f4 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[160:163] cbsz:1 blgp:0 // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
v_mfma_f32_16x16x128_f8f6f4 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[164:167] cbsz:1 blgp:0 // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
v_mfma_f32_16x16x128_f8f6f4 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[168:171] cbsz:1 blgp:0 // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
v_mfma_f32_16x16x128_f8f6f4 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[172:175] cbsz:1 blgp:0 // left value = acc[172+0:175+0]

/*  mfmaIndex:48  */
v_mfma_f32_16x16x128_f8f6f4 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[192:195] cbsz:1 blgp:0 // left value = acc[192+0:195+0]

/*  mfmaIndex:49  */
v_mfma_f32_16x16x128_f8f6f4 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[196:199] cbsz:1 blgp:0 // left value = acc[196+0:199+0]

/*  mfmaIndex:50  */
v_mfma_f32_16x16x128_f8f6f4 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[200:203] cbsz:1 blgp:0 // left value = acc[200+0:203+0]

  /*  mfmaIndex:51  */
v_mfma_f32_16x16x128_f8f6f4 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[204:207] cbsz:1 blgp:0 // left value = acc[204+0:207+0]

/*  mfmaIndex:56  */
v_mfma_f32_16x16x128_f8f6f4 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[224:227] cbsz:1 blgp:0 // left value = acc[224+0:227+0]

/*  mfmaIndex:57  */
v_mfma_f32_16x16x128_f8f6f4 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[228:231] cbsz:1 blgp:0 // left value = acc[228+0:231+0]

/*  mfmaIndex:58  */
v_mfma_f32_16x16x128_f8f6f4 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[232:235] cbsz:1 blgp:0 // left value = acc[232+0:235+0]
s_waitcnt vmcnt(0)                                // 10wait for global read
/*  mfmaIndex:59  */
v_mfma_f32_16x16x128_f8f6f4 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[236:239] cbsz:1 blgp:0 // left value = acc[236+0:239+0]
s_barrier

// B1 A1


/*  mfmaIndex:36  */
v_mfma_f32_16x16x128_f8f6f4 acc[144:147], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[144:147] cbsz:1 blgp:0 // left value = acc[144+0:147+0]
ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0

  /*  mfmaIndex:37  */
v_mfma_f32_16x16x128_f8f6f4 acc[148:151], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[148:151] cbsz:1 blgp:0 // left value = acc[148+0:151+0]
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:38  */
v_mfma_f32_16x16x128_f8f6f4 acc[152:155], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[152:155] cbsz:1 blgp:0 // left value = acc[152+0:155+0]
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:39  */
v_mfma_f32_16x16x128_f8f6f4 acc[156:159], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[156:159] cbsz:1 blgp:0 // left value = acc[156+0:159+0]
ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:44  */
v_mfma_f32_16x16x128_f8f6f4 acc[176:179], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[176:179] cbsz:1 blgp:0 // left value = acc[176+0:179+0]
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:45  */
v_mfma_f32_16x16x128_f8f6f4 acc[180:183], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[180:183] cbsz:1 blgp:0 // left value = acc[180+0:183+0]
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:46  */
v_mfma_f32_16x16x128_f8f6f4 acc[184:187], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[184:187] cbsz:1 blgp:0 // left value = acc[184+0:187+0]
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:47  */
v_mfma_f32_16x16x128_f8f6f4 acc[188:191], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[188:191] cbsz:1 blgp:0 // left value = acc[188+0:191+0]
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:52  */
v_mfma_f32_16x16x128_f8f6f4 acc[208:211], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[208:211] cbsz:1 blgp:0 // left value = acc[208+0:211+0]


/*  mfmaIndex:53  */
v_mfma_f32_16x16x128_f8f6f4 acc[212:215], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[212:215] cbsz:1 blgp:0 // left value = acc[212+0:215+0]

  /*  mfmaIndex:54  */
v_mfma_f32_16x16x128_f8f6f4 acc[216:219], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[216:219] cbsz:1 blgp:0 // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x128_f8f6f4 acc[220:223], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[220:223] cbsz:1 blgp:0 // left value = acc[220+0:223+0]

/*  mfmaIndex:60  */
v_mfma_f32_16x16x128_f8f6f4 acc[240:243], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[240:243] cbsz:1 blgp:0 // left value = acc[240+0:243+0]

/*  mfmaIndex:61  */
v_mfma_f32_16x16x128_f8f6f4 acc[244:247], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[244:247] cbsz:1 blgp:0 // left value = acc[244+0:247+0]
s_waitcnt lgkmcnt(0)
/*  mfmaIndex:62  */
v_mfma_f32_16x16x128_f8f6f4 acc[248:251], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[248:251] cbsz:1 blgp:0 // left value = acc[248+0:251+0]

/*  mfmaIndex:63  */
v_mfma_f32_16x16x128_f8f6f4 acc[252:255], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[252:255] cbsz:1 blgp:0 // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=16 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */
label_toPGR1:

/******************************************/
/* Ord. NoLoadLoop - Begin                */
/******************************************/
s_waitcnt vmcnt(0)                                 // 10wait for global read

/*  mfmaIndex:0  */
v_mfma_f32_16x16x128_f8f6f4 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[0:3] cbsz:1 blgp:0 // left value = acc[0+0:3+0]
s_barrier

/*  mfmaIndex:1  */
v_mfma_f32_16x16x128_f8f6f4 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[4:7] cbsz:1 blgp:0 // left value = acc[4+0:7+0]
ds_read_b128 v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+36:vgprValuA_X0_I0+36+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:2  */
v_mfma_f32_16x16x128_f8f6f4 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[8:11] cbsz:1 blgp:0 // left value = acc[8+0:11+0]
ds_read_b128 v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+44:vgprValuA_X0_I0+44+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:3  */
v_mfma_f32_16x16x128_f8f6f4 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[12:15] cbsz:1 blgp:0 // left value = acc[12+0:15+0]
ds_read_b128 v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+52:vgprValuA_X0_I0+52+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:8  */
v_mfma_f32_16x16x128_f8f6f4 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[32:35] cbsz:1 blgp:0 // left value = acc[32+0:35+0]
ds_read_b128 v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+60:vgprValuA_X0_I0+60+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0
/*  mfmaIndex:9  */
v_mfma_f32_16x16x128_f8f6f4 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[36:39] cbsz:1 blgp:0 // left value = acc[36+0:39+0]
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:10  */
v_mfma_f32_16x16x128_f8f6f4 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[40:43] cbsz:1 blgp:0 // left value = acc[40+0:43+0]
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:11  */
v_mfma_f32_16x16x128_f8f6f4 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[44:47] cbsz:1 blgp:0 // left value = acc[44+0:47+0]
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:16  */
v_mfma_f32_16x16x128_f8f6f4 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[64:67] cbsz:1 blgp:0 // left value = acc[64+0:67+0]
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0

/*  mfmaIndex:17  */
v_mfma_f32_16x16x128_f8f6f4 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[68:71] cbsz:1 blgp:0 // left value = acc[68+0:71+0]

/*  mfmaIndex:18  */
v_mfma_f32_16x16x128_f8f6f4 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[72:75] cbsz:1 blgp:0 // left value = acc[72+0:75+0]

  /*  mfmaIndex:19  */
v_mfma_f32_16x16x128_f8f6f4 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[76:79] cbsz:1 blgp:0 // left value = acc[76+0:79+0]

/*  mfmaIndex:24  */
v_mfma_f32_16x16x128_f8f6f4 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[96:99] cbsz:1 blgp:0 // left value = acc[96+0:99+0]

/*  mfmaIndex:25  */
v_mfma_f32_16x16x128_f8f6f4 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[100:103] cbsz:1 blgp:0 // left value = acc[100+0:103+0]


/*  mfmaIndex:26  */
v_mfma_f32_16x16x128_f8f6f4 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[104:107] cbsz:1 blgp:0 // left value = acc[104+0:107+0]

/*  mfmaIndex:27  */
v_mfma_f32_16x16x128_f8f6f4 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[108:111] cbsz:1 blgp:0 // left value = acc[108+0:111+0]
s_waitcnt lgkmcnt(0)

// B0 A1

/*  mfmaIndex:4  */
v_mfma_f32_16x16x128_f8f6f4 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[16:19] cbsz:1 blgp:0 // left value = acc[16+0:19+0]

/*  mfmaIndex:5  */
v_mfma_f32_16x16x128_f8f6f4 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[20:23] cbsz:1 blgp:0 // left value = acc[20+0:23+0]

/*  mfmaIndex:6  */
v_mfma_f32_16x16x128_f8f6f4 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[24:27] cbsz:1 blgp:0 // left value = acc[24+0:27+0]

/*  mfmaIndex:7  */
v_mfma_f32_16x16x128_f8f6f4 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[28:31] cbsz:1 blgp:0 // left value = acc[28+0:31+0]

/*  mfmaIndex:12  */
v_mfma_f32_16x16x128_f8f6f4 acc[48:51], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[48:51] cbsz:1 blgp:0 // left value = acc[48+0:51+0]

/*  mfmaIndex:13  */
v_mfma_f32_16x16x128_f8f6f4 acc[52:55], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[52:55] cbsz:1 blgp:0 // left value = acc[52+0:55+0]

/*  mfmaIndex:14  */
v_mfma_f32_16x16x128_f8f6f4 acc[56:59], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[56:59] cbsz:1 blgp:0 // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
v_mfma_f32_16x16x128_f8f6f4 acc[60:63], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[60:63] cbsz:1 blgp:0 // left value = acc[60+0:63+0]

/*  mfmaIndex:20  */
v_mfma_f32_16x16x128_f8f6f4 acc[80:83], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[80:83] cbsz:1 blgp:0 // left value = acc[80+0:83+0]

/*  mfmaIndex:21  */
v_mfma_f32_16x16x128_f8f6f4 acc[84:87], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[84:87] cbsz:1 blgp:0 // left value = acc[84+0:87+0]

/*  mfmaIndex:22  */
v_mfma_f32_16x16x128_f8f6f4 acc[88:91], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[88:91] cbsz:1 blgp:0 // left value = acc[88+0:91+0]

/*  mfmaIndex:23  */
v_mfma_f32_16x16x128_f8f6f4 acc[92:95], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[92:95] cbsz:1 blgp:0 // left value = acc[92+0:95+0]

/*  mfmaIndex:28  */
v_mfma_f32_16x16x128_f8f6f4 acc[112:115], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[112:115] cbsz:1 blgp:0 // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
v_mfma_f32_16x16x128_f8f6f4 acc[116:119], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[116:119] cbsz:1 blgp:0 // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
v_mfma_f32_16x16x128_f8f6f4 acc[120:123], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[120:123] cbsz:1 blgp:0 // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
v_mfma_f32_16x16x128_f8f6f4 acc[124:127], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[124:127] cbsz:1 blgp:0 // left value = acc[124+0:127+0]


// B1 A0

/*  mfmaIndex:32  */
v_mfma_f32_16x16x128_f8f6f4 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[128:131] cbsz:1 blgp:0 // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
v_mfma_f32_16x16x128_f8f6f4 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[132:135] cbsz:1 blgp:0 // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
v_mfma_f32_16x16x128_f8f6f4 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[136:139] cbsz:1 blgp:0 // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
v_mfma_f32_16x16x128_f8f6f4 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[140:143] cbsz:1 blgp:0 // left value = acc[140+0:143+0]

/*  mfmaIndex:40  */
v_mfma_f32_16x16x128_f8f6f4 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[160:163] cbsz:1 blgp:0 // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
v_mfma_f32_16x16x128_f8f6f4 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[164:167] cbsz:1 blgp:0 // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
v_mfma_f32_16x16x128_f8f6f4 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[168:171] cbsz:1 blgp:0 // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
v_mfma_f32_16x16x128_f8f6f4 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[172:175] cbsz:1 blgp:0 // left value = acc[172+0:175+0]

/*  mfmaIndex:48  */
v_mfma_f32_16x16x128_f8f6f4 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[192:195] cbsz:1 blgp:0 // left value = acc[192+0:195+0]

/*  mfmaIndex:49  */
v_mfma_f32_16x16x128_f8f6f4 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[196:199] cbsz:1 blgp:0 // left value = acc[196+0:199+0]

/*  mfmaIndex:50  */
v_mfma_f32_16x16x128_f8f6f4 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[200:203] cbsz:1 blgp:0 // left value = acc[200+0:203+0]

  /*  mfmaIndex:51  */
v_mfma_f32_16x16x128_f8f6f4 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[204:207] cbsz:1 blgp:0 // left value = acc[204+0:207+0]

/*  mfmaIndex:56  */
v_mfma_f32_16x16x128_f8f6f4 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[224:227] cbsz:1 blgp:0 // left value = acc[224+0:227+0]

/*  mfmaIndex:57  */
v_mfma_f32_16x16x128_f8f6f4 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[228:231] cbsz:1 blgp:0 // left value = acc[228+0:231+0]

/*  mfmaIndex:58  */
v_mfma_f32_16x16x128_f8f6f4 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[232:235] cbsz:1 blgp:0 // left value = acc[232+0:235+0]

/*  mfmaIndex:59  */
v_mfma_f32_16x16x128_f8f6f4 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[236:239] cbsz:1 blgp:0 // left value = acc[236+0:239+0]


// B1 A1


/*  mfmaIndex:36  */
v_mfma_f32_16x16x128_f8f6f4 acc[144:147], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[144:147] cbsz:1 blgp:0 // left value = acc[144+0:147+0]

  /*  mfmaIndex:37  */
v_mfma_f32_16x16x128_f8f6f4 acc[148:151], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[148:151] cbsz:1 blgp:0 // left value = acc[148+0:151+0]

/*  mfmaIndex:38  */
v_mfma_f32_16x16x128_f8f6f4 acc[152:155], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[152:155] cbsz:1 blgp:0 // left value = acc[152+0:155+0]

/*  mfmaIndex:39  */
v_mfma_f32_16x16x128_f8f6f4 acc[156:159], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[156:159] cbsz:1 blgp:0 // left value = acc[156+0:159+0]

/*  mfmaIndex:44  */
v_mfma_f32_16x16x128_f8f6f4 acc[176:179], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[176:179] cbsz:1 blgp:0 // left value = acc[176+0:179+0]

/*  mfmaIndex:45  */
v_mfma_f32_16x16x128_f8f6f4 acc[180:183], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[180:183] cbsz:1 blgp:0 // left value = acc[180+0:183+0]

/*  mfmaIndex:46  */
v_mfma_f32_16x16x128_f8f6f4 acc[184:187], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[184:187] cbsz:1 blgp:0 // left value = acc[184+0:187+0]

/*  mfmaIndex:47  */
v_mfma_f32_16x16x128_f8f6f4 acc[188:191], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[188:191] cbsz:1 blgp:0 // left value = acc[188+0:191+0]

/*  mfmaIndex:52  */
v_mfma_f32_16x16x128_f8f6f4 acc[208:211], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[208:211] cbsz:1 blgp:0 // left value = acc[208+0:211+0]


/*  mfmaIndex:53  */
v_mfma_f32_16x16x128_f8f6f4 acc[212:215], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[212:215] cbsz:1 blgp:0 // left value = acc[212+0:215+0]

  /*  mfmaIndex:54  */
v_mfma_f32_16x16x128_f8f6f4 acc[216:219], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[216:219] cbsz:1 blgp:0 // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x128_f8f6f4 acc[220:223], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[220:223] cbsz:1 blgp:0 // left value = acc[220+0:223+0]

/*  mfmaIndex:60  */
v_mfma_f32_16x16x128_f8f6f4 acc[240:243], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[240:243] cbsz:1 blgp:0 // left value = acc[240+0:243+0]

/*  mfmaIndex:61  */
v_mfma_f32_16x16x128_f8f6f4 acc[244:247], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[244:247] cbsz:1 blgp:0 // left value = acc[244+0:247+0]

/*  mfmaIndex:62  */
v_mfma_f32_16x16x128_f8f6f4 acc[248:251], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[248:251] cbsz:1 blgp:0 // left value = acc[248+0:251+0]

/*  mfmaIndex:63  */
v_mfma_f32_16x16x128_f8f6f4 acc[252:255], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[252:255] cbsz:1 blgp:0 // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=16 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */
label_toPGR1end_OrdNLL:
label_PrefetchGlobalLastIterEnd:

/* Tail: add ValuA/B vgpr buffer [4...132) to pool */

/* Tail: add address/G2L vgpr [132...132) to pool */

/******************************************/
/* Tail Loop                              */
/******************************************/

/* local write reset offsets a */
s_xor_b32 s97, s[sgprSwapA], s[sgprLocalWriteAddrA] // Get other lds buffer offset value
s_min_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s97 // Set LWA to first buffer offset

/* local write reset offsets b */
s_xor_b32 s97, s[sgprSwapB], s[sgprLocalWriteAddrB] // Get other lds buffer offset value
s_min_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s97 // Set LWA to first buffer offset
/* Check out VGPR (numG2LA,numG2LB,numG2LMetadata) = (32,32,0) */
.set vgprG2LA_BASE, 4
.set vgprG2LA, vgprG2LA_BASE+0
.set vgprG2LB_BASE, 36
.set vgprG2LB, vgprG2LB_BASE+0
/* Check out VGPR (numLWA,numLWB) = (1,1) */
.set vgprLocalWriteAddrA, 68
.set vgprLocalWriteAddrB, 69

// numIterL = LOCAL_SPLITU * min(sizeL % LOCAL_DEPTHU, DEPTHU / LOCAL_SPLITU)
s_and_b32 s[sgprLoopCounterL], 127, s[sgprSizesSum+0] // s[sgprLoopCounterL] = s[sgprSizesSum+0] % 128
s_cmp_lt_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // Check if WG processes final iteration of tile
s_cmov_b32 s[sgprLoopCounterL], 0                  // This WG not completing tile
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // numIterL == 0
s_mov_b32 s[sgprOrigLoopCounter], 0                // repurpose to count each localRead increment
s_cbranch_scc1 label_SkipTailLoopL                 // skip to end of tail loop b/c numIter==0

/* remove stagger offsets for tail loop */
s_sub_i32 s98, 3, s[sgprStaggerUIter]
s_cmp_ge_i32 s98, 0
s_cbranch_scc0 label_Negative_J5DQFVGFWLXU2DUR
s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0]    // start offset S in bytes
s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0]       // start offset S in bytes
s_branch label_MultiplyDone_DLSAQLEVYLOBCPNL
label_Negative_J5DQFVGFWLXU2DUR:
s_abs_i32 s98, s98
s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsA+0]    // start offset S in bytes
s_mul_i32 s98, s98, s[sgprGlobalReadIncsA+0]       // start offset S in bytes
s_xor_b32 s98, s98, 0xffffffff
s_xor_b32 s99, s99, 0xffffffff
s_add_u32 s98, s98, 0x1
s_addc_u32 s99, s99, 0
label_MultiplyDone_DLSAQLEVYLOBCPNL:
s_sub_u32 s98, s98, s[sgprWrapUA]                  // S - WrapU
s_subb_u32 s99, s99, s[sgprWrapUA+1]               // S - WrapU
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s99       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_sub_i32 s98, 3, s[sgprStaggerUIter]
s_cmp_ge_i32 s98, 0
s_cbranch_scc0 label_Negative_LQI6BOBE0EY8XIP1
s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0]    // start offset S in bytes
s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0]       // start offset S in bytes
s_branch label_MultiplyDone_9N1QELR2XL4Z0HRB
label_Negative_LQI6BOBE0EY8XIP1:
s_abs_i32 s98, s98
s_mul_hi_u32 s99, s98, s[sgprGlobalReadIncsB+0]    // start offset S in bytes
s_mul_i32 s98, s98, s[sgprGlobalReadIncsB+0]       // start offset S in bytes
s_xor_b32 s98, s98, 0xffffffff
s_xor_b32 s99, s99, 0xffffffff
s_add_u32 s98, s98, 0x1
s_addc_u32 s99, s99, 0
label_MultiplyDone_9N1QELR2XL4Z0HRB:
s_sub_u32 s98, s98, s[sgprWrapUB]                  // S - WrapU
s_subb_u32 s99, s99, s[sgprWrapUB+1]               // S - WrapU
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s98        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s99       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s98 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s99 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

// Check if K multiple of 4
s_and_b32 s98, s[sgprSizesSum], 3
s_cmp_eq_u32 s98, 0
s_cbranch_scc0 label_tailloop_non_dtl

label_tailloop_dtl:

s_mov_b32 m0, s[sgprLocalWriteAddrA]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0, lds // G -> Reg 0_0_7_0

s_mov_b32 m0, s[sgprLocalWriteAddrB]               // m0 <- LDS write address
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0, lds // G -> Reg 0_0_0_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0, lds // G -> Reg 0_0_1_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0, lds // G -> Reg 0_0_2_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0, lds // G -> Reg 0_0_3_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0, lds // G -> Reg 0_0_4_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0, lds // G -> Reg 0_0_5_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0, lds // G -> Reg 0_0_6_0
s_add_u32 m0, m0, 4224                             // Move LDS write address to next line
buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0, lds // G -> Reg 0_0_7_0
s_waitcnt vmcnt(0)
  
s_branch label_tail_loop_load_done
/* NONDTL Tail loop*/
label_tailloop_non_dtl:

.macro ShiftSeqA dst, reg1, reg2, reg3
v_lshlrev_b32 v[\reg1], 0x8, v[\reg1]                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LA+\dst], v[vgprG2LA+\dst], v[\reg1]     // pack a sub 8-bit with dest
v_or_b32 v[vgprG2LA+\dst], v[vgprG2LA+\dst], v[\reg2]     // pack a sub 8-bit with dest
v_lshlrev_b32 v[\reg3], 0x8, v[\reg3]                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LA+\dst], v[vgprG2LA+\dst], v[\reg3]     // pack a sub 8-bit with dest
.endm

.macro ShiftSeqB dst, reg1, reg2, reg3
v_lshlrev_b32 v[\reg1], 0x8, v[\reg1]                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+\dst], v[vgprG2LB+\dst], v[\reg1]     // pack a sub 8-bit with dest
v_or_b32 v[vgprG2LB+\dst], v[vgprG2LB+\dst], v[\reg2]     // pack a sub 8-bit with dest
v_lshlrev_b32 v[\reg3], 0x8, v[\reg3]                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+\dst], v[vgprG2LB+\dst], v[\reg3]     // pack a sub 8-bit with dest
.endm

/* Tail global read A */
buffer_load_ubyte_d16 v[vgprG2LA+0+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v70, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v71, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v72, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+0+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v73, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v74, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v75, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+0+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v76, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v77, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v78, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v79, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v80, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v81, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LA+4+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v82, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v83, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v84, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+4+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v85, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v86, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v87, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+4+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v88, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v89, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v90, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v91, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v92, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v93, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LA+8+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v94, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v95, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v96, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+8+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v97, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v98, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v99, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+8+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v100, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v101, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v102, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v103, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v104, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v105, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LA+12+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v106, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v107, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v108, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+12+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v109, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v110, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v111, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+12+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v112, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v113, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v114, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v115, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v116, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v117, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LA+16+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v118, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v119, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v120, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+16+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v121, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v122, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v123, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+16+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v124, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v125, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v126, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+16+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v127, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v128, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v129, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LA+20+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v136, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v137, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v138, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+20+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v139, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v140, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v141, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+20+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v142, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v143, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v144, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+20+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v145, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v146, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v147, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LA+24+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v148, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v149, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v150, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+24+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v151, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v152, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v153, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+24+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v154, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v155, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v156, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+24+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v157, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v158, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v159, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LA+28+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v160, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v161, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v162, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+28+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v163, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v164, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v165, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+28+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v166, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v167, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v168, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LA+28+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v169, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v170, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v171, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:15 // load one buffer value

s_waitcnt vmcnt(63)
ShiftSeqA 0, 70, 71, 72
ShiftSeqA 1, 73, 74, 75
ShiftSeqA 2, 76, 77, 78
ShiftSeqA 3, 79, 80, 81

s_waitcnt vmcnt(63)
ShiftSeqA 4, 82, 83, 84
ShiftSeqA 5, 85, 86, 87
ShiftSeqA 6, 88, 89, 90
ShiftSeqA 7, 91, 92, 93

s_waitcnt vmcnt(63)
ShiftSeqA 8, 94, 95, 96
ShiftSeqA 9, 97, 98, 99
ShiftSeqA 10, 100, 101, 102
ShiftSeqA 11, 103, 104, 105

s_waitcnt vmcnt(63)
ShiftSeqA 12, 106, 107, 108
ShiftSeqA 13, 109, 110, 111
ShiftSeqA 14, 112, 113, 114
ShiftSeqA 15, 115, 116, 117

s_waitcnt vmcnt(48)
ShiftSeqA 16, 118, 119, 120
ShiftSeqA 17, 121, 122, 123
ShiftSeqA 18, 124, 125, 126
ShiftSeqA 19, 127, 128, 129

s_waitcnt vmcnt(32)
ShiftSeqA 20, 136, 137, 138
ShiftSeqA 21, 139, 140, 141
ShiftSeqA 22, 142, 143, 144
ShiftSeqA 23, 145, 146, 147

s_waitcnt vmcnt(16)
ShiftSeqA 24, 148, 149, 150
ShiftSeqA 25, 151, 152, 153
ShiftSeqA 26, 154, 155, 156
ShiftSeqA 27, 157, 158, 159

s_waitcnt vmcnt(0)
ShiftSeqA 28, 160, 161, 162
ShiftSeqA 29, 163, 164, 165
ShiftSeqA 30, 166, 167, 168
ShiftSeqA 31, 169, 170, 171


/* Tail global read B */
buffer_load_ubyte_d16 v[vgprG2LB+0+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v70, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v71, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v72, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+0+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v73, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v74, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v75, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+0+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v76, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v77, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v78, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v79, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v80, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v81, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LB+4+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v82, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v83, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v84, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+4+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v85, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v86, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v87, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+4+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v88, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v89, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v90, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+4+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v91, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v92, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v93, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LB+8+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v94, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v95, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v96, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+8+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v97, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v98, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v99, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+8+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v100, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v101, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v102, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+8+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v103, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v104, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v105, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LB+12+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v106, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v107, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v108, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+12+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v109, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v110, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v111, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+12+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v112, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v113, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v114, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+12+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v115, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v116, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v117, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LB+16+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v118, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v119, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v120, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+16+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v121, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v122, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v123, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+16+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v124, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v125, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v126, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+16+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v127, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v128, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v129, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LB+20+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v136, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v137, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v138, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+20+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v139, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v140, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v141, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+20+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v142, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v143, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v144, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+20+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v145, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v146, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v147, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LB+24+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v148, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v149, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v150, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+24+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v151, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v152, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v153, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+24+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v154, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v155, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v156, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+24+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v157, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v158, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v159, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:15 // load one buffer value

buffer_load_ubyte_d16 v[vgprG2LB+28+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 // load one buffer value
buffer_load_ubyte_d16 v160, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:1 // load one buffer value
buffer_load_ubyte_d16_hi v161, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:2 // load one buffer value
buffer_load_ubyte_d16_hi v162, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:3 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+28+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:4 // load one buffer value
buffer_load_ubyte_d16 v163, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:5 // load one buffer value
buffer_load_ubyte_d16_hi v164, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:6 // load one buffer value
buffer_load_ubyte_d16_hi v165, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:7 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+28+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:8 // load one buffer value
buffer_load_ubyte_d16 v166, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:9 // load one buffer value
buffer_load_ubyte_d16_hi v167, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:10 // load one buffer value
buffer_load_ubyte_d16_hi v168, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:11 // load one buffer value
buffer_load_ubyte_d16 v[vgprG2LB+28+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:12 // load one buffer value
buffer_load_ubyte_d16 v169, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:13 // load one buffer value
buffer_load_ubyte_d16_hi v170, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:14 // load one buffer value
buffer_load_ubyte_d16_hi v171, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:15 // load one buffer value

s_waitcnt vmcnt(63)
ShiftSeqB 0, 70, 71, 72
ShiftSeqB 1, 73, 74, 75
ShiftSeqB 2, 76, 77, 78
ShiftSeqB 3, 79, 80, 81

s_waitcnt vmcnt(63)
ShiftSeqB 4, 82, 83, 84
ShiftSeqB 5, 85, 86, 87
ShiftSeqB 6, 88, 89, 90
ShiftSeqB 7, 91, 92, 93

s_waitcnt vmcnt(63)
ShiftSeqB 8, 94, 95, 96
ShiftSeqB 9, 97, 98, 99
ShiftSeqB 10, 100, 101, 102
ShiftSeqB 11, 103, 104, 105

s_waitcnt vmcnt(63)
ShiftSeqB 12, 106, 107, 108
ShiftSeqB 13, 109, 110, 111
ShiftSeqB 14, 112, 113, 114
ShiftSeqB 15, 115, 116, 117

s_waitcnt vmcnt(48)
ShiftSeqB 16, 118, 119, 120
ShiftSeqB 17, 121, 122, 123
ShiftSeqB 18, 124, 125, 126
ShiftSeqB 19, 127, 128, 129

s_waitcnt vmcnt(32)
ShiftSeqB 20, 136, 137, 138
ShiftSeqB 21, 139, 140, 141
ShiftSeqB 22, 142, 143, 144
ShiftSeqB 23, 145, 146, 147

s_waitcnt vmcnt(16)
ShiftSeqB 24, 148, 149, 150
ShiftSeqB 25, 151, 152, 153
ShiftSeqB 26, 154, 155, 156
ShiftSeqB 27, 157, 158, 159

s_waitcnt vmcnt(0)
ShiftSeqB 28, 160, 161, 162
ShiftSeqB 29, 163, 164, 165
ShiftSeqB 30, 166, 167, 168
ShiftSeqB 31, 169, 170, 171

/* Set local write offsets for A to be same as DTL 16B load */
v_and_b32 v[vgprLocalWriteAddrA], 63, v[vgprSerial] // Serial % wavesize
v_lshlrev_b32 v[vgprLocalWriteAddrA], 0x4, v[vgprLocalWriteAddrA]
v_add_u32 v[vgprLocalWriteAddrA], s[sgprLocalWriteAddrA], v[vgprLocalWriteAddrA]
/* Set local write offsets for B to be same as DTL 16B load */
v_and_b32 v[vgprLocalWriteAddrB], 63, v[vgprSerial] // Serial % wavesize
v_lshlrev_b32 v[vgprLocalWriteAddrB], 0x4, v[vgprLocalWriteAddrB]
v_add_u32 v[vgprLocalWriteAddrB], s[sgprLocalWriteAddrB], v[vgprLocalWriteAddrB]

/* local write a */
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+0:vgprG2LA+0+3] offset:0 // lwoA_0_0_0_0 = (0*LSCA)*(MT0I+PAD) + (0*LSPA) = 0
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+4:vgprG2LA+4+3] offset:4224 // lwoA_0_0_1_0 = (0*LSCA)*(MT0I+PAD) + (1*LSPA) = 4224
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+8:vgprG2LA+8+3] offset:8448 // lwoA_0_0_2_0 = (0*LSCA)*(MT0I+PAD) + (2*LSPA) = 8448
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+12:vgprG2LA+12+3] offset:12672 // lwoA_0_0_3_0 = (0*LSCA)*(MT0I+PAD) + (3*LSPA) = 12672
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+16:vgprG2LA+16+3] offset:16896 // lwoA_0_0_4_0 = (0*LSCA)*(MT0I+PAD) + (4*LSPA) = 16896
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+20:vgprG2LA+20+3] offset:21120 // lwoA_0_0_5_0 = (0*LSCA)*(MT0I+PAD) + (5*LSPA) = 21120
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+24:vgprG2LA+24+3] offset:25344 // lwoA_0_0_6_0 = (0*LSCA)*(MT0I+PAD) + (6*LSPA) = 25344
ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+28:vgprG2LA+28+3] offset:29568 // lwoA_0_0_7_0 = (0*LSCA)*(MT0I+PAD) + (7*LSPA) = 29568

/* local write b */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:4224 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 4224
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:8448 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 8448
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:12672 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 12672
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+16:vgprG2LB+16+3] offset:16896 // lwoB_0_0_4_0 = (0*LSCB)*(MT1J+PAD) + (4*LSPB) = 16896
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+20:vgprG2LB+20+3] offset:21120 // lwoB_0_0_5_0 = (0*LSCB)*(MT1J+PAD) + (5*LSPB) = 21120
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+24:vgprG2LB+24+3] offset:25344 // lwoB_0_0_6_0 = (0*LSCB)*(MT1J+PAD) + (6*LSPB) = 25344
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+28:vgprG2LB+28+3] offset:29568 // lwoB_0_0_7_0 = (0*LSCB)*(MT1J+PAD) + (7*LSPB) = 29568

/* Recalc local read offsets */
s_waitcnt lgkmcnt(0)                               // 5wait for local write

label_tail_loop_load_done:

// Skip force waitcnt0
s_barrier
.set vgprG2LA_BASE, UNDEF
.set vgprG2LA, UNDEF
.set vgprG2LB_BASE, UNDEF
.set vgprG2LB, UNDEF
.set vgprLocalWriteAddrA, UNDEF
.set vgprLocalWriteAddrB, UNDEF
.set vgprValuA_X0_I0_BASE, 4
.set vgprValuA_X0_I0, vgprValuA_X0_I0_BASE+0
.set vgprValuB_X0_I0_BASE, 68
.set vgprValuB_X0_I0, vgprValuB_X0_I0_BASE+0

/* Tail: local read reset offsets a */

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v135, v[vgprLocalReadSwapAddrA], v[vgprLocalReadAddrA] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], v135 // Set LRA to first buffer offset

/* Tail: local read reset offsets b */

/* localReadResetOffsets */
/* handled internally */
v_xor_b32 v135, v[vgprLocalReadSwapAddrB], v[vgprLocalReadAddrB] // Get other lds buffer offset value
v_min_i32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], v135 // Set LRA to first buffer offset

/* Tail: local read init pointers a */

/* localReadInitPointers */

/* Tail: local read init pointers b */

/* localReadInitPointers */

/* tail loop: macs */
label_TailLoopBeginL:

/* local read a */
ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprLocalReadAddrA] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprLocalReadAddrA] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprLocalReadAddrA] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+16:vgprValuA_X0_I0+16+3], v[vgprLocalReadAddrA] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+20:vgprValuA_X0_I0+20+3], v[vgprLocalReadAddrA] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+24:vgprValuA_X0_I0+24+3], v[vgprLocalReadAddrA] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+28:vgprValuA_X0_I0+28+3], v[vgprLocalReadAddrA] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+32:vgprValuA_X0_I0+32+3], v[vgprLocalReadAddrA] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+36:vgprValuA_X0_I0+36+3], v[vgprLocalReadAddrA] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+40:vgprValuA_X0_I0+40+3], v[vgprLocalReadAddrA] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+44:vgprValuA_X0_I0+44+3], v[vgprLocalReadAddrA] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+48:vgprValuA_X0_I0+48+3], v[vgprLocalReadAddrA] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+52:vgprValuA_X0_I0+52+3], v[vgprLocalReadAddrA] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+56:vgprValuA_X0_I0+56+3], v[vgprLocalReadAddrA] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuA_X0_I0+60:vgprValuA_X0_I0+60+3], v[vgprLocalReadAddrA] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0

/* local read b */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=1 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=1 oIdx=0 buffer=0 iui=0

/* local read inc a */
s_mov_b32 s97, 0x80                                // inc
v_add_co_u32 v[vgprLocalReadAddrA+0], vcc, s97, v[vgprLocalReadAddrA+0] // lrA += 128 (bpeDS)

/* local read inc b */
                                                   // inc (dup assign opt.)
v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, s97, v[vgprLocalReadAddrB+0] // lrB += 128 (bpeDS)
s_waitcnt lgkmcnt(0)                               // 4wait for local read
v_and_b32 v135, 63, v[vgprSerial]                  // v135 = v[vgprSerial] % 64
v_lshrrev_b32 v135, 4, v135                        // 135 = 135 / 16
v_lshlrev_b32 v135, 4, v135                        // v135 = v135 * 16
v_add_u32 v136, v135, 0
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_and_b32 v135, 63, v[vgprSerial]                  // v135 = v[vgprSerial] % 64
v_lshrrev_b32 v135, 4, v135                        // 135 = 135 / 16
v_lshlrev_b32 v135, 4, v135                        // v135 = v135 * 16
v_add_u32 v136, v135, 0
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], 0, s[98:99] // set 0 if K_idx >= sizeL
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], 0, s[98:99] // set 0 if K_idx >= sizeL
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], 0, s[98:99] // set 0 if K_idx >= sizeL
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], 0, s[98:99] // set 0 if K_idx >= sizeL
s_and_b32 s97, s[sgprLoopCounterL], 31             // get inputs for edge thread
s_sub_u32 s97, 32, s97                             // use shift to fill 0 for outside element
s_lshl_b32 s97, s97, 3                             // use shift to fill 0 for outside element
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+0+0+0+0:vgprValuA_X0_I0+0+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+0+0+0+2:vgprValuA_X0_I0+0+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+0+0+0+4:vgprValuA_X0_I0+0+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+0+0+0+6:vgprValuA_X0_I0+0+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+2], v[vgprValuA_X0_I0+0+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+3], v[vgprValuA_X0_I0+0+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+4], v[vgprValuA_X0_I0+0+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+5], v[vgprValuA_X0_I0+0+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+6], v[vgprValuA_X0_I0+0+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+8+0+0+0:vgprValuA_X0_I0+8+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+8+0+0+2:vgprValuA_X0_I0+8+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+8+0+0+4:vgprValuA_X0_I0+8+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+8+0+0+6:vgprValuA_X0_I0+8+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+0], v[vgprValuA_X0_I0+8+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+2], v[vgprValuA_X0_I0+8+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+3], v[vgprValuA_X0_I0+8+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+4], v[vgprValuA_X0_I0+8+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+5], v[vgprValuA_X0_I0+8+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+6], v[vgprValuA_X0_I0+8+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+16+0+0+0:vgprValuA_X0_I0+16+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+16+0+0+2:vgprValuA_X0_I0+16+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+16+0+0+4:vgprValuA_X0_I0+16+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+16+0+0+6:vgprValuA_X0_I0+16+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+0], v[vgprValuA_X0_I0+16+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+16+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+2], v[vgprValuA_X0_I0+16+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+3], v[vgprValuA_X0_I0+16+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+4], v[vgprValuA_X0_I0+16+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+5], v[vgprValuA_X0_I0+16+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+6], v[vgprValuA_X0_I0+16+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+24+0+0+0:vgprValuA_X0_I0+24+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+24+0+0+2:vgprValuA_X0_I0+24+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+24+0+0+4:vgprValuA_X0_I0+24+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+24+0+0+6:vgprValuA_X0_I0+24+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+0], v[vgprValuA_X0_I0+24+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+24+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+2], v[vgprValuA_X0_I0+24+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+3], v[vgprValuA_X0_I0+24+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+4], v[vgprValuA_X0_I0+24+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+5], v[vgprValuA_X0_I0+24+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+6], v[vgprValuA_X0_I0+24+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+32+0+0+0:vgprValuA_X0_I0+32+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+32+0+0+2:vgprValuA_X0_I0+32+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+32+0+0+4:vgprValuA_X0_I0+32+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+32+0+0+6:vgprValuA_X0_I0+32+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+0], v[vgprValuA_X0_I0+32+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+32+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+2], v[vgprValuA_X0_I0+32+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+3], v[vgprValuA_X0_I0+32+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+4], v[vgprValuA_X0_I0+32+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+5], v[vgprValuA_X0_I0+32+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+6], v[vgprValuA_X0_I0+32+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+40+0+0+0:vgprValuA_X0_I0+40+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+40+0+0+2:vgprValuA_X0_I0+40+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+40+0+0+4:vgprValuA_X0_I0+40+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+40+0+0+6:vgprValuA_X0_I0+40+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+0], v[vgprValuA_X0_I0+40+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+40+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+2], v[vgprValuA_X0_I0+40+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+3], v[vgprValuA_X0_I0+40+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+4], v[vgprValuA_X0_I0+40+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+5], v[vgprValuA_X0_I0+40+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+6], v[vgprValuA_X0_I0+40+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+48+0+0+0:vgprValuA_X0_I0+48+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+48+0+0+2:vgprValuA_X0_I0+48+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+48+0+0+4:vgprValuA_X0_I0+48+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+48+0+0+6:vgprValuA_X0_I0+48+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+0], v[vgprValuA_X0_I0+48+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+48+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+2], v[vgprValuA_X0_I0+48+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+3], v[vgprValuA_X0_I0+48+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+4], v[vgprValuA_X0_I0+48+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+5], v[vgprValuA_X0_I0+48+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+6], v[vgprValuA_X0_I0+48+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuA_X0_I0+56+0+0+0:vgprValuA_X0_I0+56+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuA_X0_I0+56+0+0+2:vgprValuA_X0_I0+56+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuA_X0_I0+56+0+0+4:vgprValuA_X0_I0+56+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuA_X0_I0+56+0+0+6:vgprValuA_X0_I0+56+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+0], v[vgprValuA_X0_I0+56+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+56+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+2], v[vgprValuA_X0_I0+56+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+3], v[vgprValuA_X0_I0+56+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+4], v[vgprValuA_X0_I0+56+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+5], v[vgprValuA_X0_I0+56+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+6], v[vgprValuA_X0_I0+56+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+0+0+0+0:vgprValuB_X0_I0+0+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+0+0+0+2:vgprValuB_X0_I0+0+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+0+0+0+4:vgprValuB_X0_I0+0+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+0+0+0+6:vgprValuB_X0_I0+0+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+2], v[vgprValuB_X0_I0+0+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+3], v[vgprValuB_X0_I0+0+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+4], v[vgprValuB_X0_I0+0+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+5], v[vgprValuB_X0_I0+0+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+6], v[vgprValuB_X0_I0+0+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+7], v[vgprValuB_X0_I0+0+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+8+0+0+0:vgprValuB_X0_I0+8+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+8+0+0+2:vgprValuB_X0_I0+8+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+8+0+0+4:vgprValuB_X0_I0+8+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+8+0+0+6:vgprValuB_X0_I0+8+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+2], v[vgprValuB_X0_I0+8+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+3], v[vgprValuB_X0_I0+8+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+4], v[vgprValuB_X0_I0+8+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+5], v[vgprValuB_X0_I0+8+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+6], v[vgprValuB_X0_I0+8+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+7], v[vgprValuB_X0_I0+8+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+16+0+0+0:vgprValuB_X0_I0+16+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+16+0+0+2:vgprValuB_X0_I0+16+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+16+0+0+4:vgprValuB_X0_I0+16+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+16+0+0+6:vgprValuB_X0_I0+16+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+2], v[vgprValuB_X0_I0+16+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+3], v[vgprValuB_X0_I0+16+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+4], v[vgprValuB_X0_I0+16+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+5], v[vgprValuB_X0_I0+16+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+6], v[vgprValuB_X0_I0+16+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+7], v[vgprValuB_X0_I0+16+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+24+0+0+0:vgprValuB_X0_I0+24+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+24+0+0+2:vgprValuB_X0_I0+24+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+24+0+0+4:vgprValuB_X0_I0+24+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+24+0+0+6:vgprValuB_X0_I0+24+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+2], v[vgprValuB_X0_I0+24+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+3], v[vgprValuB_X0_I0+24+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+4], v[vgprValuB_X0_I0+24+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+5], v[vgprValuB_X0_I0+24+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+6], v[vgprValuB_X0_I0+24+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+7], v[vgprValuB_X0_I0+24+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+32+0+0+0:vgprValuB_X0_I0+32+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+32+0+0+2:vgprValuB_X0_I0+32+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+32+0+0+4:vgprValuB_X0_I0+32+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+32+0+0+6:vgprValuB_X0_I0+32+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+0], v[vgprValuB_X0_I0+32+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+1], v[vgprValuB_X0_I0+32+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+2], v[vgprValuB_X0_I0+32+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+3], v[vgprValuB_X0_I0+32+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+4], v[vgprValuB_X0_I0+32+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+5], v[vgprValuB_X0_I0+32+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+6], v[vgprValuB_X0_I0+32+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+32+0+0+7], v[vgprValuB_X0_I0+32+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+40+0+0+0:vgprValuB_X0_I0+40+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+40+0+0+2:vgprValuB_X0_I0+40+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+40+0+0+4:vgprValuB_X0_I0+40+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+40+0+0+6:vgprValuB_X0_I0+40+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+0], v[vgprValuB_X0_I0+40+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+1], v[vgprValuB_X0_I0+40+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+2], v[vgprValuB_X0_I0+40+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+3], v[vgprValuB_X0_I0+40+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+4], v[vgprValuB_X0_I0+40+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+5], v[vgprValuB_X0_I0+40+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+6], v[vgprValuB_X0_I0+40+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+40+0+0+7], v[vgprValuB_X0_I0+40+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+48+0+0+0:vgprValuB_X0_I0+48+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+48+0+0+2:vgprValuB_X0_I0+48+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+48+0+0+4:vgprValuB_X0_I0+48+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+48+0+0+6:vgprValuB_X0_I0+48+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+0], v[vgprValuB_X0_I0+48+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+1], v[vgprValuB_X0_I0+48+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+2], v[vgprValuB_X0_I0+48+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+3], v[vgprValuB_X0_I0+48+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+4], v[vgprValuB_X0_I0+48+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+5], v[vgprValuB_X0_I0+48+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+6], v[vgprValuB_X0_I0+48+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+48+0+0+7], v[vgprValuB_X0_I0+48+0+0+7], v145, s[98:99]
v_lshlrev_b64 v[138:139], s97, v[vgprValuB_X0_I0+56+0+0+0:vgprValuB_X0_I0+56+0+0+0+1]
v_lshlrev_b64 v[140:141], s97, v[vgprValuB_X0_I0+56+0+0+2:vgprValuB_X0_I0+56+0+0+2+1]
v_lshlrev_b64 v[142:143], s97, v[vgprValuB_X0_I0+56+0+0+4:vgprValuB_X0_I0+56+0+0+4+1]
v_lshlrev_b64 v[144:145], s97, v[vgprValuB_X0_I0+56+0+0+6:vgprValuB_X0_I0+56+0+0+6+1]
v_add_u32 v136, v135, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+0], v[vgprValuB_X0_I0+56+0+0+0], v138, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+1], v[vgprValuB_X0_I0+56+0+0+1], v139, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+2], v[vgprValuB_X0_I0+56+0+0+2], v140, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+3], v[vgprValuB_X0_I0+56+0+0+3], v141, s[98:99]
v_add_u32 v136, v136, 56                           // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+4], v[vgprValuB_X0_I0+56+0+0+4], v142, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+5], v[vgprValuB_X0_I0+56+0+0+5], v143, s[98:99]
v_add_u32 v136, v136, 8                            // add part of K
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+6], v[vgprValuB_X0_I0+56+0+0+6], v144, s[98:99]
v_cmp_ge_i32 s[98:99], v136, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuB_X0_I0+56+0+0+7], v[vgprValuB_X0_I0+56+0+0+7], v145, s[98:99]
s_nop 1
v_mfma_f32_16x16x128_f8f6f4 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[0:3] cbsz:1 blgp:0 // left value = acc[0+0:3+0]
v_mfma_f32_16x16x128_f8f6f4 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[4:7] cbsz:1 blgp:0 // left value = acc[4+0:7+0]
v_mfma_f32_16x16x128_f8f6f4 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[8:11] cbsz:1 blgp:0 // left value = acc[8+0:11+0]
v_mfma_f32_16x16x128_f8f6f4 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[12:15] cbsz:1 blgp:0 // left value = acc[12+0:15+0]
v_mfma_f32_16x16x128_f8f6f4 acc[16:19], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[16:19] cbsz:1 blgp:0 // left value = acc[16+0:19+0]
v_mfma_f32_16x16x128_f8f6f4 acc[20:23], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[20:23] cbsz:1 blgp:0 // left value = acc[20+0:23+0]
v_mfma_f32_16x16x128_f8f6f4 acc[24:27], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[24:27] cbsz:1 blgp:0 // left value = acc[24+0:27+0]
v_mfma_f32_16x16x128_f8f6f4 acc[28:31], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[28:31] cbsz:1 blgp:0 // left value = acc[28+0:31+0]
v_mfma_f32_16x16x128_f8f6f4 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[32:35] cbsz:1 blgp:0 // left value = acc[32+0:35+0]
v_mfma_f32_16x16x128_f8f6f4 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[36:39] cbsz:1 blgp:0 // left value = acc[36+0:39+0]
v_mfma_f32_16x16x128_f8f6f4 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[40:43] cbsz:1 blgp:0 // left value = acc[40+0:43+0]
v_mfma_f32_16x16x128_f8f6f4 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[44:47] cbsz:1 blgp:0 // left value = acc[44+0:47+0]
v_mfma_f32_16x16x128_f8f6f4 acc[48:51], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[48:51] cbsz:1 blgp:0 // left value = acc[48+0:51+0]
v_mfma_f32_16x16x128_f8f6f4 acc[52:55], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[52:55] cbsz:1 blgp:0 // left value = acc[52+0:55+0]
v_mfma_f32_16x16x128_f8f6f4 acc[56:59], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[56:59] cbsz:1 blgp:0 // left value = acc[56+0:59+0]
v_mfma_f32_16x16x128_f8f6f4 acc[60:63], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[60:63] cbsz:1 blgp:0 // left value = acc[60+0:63+0]
v_mfma_f32_16x16x128_f8f6f4 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[64:67] cbsz:1 blgp:0 // left value = acc[64+0:67+0]
v_mfma_f32_16x16x128_f8f6f4 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[68:71] cbsz:1 blgp:0 // left value = acc[68+0:71+0]
v_mfma_f32_16x16x128_f8f6f4 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[72:75] cbsz:1 blgp:0 // left value = acc[72+0:75+0]
v_mfma_f32_16x16x128_f8f6f4 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[76:79] cbsz:1 blgp:0 // left value = acc[76+0:79+0]
v_mfma_f32_16x16x128_f8f6f4 acc[80:83], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[80:83] cbsz:1 blgp:0 // left value = acc[80+0:83+0]
v_mfma_f32_16x16x128_f8f6f4 acc[84:87], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[84:87] cbsz:1 blgp:0 // left value = acc[84+0:87+0]
v_mfma_f32_16x16x128_f8f6f4 acc[88:91], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[88:91] cbsz:1 blgp:0 // left value = acc[88+0:91+0]
v_mfma_f32_16x16x128_f8f6f4 acc[92:95], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[92:95] cbsz:1 blgp:0 // left value = acc[92+0:95+0]
v_mfma_f32_16x16x128_f8f6f4 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[96:99] cbsz:1 blgp:0 // left value = acc[96+0:99+0]
v_mfma_f32_16x16x128_f8f6f4 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[100:103] cbsz:1 blgp:0 // left value = acc[100+0:103+0]
v_mfma_f32_16x16x128_f8f6f4 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[104:107] cbsz:1 blgp:0 // left value = acc[104+0:107+0]
v_mfma_f32_16x16x128_f8f6f4 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[108:111] cbsz:1 blgp:0 // left value = acc[108+0:111+0]
v_mfma_f32_16x16x128_f8f6f4 acc[112:115], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[112:115] cbsz:1 blgp:0 // left value = acc[112+0:115+0]
v_mfma_f32_16x16x128_f8f6f4 acc[116:119], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[116:119] cbsz:1 blgp:0 // left value = acc[116+0:119+0]
v_mfma_f32_16x16x128_f8f6f4 acc[120:123], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[120:123] cbsz:1 blgp:0 // left value = acc[120+0:123+0]
v_mfma_f32_16x16x128_f8f6f4 acc[124:127], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[124:127] cbsz:1 blgp:0 // left value = acc[124+0:127+0]
v_mfma_f32_16x16x128_f8f6f4 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[128:131] cbsz:1 blgp:0 // left value = acc[128+0:131+0]
v_mfma_f32_16x16x128_f8f6f4 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[132:135] cbsz:1 blgp:0 // left value = acc[132+0:135+0]
v_mfma_f32_16x16x128_f8f6f4 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[136:139] cbsz:1 blgp:0 // left value = acc[136+0:139+0]
v_mfma_f32_16x16x128_f8f6f4 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[140:143] cbsz:1 blgp:0 // left value = acc[140+0:143+0]
v_mfma_f32_16x16x128_f8f6f4 acc[144:147], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[144:147] cbsz:1 blgp:0 // left value = acc[144+0:147+0]
v_mfma_f32_16x16x128_f8f6f4 acc[148:151], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[148:151] cbsz:1 blgp:0 // left value = acc[148+0:151+0]
v_mfma_f32_16x16x128_f8f6f4 acc[152:155], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[152:155] cbsz:1 blgp:0 // left value = acc[152+0:155+0]
v_mfma_f32_16x16x128_f8f6f4 acc[156:159], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[156:159] cbsz:1 blgp:0 // left value = acc[156+0:159+0]
v_mfma_f32_16x16x128_f8f6f4 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[160:163] cbsz:1 blgp:0 // left value = acc[160+0:163+0]
v_mfma_f32_16x16x128_f8f6f4 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[164:167] cbsz:1 blgp:0 // left value = acc[164+0:167+0]
v_mfma_f32_16x16x128_f8f6f4 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[168:171] cbsz:1 blgp:0 // left value = acc[168+0:171+0]
v_mfma_f32_16x16x128_f8f6f4 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[172:175] cbsz:1 blgp:0 // left value = acc[172+0:175+0]
v_mfma_f32_16x16x128_f8f6f4 acc[176:179], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[176:179] cbsz:1 blgp:0 // left value = acc[176+0:179+0]
v_mfma_f32_16x16x128_f8f6f4 acc[180:183], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[180:183] cbsz:1 blgp:0 // left value = acc[180+0:183+0]
v_mfma_f32_16x16x128_f8f6f4 acc[184:187], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[184:187] cbsz:1 blgp:0 // left value = acc[184+0:187+0]
v_mfma_f32_16x16x128_f8f6f4 acc[188:191], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[188:191] cbsz:1 blgp:0 // left value = acc[188+0:191+0]
v_mfma_f32_16x16x128_f8f6f4 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[192:195] cbsz:1 blgp:0 // left value = acc[192+0:195+0]
v_mfma_f32_16x16x128_f8f6f4 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[196:199] cbsz:1 blgp:0 // left value = acc[196+0:199+0]
v_mfma_f32_16x16x128_f8f6f4 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[200:203] cbsz:1 blgp:0 // left value = acc[200+0:203+0]
v_mfma_f32_16x16x128_f8f6f4 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[204:207] cbsz:1 blgp:0 // left value = acc[204+0:207+0]
v_mfma_f32_16x16x128_f8f6f4 acc[208:211], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[208:211] cbsz:1 blgp:0 // left value = acc[208+0:211+0]
v_mfma_f32_16x16x128_f8f6f4 acc[212:215], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[212:215] cbsz:1 blgp:0 // left value = acc[212+0:215+0]
v_mfma_f32_16x16x128_f8f6f4 acc[216:219], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[216:219] cbsz:1 blgp:0 // left value = acc[216+0:219+0]
v_mfma_f32_16x16x128_f8f6f4 acc[220:223], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[220:223] cbsz:1 blgp:0 // left value = acc[220+0:223+0]
v_mfma_f32_16x16x128_f8f6f4 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+7], acc[224:227] cbsz:1 blgp:0 // left value = acc[224+0:227+0]
v_mfma_f32_16x16x128_f8f6f4 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+7], acc[228:231] cbsz:1 blgp:0 // left value = acc[228+0:231+0]
v_mfma_f32_16x16x128_f8f6f4 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+16+0+0:vgprValuA_X0_I0+16+0+0+7], acc[232:235] cbsz:1 blgp:0 // left value = acc[232+0:235+0]
v_mfma_f32_16x16x128_f8f6f4 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+24+0+0:vgprValuA_X0_I0+24+0+0+7], acc[236:239] cbsz:1 blgp:0 // left value = acc[236+0:239+0]
v_mfma_f32_16x16x128_f8f6f4 acc[240:243], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+32+0+0:vgprValuA_X0_I0+32+0+0+7], acc[240:243] cbsz:1 blgp:0 // left value = acc[240+0:243+0]
v_mfma_f32_16x16x128_f8f6f4 acc[244:247], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+40+0+0:vgprValuA_X0_I0+40+0+0+7], acc[244:247] cbsz:1 blgp:0 // left value = acc[244+0:247+0]
v_mfma_f32_16x16x128_f8f6f4 acc[248:251], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+48+0+0:vgprValuA_X0_I0+48+0+0+7], acc[248:251] cbsz:1 blgp:0 // left value = acc[248+0:251+0]
v_mfma_f32_16x16x128_f8f6f4 acc[252:255], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+7], v[vgprValuA_X0_I0+56+0+0:vgprValuA_X0_I0+56+0+0+7], acc[252:255] cbsz:1 blgp:0 // left value = acc[252+0:255+0]

/* closeLoop loopL finalLoop=1 tailLoop=1 */
s_sub_i32 s[sgprLoopCounterL], s[sgprLoopCounterL], 0x80 // dec counterL (tailLoop)
s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x80 // inc counterL
s_cmp_le_i32 s[sgprLoopCounterL], 0x0              // counterL<=0
s_cbranch_scc0 label_TailLoopBeginL                // restart LoopL
label_TailLoopEndL:
s_mov_b32 s97, 1                                   // tailloop lds offset
s_mul_i32 s97, s[sgprOrigLoopCounter], s97         // scale by mul
v_sub_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s97 // remove lro damage
s_mov_b32 s97, 1                                   // tailloop lds offset
s_mul_i32 s97, s[sgprOrigLoopCounter], s97         // scale by mul
v_sub_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s97 // remove lro damage
label_SkipTailLoopL:
.set vgprValuA_X0_I0_BASE, UNDEF
.set vgprValuA_X0_I0, UNDEF
.set vgprValuB_X0_I0_BASE, UNDEF
.set vgprValuB_X0_I0, UNDEF

/* Tail: add MISC Vgpr [0...4) to pool */
label_Summation_End_DZOUDPYJU2HHRCOQ:
.set sgprLoopCounterL, UNDEF
.set sgprOrigLoopCounter, UNDEF
.set sgprSrdA, UNDEF
.set sgprSrdB, UNDEF
.set sgprShadowLimitA, UNDEF
.set sgprShadowLimitB, UNDEF
.set sgprStaggerUIter, UNDEF
.set sgprWrapUA, UNDEF
.set sgprWrapUB, UNDEF
.set sgprGlobalReadIncsA, UNDEF
.set sgprGlobalReadIncsB, UNDEF
.set sgprScalarGlobalReadOffsetA, UNDEF
.set sgprScalarGlobalReadOffsetB, UNDEF
/* load store sgprs */
.set sgprAddressScaleA, 64
.set sgprAddressScaleB, 66
.set sgprAddressScaleAlphaVec, 68
.set sgprAddressBias, 70
.set sgprBiasType, 72
.set sgprBiasStride, 73
.set sgpractivationAlpha, 74
.set sgpractivationBeta, 75
.set sgprActivationType, 76
/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_LoadExternalEpilogueStruct    // branch if ArgType == 2
s_load_dwordx8 s[64:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 124 // 124
s_load_dwordx4 s[72:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 156 // 156
s_load_dword s76, s[sgprKernArgAddress:sgprKernArgAddress+1], 172 // 172
s_branch label_LoadExternalEpilogueStructEnd
label_LoadExternalEpilogueStruct:
s_load_dwordx4 s[64:67], s[sgprKernArgAddress:sgprKernArgAddress+1], 148 // 148
s_load_dwordx4 s[68:71], s[sgprKernArgAddress:sgprKernArgAddress+1], 180 // 180
s_load_dwordx2 s[72:73], s[sgprKernArgAddress:sgprKernArgAddress+1], 196 // 196
s_load_dwordx2 s[74:75], s[sgprKernArgAddress:sgprKernArgAddress+1], 220 // 220
s_load_dword s76, s[sgprKernArgAddress:sgprKernArgAddress+1], 228 // 228
label_LoadExternalEpilogueStructEnd:
.set sgprSrdScaleAlphaVec, 80
.set sgprSrdBias, 84

/* Mapping of Acc register -> C Vgpr register */

/* not-LocalSplitU: global write indices */
/* computeStoreVgprs */
v_lshrrev_b32 v4, 6, v[vgprSerial]                 // 4 = Serial / 64
v_lshrrev_b32 v5, 1, v4                            // 5 = 4 / 2
v_mul_lo_u32 v5, 0x10, v5                          // wave coordination offset 1
v_and_b32 v1, 63, v[vgprSerial]                    // v1 = v[vgprSerial] % 64
v_lshrrev_b32 v1, 4, v1                            // 1 = 1 / 16
v_lshlrev_b32 v1, 2, v1                            // thread0 * continuous_output
v_add_lshl_u32 v1, v5, v1, 3                       // coordination 1 = vwB *(wave_id1 + tid1)
v_mul_lo_u32 v2, v1, s[sgprStrideC1J]              //  offset 1
v_mul_lo_u32 v3, v1, s[sgprStrideD1J]              //  offset 1
v_and_b32 v0, 1, v4                                // v0 = v4 % 2
v_mul_lo_u32 v0, 0x10, v0                          // wave coordination offset 0
v_and_b32 v5, 15, v[vgprSerial]                    // v5 = v[vgprSerial] % 16
v_add_lshl_u32 v0, v5, v0, 3                       // coordination 0 = vwA * (wave_id0 + tid0)
s_mul_i32 s8, 256, s[sgprWorkGroup0]               // wgp0 * MT0
v_add_u32 v0, s8, v0                               // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0
s_mul_i32 s8, 256, s[sgprWorkGroup1]               // wgp1 * MT1
v_add_u32 v1, s8, v1                               // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1

/* not-LocalSplitU: global write */

/******************************************/
/* Global Write Elements                  */
/******************************************/
s_waitcnt lgkmcnt(0)                               // wait for 52 bytes of kern args.
s_mov_b32 s8, 1.0                                  // init as 1
s_cmp_eq_u64 s[sgprAddressScaleA:sgprAddressScaleA+1], 0 // s[AddressScaleA] == 0 ?
s_cbranch_scc1 label_ScaleAValid                   // branch if s[AddressScaleA] == 0
s_load_dword s8, s[sgprAddressScaleA:sgprAddressScaleA+1], 0 // load scaleA
label_ScaleAValid:
s_mov_b32 s9, 1.0                                  // init as 1
s_cmp_eq_u64 s[sgprAddressScaleB:sgprAddressScaleB+1], 0 // s[AddressScaleB] == 0 ?
s_cbranch_scc1 label_ScaleBValid                   // branch if s[AddressScaleB] == 0
s_load_dword s9, s[sgprAddressScaleB:sgprAddressScaleB+1], 0 // load scaleB
label_ScaleBValid:
s_mov_b64 s[sgprSrdScaleAlphaVec+0:sgprSrdScaleAlphaVec+0+1], s[sgprAddressScaleAlphaVec+0:sgprAddressScaleAlphaVec+0+1] // init SRD base address
s_mov_b32 s[sgprSrdScaleAlphaVec+3], Srd127_96     // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], 0 // s[AddressScaleAlphaVec] == 0 ?
s_cbranch_scc0 label_ScaleAlphaVecAddrValid        // branch if s[AddressScaleAlphaVec] != 0
s_mov_b32 s[sgprSrdScaleAlphaVec+2], 0
s_branch label_ScaleAlphaVecAddrValid_End
label_ScaleAlphaVecAddrValid:
s_mov_b32 s[sgprSrdScaleAlphaVec+2], s[sgprSizeI]
label_ScaleAlphaVecAddrValid_End:

s_mul_i32 s[sgprSrdScaleAlphaVec+2], 0x4, s[sgprSrdScaleAlphaVec+2] // ScaleAlphaVec scaled by BPE
s_add_u32 s77, s[sgprWorkGroup2], 0x1
s_mul_i32 s77, s[sgprBiasStride], s77              // stride * (wg+1)
s_cmp_eq_u32 s77, 0                                // bias stride = 0?
s_cselect_b32 s77, s[sgprSizeI], s77
s_mov_b64 s[sgprSrdBias+0:sgprSrdBias+0+1], s[sgprAddressBias+0:sgprAddressBias+0+1] // init SRD base address
s_mov_b32 s[sgprSrdBias+3], Srd127_96              // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressBias:sgprAddressBias+1], 0 // s[AddressBias] == 0 ?
s_cbranch_scc0 label_BiasAddrValid                 // branch if s[AddressBias] != 0
s_mov_b32 s[sgprSrdBias+2], 0
s_branch label_BiasAddrValid_End
label_BiasAddrValid:
s_mov_b32 s[sgprSrdBias+2], s77
label_BiasAddrValid_End:

label_Load_Biasf32_0:
s_cmpk_lg_u32 s[sgprBiasType], 0                   // BiasType != 0
s_cbranch_scc1 label_Load_Biasbf16_0               // Branch if true

/******************************************/
/* Read vector to LDS                     */
/******************************************/
s_mul_i32 s77, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_add_u32 v8, s77, v[vgprSerial]                   // coord 0 = wgp0 * MT0 + thread offset
s_mul_i32 s[sgprSrdBias+2], 0x4, s[sgprSrdBias+2]  // scaled by BPE
s_mul_i32 s77, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG
v_add_u32 v6, s77, v8                              // coord 0 = wgp0 * MT0 + thread offset + Stride * WG
v_lshlrev_b32 v6, 0x2, v6                          // Global bias address scaled by BPE
v_lshlrev_b32 v7, 0x2, v8                          // Global scaleAlpha address scaled by BPE
s_mul_i32 s77, 256, s[sgprWorkGroup1]              // wgp1 * MT1
v_add_u32 v8, s77, v[vgprSerial]                   // coord 1 = wgp1 * MT1 + thread offset
buffer_load_dword v4, v6, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias
buffer_load_dword v5, v7, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec
v_lshlrev_b32 v8, 0x2, v[vgprSerial]               // Local address scaled by BPE
s_barrier                                          // wait for all global loads.
s_waitcnt vmcnt(1)                                 // wait for global load
ds_write_b32 v8, v4 offset:0                       // store bias
v_cmp_gt_u32 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], s[sgprSrdScaleAlphaVec+2], 0 //  == 0 ?
s_waitcnt vmcnt(0)                                 // wait for global load
v_cndmask_b32 v5, 1.0, v5, s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1] // 1. mul 1 if 0
ds_write_b32 v8, v5 offset:1024                    // store scaleAlpha
s_branch label_Load_Bias_End                       // Branch to load bias end
label_Load_Biasbf16_0:
s_cmpk_lg_u32 s[sgprBiasType], 7                   // BiasType != 7
s_cbranch_scc1 label_Load_Bias_End                 // Branch if true

/******************************************/
/* Read vector to LDS                     */
/******************************************/
s_mul_i32 s77, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_add_u32 v8, s77, v[vgprSerial]                   // coord 0 = wgp0 * MT0 + thread offset
s_mul_i32 s[sgprSrdBias+2], 0x2, s[sgprSrdBias+2]  // scaled by BPE
s_mul_i32 s77, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG
v_add_u32 v6, s77, v8                              // coord 0 = wgp0 * MT0 + thread offset + Stride * WG
v_lshlrev_b32 v6, 0x1, v6                          // Global bias address scaled by BPE
v_lshlrev_b32 v7, 0x2, v8                          // Global scaleAlpha address scaled by BPE
s_mul_i32 s77, 256, s[sgprWorkGroup1]              // wgp1 * MT1
v_add_u32 v8, s77, v[vgprSerial]                   // coord 1 = wgp1 * MT1 + thread offset
buffer_load_short_d16 v4, v6, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias
buffer_load_dword v5, v7, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec
v_lshlrev_b32 v8, 0x2, v[vgprSerial]               // Local address scaled by BPE
s_barrier                                          // wait for all global loads.
s_waitcnt vmcnt(1)                                 // wait for global load
v_cvt_f32_bf16 v4, v4 src0_sel:WORD_0              // cvt bf16 to f32
ds_write_b32 v8, v4 offset:0                       // store bias
v_cmp_gt_u32 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], s[sgprSrdScaleAlphaVec+2], 0 //  == 0 ?
s_waitcnt vmcnt(0)                                 // wait for global load
v_cndmask_b32 v5, 1.0, v5, s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1] // 1. mul 1 if 0
ds_write_b32 v8, v5 offset:1024                    // store scaleAlpha
s_branch label_Load_Bias_End                       // Branch to load bias end
label_Load_Bias_End:
.set sgprAddressScaleA, UNDEF
.set sgprAddressScaleB, UNDEF
.set sgprAddressScaleAlphaVec, UNDEF
.set sgprSrdScaleAlphaVec, UNDEF
v_mov_b32 v4, s[sgprAlpha]
s_waitcnt lgkmcnt(0)                               // wait for scaleAB load
v_mul_f32 v4, v4, s8
v_mul_f32 v4, v4, s9
s_nop 0                                            // 1 wait states
s_mov_b32 s64, s[sgprAlpha]                        // Save alpha value
v_readfirstlane_b32 s[sgprAlpha], v4               // Update Alpha
s_cmp_eq_u32 s[sgprStreamKLocalStart], 0           // does wg start tile?
s_cbranch_scc1 label_NoBranch_QWMA7J3AUDGL0X23     // Only branch on scc0
s_getpc_b64 s[88:89]                               // addr of next instr
s_add_i32 s90, label_SK_Partials, 4                // target branch offset
s_add_u32 s88, s88, s90                            // add target branch offset
s_addc_u32 s89, s89, 0                             // add high and carry
s_setpc_b64 s[88:89]                               // branch to label_SK_Partials
label_NoBranch_QWMA7J3AUDGL0X23:
s_cmp_eq_u32 s[sgprStreamKLocalEnd], s[sgprItersPerTile] // does wg finish tile?
s_cbranch_scc1 label_SK_Store                      // Branch if started and finished tile, go to regular store code
s_add_u32 s65, s[sgprStreamKIdx], 1                // input partial tile index
v_cvt_f32_u32 v17, s[sgprItersPerTile]             // StreamKIterEnd // ItersPerTile
v_rcp_iflag_f32 v17, v17                           // StreamKIterEnd // ItersPerTile
v_cvt_f32_u32 v18, s[sgprStreamKIterEnd]           // StreamKIterEnd // ItersPerTile
v_mul_f32 v17, v17, v18                            // StreamKIterEnd // ItersPerTile
v_cvt_u32_f32 v17, v17                             // StreamKIterEnd // ItersPerTile
v_mul_u32_u24 v18, v17, s[sgprItersPerTile]        // StreamKIterEnd // ItersPerTile
v_sub_u32 v18, s[sgprStreamKIterEnd], v18          // StreamKIterEnd // ItersPerTile
v_cmpx_eq_u32 exec, v18, s[sgprItersPerTile]       // StreamKIterEnd // ItersPerTile
v_add_u32 v17, 1, v17                              // StreamKIterEnd // ItersPerTile
v_mov_b32 v18, 0                                   // StreamKIterEnd // ItersPerTile
s_mov_b64 exec, -1                                 // Reset exec
v_cmpx_gt_u32 exec, v18, s[sgprItersPerTile]       // overflow happened in remainder
v_sub_u32 v17, v17, 1                              // quotient - 1
v_mul_u32_u24 v18, v17, s[sgprItersPerTile]        // re-calculate remainder
v_sub_u32 v18, s[sgprStreamKIterEnd], v18          // re-calculate remainder
s_mov_b64 exec, -1                                 // Reset exec
v_readfirstlane_b32 s77, v17                       // quotient
v_readfirstlane_b32 s68, v18                       // remainder
label_SK_Fixup:
s_lshl_b32 s77, s65, 2                             // flag offset based on CTA index
s_load_dword s79, s[sgprAddressFlags:sgprAddressFlags+1], s77 glc // get flag
s_waitcnt lgkmcnt(0)                               // wait for flag load
s_cmp_eq_u32 s79, 1                                // check if ready
s_cbranch_scc0 label_SK_Fixup                      // if flag not set, wait and check again
s_barrier                                          // wait for all workgroups before resetting flag
v_readfirstlane_b32 s79, v[vgprSerial]             // Wave 0 updates flags
s_cmp_eq_u32 s79, 0                                // Check for wave 0
s_cbranch_scc0 label_SK_SkipFlagReset              // Skip flag reset
s_store_dword s79, s[sgprAddressFlags:sgprAddressFlags+1], s77 glc // reset flag
label_SK_SkipFlagReset:
label_Fixup_E0:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=13 */
s_mov_b64 s[sgprSrdWS+0:sgprSrdWS+0+1], s[sgprAddressWS+0:sgprAddressWS+0+1] // init SRD base address
s_mov_b32 s[sgprSrdWS+2], BufferOOB
s_mov_b32 s[sgprSrdWS+3], Srd127_96                // Set bits 127_96 in post-loop SRD

s_mul_i32 s78, 0x40000, s65                        // Offset to correct partials tile
s_add_u32 s[sgprSrdWS+0], s[sgprSrdWS+0], s78      // add lo to SRD
s_addc_u32 s[sgprSrdWS+1], s[sgprSrdWS+1], 0       // add hi to SRD
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */

/******************************************/
/* Fixup Batch #0 (d1,d0,vc1,vc0) =       */
/*      (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_lshlrev_b32 v18, 5, v[vgprSerial]                // v18 = v[vgprSerial] * 32
s_mov_b32 s78, 0                                   // Init sgpr offset
buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
v_accvgpr_read_b32 v[vgprValuC+24], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+25], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+26], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+27], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+28], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+29], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+30], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+31], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+32], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+33], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+34], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+35], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+36], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+38], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+39], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+40], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+41], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+42], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+43], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+44], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+45], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+46], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+47], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+48], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+49], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+50], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+51], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+52], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+53], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+54], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+55], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+56], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+57], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+58], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+59], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+60], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+61], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+62], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+63], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+80], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+81], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+82], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+83], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+84], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+85], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+86], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+87], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+88], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+89], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+90], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+91], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+92], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+93], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+94], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+95], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+96], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+97], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+98], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+99], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+100], acc49         // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+101], acc53         // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+102], acc57         // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+103], acc61         // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+104], acc65         // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+105], acc69         // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+106], acc73         // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+107], acc77         // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+108], acc81         // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+109], acc85         // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+110], acc89         // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+111], acc93         // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+112], acc97         // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+113], acc101        // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+114], acc105        // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+115], acc109        // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+116], acc113        // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+117], acc117        // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+118], acc121        // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+119], acc125        // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+120], acc129        // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+121], acc133        // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+122], acc137        // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+123], acc141        // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+124], acc145        // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+125], acc149        // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+126], acc153        // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+127], acc157        // copy acc to vreg[103]
s_nop 1                                            // 2 wait states required before reading vgpr

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt vmcnt(12)                                // wait C (interleaved) 12 = 13 - 0 + 0 - 1
v_add_f32 v[vgprValuC+24], v[vgprValuC+24], v136   // accum partials
v_add_f32 v[vgprValuC+25], v[vgprValuC+25], v137   // accum partials
v_add_f32 v[vgprValuC+26], v[vgprValuC+26], v138   // accum partials
v_add_f32 v[vgprValuC+27], v[vgprValuC+27], v139   // accum partials
v_add_f32 v[vgprValuC+28], v[vgprValuC+28], v140   // accum partials
v_add_f32 v[vgprValuC+29], v[vgprValuC+29], v141   // accum partials
v_add_f32 v[vgprValuC+30], v[vgprValuC+30], v142   // accum partials
v_add_f32 v[vgprValuC+31], v[vgprValuC+31], v143   // accum partials

s_waitcnt vmcnt(11)                                // wait C (interleaved) 11 = 13 - 1 + 0 - 1
v_add_f32 v[vgprValuC+32], v[vgprValuC+32], v144   // accum partials
v_add_f32 v[vgprValuC+33], v[vgprValuC+33], v145   // accum partials
v_add_f32 v[vgprValuC+34], v[vgprValuC+34], v146   // accum partials
v_add_f32 v[vgprValuC+35], v[vgprValuC+35], v147   // accum partials
v_add_f32 v[vgprValuC+36], v[vgprValuC+36], v148   // accum partials
v_add_f32 v[vgprValuC+37], v[vgprValuC+37], v149   // accum partials
v_add_f32 v[vgprValuC+38], v[vgprValuC+38], v150   // accum partials
v_add_f32 v[vgprValuC+39], v[vgprValuC+39], v151   // accum partials

s_waitcnt vmcnt(10)                                // wait C (interleaved) 10 = 13 - 2 + 0 - 1
v_add_f32 v[vgprValuC+40], v[vgprValuC+40], v152   // accum partials
v_add_f32 v[vgprValuC+41], v[vgprValuC+41], v153   // accum partials
v_add_f32 v[vgprValuC+42], v[vgprValuC+42], v154   // accum partials
v_add_f32 v[vgprValuC+43], v[vgprValuC+43], v155   // accum partials
v_add_f32 v[vgprValuC+44], v[vgprValuC+44], v156   // accum partials
v_add_f32 v[vgprValuC+45], v[vgprValuC+45], v157   // accum partials
v_add_f32 v[vgprValuC+46], v[vgprValuC+46], v158   // accum partials
v_add_f32 v[vgprValuC+47], v[vgprValuC+47], v159   // accum partials

s_waitcnt vmcnt(9)                                 // wait C (interleaved) 9 = 13 - 3 + 0 - 1
v_add_f32 v[vgprValuC+48], v[vgprValuC+48], v160   // accum partials
v_add_f32 v[vgprValuC+49], v[vgprValuC+49], v161   // accum partials
v_add_f32 v[vgprValuC+50], v[vgprValuC+50], v162   // accum partials
v_add_f32 v[vgprValuC+51], v[vgprValuC+51], v163   // accum partials
v_add_f32 v[vgprValuC+52], v[vgprValuC+52], v164   // accum partials
v_add_f32 v[vgprValuC+53], v[vgprValuC+53], v165   // accum partials
v_add_f32 v[vgprValuC+54], v[vgprValuC+54], v166   // accum partials
v_add_f32 v[vgprValuC+55], v[vgprValuC+55], v167   // accum partials

s_waitcnt vmcnt(8)                                 // wait C (interleaved) 8 = 13 - 4 + 0 - 1
v_add_f32 v[vgprValuC+56], v[vgprValuC+56], v168   // accum partials
v_add_f32 v[vgprValuC+57], v[vgprValuC+57], v169   // accum partials
v_add_f32 v[vgprValuC+58], v[vgprValuC+58], v170   // accum partials
v_add_f32 v[vgprValuC+59], v[vgprValuC+59], v171   // accum partials
v_add_f32 v[vgprValuC+60], v[vgprValuC+60], v172   // accum partials
v_add_f32 v[vgprValuC+61], v[vgprValuC+61], v173   // accum partials
v_add_f32 v[vgprValuC+62], v[vgprValuC+62], v174   // accum partials
v_add_f32 v[vgprValuC+63], v[vgprValuC+63], v175   // accum partials

s_waitcnt vmcnt(7)                                 // wait C (interleaved) 7 = 13 - 5 + 0 - 1
v_add_f32 v[vgprValuC+64], v[vgprValuC+64], v176   // accum partials
v_add_f32 v[vgprValuC+65], v[vgprValuC+65], v177   // accum partials
v_add_f32 v[vgprValuC+66], v[vgprValuC+66], v178   // accum partials
v_add_f32 v[vgprValuC+67], v[vgprValuC+67], v179   // accum partials
v_add_f32 v[vgprValuC+68], v[vgprValuC+68], v180   // accum partials
v_add_f32 v[vgprValuC+69], v[vgprValuC+69], v181   // accum partials
v_add_f32 v[vgprValuC+70], v[vgprValuC+70], v182   // accum partials
v_add_f32 v[vgprValuC+71], v[vgprValuC+71], v183   // accum partials

s_waitcnt vmcnt(6)                                 // wait C (interleaved) 6 = 13 - 6 + 0 - 1
v_add_f32 v[vgprValuC+72], v[vgprValuC+72], v184   // accum partials
v_add_f32 v[vgprValuC+73], v[vgprValuC+73], v185   // accum partials
v_add_f32 v[vgprValuC+74], v[vgprValuC+74], v186   // accum partials
v_add_f32 v[vgprValuC+75], v[vgprValuC+75], v187   // accum partials
v_add_f32 v[vgprValuC+76], v[vgprValuC+76], v188   // accum partials
v_add_f32 v[vgprValuC+77], v[vgprValuC+77], v189   // accum partials
v_add_f32 v[vgprValuC+78], v[vgprValuC+78], v190   // accum partials
v_add_f32 v[vgprValuC+79], v[vgprValuC+79], v191   // accum partials

s_waitcnt vmcnt(5)                                 // wait C (interleaved) 5 = 13 - 7 + 0 - 1
v_add_f32 v[vgprValuC+80], v[vgprValuC+80], v192   // accum partials
v_add_f32 v[vgprValuC+81], v[vgprValuC+81], v193   // accum partials
v_add_f32 v[vgprValuC+82], v[vgprValuC+82], v194   // accum partials
v_add_f32 v[vgprValuC+83], v[vgprValuC+83], v195   // accum partials
v_add_f32 v[vgprValuC+84], v[vgprValuC+84], v196   // accum partials
v_add_f32 v[vgprValuC+85], v[vgprValuC+85], v197   // accum partials
v_add_f32 v[vgprValuC+86], v[vgprValuC+86], v198   // accum partials
v_add_f32 v[vgprValuC+87], v[vgprValuC+87], v199   // accum partials

s_waitcnt vmcnt(4)                                 // wait C (interleaved) 4 = 13 - 8 + 0 - 1
v_add_f32 v[vgprValuC+88], v[vgprValuC+88], v200   // accum partials
v_add_f32 v[vgprValuC+89], v[vgprValuC+89], v201   // accum partials
v_add_f32 v[vgprValuC+90], v[vgprValuC+90], v202   // accum partials
v_add_f32 v[vgprValuC+91], v[vgprValuC+91], v203   // accum partials
v_add_f32 v[vgprValuC+92], v[vgprValuC+92], v204   // accum partials
v_add_f32 v[vgprValuC+93], v[vgprValuC+93], v205   // accum partials
v_add_f32 v[vgprValuC+94], v[vgprValuC+94], v206   // accum partials
v_add_f32 v[vgprValuC+95], v[vgprValuC+95], v207   // accum partials

s_waitcnt vmcnt(3)                                 // wait C (interleaved) 3 = 13 - 9 + 0 - 1
v_add_f32 v[vgprValuC+96], v[vgprValuC+96], v208   // accum partials
v_add_f32 v[vgprValuC+97], v[vgprValuC+97], v209   // accum partials
v_add_f32 v[vgprValuC+98], v[vgprValuC+98], v210   // accum partials
v_add_f32 v[vgprValuC+99], v[vgprValuC+99], v211   // accum partials
v_add_f32 v[vgprValuC+100], v[vgprValuC+100], v212 // accum partials
v_add_f32 v[vgprValuC+101], v[vgprValuC+101], v213 // accum partials
v_add_f32 v[vgprValuC+102], v[vgprValuC+102], v214 // accum partials
v_add_f32 v[vgprValuC+103], v[vgprValuC+103], v215 // accum partials

s_waitcnt vmcnt(2)                                 // wait C (interleaved) 2 = 13 - 10 + 0 - 1
v_add_f32 v[vgprValuC+104], v[vgprValuC+104], v216 // accum partials
v_add_f32 v[vgprValuC+105], v[vgprValuC+105], v217 // accum partials
v_add_f32 v[vgprValuC+106], v[vgprValuC+106], v218 // accum partials
v_add_f32 v[vgprValuC+107], v[vgprValuC+107], v219 // accum partials
v_add_f32 v[vgprValuC+108], v[vgprValuC+108], v220 // accum partials
v_add_f32 v[vgprValuC+109], v[vgprValuC+109], v221 // accum partials
v_add_f32 v[vgprValuC+110], v[vgprValuC+110], v222 // accum partials
v_add_f32 v[vgprValuC+111], v[vgprValuC+111], v223 // accum partials

s_waitcnt vmcnt(1)                                 // wait C (interleaved) 1 = 13 - 11 + 0 - 1
v_add_f32 v[vgprValuC+112], v[vgprValuC+112], v224 // accum partials
v_add_f32 v[vgprValuC+113], v[vgprValuC+113], v225 // accum partials
v_add_f32 v[vgprValuC+114], v[vgprValuC+114], v226 // accum partials
v_add_f32 v[vgprValuC+115], v[vgprValuC+115], v227 // accum partials
v_add_f32 v[vgprValuC+116], v[vgprValuC+116], v228 // accum partials
v_add_f32 v[vgprValuC+117], v[vgprValuC+117], v229 // accum partials
v_add_f32 v[vgprValuC+118], v[vgprValuC+118], v230 // accum partials
v_add_f32 v[vgprValuC+119], v[vgprValuC+119], v231 // accum partials

s_waitcnt vmcnt(0)                                 // wait C (interleaved) 0 = 13 - 12 + 0 - 1
v_add_f32 v[vgprValuC+120], v[vgprValuC+120], v232 // accum partials
v_add_f32 v[vgprValuC+121], v[vgprValuC+121], v233 // accum partials
v_add_f32 v[vgprValuC+122], v[vgprValuC+122], v234 // accum partials
v_add_f32 v[vgprValuC+123], v[vgprValuC+123], v235 // accum partials
v_add_f32 v[vgprValuC+124], v[vgprValuC+124], v236 // accum partials
v_add_f32 v[vgprValuC+125], v[vgprValuC+125], v237 // accum partials
v_add_f32 v[vgprValuC+126], v[vgprValuC+126], v238 // accum partials
v_add_f32 v[vgprValuC+127], v[vgprValuC+127], v239 // accum partials
v_accvgpr_write_b32 acc0, v[vgprValuC+24]          // copy vreg[0] to acc
v_accvgpr_write_b32 acc4, v[vgprValuC+25]          // copy vreg[1] to acc
v_accvgpr_write_b32 acc8, v[vgprValuC+26]          // copy vreg[2] to acc
v_accvgpr_write_b32 acc12, v[vgprValuC+27]         // copy vreg[3] to acc
v_accvgpr_write_b32 acc16, v[vgprValuC+28]         // copy vreg[4] to acc
v_accvgpr_write_b32 acc20, v[vgprValuC+29]         // copy vreg[5] to acc
v_accvgpr_write_b32 acc24, v[vgprValuC+30]         // copy vreg[6] to acc
v_accvgpr_write_b32 acc28, v[vgprValuC+31]         // copy vreg[7] to acc
v_accvgpr_write_b32 acc32, v[vgprValuC+32]         // copy vreg[8] to acc
v_accvgpr_write_b32 acc36, v[vgprValuC+33]         // copy vreg[9] to acc
v_accvgpr_write_b32 acc40, v[vgprValuC+34]         // copy vreg[10] to acc
v_accvgpr_write_b32 acc44, v[vgprValuC+35]         // copy vreg[11] to acc
v_accvgpr_write_b32 acc48, v[vgprValuC+36]         // copy vreg[12] to acc
v_accvgpr_write_b32 acc52, v[vgprValuC+37]         // copy vreg[13] to acc
v_accvgpr_write_b32 acc56, v[vgprValuC+38]         // copy vreg[14] to acc
v_accvgpr_write_b32 acc60, v[vgprValuC+39]         // copy vreg[15] to acc
v_accvgpr_write_b32 acc64, v[vgprValuC+40]         // copy vreg[16] to acc
v_accvgpr_write_b32 acc68, v[vgprValuC+41]         // copy vreg[17] to acc
v_accvgpr_write_b32 acc72, v[vgprValuC+42]         // copy vreg[18] to acc
v_accvgpr_write_b32 acc76, v[vgprValuC+43]         // copy vreg[19] to acc
v_accvgpr_write_b32 acc80, v[vgprValuC+44]         // copy vreg[20] to acc
v_accvgpr_write_b32 acc84, v[vgprValuC+45]         // copy vreg[21] to acc
v_accvgpr_write_b32 acc88, v[vgprValuC+46]         // copy vreg[22] to acc
v_accvgpr_write_b32 acc92, v[vgprValuC+47]         // copy vreg[23] to acc
v_accvgpr_write_b32 acc96, v[vgprValuC+48]         // copy vreg[24] to acc
v_accvgpr_write_b32 acc100, v[vgprValuC+49]        // copy vreg[25] to acc
v_accvgpr_write_b32 acc104, v[vgprValuC+50]        // copy vreg[26] to acc
v_accvgpr_write_b32 acc108, v[vgprValuC+51]        // copy vreg[27] to acc
v_accvgpr_write_b32 acc112, v[vgprValuC+52]        // copy vreg[28] to acc
v_accvgpr_write_b32 acc116, v[vgprValuC+53]        // copy vreg[29] to acc
v_accvgpr_write_b32 acc120, v[vgprValuC+54]        // copy vreg[30] to acc
v_accvgpr_write_b32 acc124, v[vgprValuC+55]        // copy vreg[31] to acc
v_accvgpr_write_b32 acc128, v[vgprValuC+56]        // copy vreg[32] to acc
v_accvgpr_write_b32 acc132, v[vgprValuC+57]        // copy vreg[33] to acc
v_accvgpr_write_b32 acc136, v[vgprValuC+58]        // copy vreg[34] to acc
v_accvgpr_write_b32 acc140, v[vgprValuC+59]        // copy vreg[35] to acc
v_accvgpr_write_b32 acc144, v[vgprValuC+60]        // copy vreg[36] to acc
v_accvgpr_write_b32 acc148, v[vgprValuC+61]        // copy vreg[37] to acc
v_accvgpr_write_b32 acc152, v[vgprValuC+62]        // copy vreg[38] to acc
v_accvgpr_write_b32 acc156, v[vgprValuC+63]        // copy vreg[39] to acc
v_accvgpr_write_b32 acc160, v[vgprValuC+64]        // copy vreg[40] to acc
v_accvgpr_write_b32 acc164, v[vgprValuC+65]        // copy vreg[41] to acc
v_accvgpr_write_b32 acc168, v[vgprValuC+66]        // copy vreg[42] to acc
v_accvgpr_write_b32 acc172, v[vgprValuC+67]        // copy vreg[43] to acc
v_accvgpr_write_b32 acc176, v[vgprValuC+68]        // copy vreg[44] to acc
v_accvgpr_write_b32 acc180, v[vgprValuC+69]        // copy vreg[45] to acc
v_accvgpr_write_b32 acc184, v[vgprValuC+70]        // copy vreg[46] to acc
v_accvgpr_write_b32 acc188, v[vgprValuC+71]        // copy vreg[47] to acc
v_accvgpr_write_b32 acc192, v[vgprValuC+72]        // copy vreg[48] to acc
v_accvgpr_write_b32 acc196, v[vgprValuC+73]        // copy vreg[49] to acc
v_accvgpr_write_b32 acc200, v[vgprValuC+74]        // copy vreg[50] to acc
v_accvgpr_write_b32 acc204, v[vgprValuC+75]        // copy vreg[51] to acc
v_accvgpr_write_b32 acc208, v[vgprValuC+76]        // copy vreg[52] to acc
v_accvgpr_write_b32 acc212, v[vgprValuC+77]        // copy vreg[53] to acc
v_accvgpr_write_b32 acc216, v[vgprValuC+78]        // copy vreg[54] to acc
v_accvgpr_write_b32 acc220, v[vgprValuC+79]        // copy vreg[55] to acc
v_accvgpr_write_b32 acc224, v[vgprValuC+80]        // copy vreg[56] to acc
v_accvgpr_write_b32 acc228, v[vgprValuC+81]        // copy vreg[57] to acc
v_accvgpr_write_b32 acc232, v[vgprValuC+82]        // copy vreg[58] to acc
v_accvgpr_write_b32 acc236, v[vgprValuC+83]        // copy vreg[59] to acc
v_accvgpr_write_b32 acc240, v[vgprValuC+84]        // copy vreg[60] to acc
v_accvgpr_write_b32 acc244, v[vgprValuC+85]        // copy vreg[61] to acc
v_accvgpr_write_b32 acc248, v[vgprValuC+86]        // copy vreg[62] to acc
v_accvgpr_write_b32 acc252, v[vgprValuC+87]        // copy vreg[63] to acc
v_accvgpr_write_b32 acc1, v[vgprValuC+88]          // copy vreg[64] to acc
v_accvgpr_write_b32 acc5, v[vgprValuC+89]          // copy vreg[65] to acc
v_accvgpr_write_b32 acc9, v[vgprValuC+90]          // copy vreg[66] to acc
v_accvgpr_write_b32 acc13, v[vgprValuC+91]         // copy vreg[67] to acc
v_accvgpr_write_b32 acc17, v[vgprValuC+92]         // copy vreg[68] to acc
v_accvgpr_write_b32 acc21, v[vgprValuC+93]         // copy vreg[69] to acc
v_accvgpr_write_b32 acc25, v[vgprValuC+94]         // copy vreg[70] to acc
v_accvgpr_write_b32 acc29, v[vgprValuC+95]         // copy vreg[71] to acc
v_accvgpr_write_b32 acc33, v[vgprValuC+96]         // copy vreg[72] to acc
v_accvgpr_write_b32 acc37, v[vgprValuC+97]         // copy vreg[73] to acc
v_accvgpr_write_b32 acc41, v[vgprValuC+98]         // copy vreg[74] to acc
v_accvgpr_write_b32 acc45, v[vgprValuC+99]         // copy vreg[75] to acc
v_accvgpr_write_b32 acc49, v[vgprValuC+100]        // copy vreg[76] to acc
v_accvgpr_write_b32 acc53, v[vgprValuC+101]        // copy vreg[77] to acc
v_accvgpr_write_b32 acc57, v[vgprValuC+102]        // copy vreg[78] to acc
v_accvgpr_write_b32 acc61, v[vgprValuC+103]        // copy vreg[79] to acc
v_accvgpr_write_b32 acc65, v[vgprValuC+104]        // copy vreg[80] to acc
v_accvgpr_write_b32 acc69, v[vgprValuC+105]        // copy vreg[81] to acc
v_accvgpr_write_b32 acc73, v[vgprValuC+106]        // copy vreg[82] to acc
v_accvgpr_write_b32 acc77, v[vgprValuC+107]        // copy vreg[83] to acc
v_accvgpr_write_b32 acc81, v[vgprValuC+108]        // copy vreg[84] to acc
v_accvgpr_write_b32 acc85, v[vgprValuC+109]        // copy vreg[85] to acc
v_accvgpr_write_b32 acc89, v[vgprValuC+110]        // copy vreg[86] to acc
v_accvgpr_write_b32 acc93, v[vgprValuC+111]        // copy vreg[87] to acc
v_accvgpr_write_b32 acc97, v[vgprValuC+112]        // copy vreg[88] to acc
v_accvgpr_write_b32 acc101, v[vgprValuC+113]       // copy vreg[89] to acc
v_accvgpr_write_b32 acc105, v[vgprValuC+114]       // copy vreg[90] to acc
v_accvgpr_write_b32 acc109, v[vgprValuC+115]       // copy vreg[91] to acc
v_accvgpr_write_b32 acc113, v[vgprValuC+116]       // copy vreg[92] to acc
v_accvgpr_write_b32 acc117, v[vgprValuC+117]       // copy vreg[93] to acc
v_accvgpr_write_b32 acc121, v[vgprValuC+118]       // copy vreg[94] to acc
v_accvgpr_write_b32 acc125, v[vgprValuC+119]       // copy vreg[95] to acc
v_accvgpr_write_b32 acc129, v[vgprValuC+120]       // copy vreg[96] to acc
v_accvgpr_write_b32 acc133, v[vgprValuC+121]       // copy vreg[97] to acc
v_accvgpr_write_b32 acc137, v[vgprValuC+122]       // copy vreg[98] to acc
v_accvgpr_write_b32 acc141, v[vgprValuC+123]       // copy vreg[99] to acc
v_accvgpr_write_b32 acc145, v[vgprValuC+124]       // copy vreg[100] to acc
v_accvgpr_write_b32 acc149, v[vgprValuC+125]       // copy vreg[101] to acc
v_accvgpr_write_b32 acc153, v[vgprValuC+126]       // copy vreg[102] to acc
v_accvgpr_write_b32 acc157, v[vgprValuC+127]       // copy vreg[103] to acc
s_nop 1                                            // 2 wait states required before reading vgpr
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */

/******************************************/
/* Fixup Batch #1 (d1,d0,vc1,vc0) =       */
/*      (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8); (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8); (0,0,24,0:vw8); (0,0,25,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[136:139], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[140:143], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[144:147], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[148:151], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[152:155], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[156:159], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[160:163], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[164:167], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[168:171], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[172:175], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[176:179], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[180:183], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[184:187], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[188:191], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[192:195], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[196:199], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[200:203], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[204:207], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[208:211], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[212:215], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[216:219], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[220:223], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[224:227], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[228:231], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[232:235], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[236:239], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
v_accvgpr_read_b32 v[vgprValuC+24], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+25], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+26], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+27], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+28], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+29], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+30], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+31], acc189         // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+32], acc193         // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+33], acc197         // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+34], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+35], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+36], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+37], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+38], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+39], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+40], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+41], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+42], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+43], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+44], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+45], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+46], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+47], acc253         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+48], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+49], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+50], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+51], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+52], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+53], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+54], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+55], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+56], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+57], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+58], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+59], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+60], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+61], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+62], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+63], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+64], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+65], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+66], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+67], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+68], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+69], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+70], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+71], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+72], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+73], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+74], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+75], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+76], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+77], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+78], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+79], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+80], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+81], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+82], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+83], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+84], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+85], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+86], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+87], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+88], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+89], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+90], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+91], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+92], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+93], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+94], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+95], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+96], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+97], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+98], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+99], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+100], acc210        // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+101], acc214        // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+102], acc218        // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+103], acc222        // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+104], acc226        // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+105], acc230        // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+106], acc234        // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+107], acc238        // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+108], acc242        // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+109], acc246        // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+110], acc250        // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+111], acc254        // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+112], acc3          // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+113], acc7          // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+114], acc11         // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+115], acc15         // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+116], acc19         // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+117], acc23         // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+118], acc27         // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+119], acc31         // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+120], acc35         // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+121], acc39         // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+122], acc43         // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+123], acc47         // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+124], acc51         // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+125], acc55         // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+126], acc59         // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+127], acc63         // copy acc to vreg[207]
s_nop 1                                            // 2 wait states required before reading vgpr

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt vmcnt(12)                                // wait C (interleaved) 12 = 13 - 0 + 0 - 1
v_add_f32 v[vgprValuC+24], v[vgprValuC+24], v136   // accum partials
v_add_f32 v[vgprValuC+25], v[vgprValuC+25], v137   // accum partials
v_add_f32 v[vgprValuC+26], v[vgprValuC+26], v138   // accum partials
v_add_f32 v[vgprValuC+27], v[vgprValuC+27], v139   // accum partials
v_add_f32 v[vgprValuC+28], v[vgprValuC+28], v140   // accum partials
v_add_f32 v[vgprValuC+29], v[vgprValuC+29], v141   // accum partials
v_add_f32 v[vgprValuC+30], v[vgprValuC+30], v142   // accum partials
v_add_f32 v[vgprValuC+31], v[vgprValuC+31], v143   // accum partials

s_waitcnt vmcnt(11)                                // wait C (interleaved) 11 = 13 - 1 + 0 - 1
v_add_f32 v[vgprValuC+32], v[vgprValuC+32], v144   // accum partials
v_add_f32 v[vgprValuC+33], v[vgprValuC+33], v145   // accum partials
v_add_f32 v[vgprValuC+34], v[vgprValuC+34], v146   // accum partials
v_add_f32 v[vgprValuC+35], v[vgprValuC+35], v147   // accum partials
v_add_f32 v[vgprValuC+36], v[vgprValuC+36], v148   // accum partials
v_add_f32 v[vgprValuC+37], v[vgprValuC+37], v149   // accum partials
v_add_f32 v[vgprValuC+38], v[vgprValuC+38], v150   // accum partials
v_add_f32 v[vgprValuC+39], v[vgprValuC+39], v151   // accum partials

s_waitcnt vmcnt(10)                                // wait C (interleaved) 10 = 13 - 2 + 0 - 1
v_add_f32 v[vgprValuC+40], v[vgprValuC+40], v152   // accum partials
v_add_f32 v[vgprValuC+41], v[vgprValuC+41], v153   // accum partials
v_add_f32 v[vgprValuC+42], v[vgprValuC+42], v154   // accum partials
v_add_f32 v[vgprValuC+43], v[vgprValuC+43], v155   // accum partials
v_add_f32 v[vgprValuC+44], v[vgprValuC+44], v156   // accum partials
v_add_f32 v[vgprValuC+45], v[vgprValuC+45], v157   // accum partials
v_add_f32 v[vgprValuC+46], v[vgprValuC+46], v158   // accum partials
v_add_f32 v[vgprValuC+47], v[vgprValuC+47], v159   // accum partials

s_waitcnt vmcnt(9)                                 // wait C (interleaved) 9 = 13 - 3 + 0 - 1
v_add_f32 v[vgprValuC+48], v[vgprValuC+48], v160   // accum partials
v_add_f32 v[vgprValuC+49], v[vgprValuC+49], v161   // accum partials
v_add_f32 v[vgprValuC+50], v[vgprValuC+50], v162   // accum partials
v_add_f32 v[vgprValuC+51], v[vgprValuC+51], v163   // accum partials
v_add_f32 v[vgprValuC+52], v[vgprValuC+52], v164   // accum partials
v_add_f32 v[vgprValuC+53], v[vgprValuC+53], v165   // accum partials
v_add_f32 v[vgprValuC+54], v[vgprValuC+54], v166   // accum partials
v_add_f32 v[vgprValuC+55], v[vgprValuC+55], v167   // accum partials

s_waitcnt vmcnt(8)                                 // wait C (interleaved) 8 = 13 - 4 + 0 - 1
v_add_f32 v[vgprValuC+56], v[vgprValuC+56], v168   // accum partials
v_add_f32 v[vgprValuC+57], v[vgprValuC+57], v169   // accum partials
v_add_f32 v[vgprValuC+58], v[vgprValuC+58], v170   // accum partials
v_add_f32 v[vgprValuC+59], v[vgprValuC+59], v171   // accum partials
v_add_f32 v[vgprValuC+60], v[vgprValuC+60], v172   // accum partials
v_add_f32 v[vgprValuC+61], v[vgprValuC+61], v173   // accum partials
v_add_f32 v[vgprValuC+62], v[vgprValuC+62], v174   // accum partials
v_add_f32 v[vgprValuC+63], v[vgprValuC+63], v175   // accum partials

s_waitcnt vmcnt(7)                                 // wait C (interleaved) 7 = 13 - 5 + 0 - 1
v_add_f32 v[vgprValuC+64], v[vgprValuC+64], v176   // accum partials
v_add_f32 v[vgprValuC+65], v[vgprValuC+65], v177   // accum partials
v_add_f32 v[vgprValuC+66], v[vgprValuC+66], v178   // accum partials
v_add_f32 v[vgprValuC+67], v[vgprValuC+67], v179   // accum partials
v_add_f32 v[vgprValuC+68], v[vgprValuC+68], v180   // accum partials
v_add_f32 v[vgprValuC+69], v[vgprValuC+69], v181   // accum partials
v_add_f32 v[vgprValuC+70], v[vgprValuC+70], v182   // accum partials
v_add_f32 v[vgprValuC+71], v[vgprValuC+71], v183   // accum partials

s_waitcnt vmcnt(6)                                 // wait C (interleaved) 6 = 13 - 6 + 0 - 1
v_add_f32 v[vgprValuC+72], v[vgprValuC+72], v184   // accum partials
v_add_f32 v[vgprValuC+73], v[vgprValuC+73], v185   // accum partials
v_add_f32 v[vgprValuC+74], v[vgprValuC+74], v186   // accum partials
v_add_f32 v[vgprValuC+75], v[vgprValuC+75], v187   // accum partials
v_add_f32 v[vgprValuC+76], v[vgprValuC+76], v188   // accum partials
v_add_f32 v[vgprValuC+77], v[vgprValuC+77], v189   // accum partials
v_add_f32 v[vgprValuC+78], v[vgprValuC+78], v190   // accum partials
v_add_f32 v[vgprValuC+79], v[vgprValuC+79], v191   // accum partials

s_waitcnt vmcnt(5)                                 // wait C (interleaved) 5 = 13 - 7 + 0 - 1
v_add_f32 v[vgprValuC+80], v[vgprValuC+80], v192   // accum partials
v_add_f32 v[vgprValuC+81], v[vgprValuC+81], v193   // accum partials
v_add_f32 v[vgprValuC+82], v[vgprValuC+82], v194   // accum partials
v_add_f32 v[vgprValuC+83], v[vgprValuC+83], v195   // accum partials
v_add_f32 v[vgprValuC+84], v[vgprValuC+84], v196   // accum partials
v_add_f32 v[vgprValuC+85], v[vgprValuC+85], v197   // accum partials
v_add_f32 v[vgprValuC+86], v[vgprValuC+86], v198   // accum partials
v_add_f32 v[vgprValuC+87], v[vgprValuC+87], v199   // accum partials

s_waitcnt vmcnt(4)                                 // wait C (interleaved) 4 = 13 - 8 + 0 - 1
v_add_f32 v[vgprValuC+88], v[vgprValuC+88], v200   // accum partials
v_add_f32 v[vgprValuC+89], v[vgprValuC+89], v201   // accum partials
v_add_f32 v[vgprValuC+90], v[vgprValuC+90], v202   // accum partials
v_add_f32 v[vgprValuC+91], v[vgprValuC+91], v203   // accum partials
v_add_f32 v[vgprValuC+92], v[vgprValuC+92], v204   // accum partials
v_add_f32 v[vgprValuC+93], v[vgprValuC+93], v205   // accum partials
v_add_f32 v[vgprValuC+94], v[vgprValuC+94], v206   // accum partials
v_add_f32 v[vgprValuC+95], v[vgprValuC+95], v207   // accum partials

s_waitcnt vmcnt(3)                                 // wait C (interleaved) 3 = 13 - 9 + 0 - 1
v_add_f32 v[vgprValuC+96], v[vgprValuC+96], v208   // accum partials
v_add_f32 v[vgprValuC+97], v[vgprValuC+97], v209   // accum partials
v_add_f32 v[vgprValuC+98], v[vgprValuC+98], v210   // accum partials
v_add_f32 v[vgprValuC+99], v[vgprValuC+99], v211   // accum partials
v_add_f32 v[vgprValuC+100], v[vgprValuC+100], v212 // accum partials
v_add_f32 v[vgprValuC+101], v[vgprValuC+101], v213 // accum partials
v_add_f32 v[vgprValuC+102], v[vgprValuC+102], v214 // accum partials
v_add_f32 v[vgprValuC+103], v[vgprValuC+103], v215 // accum partials

s_waitcnt vmcnt(2)                                 // wait C (interleaved) 2 = 13 - 10 + 0 - 1
v_add_f32 v[vgprValuC+104], v[vgprValuC+104], v216 // accum partials
v_add_f32 v[vgprValuC+105], v[vgprValuC+105], v217 // accum partials
v_add_f32 v[vgprValuC+106], v[vgprValuC+106], v218 // accum partials
v_add_f32 v[vgprValuC+107], v[vgprValuC+107], v219 // accum partials
v_add_f32 v[vgprValuC+108], v[vgprValuC+108], v220 // accum partials
v_add_f32 v[vgprValuC+109], v[vgprValuC+109], v221 // accum partials
v_add_f32 v[vgprValuC+110], v[vgprValuC+110], v222 // accum partials
v_add_f32 v[vgprValuC+111], v[vgprValuC+111], v223 // accum partials

s_waitcnt vmcnt(1)                                 // wait C (interleaved) 1 = 13 - 11 + 0 - 1
v_add_f32 v[vgprValuC+112], v[vgprValuC+112], v224 // accum partials
v_add_f32 v[vgprValuC+113], v[vgprValuC+113], v225 // accum partials
v_add_f32 v[vgprValuC+114], v[vgprValuC+114], v226 // accum partials
v_add_f32 v[vgprValuC+115], v[vgprValuC+115], v227 // accum partials
v_add_f32 v[vgprValuC+116], v[vgprValuC+116], v228 // accum partials
v_add_f32 v[vgprValuC+117], v[vgprValuC+117], v229 // accum partials
v_add_f32 v[vgprValuC+118], v[vgprValuC+118], v230 // accum partials
v_add_f32 v[vgprValuC+119], v[vgprValuC+119], v231 // accum partials

s_waitcnt vmcnt(0)                                 // wait C (interleaved) 0 = 13 - 12 + 0 - 1
v_add_f32 v[vgprValuC+120], v[vgprValuC+120], v232 // accum partials
v_add_f32 v[vgprValuC+121], v[vgprValuC+121], v233 // accum partials
v_add_f32 v[vgprValuC+122], v[vgprValuC+122], v234 // accum partials
v_add_f32 v[vgprValuC+123], v[vgprValuC+123], v235 // accum partials
v_add_f32 v[vgprValuC+124], v[vgprValuC+124], v236 // accum partials
v_add_f32 v[vgprValuC+125], v[vgprValuC+125], v237 // accum partials
v_add_f32 v[vgprValuC+126], v[vgprValuC+126], v238 // accum partials
v_add_f32 v[vgprValuC+127], v[vgprValuC+127], v239 // accum partials
v_accvgpr_write_b32 acc161, v[vgprValuC+24]        // copy vreg[104] to acc
v_accvgpr_write_b32 acc165, v[vgprValuC+25]        // copy vreg[105] to acc
v_accvgpr_write_b32 acc169, v[vgprValuC+26]        // copy vreg[106] to acc
v_accvgpr_write_b32 acc173, v[vgprValuC+27]        // copy vreg[107] to acc
v_accvgpr_write_b32 acc177, v[vgprValuC+28]        // copy vreg[108] to acc
v_accvgpr_write_b32 acc181, v[vgprValuC+29]        // copy vreg[109] to acc
v_accvgpr_write_b32 acc185, v[vgprValuC+30]        // copy vreg[110] to acc
v_accvgpr_write_b32 acc189, v[vgprValuC+31]        // copy vreg[111] to acc
v_accvgpr_write_b32 acc193, v[vgprValuC+32]        // copy vreg[112] to acc
v_accvgpr_write_b32 acc197, v[vgprValuC+33]        // copy vreg[113] to acc
v_accvgpr_write_b32 acc201, v[vgprValuC+34]        // copy vreg[114] to acc
v_accvgpr_write_b32 acc205, v[vgprValuC+35]        // copy vreg[115] to acc
v_accvgpr_write_b32 acc209, v[vgprValuC+36]        // copy vreg[116] to acc
v_accvgpr_write_b32 acc213, v[vgprValuC+37]        // copy vreg[117] to acc
v_accvgpr_write_b32 acc217, v[vgprValuC+38]        // copy vreg[118] to acc
v_accvgpr_write_b32 acc221, v[vgprValuC+39]        // copy vreg[119] to acc
v_accvgpr_write_b32 acc225, v[vgprValuC+40]        // copy vreg[120] to acc
v_accvgpr_write_b32 acc229, v[vgprValuC+41]        // copy vreg[121] to acc
v_accvgpr_write_b32 acc233, v[vgprValuC+42]        // copy vreg[122] to acc
v_accvgpr_write_b32 acc237, v[vgprValuC+43]        // copy vreg[123] to acc
v_accvgpr_write_b32 acc241, v[vgprValuC+44]        // copy vreg[124] to acc
v_accvgpr_write_b32 acc245, v[vgprValuC+45]        // copy vreg[125] to acc
v_accvgpr_write_b32 acc249, v[vgprValuC+46]        // copy vreg[126] to acc
v_accvgpr_write_b32 acc253, v[vgprValuC+47]        // copy vreg[127] to acc
v_accvgpr_write_b32 acc2, v[vgprValuC+48]          // copy vreg[128] to acc
v_accvgpr_write_b32 acc6, v[vgprValuC+49]          // copy vreg[129] to acc
v_accvgpr_write_b32 acc10, v[vgprValuC+50]         // copy vreg[130] to acc
v_accvgpr_write_b32 acc14, v[vgprValuC+51]         // copy vreg[131] to acc
v_accvgpr_write_b32 acc18, v[vgprValuC+52]         // copy vreg[132] to acc
v_accvgpr_write_b32 acc22, v[vgprValuC+53]         // copy vreg[133] to acc
v_accvgpr_write_b32 acc26, v[vgprValuC+54]         // copy vreg[134] to acc
v_accvgpr_write_b32 acc30, v[vgprValuC+55]         // copy vreg[135] to acc
v_accvgpr_write_b32 acc34, v[vgprValuC+56]         // copy vreg[136] to acc
v_accvgpr_write_b32 acc38, v[vgprValuC+57]         // copy vreg[137] to acc
v_accvgpr_write_b32 acc42, v[vgprValuC+58]         // copy vreg[138] to acc
v_accvgpr_write_b32 acc46, v[vgprValuC+59]         // copy vreg[139] to acc
v_accvgpr_write_b32 acc50, v[vgprValuC+60]         // copy vreg[140] to acc
v_accvgpr_write_b32 acc54, v[vgprValuC+61]         // copy vreg[141] to acc
v_accvgpr_write_b32 acc58, v[vgprValuC+62]         // copy vreg[142] to acc
v_accvgpr_write_b32 acc62, v[vgprValuC+63]         // copy vreg[143] to acc
v_accvgpr_write_b32 acc66, v[vgprValuC+64]         // copy vreg[144] to acc
v_accvgpr_write_b32 acc70, v[vgprValuC+65]         // copy vreg[145] to acc
v_accvgpr_write_b32 acc74, v[vgprValuC+66]         // copy vreg[146] to acc
v_accvgpr_write_b32 acc78, v[vgprValuC+67]         // copy vreg[147] to acc
v_accvgpr_write_b32 acc82, v[vgprValuC+68]         // copy vreg[148] to acc
v_accvgpr_write_b32 acc86, v[vgprValuC+69]         // copy vreg[149] to acc
v_accvgpr_write_b32 acc90, v[vgprValuC+70]         // copy vreg[150] to acc
v_accvgpr_write_b32 acc94, v[vgprValuC+71]         // copy vreg[151] to acc
v_accvgpr_write_b32 acc98, v[vgprValuC+72]         // copy vreg[152] to acc
v_accvgpr_write_b32 acc102, v[vgprValuC+73]        // copy vreg[153] to acc
v_accvgpr_write_b32 acc106, v[vgprValuC+74]        // copy vreg[154] to acc
v_accvgpr_write_b32 acc110, v[vgprValuC+75]        // copy vreg[155] to acc
v_accvgpr_write_b32 acc114, v[vgprValuC+76]        // copy vreg[156] to acc
v_accvgpr_write_b32 acc118, v[vgprValuC+77]        // copy vreg[157] to acc
v_accvgpr_write_b32 acc122, v[vgprValuC+78]        // copy vreg[158] to acc
v_accvgpr_write_b32 acc126, v[vgprValuC+79]        // copy vreg[159] to acc
v_accvgpr_write_b32 acc130, v[vgprValuC+80]        // copy vreg[160] to acc
v_accvgpr_write_b32 acc134, v[vgprValuC+81]        // copy vreg[161] to acc
v_accvgpr_write_b32 acc138, v[vgprValuC+82]        // copy vreg[162] to acc
v_accvgpr_write_b32 acc142, v[vgprValuC+83]        // copy vreg[163] to acc
v_accvgpr_write_b32 acc146, v[vgprValuC+84]        // copy vreg[164] to acc
v_accvgpr_write_b32 acc150, v[vgprValuC+85]        // copy vreg[165] to acc
v_accvgpr_write_b32 acc154, v[vgprValuC+86]        // copy vreg[166] to acc
v_accvgpr_write_b32 acc158, v[vgprValuC+87]        // copy vreg[167] to acc
v_accvgpr_write_b32 acc162, v[vgprValuC+88]        // copy vreg[168] to acc
v_accvgpr_write_b32 acc166, v[vgprValuC+89]        // copy vreg[169] to acc
v_accvgpr_write_b32 acc170, v[vgprValuC+90]        // copy vreg[170] to acc
v_accvgpr_write_b32 acc174, v[vgprValuC+91]        // copy vreg[171] to acc
v_accvgpr_write_b32 acc178, v[vgprValuC+92]        // copy vreg[172] to acc
v_accvgpr_write_b32 acc182, v[vgprValuC+93]        // copy vreg[173] to acc
v_accvgpr_write_b32 acc186, v[vgprValuC+94]        // copy vreg[174] to acc
v_accvgpr_write_b32 acc190, v[vgprValuC+95]        // copy vreg[175] to acc
v_accvgpr_write_b32 acc194, v[vgprValuC+96]        // copy vreg[176] to acc
v_accvgpr_write_b32 acc198, v[vgprValuC+97]        // copy vreg[177] to acc
v_accvgpr_write_b32 acc202, v[vgprValuC+98]        // copy vreg[178] to acc
v_accvgpr_write_b32 acc206, v[vgprValuC+99]        // copy vreg[179] to acc
v_accvgpr_write_b32 acc210, v[vgprValuC+100]       // copy vreg[180] to acc
v_accvgpr_write_b32 acc214, v[vgprValuC+101]       // copy vreg[181] to acc
v_accvgpr_write_b32 acc218, v[vgprValuC+102]       // copy vreg[182] to acc
v_accvgpr_write_b32 acc222, v[vgprValuC+103]       // copy vreg[183] to acc
v_accvgpr_write_b32 acc226, v[vgprValuC+104]       // copy vreg[184] to acc
v_accvgpr_write_b32 acc230, v[vgprValuC+105]       // copy vreg[185] to acc
v_accvgpr_write_b32 acc234, v[vgprValuC+106]       // copy vreg[186] to acc
v_accvgpr_write_b32 acc238, v[vgprValuC+107]       // copy vreg[187] to acc
v_accvgpr_write_b32 acc242, v[vgprValuC+108]       // copy vreg[188] to acc
v_accvgpr_write_b32 acc246, v[vgprValuC+109]       // copy vreg[189] to acc
v_accvgpr_write_b32 acc250, v[vgprValuC+110]       // copy vreg[190] to acc
v_accvgpr_write_b32 acc254, v[vgprValuC+111]       // copy vreg[191] to acc
v_accvgpr_write_b32 acc3, v[vgprValuC+112]         // copy vreg[192] to acc
v_accvgpr_write_b32 acc7, v[vgprValuC+113]         // copy vreg[193] to acc
v_accvgpr_write_b32 acc11, v[vgprValuC+114]        // copy vreg[194] to acc
v_accvgpr_write_b32 acc15, v[vgprValuC+115]        // copy vreg[195] to acc
v_accvgpr_write_b32 acc19, v[vgprValuC+116]        // copy vreg[196] to acc
v_accvgpr_write_b32 acc23, v[vgprValuC+117]        // copy vreg[197] to acc
v_accvgpr_write_b32 acc27, v[vgprValuC+118]        // copy vreg[198] to acc
v_accvgpr_write_b32 acc31, v[vgprValuC+119]        // copy vreg[199] to acc
v_accvgpr_write_b32 acc35, v[vgprValuC+120]        // copy vreg[200] to acc
v_accvgpr_write_b32 acc39, v[vgprValuC+121]        // copy vreg[201] to acc
v_accvgpr_write_b32 acc43, v[vgprValuC+122]        // copy vreg[202] to acc
v_accvgpr_write_b32 acc47, v[vgprValuC+123]        // copy vreg[203] to acc
v_accvgpr_write_b32 acc51, v[vgprValuC+124]        // copy vreg[204] to acc
v_accvgpr_write_b32 acc55, v[vgprValuC+125]        // copy vreg[205] to acc
v_accvgpr_write_b32 acc59, v[vgprValuC+126]        // copy vreg[206] to acc
v_accvgpr_write_b32 acc63, v[vgprValuC+127]        // copy vreg[207] to acc
s_nop 1                                            // 2 wait states required before reading vgpr
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */

/******************************************/
/* Fixup Batch #2 (d1,d0,vc1,vc0) =       */
/*      (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[72:75], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[76:79], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[80:83], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[84:87], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[88:91], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[92:95], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[96:99], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[100:103], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[104:107], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[108:111], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
s_add_u32 s78, s78, 8192                           // Inc sgpr offset
buffer_load_dwordx4 v[112:115], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:0 // load WS
buffer_load_dwordx4 v[116:119], v18, s[sgprSrdWS:sgprSrdWS+3], s78 offen offset:16 // load WS
v_accvgpr_read_b32 v[vgprValuC+24], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+25], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+26], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+27], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+28], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+29], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+30], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+31], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+32], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+33], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+34], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+35], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+36], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+37], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+38], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+39], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+40], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+41], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+42], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+43], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+44], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+45], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+46], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+47], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+48], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+49], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+50], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+51], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+52], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+53], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+54], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+55], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+56], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+57], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+58], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+59], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+60], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+61], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+62], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+63], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+64], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+65], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+66], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+67], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+68], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+69], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+70], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+71], acc255         // copy acc to vreg[255]
s_nop 1                                            // 2 wait states required before reading vgpr

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt vmcnt(5)                                 // wait C (interleaved) 5 = 6 - 0 + 0 - 1
v_add_f32 v[vgprValuC+24], v[vgprValuC+24], v72    // accum partials
v_add_f32 v[vgprValuC+25], v[vgprValuC+25], v73    // accum partials
v_add_f32 v[vgprValuC+26], v[vgprValuC+26], v74    // accum partials
v_add_f32 v[vgprValuC+27], v[vgprValuC+27], v75    // accum partials
v_add_f32 v[vgprValuC+28], v[vgprValuC+28], v76    // accum partials
v_add_f32 v[vgprValuC+29], v[vgprValuC+29], v77    // accum partials
v_add_f32 v[vgprValuC+30], v[vgprValuC+30], v78    // accum partials
v_add_f32 v[vgprValuC+31], v[vgprValuC+31], v79    // accum partials

s_waitcnt vmcnt(4)                                 // wait C (interleaved) 4 = 6 - 1 + 0 - 1
v_add_f32 v[vgprValuC+32], v[vgprValuC+32], v80    // accum partials
v_add_f32 v[vgprValuC+33], v[vgprValuC+33], v81    // accum partials
v_add_f32 v[vgprValuC+34], v[vgprValuC+34], v82    // accum partials
v_add_f32 v[vgprValuC+35], v[vgprValuC+35], v83    // accum partials
v_add_f32 v[vgprValuC+36], v[vgprValuC+36], v84    // accum partials
v_add_f32 v[vgprValuC+37], v[vgprValuC+37], v85    // accum partials
v_add_f32 v[vgprValuC+38], v[vgprValuC+38], v86    // accum partials
v_add_f32 v[vgprValuC+39], v[vgprValuC+39], v87    // accum partials

s_waitcnt vmcnt(3)                                 // wait C (interleaved) 3 = 6 - 2 + 0 - 1
v_add_f32 v[vgprValuC+40], v[vgprValuC+40], v88    // accum partials
v_add_f32 v[vgprValuC+41], v[vgprValuC+41], v89    // accum partials
v_add_f32 v[vgprValuC+42], v[vgprValuC+42], v90    // accum partials
v_add_f32 v[vgprValuC+43], v[vgprValuC+43], v91    // accum partials
v_add_f32 v[vgprValuC+44], v[vgprValuC+44], v92    // accum partials
v_add_f32 v[vgprValuC+45], v[vgprValuC+45], v93    // accum partials
v_add_f32 v[vgprValuC+46], v[vgprValuC+46], v94    // accum partials
v_add_f32 v[vgprValuC+47], v[vgprValuC+47], v95    // accum partials

s_waitcnt vmcnt(2)                                 // wait C (interleaved) 2 = 6 - 3 + 0 - 1
v_add_f32 v[vgprValuC+48], v[vgprValuC+48], v96    // accum partials
v_add_f32 v[vgprValuC+49], v[vgprValuC+49], v97    // accum partials
v_add_f32 v[vgprValuC+50], v[vgprValuC+50], v98    // accum partials
v_add_f32 v[vgprValuC+51], v[vgprValuC+51], v99    // accum partials
v_add_f32 v[vgprValuC+52], v[vgprValuC+52], v100   // accum partials
v_add_f32 v[vgprValuC+53], v[vgprValuC+53], v101   // accum partials
v_add_f32 v[vgprValuC+54], v[vgprValuC+54], v102   // accum partials
v_add_f32 v[vgprValuC+55], v[vgprValuC+55], v103   // accum partials

s_waitcnt vmcnt(1)                                 // wait C (interleaved) 1 = 6 - 4 + 0 - 1
v_add_f32 v[vgprValuC+56], v[vgprValuC+56], v104   // accum partials
v_add_f32 v[vgprValuC+57], v[vgprValuC+57], v105   // accum partials
v_add_f32 v[vgprValuC+58], v[vgprValuC+58], v106   // accum partials
v_add_f32 v[vgprValuC+59], v[vgprValuC+59], v107   // accum partials
v_add_f32 v[vgprValuC+60], v[vgprValuC+60], v108   // accum partials
v_add_f32 v[vgprValuC+61], v[vgprValuC+61], v109   // accum partials
v_add_f32 v[vgprValuC+62], v[vgprValuC+62], v110   // accum partials
v_add_f32 v[vgprValuC+63], v[vgprValuC+63], v111   // accum partials

s_waitcnt vmcnt(0)                                 // wait C (interleaved) 0 = 6 - 5 + 0 - 1
v_add_f32 v[vgprValuC+64], v[vgprValuC+64], v112   // accum partials
v_add_f32 v[vgprValuC+65], v[vgprValuC+65], v113   // accum partials
v_add_f32 v[vgprValuC+66], v[vgprValuC+66], v114   // accum partials
v_add_f32 v[vgprValuC+67], v[vgprValuC+67], v115   // accum partials
v_add_f32 v[vgprValuC+68], v[vgprValuC+68], v116   // accum partials
v_add_f32 v[vgprValuC+69], v[vgprValuC+69], v117   // accum partials
v_add_f32 v[vgprValuC+70], v[vgprValuC+70], v118   // accum partials
v_add_f32 v[vgprValuC+71], v[vgprValuC+71], v119   // accum partials
v_accvgpr_write_b32 acc67, v[vgprValuC+24]         // copy vreg[208] to acc
v_accvgpr_write_b32 acc71, v[vgprValuC+25]         // copy vreg[209] to acc
v_accvgpr_write_b32 acc75, v[vgprValuC+26]         // copy vreg[210] to acc
v_accvgpr_write_b32 acc79, v[vgprValuC+27]         // copy vreg[211] to acc
v_accvgpr_write_b32 acc83, v[vgprValuC+28]         // copy vreg[212] to acc
v_accvgpr_write_b32 acc87, v[vgprValuC+29]         // copy vreg[213] to acc
v_accvgpr_write_b32 acc91, v[vgprValuC+30]         // copy vreg[214] to acc
v_accvgpr_write_b32 acc95, v[vgprValuC+31]         // copy vreg[215] to acc
v_accvgpr_write_b32 acc99, v[vgprValuC+32]         // copy vreg[216] to acc
v_accvgpr_write_b32 acc103, v[vgprValuC+33]        // copy vreg[217] to acc
v_accvgpr_write_b32 acc107, v[vgprValuC+34]        // copy vreg[218] to acc
v_accvgpr_write_b32 acc111, v[vgprValuC+35]        // copy vreg[219] to acc
v_accvgpr_write_b32 acc115, v[vgprValuC+36]        // copy vreg[220] to acc
v_accvgpr_write_b32 acc119, v[vgprValuC+37]        // copy vreg[221] to acc
v_accvgpr_write_b32 acc123, v[vgprValuC+38]        // copy vreg[222] to acc
v_accvgpr_write_b32 acc127, v[vgprValuC+39]        // copy vreg[223] to acc
v_accvgpr_write_b32 acc131, v[vgprValuC+40]        // copy vreg[224] to acc
v_accvgpr_write_b32 acc135, v[vgprValuC+41]        // copy vreg[225] to acc
v_accvgpr_write_b32 acc139, v[vgprValuC+42]        // copy vreg[226] to acc
v_accvgpr_write_b32 acc143, v[vgprValuC+43]        // copy vreg[227] to acc
v_accvgpr_write_b32 acc147, v[vgprValuC+44]        // copy vreg[228] to acc
v_accvgpr_write_b32 acc151, v[vgprValuC+45]        // copy vreg[229] to acc
v_accvgpr_write_b32 acc155, v[vgprValuC+46]        // copy vreg[230] to acc
v_accvgpr_write_b32 acc159, v[vgprValuC+47]        // copy vreg[231] to acc
v_accvgpr_write_b32 acc163, v[vgprValuC+48]        // copy vreg[232] to acc
v_accvgpr_write_b32 acc167, v[vgprValuC+49]        // copy vreg[233] to acc
v_accvgpr_write_b32 acc171, v[vgprValuC+50]        // copy vreg[234] to acc
v_accvgpr_write_b32 acc175, v[vgprValuC+51]        // copy vreg[235] to acc
v_accvgpr_write_b32 acc179, v[vgprValuC+52]        // copy vreg[236] to acc
v_accvgpr_write_b32 acc183, v[vgprValuC+53]        // copy vreg[237] to acc
v_accvgpr_write_b32 acc187, v[vgprValuC+54]        // copy vreg[238] to acc
v_accvgpr_write_b32 acc191, v[vgprValuC+55]        // copy vreg[239] to acc
v_accvgpr_write_b32 acc195, v[vgprValuC+56]        // copy vreg[240] to acc
v_accvgpr_write_b32 acc199, v[vgprValuC+57]        // copy vreg[241] to acc
v_accvgpr_write_b32 acc203, v[vgprValuC+58]        // copy vreg[242] to acc
v_accvgpr_write_b32 acc207, v[vgprValuC+59]        // copy vreg[243] to acc
v_accvgpr_write_b32 acc211, v[vgprValuC+60]        // copy vreg[244] to acc
v_accvgpr_write_b32 acc215, v[vgprValuC+61]        // copy vreg[245] to acc
v_accvgpr_write_b32 acc219, v[vgprValuC+62]        // copy vreg[246] to acc
v_accvgpr_write_b32 acc223, v[vgprValuC+63]        // copy vreg[247] to acc
v_accvgpr_write_b32 acc227, v[vgprValuC+64]        // copy vreg[248] to acc
v_accvgpr_write_b32 acc231, v[vgprValuC+65]        // copy vreg[249] to acc
v_accvgpr_write_b32 acc235, v[vgprValuC+66]        // copy vreg[250] to acc
v_accvgpr_write_b32 acc239, v[vgprValuC+67]        // copy vreg[251] to acc
v_accvgpr_write_b32 acc243, v[vgprValuC+68]        // copy vreg[252] to acc
v_accvgpr_write_b32 acc247, v[vgprValuC+69]        // copy vreg[253] to acc
v_accvgpr_write_b32 acc251, v[vgprValuC+70]        // copy vreg[254] to acc
v_accvgpr_write_b32 acc255, v[vgprValuC+71]        // copy vreg[255] to acc
s_nop 1                                            // 2 wait states required before reading vgpr
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_add_u32 s69, s[sgprSKItersPerWG], 1              // Add extra iter
s_cmp_lt_u32 s65, s[sgprskExtraIters]              // Check if next WG had an extra iteration
s_cselect_b32 s69, s69, s[sgprSKItersPerWG]        // Select correct number of iterations for next WG
s_add_u32 s68, s68, s69                            // next partial tile iteration
s_add_u32 s65, s65, 1                              // next partial tile index
s_cmp_lt_u32 s68, s[sgprItersPerTile]              // done loading partial tiles?
s_cbranch_scc1 label_SK_Fixup                      // Branch to continue fixup loop
label_SK_Store:
s_cmpk_eq_u32 s[sgprBeta], 0                       // Beta == 0
s_cbranch_scc0 label_GW_Beta                       // Branch if Beta is not zero

s_and_b32 s78, 255, s[sgprSizeI]                   // s78 = s[sgprSizeI] % 256
s_add_u32 s79, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s79                // wg0 >= nwg0-1 ?
s_cselect_b32 s78, s78, 0                          // set rMT0
s_cmpk_gt_u32 s78, 0                               // rMT0 > 0
s_cbranch_scc0 label_NoBranch_0MXDW6EW9K7ZNG8F     // Only branch on scc1
// jump if edges required
s_getpc_b64 s[78:79]                               // addr of next instr
s_add_i32 s80, label_GW_B0_E1_M, 4                 // target branch offset
s_add_u32 s78, s78, s80                            // add target branch offset
s_addc_u32 s79, s79, 0                             // add high and carry
s_setpc_b64 s[78:79]                               // branch to label_GW_B0_E1_M
label_NoBranch_0MXDW6EW9K7ZNG8F:
s_and_b32 s78, 255, s[sgprSizeJ]                   // s78 = s[sgprSizeJ] % 256
s_add_u32 s79, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s79                // wg1 >= nwg1-1
s_cselect_b32 s78, s78, 0                          // set rMT1
s_cmpk_gt_u32 s78, 0                               // rMT1 > 0
s_cbranch_scc0 label_NoBranch_IXPKU979JKZCQDH3     // Only branch on scc1
// jump if edges required
s_getpc_b64 s[78:79]                               // addr of next instr
s_add_i32 s80, label_GW_B0_E1_N, 4                 // target branch offset
s_add_u32 s78, s78, s80                            // add target branch offset
s_addc_u32 s79, s79, 0                             // add high and carry
s_setpc_b64 s[78:79]                               // branch to label_GW_B0_E1_N
label_NoBranch_IXPKU979JKZCQDH3:
label_GW_B0_E0:
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW8_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 12            // activationType == 12
s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_0 // Branch if true
label_To_Activation_None_VW8_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_None_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Gelu_VW8_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Gelu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Relu_VW8_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Relu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Silu_VW8_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Silu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Clamp_VW8_beta_0_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Clamp_VW8, 4       // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_ActivationSetPCAddrEnd_5:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=8 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
s_mul_i32 s68, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v19, v0, s68
v_lshlrev_b32 v19, 0x2, v19                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b128 v[88:91], v19 offset:0                // load Bias
ds_read_b128 v[92:95], v19 offset:16               // load Bias
ds_read_b128 v[96:99], v19 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v19 offset:1040           // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_lshl_u32 v17, v3, v0, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0
v_accvgpr_read_b32 v[vgprValuC+24], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+25], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+26], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+27], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+28], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+29], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+30], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+31], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+32], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+33], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+34], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+35], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+36], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+38], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+39], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+40], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+41], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+42], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+43], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+44], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+45], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+46], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+47], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+48], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+49], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+50], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+51], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+52], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+53], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+54], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+55], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+56], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+57], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+58], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+59], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+60], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+61], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+62], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+63], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+80], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+81], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+82], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+83], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+84], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+85], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+86], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+87], acc252         // copy acc to vreg[63]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt lgkmcnt(0)                               // lgkmcnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
ds_read_b128 v[88:91], v19 offset:0                // load Bias
ds_read_b128 v[92:95], v19 offset:16               // load Bias
ds_read_b128 v[96:99], v19 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v19 offset:1040           // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_accvgpr_read_b32 v[vgprValuC+24], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+25], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+26], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+27], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+28], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+29], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+30], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+31], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+32], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+33], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+34], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+35], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+36], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+37], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+38], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+39], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+40], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+41], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+42], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+43], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+44], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+45], acc85          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+46], acc89          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+47], acc93          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+48], acc97          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+49], acc101         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+50], acc105         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+51], acc109         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+52], acc113         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+53], acc117         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+54], acc121         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+55], acc125         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+56], acc129         // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+57], acc133         // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+58], acc137         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+59], acc141         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+60], acc145         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+61], acc149         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+62], acc153         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+63], acc157         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+64], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+65], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+66], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+67], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+68], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+69], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+70], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+71], acc189         // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+72], acc193         // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+73], acc197         // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+74], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+75], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+76], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+77], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+78], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+79], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+80], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+81], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+82], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+83], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+84], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+85], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+86], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+87], acc253         // copy acc to vreg[127]

/* rC *= alpha batchElements=[(0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt lgkmcnt(0)                               // lgkmcnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
ds_read_b128 v[88:91], v19 offset:0                // load Bias
ds_read_b128 v[92:95], v19 offset:16               // load Bias
ds_read_b128 v[96:99], v19 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v19 offset:1040           // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_accvgpr_read_b32 v[vgprValuC+24], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+25], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+26], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+27], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+28], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+29], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+30], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+31], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+32], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+33], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+34], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+35], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+36], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+37], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+38], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+39], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+40], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+41], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+42], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+43], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+44], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+45], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+46], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+47], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+48], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+49], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+50], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+51], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+52], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+53], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+54], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+55], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+56], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+57], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+58], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+59], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+60], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+61], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+62], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+63], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+64], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+65], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+66], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+67], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+68], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+69], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+70], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+71], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+72], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+73], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+74], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+75], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+76], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+77], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+78], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+79], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+80], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+81], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+82], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+83], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+84], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+85], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+86], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+87], acc254         // copy acc to vreg[191]

/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt lgkmcnt(0)                               // lgkmcnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #3 (d1,d0,vc1,vc0) = */
/*    (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
ds_read_b128 v[88:91], v19 offset:0                // load Bias
ds_read_b128 v[92:95], v19 offset:16               // load Bias
ds_read_b128 v[96:99], v19 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v19 offset:1040           // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_accvgpr_read_b32 v[vgprValuC+24], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+25], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+26], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+27], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+28], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+29], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+30], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+31], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+32], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+33], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+34], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+35], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+36], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+37], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+38], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+39], acc63          // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+40], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+41], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+42], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+43], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+44], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+45], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+46], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+47], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+48], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+49], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+50], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+51], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+52], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+53], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+54], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+55], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+56], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+57], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+58], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+59], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+60], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+61], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+62], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+63], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+64], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+65], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+66], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+67], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+68], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+69], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+70], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+71], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+72], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+73], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+74], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+75], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+76], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+77], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+78], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+79], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+80], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+81], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+82], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+83], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+84], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+85], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+86], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+87], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt lgkmcnt(0)                               // lgkmcnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[80:83], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_B0_E1_N:
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW8_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 12            // activationType == 12
s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_0_edge_1 // Branch if true
label_To_Activation_None_VW8_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_None_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Gelu_VW8_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Gelu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Relu_VW8_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Relu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Silu_VW8_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Silu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_To_Activation_Clamp_VW8_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Clamp_VW8, 4       // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_4
label_ActivationSetPCAddrEnd_4:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=8 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8); (0,0,7,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v18, v0, s78
v_lshlrev_b32 v18, 0x2, v18                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b128 v[88:91], v18 offset:0                // load Bias
ds_read_b128 v[92:95], v18 offset:16               // load Bias
ds_read_b128 v[96:99], v18 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v18 offset:1040           // load scaleAlpha
v_add_lshl_u32 v17, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v17, v12, v17, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v20, v0, s78
v_lshlrev_b32 v20, 0x2, v20                        // Bias address scaled by BPE
v_add_lshl_u32 v19, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v19, v12, v19, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v22, v0, s78
v_lshlrev_b32 v22, 0x2, v22                        // Bias address scaled by BPE
v_add_lshl_u32 v21, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v12, v21, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v0, s78
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v23, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v12, v23, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v0, s78
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v12, v105, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v0, s78
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v12, v107, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v0, s78
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v12, v109, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v0, s78
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v12, v111, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+24], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+25], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+26], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+27], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+28], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+29], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+30], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+31], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+32], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+33], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+34], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+35], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+36], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+38], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+39], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+40], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+41], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+42], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+43], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+44], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+45], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+46], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+47], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+48], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+49], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+50], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+51], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+52], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+53], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+54], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+55], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+56], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+57], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+58], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+59], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+60], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+61], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+62], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+63], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+80], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+81], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+82], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+83], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+84], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+85], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+86], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+87], acc252         // copy acc to vreg[63]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[32:35], v19, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[48:51], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[56:59], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[64:67], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[72:75], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[80:83], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8); (0,0,14,0:vw8); (0,0,15,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v18, v0, s78
v_lshlrev_b32 v18, 0x2, v18                        // Bias address scaled by BPE
ds_read_b128 v[88:91], v18 offset:0                // load Bias
ds_read_b128 v[92:95], v18 offset:16               // load Bias
ds_read_b128 v[96:99], v18 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v18 offset:1040           // load scaleAlpha
v_add_lshl_u32 v17, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v17, v12, v17, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v20, v0, s78
v_lshlrev_b32 v20, 0x2, v20                        // Bias address scaled by BPE
v_add_lshl_u32 v19, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v19, v12, v19, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v22, v0, s78
v_lshlrev_b32 v22, 0x2, v22                        // Bias address scaled by BPE
v_add_lshl_u32 v21, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v12, v21, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v0, s78
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v23, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v12, v23, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v0, s78
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v12, v105, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v0, s78
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v12, v107, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v0, s78
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v12, v109, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v0, s78
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v12, v111, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+24], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+25], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+26], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+27], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+28], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+29], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+30], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+31], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+32], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+33], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+34], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+35], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+36], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+37], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+38], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+39], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+40], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+41], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+42], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+43], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+44], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+45], acc85          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+46], acc89          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+47], acc93          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+48], acc97          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+49], acc101         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+50], acc105         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+51], acc109         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+52], acc113         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+53], acc117         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+54], acc121         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+55], acc125         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+56], acc129         // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+57], acc133         // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+58], acc137         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+59], acc141         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+60], acc145         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+61], acc149         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+62], acc153         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+63], acc157         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+64], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+65], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+66], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+67], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+68], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+69], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+70], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+71], acc189         // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+72], acc193         // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+73], acc197         // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+74], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+75], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+76], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+77], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+78], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+79], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+80], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+81], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+82], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+83], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+84], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+85], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+86], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+87], acc253         // copy acc to vreg[127]

/* rC *= alpha batchElements=[(0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[32:35], v19, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[48:51], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[56:59], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[64:67], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[72:75], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[80:83], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,16,0:vw8); (0,0,17,0:vw8); (0,0,18,0:vw8); (0,0,19,0:vw8); (0,0,20,0:vw8); (0,0,21,0:vw8); (0,0,22,0:vw8); (0,0,23,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v18, v0, s78
v_lshlrev_b32 v18, 0x2, v18                        // Bias address scaled by BPE
ds_read_b128 v[88:91], v18 offset:0                // load Bias
ds_read_b128 v[92:95], v18 offset:16               // load Bias
ds_read_b128 v[96:99], v18 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v18 offset:1040           // load scaleAlpha
v_add_lshl_u32 v17, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v17, v12, v17, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v20, v0, s78
v_lshlrev_b32 v20, 0x2, v20                        // Bias address scaled by BPE
v_add_lshl_u32 v19, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v19, v12, v19, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v22, v0, s78
v_lshlrev_b32 v22, 0x2, v22                        // Bias address scaled by BPE
v_add_lshl_u32 v21, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v12, v21, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v0, s78
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v23, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v12, v23, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v0, s78
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v12, v105, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v0, s78
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v12, v107, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v0, s78
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v12, v109, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v0, s78
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v12, v111, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+24], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+25], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+26], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+27], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+28], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+29], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+30], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+31], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+32], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+33], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+34], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+35], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+36], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+37], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+38], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+39], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+40], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+41], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+42], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+43], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+44], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+45], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+46], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+47], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+48], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+49], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+50], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+51], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+52], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+53], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+54], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+55], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+56], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+57], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+58], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+59], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+60], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+61], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+62], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+63], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+64], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+65], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+66], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+67], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+68], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+69], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+70], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+71], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+72], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+73], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+74], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+75], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+76], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+77], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+78], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+79], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+80], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+81], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+82], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+83], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+84], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+85], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+86], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+87], acc254         // copy acc to vreg[191]

/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[32:35], v19, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[48:51], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[56:59], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[64:67], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[72:75], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[80:83], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */
/*    (0,0,24,0:vw8); (0,0,25,0:vw8); (0,0,26,0:vw8); (0,0,27,0:vw8); (0,0,28,0:vw8); (0,0,29,0:vw8); (0,0,30,0:vw8); (0,0,31,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v18, v0, s78
v_lshlrev_b32 v18, 0x2, v18                        // Bias address scaled by BPE
ds_read_b128 v[88:91], v18 offset:0                // load Bias
ds_read_b128 v[92:95], v18 offset:16               // load Bias
ds_read_b128 v[96:99], v18 offset:1024             // load scaleAlpha
ds_read_b128 v[100:103], v18 offset:1040           // load scaleAlpha
v_add_lshl_u32 v17, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v17, v12, v17, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v20, v0, s78
v_lshlrev_b32 v20, 0x2, v20                        // Bias address scaled by BPE
v_add_lshl_u32 v19, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v19, v12, v19, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v22, v0, s78
v_lshlrev_b32 v22, 0x2, v22                        // Bias address scaled by BPE
v_add_lshl_u32 v21, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v12, v21, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v104, v0, s78
v_lshlrev_b32 v104, 0x2, v104                      // Bias address scaled by BPE
v_add_lshl_u32 v23, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v12, v23, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v106, v0, s78
v_lshlrev_b32 v106, 0x2, v106                      // Bias address scaled by BPE
v_add_lshl_u32 v105, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v105, v12, v105, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v108, v0, s78
v_lshlrev_b32 v108, 0x2, v108                      // Bias address scaled by BPE
v_add_lshl_u32 v107, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v107, v12, v107, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v110, v0, s78
v_lshlrev_b32 v110, 0x2, v110                      // Bias address scaled by BPE
v_add_lshl_u32 v109, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v109, v12, v109, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v112, v0, s78
v_lshlrev_b32 v112, 0x2, v112                      // Bias address scaled by BPE
v_add_lshl_u32 v111, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v111, v12, v111, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+24], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+25], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+26], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+27], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+28], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+29], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+30], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+31], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+32], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+33], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+34], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+35], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+36], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+37], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+38], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+39], acc63          // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+40], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+41], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+42], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+43], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+44], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+45], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+46], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+47], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+48], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+49], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+50], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+51], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+52], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+53], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+54], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+55], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+56], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+57], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+58], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+59], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+60], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+61], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+62], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+63], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+64], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+65], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+66], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+67], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+68], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+69], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+70], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+71], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+72], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+73], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+74], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+75], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+76], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+77], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+78], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+79], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+80], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+81], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+82], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+83], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+84], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+85], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+86], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+87], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[96:97], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[98:99], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[100:101], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[102:103], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[96:97], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[98:99], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[100:101], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[102:103], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[32:35], v19, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[96:97], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[98:99], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[100:101], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[102:103], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[40:43], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[96:97], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[98:99], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[100:101], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[102:103], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[48:51], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[96:97], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[98:99], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[100:101], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[102:103], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[56:59], v105, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[96:97], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[98:99], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[100:101], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[102:103], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[64:67], v107, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[96:97], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[98:99], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[100:101], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[102:103], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[72:75], v109, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_pk_mul_f32 v[vgprValuC+80:vgprValuC+80+1], v[96:97], v[vgprValuC+80:vgprValuC+80+1] // *= ScaleAlphaVecVMulPK(96)(0)
v_pk_mul_f32 v[vgprValuC+82:vgprValuC+82+1], v[98:99], v[vgprValuC+82:vgprValuC+82+1] // *= ScaleAlphaVecVMulPK(96)(2)
v_pk_mul_f32 v[vgprValuC+84:vgprValuC+84+1], v[100:101], v[vgprValuC+84:vgprValuC+84+1] // *= ScaleAlphaVecVMulPK(96)(4)
v_pk_mul_f32 v[vgprValuC+86:vgprValuC+86+1], v[102:103], v[vgprValuC+86:vgprValuC+86+1] // *= ScaleAlphaVecVMulPK(96)(6)
v_pk_add_f32 v[4:5], v[88:89], v[vgprValuC+80:vgprValuC+80+1] // C += bias
v_pk_add_f32 v[6:7], v[90:91], v[vgprValuC+82:vgprValuC+82+1] // C += bias
v_pk_add_f32 v[8:9], v[92:93], v[vgprValuC+84:vgprValuC+84+1] // C += bias
v_pk_add_f32 v[10:11], v[94:95], v[vgprValuC+86:vgprValuC+86+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[80:81], v[4:5]
v_mov_b64 v[82:83], v[6:7]
v_mov_b64 v[84:85], v[8:9]
v_mov_b64 v[86:87], v[10:11]
v_cvt_pk_bf16_f32 v80, v[vgprValuC+80], v[vgprValuC+81] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v81, v[vgprValuC+82], v[vgprValuC+83] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v82, v[vgprValuC+84], v[vgprValuC+85] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v83, v[vgprValuC+86], v[vgprValuC+87] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[80:83], v111, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_B0_E1_M:
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW1_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW1_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW1_beta_0_edge_1 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 12            // activationType == 12
s_cbranch_scc1 label_To_Activation_Clamp_VW1_beta_0_edge_1 // Branch if true
label_To_Activation_None_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_None_VW1, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Gelu_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Gelu_VW1, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Relu_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Relu_VW1, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Silu_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Silu_VW1, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_To_Activation_Clamp_VW1_beta_0_edge_1:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Clamp_VW1, 4       // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_3
label_ActivationSetPCAddrEnd_3:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=45 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,0,4:vw1); (0,0,0,5:vw1); (0,0,0,6:vw1); (0,0,0,7:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,1,4:vw1); (0,0,1,5:vw1); (0,0,1,6:vw1); (0,0,1,7:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,2,4:vw1); (0,0,2,5:vw1); (0,0,2,6:vw1); (0,0,2,7:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1); (0,0,3,4:vw1); (0,0,3,5:vw1); (0,0,3,6:vw1); (0,0,3,7:vw1); (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,4,4:vw1); (0,0,4,5:vw1); (0,0,4,6:vw1); (0,0,4,7:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,5,4:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v65, v0, s78
v_lshlrev_b32 v65, 0x2, v65                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b32 v62, v65 offset:0                      // load Bias
ds_read_b32 v63, v65 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v64, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v64, v12, v64, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v69, v4, s78
v_lshlrev_b32 v69, 0x2, v69                        // Bias address scaled by BPE
ds_read_b32 v66, v69 offset:0                      // load Bias
ds_read_b32 v67, v69 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v68, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v68, v12, v68, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v73, v4, s78
v_lshlrev_b32 v73, 0x2, v73                        // Bias address scaled by BPE
ds_read_b32 v70, v73 offset:0                      // load Bias
ds_read_b32 v71, v73 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v72, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v72, v12, v72, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v77, v4, s78
v_lshlrev_b32 v77, 0x2, v77                        // Bias address scaled by BPE
ds_read_b32 v74, v77 offset:0                      // load Bias
ds_read_b32 v75, v77 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v76, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v76, v12, v76, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v81, v4, s78
v_lshlrev_b32 v81, 0x2, v81                        // Bias address scaled by BPE
ds_read_b32 v78, v81 offset:0                      // load Bias
ds_read_b32 v79, v81 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v80, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v12, v80, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v85, v4, s78
v_lshlrev_b32 v85, 0x2, v85                        // Bias address scaled by BPE
ds_read_b32 v82, v85 offset:0                      // load Bias
ds_read_b32 v83, v85 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v84, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v12, v84, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s78
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
ds_read_b32 v86, v89 offset:0                      // load Bias
ds_read_b32 v87, v89 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v88, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v12, v88, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s78
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
ds_read_b32 v90, v93 offset:0                      // load Bias
ds_read_b32 v91, v93 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v92, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v12, v92, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v0, s78
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v12, v94, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s78
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v12, v96, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s78
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v12, v98, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s78
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v12, v100, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s78
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v12, v102, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s78
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v12, v104, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s78
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v12, v106, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s78
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v12, v108, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v0, s78
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v12, v110, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s78
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v12, v112, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s78
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v12, v114, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s78
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v12, v116, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s78
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v12, v118, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s78
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v12, v120, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s78
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v12, v122, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s78
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v12, v124, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v0, s78
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v12, v126, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s78
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v12, v128, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s78
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v12, v130, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v136, v4, s78
v_lshlrev_b32 v136, 0x2, v136                      // Bias address scaled by BPE
v_add_lshl_u32 v135, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v12, v135, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v138, v4, s78
v_lshlrev_b32 v138, 0x2, v138                      // Bias address scaled by BPE
v_add_lshl_u32 v137, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v12, v137, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v140, v4, s78
v_lshlrev_b32 v140, 0x2, v140                      // Bias address scaled by BPE
v_add_lshl_u32 v139, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v12, v139, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v142, v4, s78
v_lshlrev_b32 v142, 0x2, v142                      // Bias address scaled by BPE
v_add_lshl_u32 v141, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v12, v141, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v144, v4, s78
v_lshlrev_b32 v144, 0x2, v144                      // Bias address scaled by BPE
v_add_lshl_u32 v143, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v12, v143, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v146, v0, s78
v_lshlrev_b32 v146, 0x2, v146                      // Bias address scaled by BPE
v_add_lshl_u32 v145, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v12, v145, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v148, v4, s78
v_lshlrev_b32 v148, 0x2, v148                      // Bias address scaled by BPE
v_add_lshl_u32 v147, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v12, v147, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s78
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v12, v149, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s78
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v12, v151, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s78
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v12, v153, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s78
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v12, v155, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v4, s78
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v12, v157, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s78
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v12, v159, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v0, s78
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v12, v161, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s78
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v12, v163, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s78
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v12, v165, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s78
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v12, v167, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v170, v4, s78
v_lshlrev_b32 v170, 0x2, v170                      // Bias address scaled by BPE
v_add_lshl_u32 v169, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v12, v169, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+17], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+18], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+19], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+20], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+21], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+22], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+23], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+24], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+25], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+26], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+27], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+28], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+29], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+30], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+31], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+32], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+33], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+34], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+35], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+36], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+37], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+38], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+39], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+40], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+41], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+42], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+43], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+44], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+45], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+46], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+47], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+48], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+49], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+50], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+51], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+52], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+53], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+54], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+55], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+56], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+57], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+58], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+59], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+60], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+61], acc176         // copy acc to vreg[44]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 0, 4), (0, 0, 0, 5), (0, 0, 0, 6), (0, 0, 0, 7), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 1, 4), (0, 0, 1, 5), (0, 0, 1, 6), (0, 0, 1, 7), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 2, 4), (0, 0, 2, 5), (0, 0, 2, 6), (0, 0, 2, 7), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3), (0, 0, 3, 4), (0, 0, 3, 5), (0, 0, 3, 6), (0, 0, 3, 7), (0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 4, 4), (0, 0, 4, 5), (0, 0, 4, 6), (0, 0, 4, 7), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 5, 4)] */
v_mul_f32 v[vgprValuC+17], s[sgprAlpha], v[vgprValuC+17] // *= alpha
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v17, v4
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v18, v4
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v19, v4
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v20, v4
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v21, v4
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v22, v4
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v23, v4
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v24, v4
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v25, v4
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v26, v4
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v27, v4
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v28, v4
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v29, v4
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v30, v4
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v31, v4
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v32, v4
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v33, v4
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v34, v4
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v35, v4
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v36, v4
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v37, v4
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v38, v4
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v39, v4
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v40, v4
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v41, v4
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v42, v4
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v43, v4
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v44, v4
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v45, v4
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v46, v4
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v47, v4
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v48, v4
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1
buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v49, v4
v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1
buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v50, v4
v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1
buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v51, v4
v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1
buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v52, v4
v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1
buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v53, v4
v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1
buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v54, v4
v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1
buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v55, v4
v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1
buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+56]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v56, v4
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1
buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+57]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v57, v4
v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1
buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+58]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v58, v4
v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1
buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+59]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v59, v4
v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1
buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+60]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v60, v4
v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1
buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+61]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v61, v4
v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1
buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,5,5:vw1); (0,0,5,6:vw1); (0,0,5,7:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,6,4:vw1); (0,0,6,5:vw1); (0,0,6,6:vw1); (0,0,6,7:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1); (0,0,7,4:vw1); (0,0,7,5:vw1); (0,0,7,6:vw1); (0,0,7,7:vw1); (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,8,4:vw1); (0,0,8,5:vw1); (0,0,8,6:vw1); (0,0,8,7:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,9,4:vw1); (0,0,9,5:vw1); (0,0,9,6:vw1); (0,0,9,7:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,10,4:vw1); (0,0,10,5:vw1); (0,0,10,6:vw1); (0,0,10,7:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,5,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v65, v4, s78
v_lshlrev_b32 v65, 0x2, v65                        // Bias address scaled by BPE
ds_read_b32 v62, v65 offset:0                      // load Bias
ds_read_b32 v63, v65 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v64, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v64, v12, v64, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v69, v4, s78
v_lshlrev_b32 v69, 0x2, v69                        // Bias address scaled by BPE
ds_read_b32 v66, v69 offset:0                      // load Bias
ds_read_b32 v67, v69 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v68, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v68, v12, v68, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v73, v4, s78
v_lshlrev_b32 v73, 0x2, v73                        // Bias address scaled by BPE
ds_read_b32 v70, v73 offset:0                      // load Bias
ds_read_b32 v71, v73 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v72, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v72, v12, v72, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v77, v0, s78
v_lshlrev_b32 v77, 0x2, v77                        // Bias address scaled by BPE
ds_read_b32 v74, v77 offset:0                      // load Bias
ds_read_b32 v75, v77 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v76, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v76, v12, v76, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v81, v4, s78
v_lshlrev_b32 v81, 0x2, v81                        // Bias address scaled by BPE
ds_read_b32 v78, v81 offset:0                      // load Bias
ds_read_b32 v79, v81 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v80, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v12, v80, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v85, v4, s78
v_lshlrev_b32 v85, 0x2, v85                        // Bias address scaled by BPE
ds_read_b32 v82, v85 offset:0                      // load Bias
ds_read_b32 v83, v85 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v84, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v12, v84, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s78
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
ds_read_b32 v86, v89 offset:0                      // load Bias
ds_read_b32 v87, v89 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v88, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v12, v88, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s78
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
ds_read_b32 v90, v93 offset:0                      // load Bias
ds_read_b32 v91, v93 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v92, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v12, v92, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v4, s78
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v12, v94, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s78
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v12, v96, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s78
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v12, v98, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v0, s78
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v12, v100, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s78
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v12, v102, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s78
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v12, v104, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s78
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v12, v106, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s78
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v12, v108, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v4, s78
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v12, v110, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s78
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v12, v112, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s78
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v12, v114, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v0, s78
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v12, v116, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s78
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v12, v118, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s78
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v12, v120, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s78
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v12, v122, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s78
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v12, v124, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v4, s78
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v12, v126, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s78
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v12, v128, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s78
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v12, v130, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v136, v0, s78
v_lshlrev_b32 v136, 0x2, v136                      // Bias address scaled by BPE
v_add_lshl_u32 v135, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v12, v135, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v138, v4, s78
v_lshlrev_b32 v138, 0x2, v138                      // Bias address scaled by BPE
v_add_lshl_u32 v137, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v12, v137, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v140, v4, s78
v_lshlrev_b32 v140, 0x2, v140                      // Bias address scaled by BPE
v_add_lshl_u32 v139, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v12, v139, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v142, v4, s78
v_lshlrev_b32 v142, 0x2, v142                      // Bias address scaled by BPE
v_add_lshl_u32 v141, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v12, v141, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v144, v4, s78
v_lshlrev_b32 v144, 0x2, v144                      // Bias address scaled by BPE
v_add_lshl_u32 v143, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v12, v143, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v146, v4, s78
v_lshlrev_b32 v146, 0x2, v146                      // Bias address scaled by BPE
v_add_lshl_u32 v145, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v12, v145, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v148, v4, s78
v_lshlrev_b32 v148, 0x2, v148                      // Bias address scaled by BPE
v_add_lshl_u32 v147, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v12, v147, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s78
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v12, v149, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v0, s78
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v12, v151, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s78
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v12, v153, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s78
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v12, v155, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v4, s78
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v12, v157, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s78
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v12, v159, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v4, s78
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v12, v161, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s78
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v12, v163, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s78
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v12, v165, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v0, s78
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v12, v167, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v170, v4, s78
v_lshlrev_b32 v170, 0x2, v170                      // Bias address scaled by BPE
v_add_lshl_u32 v169, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v12, v169, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+17], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+18], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+19], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+20], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+21], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+22], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+23], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+24], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+25], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+26], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+27], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+28], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+29], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+30], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+31], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+32], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+33], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+34], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+35], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+36], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+37], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+38], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+39], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+40], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+41], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+42], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+43], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+44], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+45], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+46], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+47], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+48], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+49], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+50], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+51], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+52], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+53], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+54], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+55], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+56], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+57], acc85          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+58], acc89          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+59], acc93          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+60], acc97          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+61], acc101         // copy acc to vreg[89]

/* rC *= alpha batchElements=[(0, 0, 5, 5), (0, 0, 5, 6), (0, 0, 5, 7), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 6, 4), (0, 0, 6, 5), (0, 0, 6, 6), (0, 0, 6, 7), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3), (0, 0, 7, 4), (0, 0, 7, 5), (0, 0, 7, 6), (0, 0, 7, 7), (0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 8, 4), (0, 0, 8, 5), (0, 0, 8, 6), (0, 0, 8, 7), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 9, 4), (0, 0, 9, 5), (0, 0, 9, 6), (0, 0, 9, 7), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 10, 4), (0, 0, 10, 5), (0, 0, 10, 6), (0, 0, 10, 7), (0, 0, 11, 0), (0, 0, 11, 1)] */
v_mul_f32 v[vgprValuC+17], s[sgprAlpha], v[vgprValuC+17] // *= alpha
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v17, v4
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v18, v4
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v19, v4
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v20, v4
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v21, v4
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v22, v4
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v23, v4
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v24, v4
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v25, v4
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v26, v4
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v27, v4
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v28, v4
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v29, v4
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v30, v4
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v31, v4
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v32, v4
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v33, v4
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v34, v4
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v35, v4
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v36, v4
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v37, v4
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v38, v4
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v39, v4
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v40, v4
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v41, v4
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v42, v4
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v43, v4
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v44, v4
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v45, v4
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v46, v4
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v47, v4
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v48, v4
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1
buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v49, v4
v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1
buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v50, v4
v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1
buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v51, v4
v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1
buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v52, v4
v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1
buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v53, v4
v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1
buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v54, v4
v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1
buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v55, v4
v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1
buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+56]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v56, v4
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1
buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+57]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v57, v4
v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1
buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+58]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v58, v4
v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1
buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+59]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v59, v4
v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1
buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+60]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v60, v4
v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1
buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+61]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v61, v4
v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1
buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,11,2:vw1); (0,0,11,3:vw1); (0,0,11,4:vw1); (0,0,11,5:vw1); (0,0,11,6:vw1); (0,0,11,7:vw1); (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,12,4:vw1); (0,0,12,5:vw1); (0,0,12,6:vw1); (0,0,12,7:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,13,4:vw1); (0,0,13,5:vw1); (0,0,13,6:vw1); (0,0,13,7:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,14,4:vw1); (0,0,14,5:vw1); (0,0,14,6:vw1); (0,0,14,7:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1); (0,0,15,4:vw1); (0,0,15,5:vw1); (0,0,15,6:vw1); (0,0,15,7:vw1); (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,16,4:vw1); (0,0,16,5:vw1); (0,0,16,6:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,11,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v65, v4, s78
v_lshlrev_b32 v65, 0x2, v65                        // Bias address scaled by BPE
ds_read_b32 v62, v65 offset:0                      // load Bias
ds_read_b32 v63, v65 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v64, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v64, v12, v64, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v69, v4, s78
v_lshlrev_b32 v69, 0x2, v69                        // Bias address scaled by BPE
ds_read_b32 v66, v69 offset:0                      // load Bias
ds_read_b32 v67, v69 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v68, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v68, v12, v68, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v73, v4, s78
v_lshlrev_b32 v73, 0x2, v73                        // Bias address scaled by BPE
ds_read_b32 v70, v73 offset:0                      // load Bias
ds_read_b32 v71, v73 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v72, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v72, v12, v72, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v77, v4, s78
v_lshlrev_b32 v77, 0x2, v77                        // Bias address scaled by BPE
ds_read_b32 v74, v77 offset:0                      // load Bias
ds_read_b32 v75, v77 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v76, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v76, v12, v76, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v81, v4, s78
v_lshlrev_b32 v81, 0x2, v81                        // Bias address scaled by BPE
ds_read_b32 v78, v81 offset:0                      // load Bias
ds_read_b32 v79, v81 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v80, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v12, v80, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v85, v4, s78
v_lshlrev_b32 v85, 0x2, v85                        // Bias address scaled by BPE
ds_read_b32 v82, v85 offset:0                      // load Bias
ds_read_b32 v83, v85 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v84, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v12, v84, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v0, s78
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
ds_read_b32 v86, v89 offset:0                      // load Bias
ds_read_b32 v87, v89 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v88, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v12, v88, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s78
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
ds_read_b32 v90, v93 offset:0                      // load Bias
ds_read_b32 v91, v93 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v92, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v12, v92, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v4, s78
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v12, v94, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s78
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v12, v96, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s78
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v12, v98, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s78
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v12, v100, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s78
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v12, v102, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s78
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v12, v104, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v0, s78
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v12, v106, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s78
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v12, v108, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v4, s78
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v12, v110, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s78
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v12, v112, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s78
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v12, v114, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s78
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v12, v116, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s78
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v12, v118, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s78
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v12, v120, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v0, s78
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v12, v122, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s78
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v12, v124, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v4, s78
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v12, v126, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s78
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v12, v128, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s78
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v12, v130, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v136, v4, s78
v_lshlrev_b32 v136, 0x2, v136                      // Bias address scaled by BPE
v_add_lshl_u32 v135, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v12, v135, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v138, v4, s78
v_lshlrev_b32 v138, 0x2, v138                      // Bias address scaled by BPE
v_add_lshl_u32 v137, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v12, v137, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v140, v4, s78
v_lshlrev_b32 v140, 0x2, v140                      // Bias address scaled by BPE
v_add_lshl_u32 v139, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v12, v139, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v142, v0, s78
v_lshlrev_b32 v142, 0x2, v142                      // Bias address scaled by BPE
v_add_lshl_u32 v141, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v12, v141, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v144, v4, s78
v_lshlrev_b32 v144, 0x2, v144                      // Bias address scaled by BPE
v_add_lshl_u32 v143, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v12, v143, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v146, v4, s78
v_lshlrev_b32 v146, 0x2, v146                      // Bias address scaled by BPE
v_add_lshl_u32 v145, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v12, v145, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v148, v4, s78
v_lshlrev_b32 v148, 0x2, v148                      // Bias address scaled by BPE
v_add_lshl_u32 v147, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v12, v147, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s78
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v12, v149, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s78
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v12, v151, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s78
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v12, v153, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s78
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v12, v155, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v0, s78
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v12, v157, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s78
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v12, v159, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v4, s78
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v12, v161, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s78
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v12, v163, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s78
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v12, v165, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s78
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v12, v167, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v170, v4, s78
v_lshlrev_b32 v170, 0x2, v170                      // Bias address scaled by BPE
v_add_lshl_u32 v169, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v12, v169, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+17], acc105         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+18], acc109         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+19], acc113         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+20], acc117         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+21], acc121         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+22], acc125         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+23], acc129         // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+24], acc133         // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+25], acc137         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+26], acc141         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+27], acc145         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+28], acc149         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+29], acc153         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+30], acc157         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+31], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+32], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+33], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+34], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+35], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+36], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+37], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+38], acc189         // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+39], acc193         // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+40], acc197         // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+41], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+42], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+43], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+44], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+45], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+46], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+47], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+48], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+49], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+50], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+51], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+52], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+53], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+54], acc253         // copy acc to vreg[127]
v_accvgpr_read_b32 v[vgprValuC+55], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+56], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+57], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+58], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+59], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+60], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+61], acc26          // copy acc to vreg[134]

/* rC *= alpha batchElements=[(0, 0, 11, 2), (0, 0, 11, 3), (0, 0, 11, 4), (0, 0, 11, 5), (0, 0, 11, 6), (0, 0, 11, 7), (0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 12, 4), (0, 0, 12, 5), (0, 0, 12, 6), (0, 0, 12, 7), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 13, 4), (0, 0, 13, 5), (0, 0, 13, 6), (0, 0, 13, 7), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 14, 4), (0, 0, 14, 5), (0, 0, 14, 6), (0, 0, 14, 7), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3), (0, 0, 15, 4), (0, 0, 15, 5), (0, 0, 15, 6), (0, 0, 15, 7), (0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 16, 4), (0, 0, 16, 5), (0, 0, 16, 6)] */
v_mul_f32 v[vgprValuC+17], s[sgprAlpha], v[vgprValuC+17] // *= alpha
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v17, v4
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v18, v4
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v19, v4
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v20, v4
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v21, v4
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v22, v4
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v23, v4
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v24, v4
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v25, v4
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v26, v4
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v27, v4
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v28, v4
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v29, v4
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v30, v4
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v31, v4
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v32, v4
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v33, v4
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v34, v4
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v35, v4
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v36, v4
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v37, v4
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v38, v4
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v39, v4
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v40, v4
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v41, v4
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v42, v4
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v43, v4
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v44, v4
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v45, v4
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v46, v4
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v47, v4
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v48, v4
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1
buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v49, v4
v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1
buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v50, v4
v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1
buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v51, v4
v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1
buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v52, v4
v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1
buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v53, v4
v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1
buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v54, v4
v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1
buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v55, v4
v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1
buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+56]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v56, v4
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1
buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+57]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v57, v4
v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1
buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+58]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v58, v4
v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1
buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+59]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v59, v4
v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1
buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+60]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v60, v4
v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1
buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+61]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v61, v4
v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1
buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */
/*    (0,0,16,7:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,17,4:vw1); (0,0,17,5:vw1); (0,0,17,6:vw1); (0,0,17,7:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,18,4:vw1); (0,0,18,5:vw1); (0,0,18,6:vw1); (0,0,18,7:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1); (0,0,19,4:vw1); (0,0,19,5:vw1); (0,0,19,6:vw1); (0,0,19,7:vw1); (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,20,4:vw1); (0,0,20,5:vw1); (0,0,20,6:vw1); (0,0,20,7:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,21,4:vw1); (0,0,21,5:vw1); (0,0,21,6:vw1); (0,0,21,7:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,16,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v65, v4, s78
v_lshlrev_b32 v65, 0x2, v65                        // Bias address scaled by BPE
ds_read_b32 v62, v65 offset:0                      // load Bias
ds_read_b32 v63, v65 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v64, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v64, v12, v64, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v69, v0, s78
v_lshlrev_b32 v69, 0x2, v69                        // Bias address scaled by BPE
ds_read_b32 v66, v69 offset:0                      // load Bias
ds_read_b32 v67, v69 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v68, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v68, v12, v68, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v73, v4, s78
v_lshlrev_b32 v73, 0x2, v73                        // Bias address scaled by BPE
ds_read_b32 v70, v73 offset:0                      // load Bias
ds_read_b32 v71, v73 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v72, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v72, v12, v72, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v77, v4, s78
v_lshlrev_b32 v77, 0x2, v77                        // Bias address scaled by BPE
ds_read_b32 v74, v77 offset:0                      // load Bias
ds_read_b32 v75, v77 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v76, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v76, v12, v76, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v81, v4, s78
v_lshlrev_b32 v81, 0x2, v81                        // Bias address scaled by BPE
ds_read_b32 v78, v81 offset:0                      // load Bias
ds_read_b32 v79, v81 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v80, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v12, v80, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v85, v4, s78
v_lshlrev_b32 v85, 0x2, v85                        // Bias address scaled by BPE
ds_read_b32 v82, v85 offset:0                      // load Bias
ds_read_b32 v83, v85 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v84, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v12, v84, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s78
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
ds_read_b32 v86, v89 offset:0                      // load Bias
ds_read_b32 v87, v89 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v88, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v12, v88, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s78
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
ds_read_b32 v90, v93 offset:0                      // load Bias
ds_read_b32 v91, v93 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v92, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v12, v92, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v4, s78
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v12, v94, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v0, s78
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v12, v96, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s78
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v12, v98, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s78
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v12, v100, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s78
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v12, v102, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s78
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v12, v104, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s78
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v12, v106, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s78
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v12, v108, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v4, s78
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v12, v110, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v0, s78
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v12, v112, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s78
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v12, v114, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s78
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v12, v116, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s78
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v12, v118, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s78
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v12, v120, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s78
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v12, v122, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s78
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v12, v124, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v4, s78
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v12, v126, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v0, s78
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v12, v128, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s78
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v12, v130, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v136, v4, s78
v_lshlrev_b32 v136, 0x2, v136                      // Bias address scaled by BPE
v_add_lshl_u32 v135, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v12, v135, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v138, v4, s78
v_lshlrev_b32 v138, 0x2, v138                      // Bias address scaled by BPE
v_add_lshl_u32 v137, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v12, v137, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v140, v4, s78
v_lshlrev_b32 v140, 0x2, v140                      // Bias address scaled by BPE
v_add_lshl_u32 v139, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v12, v139, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v142, v4, s78
v_lshlrev_b32 v142, 0x2, v142                      // Bias address scaled by BPE
v_add_lshl_u32 v141, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v12, v141, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v144, v4, s78
v_lshlrev_b32 v144, 0x2, v144                      // Bias address scaled by BPE
v_add_lshl_u32 v143, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v12, v143, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v146, v4, s78
v_lshlrev_b32 v146, 0x2, v146                      // Bias address scaled by BPE
v_add_lshl_u32 v145, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v12, v145, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v148, v0, s78
v_lshlrev_b32 v148, 0x2, v148                      // Bias address scaled by BPE
v_add_lshl_u32 v147, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v12, v147, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s78
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v12, v149, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s78
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v12, v151, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v4, s78
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v12, v153, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s78
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v12, v155, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v4, s78
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v12, v157, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s78
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v12, v159, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v4, s78
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v12, v161, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v0, s78
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v12, v163, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s78
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v12, v165, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s78
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v12, v167, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v170, v4, s78
v_lshlrev_b32 v170, 0x2, v170                      // Bias address scaled by BPE
v_add_lshl_u32 v169, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v12, v169, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+17], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+18], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+19], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+20], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+21], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+22], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+23], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+24], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+25], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+26], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+27], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+28], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+29], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+30], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+31], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+32], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+33], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+34], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+35], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+36], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+37], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+38], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+39], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+40], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+41], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+42], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+43], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+44], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+45], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+46], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+47], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+48], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+49], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+50], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+51], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+52], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+53], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+54], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+55], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+56], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+57], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+58], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+59], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+60], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+61], acc206         // copy acc to vreg[179]

/* rC *= alpha batchElements=[(0, 0, 16, 7), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 17, 4), (0, 0, 17, 5), (0, 0, 17, 6), (0, 0, 17, 7), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 18, 4), (0, 0, 18, 5), (0, 0, 18, 6), (0, 0, 18, 7), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3), (0, 0, 19, 4), (0, 0, 19, 5), (0, 0, 19, 6), (0, 0, 19, 7), (0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 20, 4), (0, 0, 20, 5), (0, 0, 20, 6), (0, 0, 20, 7), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 21, 4), (0, 0, 21, 5), (0, 0, 21, 6), (0, 0, 21, 7), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3)] */
v_mul_f32 v[vgprValuC+17], s[sgprAlpha], v[vgprValuC+17] // *= alpha
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v17, v4
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v18, v4
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v19, v4
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v20, v4
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v21, v4
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v22, v4
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v23, v4
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v24, v4
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v25, v4
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v26, v4
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v27, v4
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v28, v4
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v29, v4
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v30, v4
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v31, v4
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v32, v4
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v33, v4
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v34, v4
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v35, v4
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v36, v4
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v37, v4
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v38, v4
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v39, v4
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v40, v4
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v41, v4
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v42, v4
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v43, v4
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v44, v4
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v45, v4
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v46, v4
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v47, v4
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v48, v4
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1
buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v49, v4
v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1
buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v50, v4
v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1
buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v51, v4
v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1
buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v52, v4
v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1
buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v53, v4
v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1
buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v54, v4
v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1
buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v55, v4
v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1
buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+56]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v56, v4
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1
buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+57]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v57, v4
v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1
buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+58]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v58, v4
v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1
buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+59]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v59, v4
v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1
buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+60]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v60, v4
v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1
buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+61]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v61, v4
v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1
buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #4 (d1,d0,vc1,vc0) = */
/*    (0,0,22,4:vw1); (0,0,22,5:vw1); (0,0,22,6:vw1); (0,0,22,7:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1); (0,0,23,4:vw1); (0,0,23,5:vw1); (0,0,23,6:vw1); (0,0,23,7:vw1); (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,24,4:vw1); (0,0,24,5:vw1); (0,0,24,6:vw1); (0,0,24,7:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,25,4:vw1); (0,0,25,5:vw1); (0,0,25,6:vw1); (0,0,25,7:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,26,4:vw1); (0,0,26,5:vw1); (0,0,26,6:vw1); (0,0,26,7:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1); (0,0,27,4:vw1); (0,0,27,5:vw1); (0,0,27,6:vw1); (0,0,27,7:vw1); (0,0,28,0:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,22,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v65, v4, s78
v_lshlrev_b32 v65, 0x2, v65                        // Bias address scaled by BPE
ds_read_b32 v62, v65 offset:0                      // load Bias
ds_read_b32 v63, v65 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v64, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v64, v12, v64, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v69, v4, s78
v_lshlrev_b32 v69, 0x2, v69                        // Bias address scaled by BPE
ds_read_b32 v66, v69 offset:0                      // load Bias
ds_read_b32 v67, v69 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v68, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v68, v12, v68, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v73, v4, s78
v_lshlrev_b32 v73, 0x2, v73                        // Bias address scaled by BPE
ds_read_b32 v70, v73 offset:0                      // load Bias
ds_read_b32 v71, v73 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v72, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v72, v12, v72, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v77, v4, s78
v_lshlrev_b32 v77, 0x2, v77                        // Bias address scaled by BPE
ds_read_b32 v74, v77 offset:0                      // load Bias
ds_read_b32 v75, v77 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v76, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v76, v12, v76, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v81, v0, s78
v_lshlrev_b32 v81, 0x2, v81                        // Bias address scaled by BPE
ds_read_b32 v78, v81 offset:0                      // load Bias
ds_read_b32 v79, v81 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v80, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v12, v80, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v85, v4, s78
v_lshlrev_b32 v85, 0x2, v85                        // Bias address scaled by BPE
ds_read_b32 v82, v85 offset:0                      // load Bias
ds_read_b32 v83, v85 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v84, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v12, v84, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s78
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
ds_read_b32 v86, v89 offset:0                      // load Bias
ds_read_b32 v87, v89 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v88, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v12, v88, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s78
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
ds_read_b32 v90, v93 offset:0                      // load Bias
ds_read_b32 v91, v93 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v92, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v12, v92, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v4, s78
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v12, v94, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s78
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v12, v96, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s78
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v12, v98, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s78
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v12, v100, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v0, s78
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v12, v102, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s78
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v12, v104, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s78
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v12, v106, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s78
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v12, v108, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v4, s78
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v12, v110, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s78
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v12, v112, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s78
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v12, v114, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s78
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v12, v116, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v0, s78
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v12, v118, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s78
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v12, v120, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s78
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v12, v122, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s78
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v12, v124, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v127, v4, s78
v_lshlrev_b32 v127, 0x2, v127                      // Bias address scaled by BPE
v_add_lshl_u32 v126, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v126, v12, v126, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v129, v4, s78
v_lshlrev_b32 v129, 0x2, v129                      // Bias address scaled by BPE
v_add_lshl_u32 v128, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v128, v12, v128, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v131, v4, s78
v_lshlrev_b32 v131, 0x2, v131                      // Bias address scaled by BPE
v_add_lshl_u32 v130, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v130, v12, v130, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v136, v4, s78
v_lshlrev_b32 v136, 0x2, v136                      // Bias address scaled by BPE
v_add_lshl_u32 v135, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v135, v12, v135, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v138, v0, s78
v_lshlrev_b32 v138, 0x2, v138                      // Bias address scaled by BPE
v_add_lshl_u32 v137, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v137, v12, v137, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v140, v4, s78
v_lshlrev_b32 v140, 0x2, v140                      // Bias address scaled by BPE
v_add_lshl_u32 v139, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v139, v12, v139, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v142, v4, s78
v_lshlrev_b32 v142, 0x2, v142                      // Bias address scaled by BPE
v_add_lshl_u32 v141, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v141, v12, v141, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v144, v4, s78
v_lshlrev_b32 v144, 0x2, v144                      // Bias address scaled by BPE
v_add_lshl_u32 v143, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v143, v12, v143, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v146, v4, s78
v_lshlrev_b32 v146, 0x2, v146                      // Bias address scaled by BPE
v_add_lshl_u32 v145, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v145, v12, v145, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v148, v4, s78
v_lshlrev_b32 v148, 0x2, v148                      // Bias address scaled by BPE
v_add_lshl_u32 v147, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v147, v12, v147, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v150, v4, s78
v_lshlrev_b32 v150, 0x2, v150                      // Bias address scaled by BPE
v_add_lshl_u32 v149, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v149, v12, v149, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v152, v4, s78
v_lshlrev_b32 v152, 0x2, v152                      // Bias address scaled by BPE
v_add_lshl_u32 v151, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v151, v12, v151, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v154, v0, s78
v_lshlrev_b32 v154, 0x2, v154                      // Bias address scaled by BPE
v_add_lshl_u32 v153, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v153, v12, v153, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v156, v4, s78
v_lshlrev_b32 v156, 0x2, v156                      // Bias address scaled by BPE
v_add_lshl_u32 v155, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v155, v12, v155, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v158, v4, s78
v_lshlrev_b32 v158, 0x2, v158                      // Bias address scaled by BPE
v_add_lshl_u32 v157, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v157, v12, v157, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v160, v4, s78
v_lshlrev_b32 v160, 0x2, v160                      // Bias address scaled by BPE
v_add_lshl_u32 v159, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v159, v12, v159, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v162, v4, s78
v_lshlrev_b32 v162, 0x2, v162                      // Bias address scaled by BPE
v_add_lshl_u32 v161, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v161, v12, v161, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v164, v4, s78
v_lshlrev_b32 v164, 0x2, v164                      // Bias address scaled by BPE
v_add_lshl_u32 v163, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v163, v12, v163, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v166, v4, s78
v_lshlrev_b32 v166, 0x2, v166                      // Bias address scaled by BPE
v_add_lshl_u32 v165, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v165, v12, v165, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v168, v4, s78
v_lshlrev_b32 v168, 0x2, v168                      // Bias address scaled by BPE
v_add_lshl_u32 v167, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v167, v12, v167, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v170, v0, s78
v_lshlrev_b32 v170, 0x2, v170                      // Bias address scaled by BPE
v_add_lshl_u32 v169, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v169, v12, v169, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+17], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+18], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+19], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+20], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+21], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+22], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+23], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+24], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+25], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+26], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+27], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+28], acc254         // copy acc to vreg[191]
v_accvgpr_read_b32 v[vgprValuC+29], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+30], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+31], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+32], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+33], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+34], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+35], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+36], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+37], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+38], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+39], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+40], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+41], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+42], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+43], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+44], acc63          // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+45], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+46], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+47], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+48], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+49], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+50], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+51], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+52], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+53], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+54], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+55], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+56], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+57], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+58], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+59], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+60], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+61], acc131         // copy acc to vreg[224]

/* rC *= alpha batchElements=[(0, 0, 22, 4), (0, 0, 22, 5), (0, 0, 22, 6), (0, 0, 22, 7), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3), (0, 0, 23, 4), (0, 0, 23, 5), (0, 0, 23, 6), (0, 0, 23, 7), (0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 24, 4), (0, 0, 24, 5), (0, 0, 24, 6), (0, 0, 24, 7), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 25, 4), (0, 0, 25, 5), (0, 0, 25, 6), (0, 0, 25, 7), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 26, 4), (0, 0, 26, 5), (0, 0, 26, 6), (0, 0, 26, 7), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3), (0, 0, 27, 4), (0, 0, 27, 5), (0, 0, 27, 6), (0, 0, 27, 7), (0, 0, 28, 0)] */
v_mul_f32 v[vgprValuC+17], s[sgprAlpha], v[vgprValuC+17] // *= alpha
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_mul_f32 v[vgprValuC+17], v63, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v17, v4
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v64, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+18], v67, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v18, v4
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v68, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+19], v71, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v19, v4
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v72, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+20], v75, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v20, v4
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v76, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+21], v79, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v21, v4
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+22], v83, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v22, v4
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+23], v87, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v23, v4
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+24], v91, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v24, v4
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+25], v63, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v25, v4
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+26], v67, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v26, v4
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+27], v71, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v27, v4
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+28], v75, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v28, v4
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+29], v79, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v29, v4
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+30], v83, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v30, v4
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+31], v87, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v31, v4
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+32], v91, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v32, v4
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+33], v63, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v33, v4
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+34], v67, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v34, v4
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+35], v71, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v35, v4
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+36], v75, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v36, v4
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+37], v79, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v37, v4
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+38], v83, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v38, v4
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+39], v87, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v39, v4
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+40], v91, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v40, v4
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+41], v63, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v41, v4
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v126, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+42], v67, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v42, v4
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v128, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+43], v71, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v43, v4
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v130, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+44], v75, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v44, v4
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v135, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+45], v79, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v45, v4
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v137, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+46], v83, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v46, v4
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v139, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+47], v87, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v47, v4
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v141, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+48], v91, v[vgprValuC+48]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+48]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v48, v4
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+48] // convert C to bf16 in gwvw==1
buffer_store_short v48, v143, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+49], v63, v[vgprValuC+49]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+49]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v49, v4
v_cvt_pk_bf16_f32 v49, v[vgprValuC+49], v[vgprValuC+49] // convert C to bf16 in gwvw==1
buffer_store_short v49, v145, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+50], v67, v[vgprValuC+50]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+50]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v50, v4
v_cvt_pk_bf16_f32 v50, v[vgprValuC+50], v[vgprValuC+50] // convert C to bf16 in gwvw==1
buffer_store_short v50, v147, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+51], v71, v[vgprValuC+51]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+51]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v51, v4
v_cvt_pk_bf16_f32 v51, v[vgprValuC+51], v[vgprValuC+51] // convert C to bf16 in gwvw==1
buffer_store_short v51, v149, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+52], v75, v[vgprValuC+52]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+52]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v52, v4
v_cvt_pk_bf16_f32 v52, v[vgprValuC+52], v[vgprValuC+52] // convert C to bf16 in gwvw==1
buffer_store_short v52, v151, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+53], v79, v[vgprValuC+53]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+53]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v53, v4
v_cvt_pk_bf16_f32 v53, v[vgprValuC+53], v[vgprValuC+53] // convert C to bf16 in gwvw==1
buffer_store_short v53, v153, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+54], v83, v[vgprValuC+54]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v82, v[vgprValuC+54]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v54, v4
v_cvt_pk_bf16_f32 v54, v[vgprValuC+54], v[vgprValuC+54] // convert C to bf16 in gwvw==1
buffer_store_short v54, v155, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+55], v87, v[vgprValuC+55]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v86, v[vgprValuC+55]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v55, v4
v_cvt_pk_bf16_f32 v55, v[vgprValuC+55], v[vgprValuC+55] // convert C to bf16 in gwvw==1
buffer_store_short v55, v157, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+56], v91, v[vgprValuC+56]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v90, v[vgprValuC+56]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v56, v4
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+56] // convert C to bf16 in gwvw==1
buffer_store_short v56, v159, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+57], v63, v[vgprValuC+57]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v62, v[vgprValuC+57]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v57, v4
v_cvt_pk_bf16_f32 v57, v[vgprValuC+57], v[vgprValuC+57] // convert C to bf16 in gwvw==1
buffer_store_short v57, v161, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+58], v67, v[vgprValuC+58]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v66, v[vgprValuC+58]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v58, v4
v_cvt_pk_bf16_f32 v58, v[vgprValuC+58], v[vgprValuC+58] // convert C to bf16 in gwvw==1
buffer_store_short v58, v163, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+59], v71, v[vgprValuC+59]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v70, v[vgprValuC+59]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v59, v4
v_cvt_pk_bf16_f32 v59, v[vgprValuC+59], v[vgprValuC+59] // convert C to bf16 in gwvw==1
buffer_store_short v59, v165, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+60], v75, v[vgprValuC+60]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v74, v[vgprValuC+60]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v60, v4
v_cvt_pk_bf16_f32 v60, v[vgprValuC+60], v[vgprValuC+60] // convert C to bf16 in gwvw==1
buffer_store_short v60, v167, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+61], v79, v[vgprValuC+61]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v78, v[vgprValuC+61]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v61, v4
v_cvt_pk_bf16_f32 v61, v[vgprValuC+61], v[vgprValuC+61] // convert C to bf16 in gwvw==1
buffer_store_short v61, v169, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #5 (d1,d0,vc1,vc0) = */
/*    (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1); (0,0,28,4:vw1); (0,0,28,5:vw1); (0,0,28,6:vw1); (0,0,28,7:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,29,4:vw1); (0,0,29,5:vw1); (0,0,29,6:vw1); (0,0,29,7:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,30,4:vw1); (0,0,30,5:vw1); (0,0,30,6:vw1); (0,0,30,7:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1); (0,0,31,4:vw1); (0,0,31,5:vw1); (0,0,31,6:vw1); (0,0,31,7:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v12, BufferOOB
/* (d1,vc1,d0,vc0)=(0,28,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v51, v4, s78
v_lshlrev_b32 v51, 0x2, v51                        // Bias address scaled by BPE
ds_read_b32 v48, v51 offset:0                      // load Bias
ds_read_b32 v49, v51 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v50, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v50, v12, v50, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v55, v4, s78
v_lshlrev_b32 v55, 0x2, v55                        // Bias address scaled by BPE
ds_read_b32 v52, v55 offset:0                      // load Bias
ds_read_b32 v53, v55 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v54, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v54, v12, v54, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v59, v4, s78
v_lshlrev_b32 v59, 0x2, v59                        // Bias address scaled by BPE
ds_read_b32 v56, v59 offset:0                      // load Bias
ds_read_b32 v57, v59 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v58, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v58, v12, v58, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v63, v4, s78
v_lshlrev_b32 v63, 0x2, v63                        // Bias address scaled by BPE
ds_read_b32 v60, v63 offset:0                      // load Bias
ds_read_b32 v61, v63 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v62, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v12, v62, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v67, v4, s78
v_lshlrev_b32 v67, 0x2, v67                        // Bias address scaled by BPE
ds_read_b32 v64, v67 offset:0                      // load Bias
ds_read_b32 v65, v67 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v66, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v66, v12, v66, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v71, v4, s78
v_lshlrev_b32 v71, 0x2, v71                        // Bias address scaled by BPE
ds_read_b32 v68, v71 offset:0                      // load Bias
ds_read_b32 v69, v71 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v70, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v70, v12, v70, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v75, v4, s78
v_lshlrev_b32 v75, 0x2, v75                        // Bias address scaled by BPE
ds_read_b32 v72, v75 offset:0                      // load Bias
ds_read_b32 v73, v75 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v74, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v74, v12, v74, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v79, v0, s78
v_lshlrev_b32 v79, 0x2, v79                        // Bias address scaled by BPE
ds_read_b32 v76, v79 offset:0                      // load Bias
ds_read_b32 v77, v79 offset:1024                   // load scaleAlpha
v_add_lshl_u32 v78, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v78, v12, v78, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v81, v4, s78
v_lshlrev_b32 v81, 0x2, v81                        // Bias address scaled by BPE
v_add_lshl_u32 v80, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v12, v80, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v83, v4, s78
v_lshlrev_b32 v83, 0x2, v83                        // Bias address scaled by BPE
v_add_lshl_u32 v82, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v82, v12, v82, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v85, v4, s78
v_lshlrev_b32 v85, 0x2, v85                        // Bias address scaled by BPE
v_add_lshl_u32 v84, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v84, v12, v84, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v87, v4, s78
v_lshlrev_b32 v87, 0x2, v87                        // Bias address scaled by BPE
v_add_lshl_u32 v86, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v86, v12, v86, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v89, v4, s78
v_lshlrev_b32 v89, 0x2, v89                        // Bias address scaled by BPE
v_add_lshl_u32 v88, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v88, v12, v88, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v91, v4, s78
v_lshlrev_b32 v91, 0x2, v91                        // Bias address scaled by BPE
v_add_lshl_u32 v90, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v90, v12, v90, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v93, v4, s78
v_lshlrev_b32 v93, 0x2, v93                        // Bias address scaled by BPE
v_add_lshl_u32 v92, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v92, v12, v92, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v95, v0, s78
v_lshlrev_b32 v95, 0x2, v95                        // Bias address scaled by BPE
v_add_lshl_u32 v94, v3, v0, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v94, v12, v94, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v97, v4, s78
v_lshlrev_b32 v97, 0x2, v97                        // Bias address scaled by BPE
v_add_lshl_u32 v96, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v96, v12, v96, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v99, v4, s78
v_lshlrev_b32 v99, 0x2, v99                        // Bias address scaled by BPE
v_add_lshl_u32 v98, v3, v4, 0x1                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v98, v12, v98, s[82:83]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v101, v4, s78
v_lshlrev_b32 v101, 0x2, v101                      // Bias address scaled by BPE
v_add_lshl_u32 v100, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v100, v12, v100, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v103, v4, s78
v_lshlrev_b32 v103, 0x2, v103                      // Bias address scaled by BPE
v_add_lshl_u32 v102, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v102, v12, v102, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v105, v4, s78
v_lshlrev_b32 v105, 0x2, v105                      // Bias address scaled by BPE
v_add_lshl_u32 v104, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v104, v12, v104, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v107, v4, s78
v_lshlrev_b32 v107, 0x2, v107                      // Bias address scaled by BPE
v_add_lshl_u32 v106, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v106, v12, v106, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v109, v4, s78
v_lshlrev_b32 v109, 0x2, v109                      // Bias address scaled by BPE
v_add_lshl_u32 v108, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v108, v12, v108, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[78:79], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v111, v0, s78
v_lshlrev_b32 v111, 0x2, v111                      // Bias address scaled by BPE
v_add_lshl_u32 v110, v3, v0, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v110, v12, v110, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v113, v4, s78
v_lshlrev_b32 v113, 0x2, v113                      // Bias address scaled by BPE
v_add_lshl_u32 v112, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v112, v12, v112, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v115, v4, s78
v_lshlrev_b32 v115, 0x2, v115                      // Bias address scaled by BPE
v_add_lshl_u32 v114, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v114, v12, v114, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v117, v4, s78
v_lshlrev_b32 v117, 0x2, v117                      // Bias address scaled by BPE
v_add_lshl_u32 v116, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v116, v12, v116, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,4) */
v_add_co_u32 v4, vcc, v0, 4                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v119, v4, s78
v_lshlrev_b32 v119, 0x2, v119                      // Bias address scaled by BPE
v_add_lshl_u32 v118, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v118, v12, v118, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,5) */
v_add_co_u32 v4, vcc, v0, 5                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v121, v4, s78
v_lshlrev_b32 v121, 0x2, v121                      // Bias address scaled by BPE
v_add_lshl_u32 v120, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v120, v12, v120, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,6) */
v_add_co_u32 v4, vcc, v0, 6                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v123, v4, s78
v_lshlrev_b32 v123, 0x2, v123                      // Bias address scaled by BPE
v_add_lshl_u32 v122, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v122, v12, v122, s[82:83]            // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,7) */
v_add_co_u32 v4, vcc, v0, 7                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[78:79], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[82:83], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[82:83], s[78:79], s[82:83]             // in0 && in1
s_mul_i32 s78, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v125, v4, s78
v_lshlrev_b32 v125, 0x2, v125                      // Bias address scaled by BPE
v_add_lshl_u32 v124, v3, v4, 0x1                   // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v124, v12, v124, s[82:83]            // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+17], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+18], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+19], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+20], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+21], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+22], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+23], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+24], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+25], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+26], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+27], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+28], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+29], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+30], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+31], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+32], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+33], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+34], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+35], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+36], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+37], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+38], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+39], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+40], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+41], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+42], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+43], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+44], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+45], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+46], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+47], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3), (0, 0, 28, 4), (0, 0, 28, 5), (0, 0, 28, 6), (0, 0, 28, 7), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 29, 4), (0, 0, 29, 5), (0, 0, 29, 6), (0, 0, 29, 7), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 30, 4), (0, 0, 30, 5), (0, 0, 30, 6), (0, 0, 30, 7), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3), (0, 0, 31, 4), (0, 0, 31, 5), (0, 0, 31, 6), (0, 0, 31, 7)] */
v_mul_f32 v[vgprValuC+17], s[sgprAlpha], v[vgprValuC+17] // *= alpha
v_pk_mul_f32 v[vgprValuC+18:vgprValuC+18+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+20:vgprValuC+20+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+22:vgprValuC+22+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
s_waitcnt lgkmcnt(0)                               // wait for Bias LDS, ScaleAlphaVec

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16
v_mul_f32 v[vgprValuC+17], v49, v[vgprValuC+17]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v48, v[vgprValuC+17]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v17, v4
v_cvt_pk_bf16_f32 v17, v[vgprValuC+17], v[vgprValuC+17] // convert C to bf16 in gwvw==1
buffer_store_short v17, v50, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+18], v53, v[vgprValuC+18]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v52, v[vgprValuC+18]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v18, v4
v_cvt_pk_bf16_f32 v18, v[vgprValuC+18], v[vgprValuC+18] // convert C to bf16 in gwvw==1
buffer_store_short v18, v54, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+19], v57, v[vgprValuC+19]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+19]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v19, v4
v_cvt_pk_bf16_f32 v19, v[vgprValuC+19], v[vgprValuC+19] // convert C to bf16 in gwvw==1
buffer_store_short v19, v58, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+20], v61, v[vgprValuC+20]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+20]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v20, v4
v_cvt_pk_bf16_f32 v20, v[vgprValuC+20], v[vgprValuC+20] // convert C to bf16 in gwvw==1
buffer_store_short v20, v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+21], v65, v[vgprValuC+21]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+21]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v21, v4
v_cvt_pk_bf16_f32 v21, v[vgprValuC+21], v[vgprValuC+21] // convert C to bf16 in gwvw==1
buffer_store_short v21, v66, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+22], v69, v[vgprValuC+22]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+22]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v22, v4
v_cvt_pk_bf16_f32 v22, v[vgprValuC+22], v[vgprValuC+22] // convert C to bf16 in gwvw==1
buffer_store_short v22, v70, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+23], v73, v[vgprValuC+23]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+23]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v23, v4
v_cvt_pk_bf16_f32 v23, v[vgprValuC+23], v[vgprValuC+23] // convert C to bf16 in gwvw==1
buffer_store_short v23, v74, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+24], v77, v[vgprValuC+24]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+24]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v24, v4
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+24] // convert C to bf16 in gwvw==1
buffer_store_short v24, v78, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+25], v49, v[vgprValuC+25]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v48, v[vgprValuC+25]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v25, v4
v_cvt_pk_bf16_f32 v25, v[vgprValuC+25], v[vgprValuC+25] // convert C to bf16 in gwvw==1
buffer_store_short v25, v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+26], v53, v[vgprValuC+26]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v52, v[vgprValuC+26]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v26, v4
v_cvt_pk_bf16_f32 v26, v[vgprValuC+26], v[vgprValuC+26] // convert C to bf16 in gwvw==1
buffer_store_short v26, v82, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+27], v57, v[vgprValuC+27]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+27]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v27, v4
v_cvt_pk_bf16_f32 v27, v[vgprValuC+27], v[vgprValuC+27] // convert C to bf16 in gwvw==1
buffer_store_short v27, v84, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+28], v61, v[vgprValuC+28]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+28]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v28, v4
v_cvt_pk_bf16_f32 v28, v[vgprValuC+28], v[vgprValuC+28] // convert C to bf16 in gwvw==1
buffer_store_short v28, v86, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+29], v65, v[vgprValuC+29]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+29]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v29, v4
v_cvt_pk_bf16_f32 v29, v[vgprValuC+29], v[vgprValuC+29] // convert C to bf16 in gwvw==1
buffer_store_short v29, v88, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+30], v69, v[vgprValuC+30]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+30]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v30, v4
v_cvt_pk_bf16_f32 v30, v[vgprValuC+30], v[vgprValuC+30] // convert C to bf16 in gwvw==1
buffer_store_short v30, v90, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+31], v73, v[vgprValuC+31]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+31]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v31, v4
v_cvt_pk_bf16_f32 v31, v[vgprValuC+31], v[vgprValuC+31] // convert C to bf16 in gwvw==1
buffer_store_short v31, v92, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+32], v77, v[vgprValuC+32]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+32]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v32, v4
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+32] // convert C to bf16 in gwvw==1
buffer_store_short v32, v94, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+33], v49, v[vgprValuC+33]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v48, v[vgprValuC+33]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v33, v4
v_cvt_pk_bf16_f32 v33, v[vgprValuC+33], v[vgprValuC+33] // convert C to bf16 in gwvw==1
buffer_store_short v33, v96, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+34], v53, v[vgprValuC+34]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v52, v[vgprValuC+34]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v34, v4
v_cvt_pk_bf16_f32 v34, v[vgprValuC+34], v[vgprValuC+34] // convert C to bf16 in gwvw==1
buffer_store_short v34, v98, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+35], v57, v[vgprValuC+35]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+35]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v35, v4
v_cvt_pk_bf16_f32 v35, v[vgprValuC+35], v[vgprValuC+35] // convert C to bf16 in gwvw==1
buffer_store_short v35, v100, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+36], v61, v[vgprValuC+36]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+36]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v36, v4
v_cvt_pk_bf16_f32 v36, v[vgprValuC+36], v[vgprValuC+36] // convert C to bf16 in gwvw==1
buffer_store_short v36, v102, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+37], v65, v[vgprValuC+37]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+37]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v37, v4
v_cvt_pk_bf16_f32 v37, v[vgprValuC+37], v[vgprValuC+37] // convert C to bf16 in gwvw==1
buffer_store_short v37, v104, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+38], v69, v[vgprValuC+38]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+38]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v38, v4
v_cvt_pk_bf16_f32 v38, v[vgprValuC+38], v[vgprValuC+38] // convert C to bf16 in gwvw==1
buffer_store_short v38, v106, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+39], v73, v[vgprValuC+39]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+39]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v39, v4
v_cvt_pk_bf16_f32 v39, v[vgprValuC+39], v[vgprValuC+39] // convert C to bf16 in gwvw==1
buffer_store_short v39, v108, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+40], v77, v[vgprValuC+40]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v76, v[vgprValuC+40]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v40, v4
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+40] // convert C to bf16 in gwvw==1
buffer_store_short v40, v110, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+41], v49, v[vgprValuC+41]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v48, v[vgprValuC+41]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v41, v4
v_cvt_pk_bf16_f32 v41, v[vgprValuC+41], v[vgprValuC+41] // convert C to bf16 in gwvw==1
buffer_store_short v41, v112, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+42], v53, v[vgprValuC+42]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v52, v[vgprValuC+42]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v42, v4
v_cvt_pk_bf16_f32 v42, v[vgprValuC+42], v[vgprValuC+42] // convert C to bf16 in gwvw==1
buffer_store_short v42, v114, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+43], v57, v[vgprValuC+43]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v56, v[vgprValuC+43]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v43, v4
v_cvt_pk_bf16_f32 v43, v[vgprValuC+43], v[vgprValuC+43] // convert C to bf16 in gwvw==1
buffer_store_short v43, v116, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+44], v61, v[vgprValuC+44]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v60, v[vgprValuC+44]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v44, v4
v_cvt_pk_bf16_f32 v44, v[vgprValuC+44], v[vgprValuC+44] // convert C to bf16 in gwvw==1
buffer_store_short v44, v118, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+45], v65, v[vgprValuC+45]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v64, v[vgprValuC+45]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v45, v4
v_cvt_pk_bf16_f32 v45, v[vgprValuC+45], v[vgprValuC+45] // convert C to bf16 in gwvw==1
buffer_store_short v45, v120, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+46], v69, v[vgprValuC+46]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v68, v[vgprValuC+46]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v46, v4
v_cvt_pk_bf16_f32 v46, v[vgprValuC+46], v[vgprValuC+46] // convert C to bf16 in gwvw==1
buffer_store_short v46, v122, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
v_mul_f32 v[vgprValuC+47], v73, v[vgprValuC+47]    // *= ScaleAlphaVecVMul
v_add_f32 v4, v72, v[vgprValuC+47]                 // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b32 v47, v4
v_cvt_pk_bf16_f32 v47, v[vgprValuC+47], v[vgprValuC+47] // convert C to bf16 in gwvw==1
buffer_store_short v47, v124, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_Beta:
s_and_b32 s78, 255, s[sgprSizeI]                   // s78 = s[sgprSizeI] % 256
s_add_u32 s79, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s79                // wg0 >= nwg0-1 ?
s_cselect_b32 s78, s78, 0                          // set rMT0
s_cmpk_gt_u32 s78, 0                               // rMT0 > 0
s_cbranch_scc1 label_GW_B1_E1_M                    // jump if edges required
s_and_b32 s78, 255, s[sgprSizeJ]                   // s78 = s[sgprSizeJ] % 256
s_add_u32 s79, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s79                // wg1 >= nwg1-1
s_cselect_b32 s78, s78, 0                          // set rMT1
s_cmpk_gt_u32 s78, 0                               // rMT1 > 0
s_cbranch_scc1 label_GW_B1_E1_N                    // jump if edges required
label_GW_B1_E0:
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW8_beta_1_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW8_beta_1_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW8_beta_1_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 12            // activationType == 12
s_cbranch_scc1 label_To_Activation_Clamp_VW8_beta_1_edge_0 // Branch if true
label_To_Activation_None_VW8_beta_1_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_None_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_2
label_To_Activation_Gelu_VW8_beta_1_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Gelu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_2
label_To_Activation_Relu_VW8_beta_1_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Relu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_2
label_To_Activation_Silu_VW8_beta_1_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Silu_VW8, 4        // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_2
label_To_Activation_Clamp_VW8_beta_1_edge_0:
s_getpc_b64 s[8:9]                                 // addr of next instr
s_add_i32 s65, label_Activation_Clamp_VW8, 4       // target branch offset
s_add_u32 s8, s8, s65                              // add target branch offset
s_addc_u32 s9, s9, 0                               // add high and carry
s_branch label_ActivationSetPCAddrEnd_2
label_ActivationSetPCAddrEnd_2:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=7 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Beta Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw8); (0,0,1,0:vw8); (0,0,2,0:vw8); (0,0,3,0:vw8); (0,0,4,0:vw8); (0,0,5,0:vw8); (0,0,6,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_add_lshl_u32 v18, v2, v0, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0
buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
s_mul_i32 s68, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v19, v0, s68
v_lshlrev_b32 v19, 0x2, v19                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b128 v[80:83], v19 offset:0                // load Bias
ds_read_b128 v[84:87], v19 offset:16               // load Bias
ds_read_b128 v[88:91], v19 offset:1024             // load scaleAlpha
ds_read_b128 v[92:95], v19 offset:1040             // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
v_add_lshl_u32 v17, v3, v0, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0
v_accvgpr_read_b32 v[vgprValuC+24], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+25], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+26], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+27], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+28], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+29], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+30], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+31], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+32], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+33], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+34], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+35], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+36], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+38], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+39], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+40], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+41], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+42], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+43], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+44], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+45], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+46], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+47], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+48], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+49], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+50], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+51], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+52], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+53], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+54], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+55], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+56], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+57], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+58], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+59], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+60], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+61], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+62], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+63], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt lgkmcnt(0), vmcnt(6)                     // vmcnt(6) = 7 - 1 (beta) lgkmcnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v20 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v20 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v21 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v21 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v22 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v22 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v23 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v23 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(5) = 7 - 2 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v96 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v96 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v97 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v97 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v98 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v99 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v99 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(4) = 7 - 3 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v100 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v100 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v101 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v102 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v102 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v103 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v103 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(3) = 7 - 4 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v104 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v105 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v105 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v106 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v106 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v107 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+55], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(2) = 7 - 5 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v108 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+56], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v108 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+57], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v109 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+58], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v109 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+59], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+60], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v110 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+61], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v111 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+62], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v111 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+63], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(1) = 7 - 6 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v112 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+64], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v112 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+65], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+66], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v113 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+67], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v114 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+68], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v114 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+69], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v115 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+70], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v115 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+71], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+64:vgprValuC+64+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+66:vgprValuC+66+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+68:vgprValuC+68+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+70:vgprValuC+70+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[64:65], v[4:5]
v_mov_b64 v[66:67], v[6:7]
v_mov_b64 v[68:69], v[8:9]
v_mov_b64 v[70:71], v[10:11]
v_cvt_pk_bf16_f32 v64, v[vgprValuC+64], v[vgprValuC+65] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v65, v[vgprValuC+66], v[vgprValuC+67] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v66, v[vgprValuC+68], v[vgprValuC+69] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v67, v[vgprValuC+70], v[vgprValuC+71] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(0) = 7 - 7 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], v[88:89], v[vgprValuC+72:vgprValuC+72+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], v[90:91], v[vgprValuC+74:vgprValuC+74+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], v[92:93], v[vgprValuC+76:vgprValuC+76+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], v[94:95], v[vgprValuC+78:vgprValuC+78+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v116 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+72], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v116 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+73], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v117 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+74], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v117 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+75], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v118 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+76], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v118 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+77], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v119 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+78], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v119 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+79], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+72:vgprValuC+72+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+74:vgprValuC+74+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+76:vgprValuC+76+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+78:vgprValuC+78+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[72:73], v[4:5]
v_mov_b64 v[74:75], v[6:7]
v_mov_b64 v[76:77], v[8:9]
v_mov_b64 v[78:79], v[10:11]
v_cvt_pk_bf16_f32 v72, v[vgprValuC+72], v[vgprValuC+73] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v73, v[vgprValuC+74], v[vgprValuC+75] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v74, v[vgprValuC+76], v[vgprValuC+77] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v75, v[vgprValuC+78], v[vgprValuC+79] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Beta Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,7,0:vw8); (0,0,8,0:vw8); (0,0,9,0:vw8); (0,0,10,0:vw8); (0,0,11,0:vw8); (0,0,12,0:vw8); (0,0,13,0:vw8) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[20:23], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
ds_read_b128 v[80:83], v19 offset:0                // load Bias
ds_read_b128 v[84:87], v19 offset:16               // load Bias
ds_read_b128 v[88:91], v19 offset:1024             // load scaleAlpha
ds_read_b128 v[92:95], v19 offset:1040             // load scaleAlpha
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[96:99], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[100:103], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[104:107], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[108:111], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[112:115], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
s_lshl_b32 s68, s[sgprStrideC1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_load_dwordx4 v[116:119], v18, s[sgprSrdC:sgprSrdC+3], 0 offen offset:0 // load C
v_accvgpr_read_b32 v[vgprValuC+24], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+25], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+26], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+27], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+28], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+29], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+30], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+31], acc252         // copy acc to vreg[63]
v_accvgpr_read_b32 v[vgprValuC+32], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+33], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+34], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+35], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+36], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+37], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+38], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+39], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+40], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+41], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+42], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+43], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+44], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+45], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+46], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+47], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+48], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+49], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+50], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+51], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+52], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+53], acc85          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+54], acc89          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+55], acc93          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+56], acc97          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+57], acc101         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+58], acc105         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+59], acc109         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+60], acc113         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+61], acc117         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+62], acc121         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+63], acc125         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+64], acc129         // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+65], acc133         // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+66], acc137         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+67], acc141         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+68], acc145         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+69], acc149         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+70], acc153         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+71], acc157         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+72], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+73], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+74], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+75], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+76], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+77], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+78], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+79], acc189         // copy acc to vreg[111]

/* rC *= alpha batchElements=[(0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0)] */
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+72:vgprValuC+72+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+74:vgprValuC+74+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+76:vgprValuC+76+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] op_sel_hi:[0,1,1] // *= alpha (pk)
v_pk_mul_f32 v[vgprValuC+78:vgprValuC+78+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] op_sel_hi:[0,1,1] // *= alpha (pk)

/* apply mask, calc new C and issue writes */
v_mov_b32 v14, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v15, 0x7fff0000                          // fp32 Nan
v_mov_b32 v16, 0x7fff                              // rounding bias for bfloat16

s_waitcnt lgkmcnt(0), vmcnt(6)                     // vmcnt(6) = 7 - 1 (beta) lgkmcnt(0) = 4 - 2 (bias) - 2 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+24:vgprValuC+24+1], v[88:89], v[vgprValuC+24:vgprValuC+24+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+26:vgprValuC+26+1], v[90:91], v[vgprValuC+26:vgprValuC+26+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+28:vgprValuC+28+1], v[92:93], v[vgprValuC+28:vgprValuC+28+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+30:vgprValuC+30+1], v[94:95], v[vgprValuC+30:vgprValuC+30+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v20 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+24], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v20 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+25], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v21 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+26], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v21 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+27], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v22 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+28], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v22 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+29], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v23 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+30], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v23 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+31], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+24:vgprValuC+24+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+26:vgprValuC+26+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+28:vgprValuC+28+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+30:vgprValuC+30+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[24:25], v[4:5]
v_mov_b64 v[26:27], v[6:7]
v_mov_b64 v[28:29], v[8:9]
v_mov_b64 v[30:31], v[10:11]
v_cvt_pk_bf16_f32 v24, v[vgprValuC+24], v[vgprValuC+25] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v25, v[vgprValuC+26], v[vgprValuC+27] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v26, v[vgprValuC+28], v[vgprValuC+29] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v27, v[vgprValuC+30], v[vgprValuC+31] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(5) = 7 - 2 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+32:vgprValuC+32+1], v[88:89], v[vgprValuC+32:vgprValuC+32+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+34:vgprValuC+34+1], v[90:91], v[vgprValuC+34:vgprValuC+34+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[92:93], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[94:95], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v96 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+32], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v96 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+33], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v97 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+34], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v97 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+35], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v98 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+36], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v98 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+37], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v99 src0_sel:WORD_0             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+38], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v99 src0_sel:WORD_1             // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+39], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+32:vgprValuC+32+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+34:vgprValuC+34+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[32:33], v[4:5]
v_mov_b64 v[34:35], v[6:7]
v_mov_b64 v[36:37], v[8:9]
v_mov_b64 v[38:39], v[10:11]
v_cvt_pk_bf16_f32 v32, v[vgprValuC+32], v[vgprValuC+33] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v33, v[vgprValuC+34], v[vgprValuC+35] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v34, v[vgprValuC+36], v[vgprValuC+37] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v35, v[vgprValuC+38], v[vgprValuC+39] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(4) = 7 - 3 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[88:89], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[90:91], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+44:vgprValuC+44+1], v[92:93], v[vgprValuC+44:vgprValuC+44+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+46:vgprValuC+46+1], v[94:95], v[vgprValuC+46:vgprValuC+46+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v100 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+40], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v100 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+41], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v101 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+42], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v101 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+43], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v102 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+44], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v102 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+45], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v103 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+46], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v103 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+47], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+42:vgprValuC+42+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+44:vgprValuC+44+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+46:vgprValuC+46+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[40:41], v[4:5]
v_mov_b64 v[42:43], v[6:7]
v_mov_b64 v[44:45], v[8:9]
v_mov_b64 v[46:47], v[10:11]
v_cvt_pk_bf16_f32 v40, v[vgprValuC+40], v[vgprValuC+41] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v41, v[vgprValuC+42], v[vgprValuC+43] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v42, v[vgprValuC+44], v[vgprValuC+45] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v43, v[vgprValuC+46], v[vgprValuC+47] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(3) = 7 - 4 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+48:vgprValuC+48+1], v[88:89], v[vgprValuC+48:vgprValuC+48+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+50:vgprValuC+50+1], v[90:91], v[vgprValuC+50:vgprValuC+50+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+52:vgprValuC+52+1], v[92:93], v[vgprValuC+52:vgprValuC+52+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+54:vgprValuC+54+1], v[94:95], v[vgprValuC+54:vgprValuC+54+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v104 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+48], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v104 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+49], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v105 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+50], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v105 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+51], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v106 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+52], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v106 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+53], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v107 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+54], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v107 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+55], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+48:vgprValuC+48+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+50:vgprValuC+50+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+52:vgprValuC+52+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+54:vgprValuC+54+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[48:49], v[4:5]
v_mov_b64 v[50:51], v[6:7]
v_mov_b64 v[52:53], v[8:9]
v_mov_b64 v[54:55], v[10:11]
v_cvt_pk_bf16_f32 v48, v[vgprValuC+48], v[vgprValuC+49] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v49, v[vgprValuC+50], v[vgprValuC+51] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v50, v[vgprValuC+52], v[vgprValuC+53] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v51, v[vgprValuC+54], v[vgprValuC+55] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(2) = 7 - 5 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+56:vgprValuC+56+1], v[88:89], v[vgprValuC+56:vgprValuC+56+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+58:vgprValuC+58+1], v[90:91], v[vgprValuC+58:vgprValuC+58+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+60:vgprValuC+60+1], v[92:93], v[vgprValuC+60:vgprValuC+60+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+62:vgprValuC+62+1], v[94:95], v[vgprValuC+62:vgprValuC+62+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v108 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+56], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v108 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+57], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v109 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+58], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v109 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+59], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v110 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+60], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v110 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+61], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v111 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+62], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v111 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+63], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_pk_add_f32 v[4:5], v[80:81], v[vgprValuC+56:vgprValuC+56+1] // C += bias
v_pk_add_f32 v[6:7], v[82:83], v[vgprValuC+58:vgprValuC+58+1] // C += bias
v_pk_add_f32 v[8:9], v[84:85], v[vgprValuC+60:vgprValuC+60+1] // C += bias
v_pk_add_f32 v[10:11], v[86:87], v[vgprValuC+62:vgprValuC+62+1] // C += bias
s_swappc_b64 s[66:67], s[8:9]
v_mov_b64 v[56:57], v[4:5]
v_mov_b64 v[58:59], v[6:7]
v_mov_b64 v[60:61], v[8:9]
v_mov_b64 v[62:63], v[10:11]
v_cvt_pk_bf16_f32 v56, v[vgprValuC+56], v[vgprValuC+57] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v57, v[vgprValuC+58], v[vgprValuC+59] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v58, v[vgprValuC+60], v[vgprValuC+61] // convert C to bf16 and Pack with neighbor
v_cvt_pk_bf16_f32 v59, v[vgprValuC+62], v[vgprValuC+63] // convert C to bf16 and Pack with neighbor
s_lshl_b32 s68, s[sgprStrideD1J], 1                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v17, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 nt // store D

s_waitcnt vmcnt(6)                                 // vmcnt(1) = 7 - 6 (beta) (interleaved)
v_pk_mul_f32 v[vgprValuC+64:vgprValuC+64+1], v[88:89], v[vgprValuC+64:vgprValuC+64+1] // *= ScaleAlphaVecVMulPK(88)(0)
v_pk_mul_f32 v[vgprValuC+66:vgprValuC+66+1], v[90:91], v[vgprValuC+66:vgprValuC+66+1] // *= ScaleAlphaVecVMulPK(88)(2)
v_pk_mul_f32 v[vgprValuC+68:vgprValuC+68+1], v[92:93], v[vgprValuC+68:vgprValuC+68+1] // *= ScaleAlphaVecVMulPK(88)(4)
v_pk_mul_f32 v[vgprValuC+70:vgprValuC+70+1], v[94:95], v[vgprValuC+70:vgprValuC+70+1] // *= ScaleAlphaVecVMulPK(88)(6)
v_cvt_f32_bf16 v4, v112 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+64], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v112 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+65], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v113 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+66], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v113 src0_sel:WORD_1            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+67], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v114 src0_sel:WORD_0            // cvt bf16 to f32
v_fmac_f32 v[vgprValuC+68], v4, s[sgprBeta]        // finalSum = sum*alpha + C*beta
v_cvt_f32_bf16 v4, v114 src0_sel:WORD_1            // cvt bf16 to f32
v