1; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 2; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s 3; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s 4 5; Natural mapping 6define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 7 ; CHECK-LABEL: name: s_buffer_load_i32 8 ; CHECK: bb.1 (%ir-block.0): 9 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 10 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 11 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 12 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 13 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 14 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 15 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 16 ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4) 17 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) 18 ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) 19 ; CHECK: $sgpr0 = COPY [[INT]](s32) 20 ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 21 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) 22 ret i32 %val 23} 24 25define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 26 ; CHECK-LABEL: name: s_buffer_load_v2i32 27 ; CHECK: bb.1 (%ir-block.0): 28 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 29 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 30 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 31 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 32 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 33 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 34 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 35 ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4) 36 ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) 37 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) 38 ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) 39 ; CHECK: $sgpr0 = COPY [[INT]](s32) 40 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) 41 ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) 42 ; CHECK: $sgpr1 = COPY [[INT1]](s32) 43 ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 44 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) 45 ret <2 x i32> %val 46} 47 48define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 49 ; CHECK-LABEL: name: s_buffer_load_v3i32 50 ; CHECK: bb.1 (%ir-block.0): 51 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 52 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 53 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 54 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 55 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 56 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 57 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 58 ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4) 59 ; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 60 ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) 61 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) 62 ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) 63 ; CHECK: $sgpr0 = COPY [[INT]](s32) 64 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) 65 ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) 66 ; CHECK: $sgpr1 = COPY [[INT1]](s32) 67 ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) 68 ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) 69 ; CHECK: $sgpr2 = COPY [[INT2]](s32) 70 ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 71 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) 72 ret <3 x i32> %val 73} 74 75define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 76 ; CHECK-LABEL: name: s_buffer_load_v8i32 77 ; CHECK: bb.1 (%ir-block.0): 78 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 79 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 80 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 81 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 82 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 83 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 84 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 85 ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4) 86 ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) 87 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) 88 ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) 89 ; CHECK: $sgpr0 = COPY [[INT]](s32) 90 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) 91 ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) 92 ; CHECK: $sgpr1 = COPY [[INT1]](s32) 93 ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) 94 ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) 95 ; CHECK: $sgpr2 = COPY [[INT2]](s32) 96 ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) 97 ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) 98 ; CHECK: $sgpr3 = COPY [[INT3]](s32) 99 ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) 100 ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) 101 ; CHECK: $sgpr4 = COPY [[INT4]](s32) 102 ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) 103 ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) 104 ; CHECK: $sgpr5 = COPY [[INT5]](s32) 105 ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) 106 ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) 107 ; CHECK: $sgpr6 = COPY [[INT6]](s32) 108 ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) 109 ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) 110 ; CHECK: $sgpr7 = COPY [[INT7]](s32) 111 ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 112 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) 113 ret <8 x i32> %val 114} 115 116define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { 117 ; CHECK-LABEL: name: s_buffer_load_v16i32 118 ; CHECK: bb.1 (%ir-block.0): 119 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 120 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 121 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 122 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 123 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 124 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 125 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 126 ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4) 127 ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) 128 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) 129 ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) 130 ; CHECK: $sgpr0 = COPY [[INT]](s32) 131 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) 132 ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) 133 ; CHECK: $sgpr1 = COPY [[INT1]](s32) 134 ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) 135 ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) 136 ; CHECK: $sgpr2 = COPY [[INT2]](s32) 137 ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) 138 ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) 139 ; CHECK: $sgpr3 = COPY [[INT3]](s32) 140 ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) 141 ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) 142 ; CHECK: $sgpr4 = COPY [[INT4]](s32) 143 ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) 144 ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) 145 ; CHECK: $sgpr5 = COPY [[INT5]](s32) 146 ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) 147 ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) 148 ; CHECK: $sgpr6 = COPY [[INT6]](s32) 149 ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) 150 ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) 151 ; CHECK: $sgpr7 = COPY [[INT7]](s32) 152 ; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32) 153 ; CHECK: [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32) 154 ; CHECK: $sgpr8 = COPY [[INT8]](s32) 155 ; CHECK: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32) 156 ; CHECK: [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32) 157 ; CHECK: $sgpr9 = COPY [[INT9]](s32) 158 ; CHECK: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32) 159 ; CHECK: [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32) 160 ; CHECK: $sgpr10 = COPY [[INT10]](s32) 161 ; CHECK: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32) 162 ; CHECK: [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32) 163 ; CHECK: $sgpr11 = COPY [[INT11]](s32) 164 ; CHECK: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32) 165 ; CHECK: [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32) 166 ; CHECK: $sgpr12 = COPY [[INT12]](s32) 167 ; CHECK: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32) 168 ; CHECK: [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32) 169 ; CHECK: $sgpr13 = COPY [[INT13]](s32) 170 ; CHECK: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32) 171 ; CHECK: [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32) 172 ; CHECK: $sgpr14 = COPY [[INT14]](s32) 173 ; CHECK: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32) 174 ; CHECK: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) 175 ; CHECK: $sgpr15 = COPY [[INT15]](s32) 176 ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 177 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) 178 ret <16 x i32> %val 179} 180 181; Check cases that need to be converted to MUBUF due to the offset being a VGPR. 182define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 183 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset 184 ; CHECK: bb.1 (%ir-block.0): 185 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 186 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 187 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 188 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 189 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 190 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 191 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 192 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 193 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 194 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) 195 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 196 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 197 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 198 ret float %val 199} 200 201define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 202 ; CHECK-LABEL: name: s_buffer_load_v2f32_vgpr_offset 203 ; CHECK: bb.1 (%ir-block.0): 204 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 205 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 206 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 207 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 208 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 209 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 210 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 211 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 212 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 213 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 8, align 4) 214 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>) 215 ; CHECK: $vgpr0 = COPY [[UV]](s32) 216 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 217 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 218 %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 219 ret <2 x float> %val 220} 221 222define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 223 ; CHECK-LABEL: name: s_buffer_load_v3f32_vgpr_offset 224 ; CHECK: bb.1 (%ir-block.0): 225 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 226 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 227 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 228 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 229 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 230 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 231 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 232 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 233 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 234 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 235 ; CHECK: [[EXTRACT:%[0-9]+]]:vgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_BUFFER_LOAD]](<4 x s32>), 0 236 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) 237 ; CHECK: $vgpr0 = COPY [[UV]](s32) 238 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 239 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 240 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 241 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 242 ret <3 x float> %val 243} 244 245define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 246 ; CHECK-LABEL: name: s_buffer_load_v4f32_vgpr_offset 247 ; CHECK: bb.1 (%ir-block.0): 248 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 249 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 250 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 251 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 252 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 253 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 254 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 255 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 256 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 257 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 258 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) 259 ; CHECK: $vgpr0 = COPY [[UV]](s32) 260 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 261 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 262 ; CHECK: $vgpr3 = COPY [[UV3]](s32) 263 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 264 %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 265 ret <4 x float> %val 266} 267 268define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 269 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset 270 ; CHECK: bb.1 (%ir-block.0): 271 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 272 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 273 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 274 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 275 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 276 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 277 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 278 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 279 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 280 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 281 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 282 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 283 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 284 ; CHECK: $vgpr0 = COPY [[UV]](s32) 285 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 286 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 287 ; CHECK: $vgpr3 = COPY [[UV3]](s32) 288 ; CHECK: $vgpr4 = COPY [[UV4]](s32) 289 ; CHECK: $vgpr5 = COPY [[UV5]](s32) 290 ; CHECK: $vgpr6 = COPY [[UV6]](s32) 291 ; CHECK: $vgpr7 = COPY [[UV7]](s32) 292 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 293 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 294 ret <8 x float> %val 295} 296 297define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 298 ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset 299 ; CHECK: bb.1 (%ir-block.0): 300 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 301 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 302 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 303 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 304 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 305 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 306 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 307 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 308 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 309 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 310 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 311 ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) 312 ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) 313 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) 314 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) 315 ; CHECK: $vgpr0 = COPY [[UV]](s32) 316 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 317 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 318 ; CHECK: $vgpr3 = COPY [[UV3]](s32) 319 ; CHECK: $vgpr4 = COPY [[UV4]](s32) 320 ; CHECK: $vgpr5 = COPY [[UV5]](s32) 321 ; CHECK: $vgpr6 = COPY [[UV6]](s32) 322 ; CHECK: $vgpr7 = COPY [[UV7]](s32) 323 ; CHECK: $vgpr8 = COPY [[UV8]](s32) 324 ; CHECK: $vgpr9 = COPY [[UV9]](s32) 325 ; CHECK: $vgpr10 = COPY [[UV10]](s32) 326 ; CHECK: $vgpr11 = COPY [[UV11]](s32) 327 ; CHECK: $vgpr12 = COPY [[UV12]](s32) 328 ; CHECK: $vgpr13 = COPY [[UV13]](s32) 329 ; CHECK: $vgpr14 = COPY [[UV14]](s32) 330 ; CHECK: $vgpr15 = COPY [[UV15]](s32) 331 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 332 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 333 ret <16 x float> %val 334} 335 336define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 337 ; CHECK-LABEL: name: s_buffer_load_i96_vgpr_offset 338 ; CHECK: bb.1 (%ir-block.0): 339 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 340 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 341 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 342 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 343 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 344 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 345 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 346 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 347 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 348 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 349 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 350 ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128) 351 ; CHECK: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store 12 into `i96 addrspace(1)* undef`, align 8, addrspace 1) 352 ; CHECK: S_ENDPGM 0 353 %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0) 354 store i96 %val, i96 addrspace(1)* undef 355 ret void 356} 357 358; Test split of a wide scalar 359define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 360 ; CHECK-LABEL: name: s_buffer_load_i256_vgpr_offset 361 ; CHECK: bb.1 (%ir-block.0): 362 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 363 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 364 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 365 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 366 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 367 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 368 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 369 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 370 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 371 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 372 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 373 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 374 ; CHECK: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128) 375 ; CHECK: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256) 376 ; CHECK: G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i256 addrspace(1)* undef`, align 8, addrspace 1) 377 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 378 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 379 ; CHECK: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i256 addrspace(1)* undef` + 16, align 8, addrspace 1) 380 ; CHECK: S_ENDPGM 0 381 %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0) 382 store i256 %val, i256 addrspace(1)* undef 383 ret void 384} 385 386; Test split of a wide scalar 387define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 388 ; CHECK-LABEL: name: s_buffer_load_i512_vgpr_offset 389 ; CHECK: bb.1 (%ir-block.0): 390 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 391 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 392 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 393 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 394 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 395 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 396 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 397 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 398 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 399 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 400 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 401 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 402 ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) 403 ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) 404 ; CHECK: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128) 405 ; CHECK: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512) 406 ; CHECK: G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i512 addrspace(1)* undef`, align 8, addrspace 1) 407 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 408 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 409 ; CHECK: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 16, align 8, addrspace 1) 410 ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 411 ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) 412 ; CHECK: G_STORE [[UV2]](s128), [[PTR_ADD1]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 32, align 8, addrspace 1) 413 ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 414 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) 415 ; CHECK: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 48, align 8, addrspace 1) 416 ; CHECK: S_ENDPGM 0 417 %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0) 418 store i512 %val, i512 addrspace(1)* undef 419 ret void 420} 421 422; Test split of a vector with 16-bit elements 423define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 424 ; CHECK-LABEL: name: s_buffer_load_v16i16_vgpr_offset 425 ; CHECK: bb.1 (%ir-block.0): 426 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 427 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 428 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 429 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 430 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 431 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 432 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 433 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 434 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 435 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 436 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 437 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 438 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) 439 ; CHECK: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) 440 ; CHECK: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1) 441 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 442 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 443 ; CHECK: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef` + 16, align 32, addrspace 1) 444 ; CHECK: S_ENDPGM 0 445 %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0) 446 store <16 x i16> %val, <16 x i16> addrspace(1)* undef 447 ret void 448} 449 450; Test split of a vector with 16-bit elements 451define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 452 ; CHECK-LABEL: name: s_buffer_load_v32i16_vgpr_offset 453 ; CHECK: bb.1 (%ir-block.0): 454 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 455 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 456 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 457 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 458 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 459 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 460 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 461 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 462 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 463 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 464 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 465 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 466 ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) 467 ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) 468 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) 469 ; CHECK: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) 470 ; CHECK: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1) 471 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 472 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 473 ; CHECK: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 16, align 64, addrspace 1) 474 ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 475 ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) 476 ; CHECK: G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 32, align 64, addrspace 1) 477 ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 478 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) 479 ; CHECK: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 48, align 64, addrspace 1) 480 ; CHECK: S_ENDPGM 0 481 %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0) 482 store <32 x i16> %val, <32 x i16> addrspace(1)* undef 483 ret void 484} 485 486; Test split of a vector with 64-bit elements 487define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 488 ; CHECK-LABEL: name: s_buffer_load_v4i64_vgpr_offset 489 ; CHECK: bb.1 (%ir-block.0): 490 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 491 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 492 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 493 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 494 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 495 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 496 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 497 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 498 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 499 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 500 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 501 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 502 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) 503 ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) 504 ; CHECK: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1) 505 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 506 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 507 ; CHECK: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef` + 16, align 32, addrspace 1) 508 ; CHECK: S_ENDPGM 0 509 %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0) 510 store <4 x i64> %val, <4 x i64> addrspace(1)* undef 511 ret void 512} 513 514; Test split of a vector with 64-bit elements 515define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 516 ; CHECK-LABEL: name: s_buffer_load_v8i64_vgpr_offset 517 ; CHECK: bb.1 (%ir-block.0): 518 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 519 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 520 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 521 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 522 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 523 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 524 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 525 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 526 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 527 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 528 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 529 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 530 ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) 531 ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) 532 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) 533 ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) 534 ; CHECK: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1) 535 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 536 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 537 ; CHECK: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 16, align 64, addrspace 1) 538 ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 539 ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) 540 ; CHECK: G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 32, align 64, addrspace 1) 541 ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 542 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) 543 ; CHECK: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 48, align 64, addrspace 1) 544 ; CHECK: S_ENDPGM 0 545 %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0) 546 store <8 x i64> %val, <8 x i64> addrspace(1)* undef 547 ret void 548} 549 550; Test split of a vector with 64-bit pointer elements 551define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 552 ; CHECK-LABEL: name: s_buffer_load_v4p1_vgpr_offset 553 ; CHECK: bb.1 (%ir-block.0): 554 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 555 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 556 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 557 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 558 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 559 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 560 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 561 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 562 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 563 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 564 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 565 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 566 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) 567 ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) 568 ; CHECK: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1) 569 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 570 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 571 ; CHECK: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 32, addrspace 1) 572 ; CHECK: S_ENDPGM 0 573 %val = call <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) 574 store <4 x i8 addrspace(1)*> %val, <4 x i8 addrspace(1)*> addrspace(1)* undef 575 ret void 576} 577 578; Test split of a vector with 64-bit pointer elements 579define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { 580 ; CHECK-LABEL: name: s_buffer_load_v8p1_vgpr_offset 581 ; CHECK: bb.1 (%ir-block.0): 582 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 583 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 584 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 585 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 586 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 587 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 588 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 589 ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF 590 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 591 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 592 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 593 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 594 ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) 595 ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) 596 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) 597 ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) 598 ; CHECK: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1) 599 ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 600 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) 601 ; CHECK: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 64, addrspace 1) 602 ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 603 ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) 604 ; CHECK: G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 32, align 64, addrspace 1) 605 ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 606 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) 607 ; CHECK: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 48, align 64, addrspace 1) 608 ; CHECK: S_ENDPGM 0 609 %val = call <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) 610 store <8 x i8 addrspace(1)*> %val, <8 x i8 addrspace(1)*> addrspace(1)* undef 611 ret void 612} 613 614define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %rsrc, i32 %soffset.base) { 615 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 616 ; CHECK: bb.1 (%ir-block.0): 617 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 618 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 619 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 620 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 621 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 622 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 623 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 624 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 625 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 626 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 627 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 628 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 629 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load 4) 630 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 631 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 632 %soffset = add i32 %soffset.base, 4092 633 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 634 ret float %val 635} 636 637define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %rsrc, i32 %soffset.base) { 638 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 639 ; CHECK: bb.1 (%ir-block.0): 640 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 641 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 642 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 643 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 644 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 645 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 646 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 647 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 648 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 649 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 650 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 651 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 652 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load 4) 653 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 654 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 655 %soffset = add i32 %soffset.base, 4095 656 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 657 ret float %val 658} 659 660define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %rsrc, i32 %soffset.base) { 661 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 662 ; CHECK: bb.1 (%ir-block.0): 663 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 664 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 665 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 666 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 667 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 668 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 669 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 670 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 671 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 672 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 673 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 674 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) 675 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 676 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 677 %soffset = add i32 %soffset.base, 4096 678 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 679 ret float %val 680} 681 682; Make sure the base offset is added to each split load. 683define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> inreg %rsrc, i32 %soffset.base) { 684 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064 685 ; CHECK: bb.1 (%ir-block.0): 686 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 687 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 688 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 689 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 690 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 691 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 692 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 693 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 694 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 695 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 696 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 697 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 698 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) 699 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) 700 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 701 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 702 ; CHECK: $vgpr0 = COPY [[UV]](s32) 703 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 704 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 705 ; CHECK: $vgpr3 = COPY [[UV3]](s32) 706 ; CHECK: $vgpr4 = COPY [[UV4]](s32) 707 ; CHECK: $vgpr5 = COPY [[UV5]](s32) 708 ; CHECK: $vgpr6 = COPY [[UV6]](s32) 709 ; CHECK: $vgpr7 = COPY [[UV7]](s32) 710 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 711 %soffset = add i32 %soffset.base, 4064 712 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 713 ret <8 x float> %val 714} 715 716; Make sure the maximum offset isn't exeeded when splitting this 717define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> inreg %rsrc, i32 %soffset.base) { 718 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068 719 ; CHECK: bb.1 (%ir-block.0): 720 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 721 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 722 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 723 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 724 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 725 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 726 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 727 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 728 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 729 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 730 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 731 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 732 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 733 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 734 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 735 ; CHECK: $vgpr0 = COPY [[UV]](s32) 736 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 737 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 738 ; CHECK: $vgpr3 = COPY [[UV3]](s32) 739 ; CHECK: $vgpr4 = COPY [[UV4]](s32) 740 ; CHECK: $vgpr5 = COPY [[UV5]](s32) 741 ; CHECK: $vgpr6 = COPY [[UV6]](s32) 742 ; CHECK: $vgpr7 = COPY [[UV7]](s32) 743 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 744 %soffset = add i32 %soffset.base, 4068 745 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 746 ret <8 x float> %val 747} 748 749define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i32> inreg %rsrc, i32 %soffset.base) { 750 ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032 751 ; CHECK: bb.1 (%ir-block.0): 752 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 753 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 754 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 755 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 756 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 757 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 758 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 759 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 760 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 761 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 762 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 763 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 764 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load 16, align 4) 765 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load 16, align 4) 766 ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) 767 ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) 768 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) 769 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) 770 ; CHECK: $vgpr0 = COPY [[UV]](s32) 771 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 772 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 773 ; CHECK: $vgpr3 = COPY [[UV3]](s32) 774 ; CHECK: $vgpr4 = COPY [[UV4]](s32) 775 ; CHECK: $vgpr5 = COPY [[UV5]](s32) 776 ; CHECK: $vgpr6 = COPY [[UV6]](s32) 777 ; CHECK: $vgpr7 = COPY [[UV7]](s32) 778 ; CHECK: $vgpr8 = COPY [[UV8]](s32) 779 ; CHECK: $vgpr9 = COPY [[UV9]](s32) 780 ; CHECK: $vgpr10 = COPY [[UV10]](s32) 781 ; CHECK: $vgpr11 = COPY [[UV11]](s32) 782 ; CHECK: $vgpr12 = COPY [[UV12]](s32) 783 ; CHECK: $vgpr13 = COPY [[UV13]](s32) 784 ; CHECK: $vgpr14 = COPY [[UV14]](s32) 785 ; CHECK: $vgpr15 = COPY [[UV15]](s32) 786 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 787 %soffset = add i32 %soffset.base, 4032 788 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 789 ret <16 x float> %val 790} 791 792define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i32> inreg %rsrc, i32 %soffset.base) { 793 ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036 794 ; CHECK: bb.1 (%ir-block.0): 795 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 796 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 797 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 798 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 799 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 800 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 801 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 802 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 803 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 804 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 805 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 806 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 807 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 808 ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4) 809 ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4) 810 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) 811 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) 812 ; CHECK: $vgpr0 = COPY [[UV]](s32) 813 ; CHECK: $vgpr1 = COPY [[UV1]](s32) 814 ; CHECK: $vgpr2 = COPY [[UV2]](s32) 815 ; CHECK: $vgpr3 = COPY [[UV3]](s32) 816 ; CHECK: $vgpr4 = COPY [[UV4]](s32) 817 ; CHECK: $vgpr5 = COPY [[UV5]](s32) 818 ; CHECK: $vgpr6 = COPY [[UV6]](s32) 819 ; CHECK: $vgpr7 = COPY [[UV7]](s32) 820 ; CHECK: $vgpr8 = COPY [[UV8]](s32) 821 ; CHECK: $vgpr9 = COPY [[UV9]](s32) 822 ; CHECK: $vgpr10 = COPY [[UV10]](s32) 823 ; CHECK: $vgpr11 = COPY [[UV11]](s32) 824 ; CHECK: $vgpr12 = COPY [[UV12]](s32) 825 ; CHECK: $vgpr13 = COPY [[UV13]](s32) 826 ; CHECK: $vgpr14 = COPY [[UV14]](s32) 827 ; CHECK: $vgpr15 = COPY [[UV15]](s32) 828 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 829 %soffset = add i32 %soffset.base, 4036 830 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 831 ret <16 x float> %val 832} 833 834; Waterfall loop due to resource being VGPR 835define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %soffset) { 836 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc 837 ; CHECK: bb.1 (%ir-block.0): 838 ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 839 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 840 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 841 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 842 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 843 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 844 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 845 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) 846 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 847 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 848 ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF 849 ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 850 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 851 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 852 ; CHECK: bb.2: 853 ; CHECK: successors: %bb.3, %bb.2 854 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 855 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %8(s32), %bb.2 856 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 857 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 858 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 859 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 860 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 861 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 862 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 863 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 864 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 865 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 866 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) 867 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 868 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 869 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 870 ; CHECK: bb.3: 871 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 872 ; CHECK: bb.4: 873 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 874 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 875 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 876 ret float %val 877} 878 879; Use the offset inside the waterfall loop 880define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %rsrc, i32 inreg %soffset.base) { 881 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 882 ; CHECK: bb.1 (%ir-block.0): 883 ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 884 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 885 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 886 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 887 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 888 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 889 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 890 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 891 ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] 892 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 893 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 894 ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF 895 ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 896 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 897 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 898 ; CHECK: bb.2: 899 ; CHECK: successors: %bb.3, %bb.2 900 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.2 901 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 902 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 903 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 904 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 905 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 906 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 907 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 908 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 909 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 910 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 911 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 912 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load 4) 913 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 914 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 915 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 916 ; CHECK: bb.3: 917 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 918 ; CHECK: bb.4: 919 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 920 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 921 %soffset = add i32 %soffset.base, 4092 922 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 923 ret float %val 924} 925 926; Scalar offset exceeds MUBUF limit, keep add out of the loop 927define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { 928 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 929 ; CHECK: bb.1 (%ir-block.0): 930 ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 931 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 932 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 933 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 934 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 935 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 936 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 937 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 938 ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] 939 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) 940 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 941 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 942 ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF 943 ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 944 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 945 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 946 ; CHECK: bb.2: 947 ; CHECK: successors: %bb.3, %bb.2 948 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %19, %bb.2 949 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 950 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 951 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 952 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 953 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 954 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 955 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 956 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 957 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 958 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 959 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 960 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4) 961 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 962 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 963 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 964 ; CHECK: bb.3: 965 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 966 ; CHECK: bb.4: 967 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 968 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 969 %soffset = add i32 %soffset.base, 4096 970 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 971 ret float %val 972} 973 974; Waterfall loop, but constant offset 975define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) { 976 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 977 ; CHECK: bb.1 (%ir-block.0): 978 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 979 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 980 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 981 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 982 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 983 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 984 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 985 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 986 ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 987 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 988 ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF 989 ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 990 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 991 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 992 ; CHECK: bb.2: 993 ; CHECK: successors: %bb.3, %bb.2 994 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 995 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2 996 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 997 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 998 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 999 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1000 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1001 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1002 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1003 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1004 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1005 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1006 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load 4 + 4095, align 1) 1007 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1008 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1009 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1010 ; CHECK: bb.3: 1011 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1012 ; CHECK: bb.4: 1013 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1014 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1015 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) 1016 ret float %val 1017} 1018 1019; Waterfall loop, but constant offset 1020define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) { 1021 ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 1022 ; CHECK: bb.1 (%ir-block.0): 1023 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 1024 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1025 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1026 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1027 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1028 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1029 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 1030 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 1031 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 1032 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1033 ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF 1034 ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1035 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1036 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1037 ; CHECK: bb.2: 1038 ; CHECK: successors: %bb.3, %bb.2 1039 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 1040 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2 1041 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1042 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1043 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1044 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1045 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1046 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1047 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1048 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1049 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1050 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1051 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4) 1052 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1053 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1054 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1055 ; CHECK: bb.3: 1056 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1057 ; CHECK: bb.4: 1058 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1059 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1060 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) 1061 ret float %val 1062} 1063 1064; Need a waterfall loop, but the offset is scalar. 1065; Make sure the base offset is added to each split load. 1066define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %rsrc, i32 inreg %soffset.base) { 1067 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064 1068 ; CHECK: bb.1 (%ir-block.0): 1069 ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 1070 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1071 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1072 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1073 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1074 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1075 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1076 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 1077 ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] 1078 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1079 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1080 ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1081 ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1082 ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1083 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1084 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1085 ; CHECK: bb.2: 1086 ; CHECK: successors: %bb.3, %bb.2 1087 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 1088 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 1089 ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 1090 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1091 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1092 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1093 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1094 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1095 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1096 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1097 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1098 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1099 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1100 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) 1101 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) 1102 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1103 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1104 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1105 ; CHECK: bb.3: 1106 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1107 ; CHECK: bb.4: 1108 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 1109 ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 1110 ; CHECK: $vgpr0 = COPY [[UV2]](s32) 1111 ; CHECK: $vgpr1 = COPY [[UV3]](s32) 1112 ; CHECK: $vgpr2 = COPY [[UV4]](s32) 1113 ; CHECK: $vgpr3 = COPY [[UV5]](s32) 1114 ; CHECK: $vgpr4 = COPY [[UV6]](s32) 1115 ; CHECK: $vgpr5 = COPY [[UV7]](s32) 1116 ; CHECK: $vgpr6 = COPY [[UV8]](s32) 1117 ; CHECK: $vgpr7 = COPY [[UV9]](s32) 1118 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 1119 %soffset = add i32 %soffset.base, 4064 1120 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 1121 ret <8 x float> %val 1122} 1123 1124; Need a waterfall loop, but the offset is scalar. 1125; Make sure the maximum offset isn't exeeded when splitting this 1126define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %rsrc, i32 inreg %soffset.base) { 1127 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068 1128 ; CHECK: bb.1 (%ir-block.0): 1129 ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 1130 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1131 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1132 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1133 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1134 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1135 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1136 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 1137 ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] 1138 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) 1139 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 1140 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1141 ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1142 ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1143 ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1144 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1145 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1146 ; CHECK: bb.2: 1147 ; CHECK: successors: %bb.3, %bb.2 1148 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 1149 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 1150 ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 1151 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1152 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1153 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1154 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1155 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1156 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1157 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1158 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1159 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1160 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1161 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 1162 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 1163 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1164 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1165 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1166 ; CHECK: bb.3: 1167 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1168 ; CHECK: bb.4: 1169 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 1170 ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 1171 ; CHECK: $vgpr0 = COPY [[UV2]](s32) 1172 ; CHECK: $vgpr1 = COPY [[UV3]](s32) 1173 ; CHECK: $vgpr2 = COPY [[UV4]](s32) 1174 ; CHECK: $vgpr3 = COPY [[UV5]](s32) 1175 ; CHECK: $vgpr4 = COPY [[UV6]](s32) 1176 ; CHECK: $vgpr5 = COPY [[UV7]](s32) 1177 ; CHECK: $vgpr6 = COPY [[UV8]](s32) 1178 ; CHECK: $vgpr7 = COPY [[UV9]](s32) 1179 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 1180 %soffset = add i32 %soffset.base, 4068 1181 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 1182 ret <8 x float> %val 1183} 1184 1185define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { 1186 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096 1187 ; CHECK: bb.1 (%ir-block.0): 1188 ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 1189 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1190 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1191 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1192 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1193 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1194 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1195 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 1196 ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] 1197 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) 1198 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 1199 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1200 ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1201 ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1202 ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1203 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1204 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1205 ; CHECK: bb.2: 1206 ; CHECK: successors: %bb.3, %bb.2 1207 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 1208 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 1209 ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 1210 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1211 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1212 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1213 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1214 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1215 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1216 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1217 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1218 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1219 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1220 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 1221 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 1222 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1223 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1224 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1225 ; CHECK: bb.3: 1226 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1227 ; CHECK: bb.4: 1228 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 1229 ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 1230 ; CHECK: $vgpr0 = COPY [[UV2]](s32) 1231 ; CHECK: $vgpr1 = COPY [[UV3]](s32) 1232 ; CHECK: $vgpr2 = COPY [[UV4]](s32) 1233 ; CHECK: $vgpr3 = COPY [[UV5]](s32) 1234 ; CHECK: $vgpr4 = COPY [[UV6]](s32) 1235 ; CHECK: $vgpr5 = COPY [[UV7]](s32) 1236 ; CHECK: $vgpr6 = COPY [[UV8]](s32) 1237 ; CHECK: $vgpr7 = COPY [[UV9]](s32) 1238 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 1239 %soffset = add i32 %soffset.base, 4096 1240 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 1241 ret <8 x float> %val 1242} 1243 1244define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(<4 x i32> %rsrc, i32 %offset.base) { 1245 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 1246 ; CHECK: bb.1 (%ir-block.0): 1247 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 1248 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1249 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1250 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1251 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1252 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 1253 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1254 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 1255 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 1256 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 1257 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1258 ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1259 ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1260 ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1261 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1262 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1263 ; CHECK: bb.2: 1264 ; CHECK: successors: %bb.3, %bb.2 1265 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 1266 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 1267 ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 1268 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1269 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1270 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1271 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1272 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1273 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1274 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1275 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1276 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1277 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1278 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 1279 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 1280 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1281 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1282 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1283 ; CHECK: bb.3: 1284 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1285 ; CHECK: bb.4: 1286 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 1287 ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 1288 ; CHECK: $vgpr0 = COPY [[UV2]](s32) 1289 ; CHECK: $vgpr1 = COPY [[UV3]](s32) 1290 ; CHECK: $vgpr2 = COPY [[UV4]](s32) 1291 ; CHECK: $vgpr3 = COPY [[UV5]](s32) 1292 ; CHECK: $vgpr4 = COPY [[UV6]](s32) 1293 ; CHECK: $vgpr5 = COPY [[UV7]](s32) 1294 ; CHECK: $vgpr6 = COPY [[UV8]](s32) 1295 ; CHECK: $vgpr7 = COPY [[UV9]](s32) 1296 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 1297 %soffset = add i32 %offset.base, 5000 1298 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 1299 ret <8 x float> %val 1300} 1301 1302define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(<4 x i32> %rsrc, i32 %offset.base) { 1303 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 1304 ; CHECK: bb.1 (%ir-block.0): 1305 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 1306 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1307 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1308 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1309 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1310 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 1311 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1312 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 1313 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 1314 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 1315 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1316 ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1317 ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1318 ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1319 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1320 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1321 ; CHECK: bb.2: 1322 ; CHECK: successors: %bb.3, %bb.2 1323 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 1324 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 1325 ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 1326 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1327 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1328 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1329 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1330 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1331 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1332 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1333 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1334 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1335 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1336 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 1337 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 1338 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1339 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1340 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1341 ; CHECK: bb.3: 1342 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1343 ; CHECK: bb.4: 1344 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 1345 ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 1346 ; CHECK: $vgpr0 = COPY [[UV2]](s32) 1347 ; CHECK: $vgpr1 = COPY [[UV3]](s32) 1348 ; CHECK: $vgpr2 = COPY [[UV4]](s32) 1349 ; CHECK: $vgpr3 = COPY [[UV5]](s32) 1350 ; CHECK: $vgpr4 = COPY [[UV6]](s32) 1351 ; CHECK: $vgpr5 = COPY [[UV7]](s32) 1352 ; CHECK: $vgpr6 = COPY [[UV8]](s32) 1353 ; CHECK: $vgpr7 = COPY [[UV9]](s32) 1354 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 1355 %soffset = add i32 %offset.base, 4076 1356 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 1357 ret <8 x float> %val 1358} 1359 1360define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(<4 x i32> %rsrc, i32 %offset.base) { 1361 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 1362 ; CHECK: bb.1 (%ir-block.0): 1363 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 1364 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1365 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1366 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1367 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1368 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 1369 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1370 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 1371 ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 1372 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] 1373 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1374 ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1375 ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1376 ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1377 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1378 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1379 ; CHECK: bb.2: 1380 ; CHECK: successors: %bb.3, %bb.2 1381 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 1382 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 1383 ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 1384 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1385 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1386 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1387 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1388 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1389 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1390 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1391 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1392 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1393 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1394 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) 1395 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) 1396 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1397 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1398 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1399 ; CHECK: bb.3: 1400 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1401 ; CHECK: bb.4: 1402 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 1403 ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 1404 ; CHECK: $vgpr0 = COPY [[UV2]](s32) 1405 ; CHECK: $vgpr1 = COPY [[UV3]](s32) 1406 ; CHECK: $vgpr2 = COPY [[UV4]](s32) 1407 ; CHECK: $vgpr3 = COPY [[UV5]](s32) 1408 ; CHECK: $vgpr4 = COPY [[UV6]](s32) 1409 ; CHECK: $vgpr5 = COPY [[UV7]](s32) 1410 ; CHECK: $vgpr6 = COPY [[UV8]](s32) 1411 ; CHECK: $vgpr7 = COPY [[UV9]](s32) 1412 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 1413 %soffset = add i32 %offset.base, 4080 1414 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) 1415 ret <8 x float> %val 1416} 1417 1418define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(<4 x i32> %rsrc, i32 %offset.base) { 1419 ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064 1420 ; CHECK: bb.1 (%ir-block.0): 1421 ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 1422 ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1423 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 1424 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 1425 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 1426 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1427 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 1428 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1429 ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 1430 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1431 ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1432 ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF 1433 ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF 1434 ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) 1435 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec 1436 ; CHECK: bb.2: 1437 ; CHECK: successors: %bb.3, %bb.2 1438 ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 1439 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 1440 ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 1441 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec 1442 ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec 1443 ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) 1444 ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec 1445 ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec 1446 ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec 1447 ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1448 ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec 1449 ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc 1450 ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) 1451 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load 16 + 4064, align 4) 1452 ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load 16 + 4064, align 4) 1453 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec 1454 ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc 1455 ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec 1456 ; CHECK: bb.3: 1457 ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] 1458 ; CHECK: bb.4: 1459 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) 1460 ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) 1461 ; CHECK: $vgpr0 = COPY [[UV2]](s32) 1462 ; CHECK: $vgpr1 = COPY [[UV3]](s32) 1463 ; CHECK: $vgpr2 = COPY [[UV4]](s32) 1464 ; CHECK: $vgpr3 = COPY [[UV5]](s32) 1465 ; CHECK: $vgpr4 = COPY [[UV6]](s32) 1466 ; CHECK: $vgpr5 = COPY [[UV7]](s32) 1467 ; CHECK: $vgpr6 = COPY [[UV8]](s32) 1468 ; CHECK: $vgpr7 = COPY [[UV9]](s32) 1469 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 1470 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0) 1471 ret <8 x float> %val 1472} 1473 1474define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { 1475 ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr 1476 ; CHECK: bb.1 (%ir-block.0): 1477 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 1478 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1479 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 1480 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 1481 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 1482 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1483 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 1484 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1485 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) 1486 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] 1487 ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1488 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) 1489 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1490 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1491 %offset = add i32 %offset.v, %offset.s 1492 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) 1493 ret float %val 1494} 1495 1496define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { 1497 ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr 1498 ; CHECK: bb.1 (%ir-block.0): 1499 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 1500 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1501 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 1502 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 1503 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 1504 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1505 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 1506 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1507 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) 1508 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] 1509 ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1510 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) 1511 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1512 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1513 %offset = add i32 %offset.s, %offset.v 1514 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) 1515 ret float %val 1516} 1517 1518define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { 1519 ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm 1520 ; CHECK: bb.1 (%ir-block.0): 1521 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 1522 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1523 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 1524 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 1525 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 1526 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1527 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 1528 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1529 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) 1530 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] 1531 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 1532 ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 1533 ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] 1534 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 1535 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1536 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) 1537 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1538 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1539 %offset.base = add i32 %offset.v, %offset.s 1540 %offset = add i32 %offset.base, 1024 1541 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) 1542 ret float %val 1543} 1544 1545define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { 1546 ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm 1547 ; CHECK: bb.1 (%ir-block.0): 1548 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 1549 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1550 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 1551 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 1552 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 1553 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1554 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 1555 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1556 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) 1557 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] 1558 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 1559 ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 1560 ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] 1561 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 1562 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1563 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) 1564 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1565 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1566 %offset.base = add i32 %offset.s, %offset.v 1567 %offset = add i32 %offset.base, 1024 1568 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) 1569 ret float %val 1570} 1571 1572; TODO: Ideally this would be reassociated to fold. 1573define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { 1574 ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr 1575 ; CHECK: bb.1 (%ir-block.0): 1576 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 1577 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1578 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 1579 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 1580 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 1581 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1582 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 1583 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1584 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 1585 ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]] 1586 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) 1587 ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] 1588 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1589 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load 4) 1590 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1591 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1592 %offset.base = add i32 %offset.s, 1024 1593 %offset = add i32 %offset.base, %offset.v 1594 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) 1595 ret float %val 1596} 1597 1598define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { 1599 ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr 1600 ; CHECK: bb.1 (%ir-block.0): 1601 ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 1602 ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 1603 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 1604 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 1605 ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 1606 ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 1607 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 1608 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) 1609 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 1610 ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) 1611 ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] 1612 ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) 1613 ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] 1614 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 1615 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) 1616 ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) 1617 ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 1618 %offset.base = add i32 %offset.v, 1024 1619 %offset = add i32 %offset.base, %offset.s 1620 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) 1621 ret float %val 1622} 1623 1624declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) 1625declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) 1626declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg) 1627declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) 1628declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32 immarg) 1629declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32 immarg) 1630 1631declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) 1632declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32 immarg) 1633declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32 immarg) 1634declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg) 1635declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32 immarg) 1636declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32 immarg) 1637 1638declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32 immarg) 1639declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32 immarg) 1640declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32 immarg) 1641 1642declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32 immarg) 1643declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32 immarg) 1644 1645declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32 immarg) 1646declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32 immarg) 1647 1648declare <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32>, i32, i32 immarg) 1649declare <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32>, i32, i32 immarg) 1650