1; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s
3; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
4
5; Natural mapping
6define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
7  ; CHECK-LABEL: name: s_buffer_load_i32
8  ; CHECK: bb.1 (%ir-block.0):
9  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
10  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
11  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
12  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
13  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
14  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
15  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
16  ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4)
17  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
18  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
19  ; CHECK:   $sgpr0 = COPY [[INT]](s32)
20  ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0
21  %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
22  ret i32 %val
23}
24
25define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
26  ; CHECK-LABEL: name: s_buffer_load_v2i32
27  ; CHECK: bb.1 (%ir-block.0):
28  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
29  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
30  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
31  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
32  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
33  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
34  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
35  ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4)
36  ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
37  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
38  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
39  ; CHECK:   $sgpr0 = COPY [[INT]](s32)
40  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
41  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
42  ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
43  ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
44  %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
45  ret <2 x i32> %val
46}
47
48define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
49  ; CHECK-LABEL: name: s_buffer_load_v3i32
50  ; CHECK: bb.1 (%ir-block.0):
51  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
52  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
53  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
54  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
55  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
56  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
57  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
58  ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4)
59  ; CHECK:   [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
60  ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>)
61  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
62  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
63  ; CHECK:   $sgpr0 = COPY [[INT]](s32)
64  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
65  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
66  ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
67  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
68  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
69  ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
70  ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
71  %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
72  ret <3 x i32> %val
73}
74
75define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
76  ; CHECK-LABEL: name: s_buffer_load_v8i32
77  ; CHECK: bb.1 (%ir-block.0):
78  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
79  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
80  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
81  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
82  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
83  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
84  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
85  ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4)
86  ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
87  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
88  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
89  ; CHECK:   $sgpr0 = COPY [[INT]](s32)
90  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
91  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
92  ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
93  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
94  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
95  ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
96  ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
97  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
98  ; CHECK:   $sgpr3 = COPY [[INT3]](s32)
99  ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
100  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
101  ; CHECK:   $sgpr4 = COPY [[INT4]](s32)
102  ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
103  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
104  ; CHECK:   $sgpr5 = COPY [[INT5]](s32)
105  ; CHECK:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
106  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
107  ; CHECK:   $sgpr6 = COPY [[INT6]](s32)
108  ; CHECK:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
109  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
110  ; CHECK:   $sgpr7 = COPY [[INT7]](s32)
111  ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
112  %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
113  ret <8 x i32> %val
114}
115
116define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
117  ; CHECK-LABEL: name: s_buffer_load_v16i32
118  ; CHECK: bb.1 (%ir-block.0):
119  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
120  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
121  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
122  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
123  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
124  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
125  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
126  ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4)
127  ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
128  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
129  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
130  ; CHECK:   $sgpr0 = COPY [[INT]](s32)
131  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
132  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
133  ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
134  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
135  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
136  ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
137  ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
138  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
139  ; CHECK:   $sgpr3 = COPY [[INT3]](s32)
140  ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
141  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
142  ; CHECK:   $sgpr4 = COPY [[INT4]](s32)
143  ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
144  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
145  ; CHECK:   $sgpr5 = COPY [[INT5]](s32)
146  ; CHECK:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
147  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
148  ; CHECK:   $sgpr6 = COPY [[INT6]](s32)
149  ; CHECK:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
150  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
151  ; CHECK:   $sgpr7 = COPY [[INT7]](s32)
152  ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
153  ; CHECK:   [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
154  ; CHECK:   $sgpr8 = COPY [[INT8]](s32)
155  ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
156  ; CHECK:   [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
157  ; CHECK:   $sgpr9 = COPY [[INT9]](s32)
158  ; CHECK:   [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
159  ; CHECK:   [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
160  ; CHECK:   $sgpr10 = COPY [[INT10]](s32)
161  ; CHECK:   [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
162  ; CHECK:   [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
163  ; CHECK:   $sgpr11 = COPY [[INT11]](s32)
164  ; CHECK:   [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
165  ; CHECK:   [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
166  ; CHECK:   $sgpr12 = COPY [[INT12]](s32)
167  ; CHECK:   [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
168  ; CHECK:   [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
169  ; CHECK:   $sgpr13 = COPY [[INT13]](s32)
170  ; CHECK:   [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
171  ; CHECK:   [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
172  ; CHECK:   $sgpr14 = COPY [[INT14]](s32)
173  ; CHECK:   [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
174  ; CHECK:   [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
175  ; CHECK:   $sgpr15 = COPY [[INT15]](s32)
176  ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
177  %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
178  ret <16 x i32> %val
179}
180
181; Check cases that need to be converted to MUBUF due to the offset being a VGPR.
182define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
183  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset
184  ; CHECK: bb.1 (%ir-block.0):
185  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
186  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
187  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
188  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
189  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
190  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
191  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
192  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
193  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
194  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4)
195  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
196  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
197  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
198  ret float %val
199}
200
201define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
202  ; CHECK-LABEL: name: s_buffer_load_v2f32_vgpr_offset
203  ; CHECK: bb.1 (%ir-block.0):
204  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
205  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
206  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
207  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
208  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
209  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
210  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
211  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
212  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
213  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 8, align 4)
214  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>)
215  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
216  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
217  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
218  %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
219  ret <2 x float> %val
220}
221
222define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
223  ; CHECK-LABEL: name: s_buffer_load_v3f32_vgpr_offset
224  ; CHECK: bb.1 (%ir-block.0):
225  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
226  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
227  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
228  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
229  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
230  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
231  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
232  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
233  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
234  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
235  ; CHECK:   [[EXTRACT:%[0-9]+]]:vgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_BUFFER_LOAD]](<4 x s32>), 0
236  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>)
237  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
238  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
239  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
240  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
241  %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
242  ret <3 x float> %val
243}
244
245define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
246  ; CHECK-LABEL: name: s_buffer_load_v4f32_vgpr_offset
247  ; CHECK: bb.1 (%ir-block.0):
248  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
249  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
250  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
251  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
252  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
253  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
254  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
255  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
256  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
257  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
258  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
259  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
260  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
261  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
262  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
263  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
264  %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
265  ret <4 x float> %val
266}
267
268define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
269  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset
270  ; CHECK: bb.1 (%ir-block.0):
271  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
272  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
273  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
274  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
275  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
276  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
277  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
278  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
279  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
280  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
281  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
282  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
283  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
284  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
285  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
286  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
287  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
288  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
289  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
290  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
291  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
292  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
293  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
294  ret <8 x float> %val
295}
296
297define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
298  ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset
299  ; CHECK: bb.1 (%ir-block.0):
300  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
301  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
302  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
303  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
304  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
305  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
306  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
307  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
308  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
309  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
310  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
311  ; CHECK:   [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4)
312  ; CHECK:   [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4)
313  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
314  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
315  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
316  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
317  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
318  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
319  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
320  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
321  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
322  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
323  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
324  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
325  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
326  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
327  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
328  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
329  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
330  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
331  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
332  %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
333  ret <16 x float> %val
334}
335
336define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
337  ; CHECK-LABEL: name: s_buffer_load_i96_vgpr_offset
338  ; CHECK: bb.1 (%ir-block.0):
339  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
340  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
341  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
342  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
343  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
344  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
345  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
346  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
347  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
348  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
349  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
350  ; CHECK:   [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128)
351  ; CHECK:   G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store 12 into `i96 addrspace(1)* undef`, align 8, addrspace 1)
352  ; CHECK:   S_ENDPGM 0
353  %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0)
354  store i96 %val, i96 addrspace(1)* undef
355  ret void
356}
357
358; Test split of a wide scalar
359define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
360  ; CHECK-LABEL: name: s_buffer_load_i256_vgpr_offset
361  ; CHECK: bb.1 (%ir-block.0):
362  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
363  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
364  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
365  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
366  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
367  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
368  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
369  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
370  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
371  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
372  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
373  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
374  ; CHECK:   [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128)
375  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256)
376  ; CHECK:   G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i256 addrspace(1)* undef`, align 8, addrspace 1)
377  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
378  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
379  ; CHECK:   G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i256 addrspace(1)* undef` + 16, align 8, addrspace 1)
380  ; CHECK:   S_ENDPGM 0
381  %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0)
382  store i256 %val, i256 addrspace(1)* undef
383  ret void
384}
385
386; Test split of a wide scalar
387define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
388  ; CHECK-LABEL: name: s_buffer_load_i512_vgpr_offset
389  ; CHECK: bb.1 (%ir-block.0):
390  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
391  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
392  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
393  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
394  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
395  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
396  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
397  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
398  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
399  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
400  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
401  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
402  ; CHECK:   [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4)
403  ; CHECK:   [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4)
404  ; CHECK:   [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128)
405  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512)
406  ; CHECK:   G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i512 addrspace(1)* undef`, align 8, addrspace 1)
407  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
408  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
409  ; CHECK:   G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 16, align 8, addrspace 1)
410  ; CHECK:   [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
411  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
412  ; CHECK:   G_STORE [[UV2]](s128), [[PTR_ADD1]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 32, align 8, addrspace 1)
413  ; CHECK:   [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
414  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
415  ; CHECK:   G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 48, align 8, addrspace 1)
416  ; CHECK:   S_ENDPGM 0
417  %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0)
418  store i512 %val, i512 addrspace(1)* undef
419  ret void
420}
421
422; Test split of a vector with 16-bit elements
423define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
424  ; CHECK-LABEL: name: s_buffer_load_v16i16_vgpr_offset
425  ; CHECK: bb.1 (%ir-block.0):
426  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
427  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
428  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
429  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
430  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
431  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
432  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
433  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
434  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
435  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
436  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
437  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
438  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>)
439  ; CHECK:   [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
440  ; CHECK:   G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1)
441  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
442  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
443  ; CHECK:   G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef` + 16, align 32, addrspace 1)
444  ; CHECK:   S_ENDPGM 0
445  %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
446  store <16 x i16> %val, <16 x i16> addrspace(1)* undef
447  ret void
448}
449
450; Test split of a vector with 16-bit elements
451define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
452  ; CHECK-LABEL: name: s_buffer_load_v32i16_vgpr_offset
453  ; CHECK: bb.1 (%ir-block.0):
454  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
455  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
456  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
457  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
458  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
459  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
460  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
461  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
462  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
463  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
464  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
465  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
466  ; CHECK:   [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4)
467  ; CHECK:   [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4)
468  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>)
469  ; CHECK:   [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>)
470  ; CHECK:   G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1)
471  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
472  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
473  ; CHECK:   G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 16, align 64, addrspace 1)
474  ; CHECK:   [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
475  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
476  ; CHECK:   G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 32, align 64, addrspace 1)
477  ; CHECK:   [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
478  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
479  ; CHECK:   G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 48, align 64, addrspace 1)
480  ; CHECK:   S_ENDPGM 0
481  %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
482  store <32 x i16> %val, <32 x i16> addrspace(1)* undef
483  ret void
484}
485
486; Test split of a vector with 64-bit elements
487define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
488  ; CHECK-LABEL: name: s_buffer_load_v4i64_vgpr_offset
489  ; CHECK: bb.1 (%ir-block.0):
490  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
491  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
492  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
493  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
494  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
495  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
496  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
497  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
498  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
499  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
500  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
501  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
502  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>)
503  ; CHECK:   [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
504  ; CHECK:   G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1)
505  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
506  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
507  ; CHECK:   G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef` + 16, align 32, addrspace 1)
508  ; CHECK:   S_ENDPGM 0
509  %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
510  store <4 x i64> %val, <4 x i64> addrspace(1)* undef
511  ret void
512}
513
514; Test split of a vector with 64-bit elements
515define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
516  ; CHECK-LABEL: name: s_buffer_load_v8i64_vgpr_offset
517  ; CHECK: bb.1 (%ir-block.0):
518  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
519  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
520  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
521  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
522  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
523  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
524  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
525  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
526  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
527  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
528  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
529  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
530  ; CHECK:   [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4)
531  ; CHECK:   [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4)
532  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>)
533  ; CHECK:   [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
534  ; CHECK:   G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1)
535  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
536  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
537  ; CHECK:   G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 16, align 64, addrspace 1)
538  ; CHECK:   [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
539  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
540  ; CHECK:   G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 32, align 64, addrspace 1)
541  ; CHECK:   [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
542  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
543  ; CHECK:   G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 48, align 64, addrspace 1)
544  ; CHECK:   S_ENDPGM 0
545  %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
546  store <8 x i64> %val, <8 x i64> addrspace(1)* undef
547  ret void
548}
549
550; Test split of a vector with 64-bit pointer elements
551define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
552  ; CHECK-LABEL: name: s_buffer_load_v4p1_vgpr_offset
553  ; CHECK: bb.1 (%ir-block.0):
554  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
555  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
556  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
557  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
558  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
559  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
560  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
561  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
562  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
563  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
564  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
565  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
566  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>)
567  ; CHECK:   [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>)
568  ; CHECK:   G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1)
569  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
570  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
571  ; CHECK:   G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 32, addrspace 1)
572  ; CHECK:   S_ENDPGM 0
573  %val = call <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0)
574  store <4 x i8 addrspace(1)*> %val, <4 x i8 addrspace(1)*> addrspace(1)* undef
575  ret void
576}
577
578; Test split of a vector with 64-bit pointer elements
579define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
580  ; CHECK-LABEL: name: s_buffer_load_v8p1_vgpr_offset
581  ; CHECK: bb.1 (%ir-block.0):
582  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
583  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
584  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
585  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
586  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
587  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
588  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
589  ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
590  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
591  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
592  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
593  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
594  ; CHECK:   [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4)
595  ; CHECK:   [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4)
596  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>)
597  ; CHECK:   [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>)
598  ; CHECK:   G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1)
599  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
600  ; CHECK:   [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
601  ; CHECK:   G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 64, addrspace 1)
602  ; CHECK:   [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
603  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
604  ; CHECK:   G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 32, align 64, addrspace 1)
605  ; CHECK:   [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
606  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
607  ; CHECK:   G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 48, align 64, addrspace 1)
608  ; CHECK:   S_ENDPGM 0
609  %val = call <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0)
610  store <8 x i8 addrspace(1)*> %val, <8 x i8 addrspace(1)*> addrspace(1)* undef
611  ret void
612}
613
614define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %rsrc, i32 %soffset.base) {
615  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
616  ; CHECK: bb.1 (%ir-block.0):
617  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
618  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
619  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
620  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
621  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
622  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
623  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
624  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
625  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
626  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
627  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
628  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
629  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load 4)
630  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
631  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
632  %soffset = add i32 %soffset.base, 4092
633  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
634  ret float %val
635}
636
637define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %rsrc, i32 %soffset.base) {
638  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
639  ; CHECK: bb.1 (%ir-block.0):
640  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
641  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
642  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
643  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
644  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
645  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
646  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
647  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
648  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
649  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
650  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
651  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
652  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load 4)
653  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
654  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
655  %soffset = add i32 %soffset.base, 4095
656  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
657  ret float %val
658}
659
660define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %rsrc, i32 %soffset.base) {
661  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
662  ; CHECK: bb.1 (%ir-block.0):
663  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
664  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
665  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
666  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
667  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
668  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
669  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
670  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
671  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
672  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
673  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
674  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4)
675  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
676  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
677  %soffset = add i32 %soffset.base, 4096
678  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
679  ret float %val
680}
681
682; Make sure the base offset is added to each split load.
683define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> inreg %rsrc, i32 %soffset.base) {
684  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064
685  ; CHECK: bb.1 (%ir-block.0):
686  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
687  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
688  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
689  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
690  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
691  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
692  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
693  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
694  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
695  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
696  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
697  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
698  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4)
699  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4)
700  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
701  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
702  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
703  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
704  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
705  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
706  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
707  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
708  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
709  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
710  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
711  %soffset = add i32 %soffset.base, 4064
712  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
713  ret <8 x float> %val
714}
715
716; Make sure the maximum offset isn't exeeded when splitting this
717define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> inreg %rsrc, i32 %soffset.base) {
718  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068
719  ; CHECK: bb.1 (%ir-block.0):
720  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
721  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
722  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
723  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
724  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
725  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
726  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
727  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
728  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
729  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
730  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
731  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
732  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
733  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
734  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
735  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
736  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
737  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
738  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
739  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
740  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
741  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
742  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
743  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
744  %soffset = add i32 %soffset.base, 4068
745  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
746  ret <8 x float> %val
747}
748
749define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i32> inreg %rsrc, i32 %soffset.base) {
750  ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032
751  ; CHECK: bb.1 (%ir-block.0):
752  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
753  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
754  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
755  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
756  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
757  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
758  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
759  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032
760  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
761  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
762  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
763  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
764  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load 16, align 4)
765  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load 16, align 4)
766  ; CHECK:   [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4)
767  ; CHECK:   [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4)
768  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
769  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
770  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
771  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
772  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
773  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
774  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
775  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
776  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
777  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
778  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
779  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
780  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
781  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
782  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
783  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
784  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
785  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
786  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
787  %soffset = add i32 %soffset.base, 4032
788  %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
789  ret <16 x float> %val
790}
791
792define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i32> inreg %rsrc, i32 %soffset.base) {
793  ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036
794  ; CHECK: bb.1 (%ir-block.0):
795  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
796  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
797  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
798  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
799  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
800  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
801  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
802  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036
803  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
804  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
805  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
806  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
807  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
808  ; CHECK:   [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16 + 16, align 4)
809  ; CHECK:   [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16 + 48, align 4)
810  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
811  ; CHECK:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
812  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
813  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
814  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
815  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
816  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
817  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
818  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
819  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
820  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
821  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
822  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
823  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
824  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
825  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
826  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
827  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
828  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
829  %soffset = add i32 %soffset.base, 4036
830  %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
831  ret <16 x float> %val
832}
833
834; Waterfall loop due to resource being VGPR
835define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %soffset) {
836  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc
837  ; CHECK: bb.1 (%ir-block.0):
838  ; CHECK:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
839  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
840  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
841  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
842  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
843  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
844  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
845  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
846  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
847  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
848  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
849  ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
850  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
851  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
852  ; CHECK: bb.2:
853  ; CHECK:   successors: %bb.3, %bb.2
854  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2
855  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %8(s32), %bb.2
856  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
857  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
858  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
859  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
860  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
861  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
862  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
863  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
864  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
865  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
866  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4)
867  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
868  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
869  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
870  ; CHECK: bb.3:
871  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
872  ; CHECK: bb.4:
873  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
874  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
875  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
876  ret float %val
877}
878
879; Use the offset inside the waterfall loop
880define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %rsrc, i32 inreg %soffset.base) {
881  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092
882  ; CHECK: bb.1 (%ir-block.0):
883  ; CHECK:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
884  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
885  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
886  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
887  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
888  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
889  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
890  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
891  ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
892  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
893  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
894  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
895  ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
896  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
897  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
898  ; CHECK: bb.2:
899  ; CHECK:   successors: %bb.3, %bb.2
900  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.2
901  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
902  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
903  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
904  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
905  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
906  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
907  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
908  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
909  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
910  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
911  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
912  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load 4)
913  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
914  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
915  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
916  ; CHECK: bb.3:
917  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
918  ; CHECK: bb.4:
919  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
920  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
921  %soffset = add i32 %soffset.base, 4092
922  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
923  ret float %val
924}
925
926; Scalar offset exceeds MUBUF limit, keep add out of the loop
927define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) {
928  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096
929  ; CHECK: bb.1 (%ir-block.0):
930  ; CHECK:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
931  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
932  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
933  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
934  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
935  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
936  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
937  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
938  ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
939  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
940  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
941  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
942  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
943  ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
944  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
945  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
946  ; CHECK: bb.2:
947  ; CHECK:   successors: %bb.3, %bb.2
948  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %19, %bb.2
949  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
950  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
951  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
952  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
953  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
954  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
955  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
956  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
957  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
958  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
959  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
960  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4)
961  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
962  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
963  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
964  ; CHECK: bb.3:
965  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
966  ; CHECK: bb.4:
967  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
968  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
969  %soffset = add i32 %soffset.base, 4096
970  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
971  ret float %val
972}
973
974; Waterfall loop, but constant offset
975define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) {
976  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095
977  ; CHECK: bb.1 (%ir-block.0):
978  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
979  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
980  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
981  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
982  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
983  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
984  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
985  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
986  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
987  ; CHECK:   [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
988  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
989  ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
990  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
991  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
992  ; CHECK: bb.2:
993  ; CHECK:   successors: %bb.3, %bb.2
994  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2
995  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2
996  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
997  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
998  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
999  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1000  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1001  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1002  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1003  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1004  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1005  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1006  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load 4 + 4095, align 1)
1007  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1008  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1009  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1010  ; CHECK: bb.3:
1011  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1012  ; CHECK: bb.4:
1013  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1014  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1015  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0)
1016  ret float %val
1017}
1018
1019; Waterfall loop, but constant offset
1020define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) {
1021  ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096
1022  ; CHECK: bb.1 (%ir-block.0):
1023  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1024  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1025  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1026  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1027  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1028  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1029  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1030  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1031  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1032  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1033  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
1034  ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1035  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1036  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1037  ; CHECK: bb.2:
1038  ; CHECK:   successors: %bb.3, %bb.2
1039  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2
1040  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2
1041  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1042  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1043  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1044  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1045  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1046  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1047  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1048  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1049  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1050  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1051  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4)
1052  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1053  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1054  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1055  ; CHECK: bb.3:
1056  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1057  ; CHECK: bb.4:
1058  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1059  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1060  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0)
1061  ret float %val
1062}
1063
1064; Need a waterfall loop, but the offset is scalar.
1065; Make sure the base offset is added to each split load.
1066define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1067  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064
1068  ; CHECK: bb.1 (%ir-block.0):
1069  ; CHECK:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1070  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1071  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1072  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1073  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1074  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1075  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1076  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
1077  ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1078  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1079  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1080  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1081  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1082  ; CHECK:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1083  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1084  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1085  ; CHECK: bb.2:
1086  ; CHECK:   successors: %bb.3, %bb.2
1087  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
1088  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
1089  ; CHECK:   [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
1090  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1091  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1092  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1093  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1094  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1095  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1096  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1097  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1098  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1099  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1100  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4)
1101  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4)
1102  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1103  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1104  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1105  ; CHECK: bb.3:
1106  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1107  ; CHECK: bb.4:
1108  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1109  ; CHECK:   [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1110  ; CHECK:   $vgpr0 = COPY [[UV2]](s32)
1111  ; CHECK:   $vgpr1 = COPY [[UV3]](s32)
1112  ; CHECK:   $vgpr2 = COPY [[UV4]](s32)
1113  ; CHECK:   $vgpr3 = COPY [[UV5]](s32)
1114  ; CHECK:   $vgpr4 = COPY [[UV6]](s32)
1115  ; CHECK:   $vgpr5 = COPY [[UV7]](s32)
1116  ; CHECK:   $vgpr6 = COPY [[UV8]](s32)
1117  ; CHECK:   $vgpr7 = COPY [[UV9]](s32)
1118  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1119  %soffset = add i32 %soffset.base, 4064
1120  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1121  ret <8 x float> %val
1122}
1123
1124; Need a waterfall loop, but the offset is scalar.
1125; Make sure the maximum offset isn't exeeded when splitting this
1126define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1127  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068
1128  ; CHECK: bb.1 (%ir-block.0):
1129  ; CHECK:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1130  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1131  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1132  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1133  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1134  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1135  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1136  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
1137  ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1138  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
1139  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1140  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1141  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1142  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1143  ; CHECK:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1144  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1145  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1146  ; CHECK: bb.2:
1147  ; CHECK:   successors: %bb.3, %bb.2
1148  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
1149  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
1150  ; CHECK:   [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
1151  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1152  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1153  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1154  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1155  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1156  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1157  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1158  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1159  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1160  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1161  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
1162  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
1163  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1164  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1165  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1166  ; CHECK: bb.3:
1167  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1168  ; CHECK: bb.4:
1169  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1170  ; CHECK:   [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1171  ; CHECK:   $vgpr0 = COPY [[UV2]](s32)
1172  ; CHECK:   $vgpr1 = COPY [[UV3]](s32)
1173  ; CHECK:   $vgpr2 = COPY [[UV4]](s32)
1174  ; CHECK:   $vgpr3 = COPY [[UV5]](s32)
1175  ; CHECK:   $vgpr4 = COPY [[UV6]](s32)
1176  ; CHECK:   $vgpr5 = COPY [[UV7]](s32)
1177  ; CHECK:   $vgpr6 = COPY [[UV8]](s32)
1178  ; CHECK:   $vgpr7 = COPY [[UV9]](s32)
1179  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1180  %soffset = add i32 %soffset.base, 4068
1181  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1182  ret <8 x float> %val
1183}
1184
1185define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1186  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096
1187  ; CHECK: bb.1 (%ir-block.0):
1188  ; CHECK:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1189  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1190  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1191  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1192  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1193  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1194  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1195  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1196  ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1197  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
1198  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1199  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1200  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1201  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1202  ; CHECK:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1203  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1204  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1205  ; CHECK: bb.2:
1206  ; CHECK:   successors: %bb.3, %bb.2
1207  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
1208  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
1209  ; CHECK:   [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
1210  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1211  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1212  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1213  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1214  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1215  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1216  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1217  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1218  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1219  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1220  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
1221  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
1222  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1223  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1224  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1225  ; CHECK: bb.3:
1226  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1227  ; CHECK: bb.4:
1228  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1229  ; CHECK:   [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1230  ; CHECK:   $vgpr0 = COPY [[UV2]](s32)
1231  ; CHECK:   $vgpr1 = COPY [[UV3]](s32)
1232  ; CHECK:   $vgpr2 = COPY [[UV4]](s32)
1233  ; CHECK:   $vgpr3 = COPY [[UV5]](s32)
1234  ; CHECK:   $vgpr4 = COPY [[UV6]](s32)
1235  ; CHECK:   $vgpr5 = COPY [[UV7]](s32)
1236  ; CHECK:   $vgpr6 = COPY [[UV8]](s32)
1237  ; CHECK:   $vgpr7 = COPY [[UV9]](s32)
1238  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1239  %soffset = add i32 %soffset.base, 4096
1240  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1241  ret <8 x float> %val
1242}
1243
1244define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(<4 x i32> %rsrc, i32 %offset.base) {
1245  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
1246  ; CHECK: bb.1 (%ir-block.0):
1247  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
1248  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1249  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1250  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1251  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1252  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
1253  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1254  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000
1255  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1256  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1257  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1258  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1259  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1260  ; CHECK:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1261  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1262  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1263  ; CHECK: bb.2:
1264  ; CHECK:   successors: %bb.3, %bb.2
1265  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
1266  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
1267  ; CHECK:   [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
1268  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1269  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1270  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1271  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1272  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1273  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1274  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1275  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1276  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1277  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1278  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
1279  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
1280  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1281  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1282  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1283  ; CHECK: bb.3:
1284  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1285  ; CHECK: bb.4:
1286  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1287  ; CHECK:   [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1288  ; CHECK:   $vgpr0 = COPY [[UV2]](s32)
1289  ; CHECK:   $vgpr1 = COPY [[UV3]](s32)
1290  ; CHECK:   $vgpr2 = COPY [[UV4]](s32)
1291  ; CHECK:   $vgpr3 = COPY [[UV5]](s32)
1292  ; CHECK:   $vgpr4 = COPY [[UV6]](s32)
1293  ; CHECK:   $vgpr5 = COPY [[UV7]](s32)
1294  ; CHECK:   $vgpr6 = COPY [[UV8]](s32)
1295  ; CHECK:   $vgpr7 = COPY [[UV9]](s32)
1296  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1297  %soffset = add i32 %offset.base, 5000
1298  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1299  ret <8 x float> %val
1300}
1301
1302define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(<4 x i32> %rsrc, i32 %offset.base) {
1303  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
1304  ; CHECK: bb.1 (%ir-block.0):
1305  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
1306  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1307  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1308  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1309  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1310  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
1311  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1312  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076
1313  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1314  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1315  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1316  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1317  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1318  ; CHECK:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1319  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1320  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1321  ; CHECK: bb.2:
1322  ; CHECK:   successors: %bb.3, %bb.2
1323  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
1324  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
1325  ; CHECK:   [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
1326  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1327  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1328  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1329  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1330  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1331  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1332  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1333  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1334  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1335  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1336  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
1337  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
1338  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1339  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1340  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1341  ; CHECK: bb.3:
1342  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1343  ; CHECK: bb.4:
1344  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1345  ; CHECK:   [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1346  ; CHECK:   $vgpr0 = COPY [[UV2]](s32)
1347  ; CHECK:   $vgpr1 = COPY [[UV3]](s32)
1348  ; CHECK:   $vgpr2 = COPY [[UV4]](s32)
1349  ; CHECK:   $vgpr3 = COPY [[UV5]](s32)
1350  ; CHECK:   $vgpr4 = COPY [[UV6]](s32)
1351  ; CHECK:   $vgpr5 = COPY [[UV7]](s32)
1352  ; CHECK:   $vgpr6 = COPY [[UV8]](s32)
1353  ; CHECK:   $vgpr7 = COPY [[UV9]](s32)
1354  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1355  %soffset = add i32 %offset.base, 4076
1356  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1357  ret <8 x float> %val
1358}
1359
1360define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(<4 x i32> %rsrc, i32 %offset.base) {
1361  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
1362  ; CHECK: bb.1 (%ir-block.0):
1363  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
1364  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1365  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1366  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1367  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1368  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
1369  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1370  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080
1371  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1372  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1373  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1374  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1375  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1376  ; CHECK:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1377  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1378  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1379  ; CHECK: bb.2:
1380  ; CHECK:   successors: %bb.3, %bb.2
1381  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
1382  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
1383  ; CHECK:   [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
1384  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1385  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1386  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1387  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1388  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1389  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1390  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1391  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1392  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1393  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1394  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
1395  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4)
1396  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1397  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1398  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1399  ; CHECK: bb.3:
1400  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1401  ; CHECK: bb.4:
1402  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1403  ; CHECK:   [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1404  ; CHECK:   $vgpr0 = COPY [[UV2]](s32)
1405  ; CHECK:   $vgpr1 = COPY [[UV3]](s32)
1406  ; CHECK:   $vgpr2 = COPY [[UV4]](s32)
1407  ; CHECK:   $vgpr3 = COPY [[UV5]](s32)
1408  ; CHECK:   $vgpr4 = COPY [[UV6]](s32)
1409  ; CHECK:   $vgpr5 = COPY [[UV7]](s32)
1410  ; CHECK:   $vgpr6 = COPY [[UV8]](s32)
1411  ; CHECK:   $vgpr7 = COPY [[UV9]](s32)
1412  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1413  %soffset = add i32 %offset.base, 4080
1414  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1415  ret <8 x float> %val
1416}
1417
1418define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(<4 x i32> %rsrc, i32 %offset.base) {
1419  ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064
1420  ; CHECK: bb.1 (%ir-block.0):
1421  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1422  ; CHECK:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1423  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1424  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1425  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1426  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1427  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
1428  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1429  ; CHECK:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1430  ; CHECK:   [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1431  ; CHECK:   [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1432  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
1433  ; CHECK:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1434  ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1435  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
1436  ; CHECK: bb.2:
1437  ; CHECK:   successors: %bb.3, %bb.2
1438  ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
1439  ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
1440  ; CHECK:   [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
1441  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
1442  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
1443  ; CHECK:   [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
1444  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
1445  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec
1446  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec
1447  ; CHECK:   [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1448  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
1449  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
1450  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1451  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load 16 + 4064, align 4)
1452  ; CHECK:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load 16 + 4064, align 4)
1453  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
1454  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1455  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
1456  ; CHECK: bb.3:
1457  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
1458  ; CHECK: bb.4:
1459  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1460  ; CHECK:   [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1461  ; CHECK:   $vgpr0 = COPY [[UV2]](s32)
1462  ; CHECK:   $vgpr1 = COPY [[UV3]](s32)
1463  ; CHECK:   $vgpr2 = COPY [[UV4]](s32)
1464  ; CHECK:   $vgpr3 = COPY [[UV5]](s32)
1465  ; CHECK:   $vgpr4 = COPY [[UV6]](s32)
1466  ; CHECK:   $vgpr5 = COPY [[UV7]](s32)
1467  ; CHECK:   $vgpr6 = COPY [[UV8]](s32)
1468  ; CHECK:   $vgpr7 = COPY [[UV9]](s32)
1469  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1470  %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0)
1471  ret <8 x float> %val
1472}
1473
1474define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1475  ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
1476  ; CHECK: bb.1 (%ir-block.0):
1477  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1478  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1479  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1480  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1481  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1482  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1483  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1484  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1485  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1486  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
1487  ; CHECK:   [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1488  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4)
1489  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1490  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1491  %offset = add i32 %offset.v, %offset.s
1492  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1493  ret float %val
1494}
1495
1496define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1497  ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
1498  ; CHECK: bb.1 (%ir-block.0):
1499  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1500  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1501  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1502  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1503  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1504  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1505  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1506  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1507  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1508  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
1509  ; CHECK:   [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1510  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4)
1511  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1512  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1513  %offset = add i32 %offset.s, %offset.v
1514  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1515  ret float %val
1516}
1517
1518define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1519  ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
1520  ; CHECK: bb.1 (%ir-block.0):
1521  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1522  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1523  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1524  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1525  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1526  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1527  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1528  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1529  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1530  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
1531  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1532  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1533  ; CHECK:   [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
1534  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1535  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1536  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4)
1537  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1538  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1539  %offset.base = add i32 %offset.v, %offset.s
1540  %offset = add i32 %offset.base, 1024
1541  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1542  ret float %val
1543}
1544
1545define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1546  ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
1547  ; CHECK: bb.1 (%ir-block.0):
1548  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1549  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1550  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1551  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1552  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1553  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1554  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1555  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1556  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1557  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
1558  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1559  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1560  ; CHECK:   [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
1561  ; CHECK:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1562  ; CHECK:   [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1563  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4)
1564  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1565  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1566  %offset.base = add i32 %offset.s, %offset.v
1567  %offset = add i32 %offset.base, 1024
1568  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1569  ret float %val
1570}
1571
1572; TODO: Ideally this would be reassociated to fold.
1573define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1574  ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
1575  ; CHECK: bb.1 (%ir-block.0):
1576  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1577  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1578  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1579  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1580  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1581  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1582  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1583  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1584  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1585  ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]]
1586  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
1587  ; CHECK:   [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
1588  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1589  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load 4)
1590  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1591  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1592  %offset.base = add i32 %offset.s, 1024
1593  %offset = add i32 %offset.base, %offset.v
1594  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1595  ret float %val
1596}
1597
1598define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1599  ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
1600  ; CHECK: bb.1 (%ir-block.0):
1601  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1602  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1603  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1604  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1605  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1606  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1607  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1608  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1609  ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1610  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1611  ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
1612  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1613  ; CHECK:   [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
1614  ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1615  ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4)
1616  ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1617  ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
1618  %offset.base = add i32 %offset.v, 1024
1619  %offset = add i32 %offset.base, %offset.s
1620  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1621  ret float %val
1622}
1623
1624declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
1625declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg)
1626declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg)
1627declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg)
1628declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32 immarg)
1629declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32 immarg)
1630
1631declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32      immarg)
1632declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32 immarg)
1633declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32 immarg)
1634declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg)
1635declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32 immarg)
1636declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32 immarg)
1637
1638declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32 immarg)
1639declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32 immarg)
1640declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32 immarg)
1641
1642declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32 immarg)
1643declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32 immarg)
1644
1645declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32 immarg)
1646declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32 immarg)
1647
1648declare <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32>, i32, i32 immarg)
1649declare <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32>, i32, i32 immarg)
1650