1; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3
4declare i64 @_Z13get_global_idj(i32)
5
6define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)*  %buffer) {
7; GCN-LABEL: clmem_read_simplified:
8; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
9; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
10; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
11; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
12; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
13; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
14; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
15; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
16;
17; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
18; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
19; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
20; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
21; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
22; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
23; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
24; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
25entry:
26  %call = tail call i64 @_Z13get_global_idj(i32 0)
27  %conv = and i64 %call, 255
28  %a0 = shl i64 %call, 7
29  %idx.ext11 = and i64 %a0, 4294934528
30  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
31  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
32
33  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
34  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
35  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
36  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
37  %add.1 = add i64 %load2, %load1
38
39  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
40  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
41  %add.2 = add i64 %load3, %add.1
42  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
43  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
44  %add.3 = add i64 %load4, %add.2
45
46  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
47  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
48  %add.4 = add i64 %load5, %add.3
49  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
50  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
51  %add.5 = add i64 %load6, %add.4
52
53  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
54  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
55  %add.6 = add i64 %load7, %add.5
56  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
57  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
58  %add.7 = add i64 %load8, %add.6
59
60  store i64 %add.7, i64 addrspace(1)* %saddr, align 8
61  ret void
62}
63
64define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
65; GCN-LABEL: clmem_read:
66; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
67; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
68; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
69; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
70; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
71; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
72; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
73; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
74; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
75; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
76; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
77;
78; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
79; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
80; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
81; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
82; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
83; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
84; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
85; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
86; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
87; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
88; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
89entry:
90  %call = tail call i64 @_Z13get_global_idj(i32 0)
91  %conv = and i64 %call, 255
92  %a0 = shl i64 %call, 17
93  %idx.ext11 = and i64 %a0, 4261412864
94  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
95  %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
96  %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
97  br label %for.cond.preheader
98
99while.cond.loopexit:                              ; preds = %for.body
100  %dec = add nsw i32 %dec31, -1
101  %tobool = icmp eq i32 %dec31, 0
102  br i1 %tobool, label %while.end, label %for.cond.preheader
103
104for.cond.preheader:                               ; preds = %entry, %while.cond.loopexit
105  %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
106  %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
107  br label %for.body
108
109for.body:                                         ; preds = %for.body, %for.cond.preheader
110  %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
111  %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
112  %conv3 = zext i32 %block.029 to i64
113  %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
114  %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
115  %add = add i64 %load1, %sum.128
116
117  %add9 = or i32 %block.029, 256
118  %conv3.1 = zext i32 %add9 to i64
119  %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
120  %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
121  %add.1 = add i64 %load2, %add
122
123  %add9.1 = or i32 %block.029, 512
124  %conv3.2 = zext i32 %add9.1 to i64
125  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
126  %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
127  %add.2 = add i64 %l3, %add.1
128
129  %add9.2 = or i32 %block.029, 768
130  %conv3.3 = zext i32 %add9.2 to i64
131  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
132  %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
133  %add.3 = add i64 %l4, %add.2
134
135  %add9.3 = or i32 %block.029, 1024
136  %conv3.4 = zext i32 %add9.3 to i64
137  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
138  %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
139  %add.4 = add i64 %l5, %add.3
140
141  %add9.4 = or i32 %block.029, 1280
142  %conv3.5 = zext i32 %add9.4 to i64
143  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
144  %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
145  %add.5 = add i64 %l6, %add.4
146
147  %add9.5 = or i32 %block.029, 1536
148  %conv3.6 = zext i32 %add9.5 to i64
149  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
150  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
151  %add.6 = add i64 %load7, %add.5
152
153  %add9.6 = or i32 %block.029, 1792
154  %conv3.7 = zext i32 %add9.6 to i64
155  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
156  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
157  %add.7 = add i64 %load8, %add.6
158
159  %add9.7 = or i32 %block.029, 2048
160  %conv3.8 = zext i32 %add9.7 to i64
161  %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
162  %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
163  %add.8 = add i64 %load9, %add.7
164
165  %add9.8 = or i32 %block.029, 2304
166  %conv3.9 = zext i32 %add9.8 to i64
167  %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
168  %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
169  %add.9 = add i64 %load10, %add.8
170
171  %add9.9 = or i32 %block.029, 2560
172  %conv3.10 = zext i32 %add9.9 to i64
173  %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
174  %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
175  %add.10 = add i64 %load11, %add.9
176
177  %add9.31 = add nuw nsw i32 %block.029, 8192
178  %cmp.31 = icmp ult i32 %add9.31, 4194304
179  br i1 %cmp.31, label %for.body, label %while.cond.loopexit
180
181while.end:                                        ; preds = %while.cond.loopexit
182  store i64 %add.10, i64 addrspace(1)* %a1, align 8
183  ret void
184}
185
186; using 32bit address.
187define amdgpu_kernel void @Address32(i8 addrspace(1)*  %buffer) {
188; GCN-LABEL: Address32:
189; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
190; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
191; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
192; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
193; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
194; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
195; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
196; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
197; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
198; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
199;
200; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
201; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
202; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
203; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
204; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
205; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096
206; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072
207; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
208; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
209; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
210entry:
211   %call = tail call i64 @_Z13get_global_idj(i32 0)
212   %conv = and i64 %call, 255
213   %id = shl i64 %call, 7
214   %idx.ext11 = and i64 %id, 4294934528
215   %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
216   %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
217
218   %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
219   %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
220
221   %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
222   %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
223   %add.1 = add i32 %load2, %load1
224
225   %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
226   %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
227   %add.2 = add i32 %load3, %add.1
228
229   %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
230   %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
231   %add.3 = add i32 %load4, %add.2
232
233   %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
234   %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
235   %add.4 = add i32 %load5, %add.3
236
237   %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
238   %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
239   %add.5 = add i32 %load6, %add.4
240
241   %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
242   %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
243   %add.6 = add i32 %load7, %add.5
244
245   %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
246   %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
247   %add.7 = add i32 %load8, %add.6
248
249   %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
250   %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
251   %add.8 = add i32 %load9, %add.7
252
253   %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
254   %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
255   %add.9 = add i32 %load10, %add.8
256
257   store i32 %add.9, i32 addrspace(1)* %addr, align 4
258   ret void
259}
260
261define amdgpu_kernel void @Offset64(i8 addrspace(1)*  %buffer) {
262; GCN-LABEL: Offset64:
263; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
264; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
265; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
266; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
267;
268; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
269; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
270; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
271; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
272entry:
273  %call = tail call i64 @_Z13get_global_idj(i32 0)
274  %conv = and i64 %call, 255
275  %a0 = shl i64 %call, 7
276  %idx.ext11 = and i64 %a0, 4294934528
277  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
278  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
279
280  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
281  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
282
283  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
284  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
285
286  %add1 = add i64 %load2, %load1
287
288  %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
289  %load3 = load i64, i64 addrspace(1)* %addr3, align 8
290
291  %add2 = add i64 %load3, %add1
292
293  %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
294  %load4 = load i64, i64 addrspace(1)* %addr4, align 8
295  %add4 = add i64 %load4, %add2
296
297  store i64 %add4, i64 addrspace(1)* %saddr, align 8
298  ret void
299}
300
301; TODO: Support load4 as anchor instruction.
302define amdgpu_kernel void @p32Offset64(i8 addrspace(1)*  %buffer) {
303; GCN-LABEL: p32Offset64:
304; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
305; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
306; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
307; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
308;
309; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
310; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
311; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
312; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
313entry:
314  %call = tail call i64 @_Z13get_global_idj(i32 0)
315  %conv = and i64 %call, 255
316  %a0 = shl i64 %call, 7
317  %idx.ext11 = and i64 %a0, 4294934528
318  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
319  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
320
321  %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
322  %load1 = load i32, i32 addrspace(1)* %addr1, align 8
323
324  %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
325  %load2 = load i32, i32 addrspace(1)* %addr2, align 8
326
327  %add1 = add i32 %load2, %load1
328
329  %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
330  %load3 = load i32, i32 addrspace(1)* %addr3, align 8
331
332  %add2 = add i32 %load3, %add1
333
334  %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
335  %load4 = load i32, i32 addrspace(1)* %addr4, align 8
336  %add4 = add i32 %load4, %add2
337
338  store i32 %add4, i32 addrspace(1)* %saddr, align 8
339  ret void
340}
341
342define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
343; GCN-LABEL: DiffBase:
344; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
345; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
346; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
347; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
348; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
349; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
350;
351; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
352; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
353; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
354; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
355; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
356; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
357                                    i8 addrspace(1)* %buffer2) {
358entry:
359  %call = tail call i64 @_Z13get_global_idj(i32 0)
360  %conv = and i64 %call, 255
361  %a0 = shl i64 %call, 7
362  %idx.ext11 = and i64 %a0, 4294934528
363  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
364  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
365
366  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
367  %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
368
369  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
370  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
371  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
372  %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
373  %add1 = add i64 %load2, %load1
374  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
375  %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
376  %add2 = add i64 %load3, %add1
377
378  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
379  %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
380
381  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
382  %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
383  %add3 = add i64 %load5, %load4
384
385  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
386  %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
387  %add4 = add i64 %load6, %add3
388
389  %add5 = add i64 %add2, %add4
390
391  store i64 %add5, i64 addrspace(1)* %saddr, align 8
392  ret void
393}
394
395define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
396; GCN-LABEL: ReverseOrder:
397; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
398; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
399; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
400; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
401; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
402; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
403; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
404; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
405;
406; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
407; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
408; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
409; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
410; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
411; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
412; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
413; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
414entry:
415  %call = tail call i64 @_Z13get_global_idj(i32 0)
416  %conv = and i64 %call, 255
417  %a0 = shl i64 %call, 7
418  %idx.ext11 = and i64 %a0, 4294934528
419  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
420  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
421
422  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
423  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
424
425  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
426  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
427  %add7 = add i64 %load8, %load1
428
429  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
430  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
431  %add6 = add i64 %load7, %add7
432
433  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
434  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
435  %add5 = add i64 %load6, %add6
436
437  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
438  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
439  %add4 = add i64 %load5, %add5
440
441  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
442  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
443  %add3 = add i64 %load4, %add4
444
445  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
446  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
447  %add2 = add i64 %load3, %add3
448
449  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
450  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
451  %add1 = add i64 %load2, %add2
452
453  store i64 %add1, i64 addrspace(1)* %saddr, align 8
454  ret void
455}
456
457define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
458; GCN-LABEL: negativeoffset:
459; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
460; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
461;
462; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
463; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
464entry:
465  %call = tail call i64 @_Z13get_global_idj(i32 0) #2
466  %conv = and i64 %call, 255
467  %0 = shl i64 %call, 7
468  %idx.ext11 = and i64 %0, 4294934528
469  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
470  %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
471
472  %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
473
474  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
475  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
476
477  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
478  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
479
480
481  %add = add i64 %load2, %load1
482
483  store i64 %add, i64 addrspace(1)* %buffer_head, align 8
484  ret void
485}
486