1; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX90A %s
5
6declare i64 @_Z13get_global_idj(i32)
7
8define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)*  %buffer) {
9; GCN-LABEL: clmem_read_simplified:
10; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
11; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
12; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
13; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
14; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
15; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
16; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
17; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
18;
19; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
20; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
21; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
22; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
23; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
24; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
25; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
26; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
27;
28; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
29; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
30; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
31; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
32; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
33; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
34; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
35; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
36
37entry:
38  %call = tail call i64 @_Z13get_global_idj(i32 0)
39  %conv = and i64 %call, 255
40  %a0 = shl i64 %call, 7
41  %idx.ext11 = and i64 %a0, 4294934528
42  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
43  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
44
45  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
46  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
47  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
48  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
49  %add.1 = add i64 %load2, %load1
50
51  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
52  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
53  %add.2 = add i64 %load3, %add.1
54  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
55  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
56  %add.3 = add i64 %load4, %add.2
57
58  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
59  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
60  %add.4 = add i64 %load5, %add.3
61  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
62  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
63  %add.5 = add i64 %load6, %add.4
64
65  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
66  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
67  %add.6 = add i64 %load7, %add.5
68  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
69  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
70  %add.7 = add i64 %load8, %add.6
71
72  store i64 %add.7, i64 addrspace(1)* %saddr, align 8
73  ret void
74}
75
76define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
77; GCN-LABEL: clmem_read:
78; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
79; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
80; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
81; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
82; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
83; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
84; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
85; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
86; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
87; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
88; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
89;
90; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
91; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
92; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
93; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
94; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
95; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
96; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
97; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
98; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
99; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
100; GFX900:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
101;
102; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
103; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
104; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
105; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
106; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
107; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
108; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
109; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
110; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
111; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
112; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
113
114; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
115; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
116; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
117; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
118; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
119; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
120; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
121; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
122; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
123; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
124; GFX90A:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
125
126entry:
127  %call = tail call i64 @_Z13get_global_idj(i32 0)
128  %conv = and i64 %call, 255
129  %a0 = shl i64 %call, 17
130  %idx.ext11 = and i64 %a0, 4261412864
131  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
132  %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
133  %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
134  br label %for.cond.preheader
135
136while.cond.loopexit:                              ; preds = %for.body
137  %dec = add nsw i32 %dec31, -1
138  %tobool = icmp eq i32 %dec31, 0
139  br i1 %tobool, label %while.end, label %for.cond.preheader
140
141for.cond.preheader:                               ; preds = %entry, %while.cond.loopexit
142  %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
143  %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
144  br label %for.body
145
146for.body:                                         ; preds = %for.body, %for.cond.preheader
147  %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
148  %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
149  %conv3 = zext i32 %block.029 to i64
150  %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
151  %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
152  %add = add i64 %load1, %sum.128
153
154  %add9 = or i32 %block.029, 256
155  %conv3.1 = zext i32 %add9 to i64
156  %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
157  %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
158  %add.1 = add i64 %load2, %add
159
160  %add9.1 = or i32 %block.029, 512
161  %conv3.2 = zext i32 %add9.1 to i64
162  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
163  %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
164  %add.2 = add i64 %l3, %add.1
165
166  %add9.2 = or i32 %block.029, 768
167  %conv3.3 = zext i32 %add9.2 to i64
168  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
169  %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
170  %add.3 = add i64 %l4, %add.2
171
172  %add9.3 = or i32 %block.029, 1024
173  %conv3.4 = zext i32 %add9.3 to i64
174  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
175  %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
176  %add.4 = add i64 %l5, %add.3
177
178  %add9.4 = or i32 %block.029, 1280
179  %conv3.5 = zext i32 %add9.4 to i64
180  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
181  %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
182  %add.5 = add i64 %l6, %add.4
183
184  %add9.5 = or i32 %block.029, 1536
185  %conv3.6 = zext i32 %add9.5 to i64
186  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
187  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
188  %add.6 = add i64 %load7, %add.5
189
190  %add9.6 = or i32 %block.029, 1792
191  %conv3.7 = zext i32 %add9.6 to i64
192  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
193  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
194  %add.7 = add i64 %load8, %add.6
195
196  %add9.7 = or i32 %block.029, 2048
197  %conv3.8 = zext i32 %add9.7 to i64
198  %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
199  %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
200  %add.8 = add i64 %load9, %add.7
201
202  %add9.8 = or i32 %block.029, 2304
203  %conv3.9 = zext i32 %add9.8 to i64
204  %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
205  %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
206  %add.9 = add i64 %load10, %add.8
207
208  %add9.9 = or i32 %block.029, 2560
209  %conv3.10 = zext i32 %add9.9 to i64
210  %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
211  %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
212  %add.10 = add i64 %load11, %add.9
213
214  %add9.31 = add nuw nsw i32 %block.029, 8192
215  %cmp.31 = icmp ult i32 %add9.31, 4194304
216  br i1 %cmp.31, label %for.body, label %while.cond.loopexit
217
218while.end:                                        ; preds = %while.cond.loopexit
219  store i64 %add.10, i64 addrspace(1)* %a1, align 8
220  ret void
221}
222
223; using 32bit address.
224define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
225; GCN-LABEL: Address32:
226; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
227; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
228; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
229; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
230; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
231; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
232; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
233; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
234; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
235; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
236;
237; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
238; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
239; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
240; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
241; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
242; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
243; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
244; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
245; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
246; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
247;
248; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
249; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
250; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
251; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
252; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
253; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
254; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
255; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
256; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
257; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
258entry:
259   %call = tail call i64 @_Z13get_global_idj(i32 0)
260   %conv = and i64 %call, 255
261   %id = shl i64 %call, 7
262   %idx.ext11 = and i64 %id, 4294934528
263   %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
264   %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
265
266   %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
267   %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
268
269   %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
270   %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
271   %add.1 = add i32 %load2, %load1
272
273   %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
274   %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
275   %add.2 = add i32 %load3, %add.1
276
277   %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
278   %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
279   %add.3 = add i32 %load4, %add.2
280
281   %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
282   %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
283   %add.4 = add i32 %load5, %add.3
284
285   %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
286   %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
287   %add.5 = add i32 %load6, %add.4
288
289   %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
290   %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
291   %add.6 = add i32 %load7, %add.5
292
293   %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
294   %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
295   %add.7 = add i32 %load8, %add.6
296
297   %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
298   %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
299   %add.8 = add i32 %load9, %add.7
300
301   %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
302   %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
303   %add.9 = add i32 %load10, %add.8
304
305   store i32 %add.9, i32 addrspace(1)* %addr, align 4
306   ret void
307}
308
309define amdgpu_kernel void @Offset64(i8 addrspace(1)*  %buffer) {
310; GCN-LABEL: Offset64:
311; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
312; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
313; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
314; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
315;
316; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
317; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
318; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
319; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
320;
321; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
322; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
323; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
324; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
325entry:
326  %call = tail call i64 @_Z13get_global_idj(i32 0)
327  %conv = and i64 %call, 255
328  %a0 = shl i64 %call, 7
329  %idx.ext11 = and i64 %a0, 4294934528
330  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
331  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
332
333  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
334  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
335
336  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
337  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
338
339  %add1 = add i64 %load2, %load1
340
341  %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
342  %load3 = load i64, i64 addrspace(1)* %addr3, align 8
343
344  %add2 = add i64 %load3, %add1
345
346  %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
347  %load4 = load i64, i64 addrspace(1)* %addr4, align 8
348  %add4 = add i64 %load4, %add2
349
350  store i64 %add4, i64 addrspace(1)* %saddr, align 8
351  ret void
352}
353
354; TODO: Support load4 as anchor instruction.
355define amdgpu_kernel void @p32Offset64(i8 addrspace(1)*  %buffer) {
356; GCN-LABEL: p32Offset64:
357; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
358; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
359; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
360; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
361;
362; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
363; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
364; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
365; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
366;
367; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
368; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
369; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
370; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
371entry:
372  %call = tail call i64 @_Z13get_global_idj(i32 0)
373  %conv = and i64 %call, 255
374  %a0 = shl i64 %call, 7
375  %idx.ext11 = and i64 %a0, 4294934528
376  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
377  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
378
379  %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
380  %load1 = load i32, i32 addrspace(1)* %addr1, align 8
381
382  %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
383  %load2 = load i32, i32 addrspace(1)* %addr2, align 8
384
385  %add1 = add i32 %load2, %load1
386
387  %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
388  %load3 = load i32, i32 addrspace(1)* %addr3, align 8
389
390  %add2 = add i32 %load3, %add1
391
392  %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
393  %load4 = load i32, i32 addrspace(1)* %addr4, align 8
394  %add4 = add i32 %load4, %add2
395
396  store i32 %add4, i32 addrspace(1)* %saddr, align 8
397  ret void
398}
399
400define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
401; GCN-LABEL: DiffBase:
402; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
403; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
404; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
405; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
406; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
407; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
408;
409; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
410; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
411; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
412; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
413; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
414; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
415;
416; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
417; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
418; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
419; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
420; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
421; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
422                                    i8 addrspace(1)* %buffer2) {
423entry:
424  %call = tail call i64 @_Z13get_global_idj(i32 0)
425  %conv = and i64 %call, 255
426  %a0 = shl i64 %call, 7
427  %idx.ext11 = and i64 %a0, 4294934528
428  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
429  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
430
431  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
432  %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
433
434  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
435  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
436  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
437  %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
438  %add1 = add i64 %load2, %load1
439  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
440  %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
441  %add2 = add i64 %load3, %add1
442
443  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
444  %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
445
446  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
447  %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
448  %add3 = add i64 %load5, %load4
449
450  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
451  %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
452  %add4 = add i64 %load6, %add3
453
454  %add5 = add i64 %add2, %add4
455
456  store i64 %add5, i64 addrspace(1)* %saddr, align 8
457  ret void
458}
459
460define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
461; GCN-LABEL: ReverseOrder:
462; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
463; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
464; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
465; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
466; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
467; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
468; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
469; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
470;
471; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
472; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
473; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
474; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
475; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
476; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
477; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
478; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
479;
480; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
481; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
482; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
483; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
484; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
485; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
486; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
487; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
488entry:
489  %call = tail call i64 @_Z13get_global_idj(i32 0)
490  %conv = and i64 %call, 255
491  %a0 = shl i64 %call, 7
492  %idx.ext11 = and i64 %a0, 4294934528
493  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
494  %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
495
496  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
497  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
498
499  %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
500  %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
501  %add7 = add i64 %load8, %load1
502
503  %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
504  %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
505  %add6 = add i64 %load7, %add7
506
507  %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
508  %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
509  %add5 = add i64 %load6, %add6
510
511  %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
512  %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
513  %add4 = add i64 %load5, %add5
514
515  %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
516  %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
517  %add3 = add i64 %load4, %add4
518
519  %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
520  %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
521  %add2 = add i64 %load3, %add3
522
523  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
524  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
525  %add1 = add i64 %load2, %add2
526
527  store i64 %add1, i64 addrspace(1)* %saddr, align 8
528  ret void
529}
530
531define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
532; GCN-LABEL: negativeoffset:
533; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
534; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
535;
536; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
537; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
538;
539; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
540; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
541entry:
542  %call = tail call i64 @_Z13get_global_idj(i32 0) #2
543  %conv = and i64 %call, 255
544  %0 = shl i64 %call, 7
545  %idx.ext11 = and i64 %0, 4294934528
546  %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
547  %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
548
549  %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
550
551  %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
552  %load1 = load i64, i64 addrspace(1)* %addr1, align 8
553
554  %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
555  %load2 = load i64, i64 addrspace(1)* %addr2, align 8
556
557
558  %add = add i64 %load2, %load1
559
560  store i64 %add, i64 addrspace(1)* %buffer_head, align 8
561  ret void
562}
563