1 // Test target codegen - host bc file has to be created first.
2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
6 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
7 // expected-no-diagnostics
8 #ifndef HEADER
9 #define HEADER
10 
11 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
12 // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
13 
14 // Check that the execution mode of all 3 target regions is set to Spmd Mode.
15 // CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
16 // CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
17 // CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
18 
19 template<typename tx>
ftemplate(int n)20 tx ftemplate(int n) {
21   int a;
22   short b;
23   tx c;
24   float d;
25   double e;
26 
27   #pragma omp target parallel reduction(+: e)
28   {
29     e += 5;
30   }
31 
32   #pragma omp target parallel reduction(^: c) reduction(*: d)
33   {
34     c ^= 2;
35     d *= 33;
36   }
37 
38   #pragma omp target parallel reduction(|: a) reduction(max: b)
39   {
40     a |= 1;
41     b = 99 > b ? 99 : b;
42   }
43 
44   return a+b+c+d+e;
45 }
46 
bar(int n)47 int bar(int n){
48   int a = 0;
49 
50   a += ftemplate<char>(n);
51 
52   return a;
53 }
54 
55 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
56 //
57 // CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
58 // CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true)
59 //
60 //
61 // define internal void [[PFN]](
62 // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
63 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
64 // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
65 // CHECK: store double [[ADD]], double* [[E]], align
66 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
67 // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
68 // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
69 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
70 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
71 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
72 // CHECK: br i1 [[CMP]], label
73 
74 // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
75 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
76 // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
77 // CHECK: store double [[ADD]], double* [[E_IN]], align
78 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
79 // CHECK: br label
80 //
81 // CHECK: ret
82 
83 //
84 // Reduction function
85 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
86 // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
87 // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
88 // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
89 //
90 // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
91 // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
92 // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
93 //
94 // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
95 // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
96 // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
97 // CHECK: store double [[RES]], double* [[VAR_LHS]],
98 // CHECK: ret void
99 
100 //
101 // Shuffle and reduce function
102 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
103 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
104 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
105 //
106 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
107 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
108 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
109 //
110 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
111 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
112 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
113 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
114 //
115 // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
116 // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
117 // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
118 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
119 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
120 // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
121 //
122 // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
123 // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
124 // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
125 //
126 // Condition to reduce
127 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
128 //
129 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
130 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
131 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
132 //
133 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
134 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
135 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
136 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
137 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
138 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
139 //
140 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
141 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
142 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
143 //
144 // CHECK: [[DO_REDUCE]]
145 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
146 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
147 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
148 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
149 //
150 // CHECK: [[REDUCE_ELSE]]
151 // CHECK: br label {{%?}}[[REDUCE_CONT]]
152 //
153 // CHECK: [[REDUCE_CONT]]
154 // Now check if we should just copy over the remote reduction list
155 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
156 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
157 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
158 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
159 //
160 // CHECK: [[DO_COPY]]
161 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
162 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
163 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
164 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
165 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
166 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
167 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
168 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
169 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
170 //
171 // CHECK: [[COPY_ELSE]]
172 // CHECK: br label {{%?}}[[COPY_CONT]]
173 //
174 // CHECK: [[COPY_CONT]]
175 // CHECK: void
176 
177 //
178 // Inter warp copy function
179 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
180 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
181 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
182 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
183 // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
184 // CHECK: br label
185 // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
186 // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
187 // CHECK: br i1 [[DONE_COPY]], label
188 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
189 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
190 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
191 //
192 // [[DO_COPY]]
193 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
194 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
195 // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
196 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
197 //
198 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
199 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
200 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
201 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
202 //
203 // CHECK: [[COPY_ELSE]]
204 // CHECK: br label {{%?}}[[COPY_CONT]]
205 //
206 // Barrier after copy to shared memory storage medium.
207 // CHECK: [[COPY_CONT]]
208 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
209 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
210 //
211 // Read into warp 0.
212 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
213 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
214 //
215 // CHECK: [[DO_READ]]
216 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
217 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
218 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
219 // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
220 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
221 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
222 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
223 // CHECK: br label {{%?}}[[READ_CONT:.+]]
224 //
225 // CHECK: [[READ_ELSE]]
226 // CHECK: br label {{%?}}[[READ_CONT]]
227 //
228 // CHECK: [[READ_CONT]]
229 // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
230 // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
231 // CHECK: br label
232 // CHECK: ret
233 
234 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
235 //
236 // CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
237 // CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true)
238 //
239 //
240 // define internal void [[PFN1]](
241 // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
242 // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
243 // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
244 // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
245 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
246 // CHECK: store i8 [[TRUNC]], i8* [[C]], align
247 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
248 // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
249 // CHECK: store float [[MUL]], float* [[D]], align
250 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
251 // CHECK: store i8* [[C]], i8** [[PTR1]], align
252 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
253 // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
254 // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
255 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
256 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
257 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
258 // CHECK: br i1 [[CMP]], label
259 // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
260 // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
261 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
262 // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
263 // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
264 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
265 // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
266 // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
267 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
268 // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
269 // CHECK: store float [[MUL]], float* [[D_IN]], align
270 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
271 // CHECK: br label
272 //
273 // CHECK: ret
274 
275 //
276 // Reduction function
277 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
278 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
279 // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
280 //
281 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
282 // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
283 //
284 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
285 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
286 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
287 //
288 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
289 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
290 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
291 //
292 // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
293 // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
294 // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
295 // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
296 // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
297 // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
298 // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
299 //
300 // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
301 // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
302 // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
303 // CHECK: store float [[RES]], float* [[VAR2_LHS]],
304 // CHECK: ret void
305 
306 //
307 // Shuffle and reduce function
308 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
309 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
310 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
311 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
312 //
313 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
314 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
315 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
316 //
317 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
318 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
319 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
320 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
321 //
322 // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
323 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
324 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
325 // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
326 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
327 //
328 // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
329 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
330 //
331 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
332 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
333 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
334 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
335 //
336 // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
337 // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
338 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
339 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
340 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
341 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
342 //
343 // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
344 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
345 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
346 //
347 // Condition to reduce
348 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
349 //
350 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
351 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
352 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
353 //
354 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
355 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
356 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
357 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
358 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
359 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
360 //
361 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
362 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
363 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
364 //
365 // CHECK: [[DO_REDUCE]]
366 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
367 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
368 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
369 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
370 //
371 // CHECK: [[REDUCE_ELSE]]
372 // CHECK: br label {{%?}}[[REDUCE_CONT]]
373 //
374 // CHECK: [[REDUCE_CONT]]
375 // Now check if we should just copy over the remote reduction list
376 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
377 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
378 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
379 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
380 //
381 // CHECK: [[DO_COPY]]
382 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
383 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
384 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
385 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
386 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
387 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
388 //
389 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
390 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
391 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
392 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
393 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
394 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
395 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
396 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
397 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
398 //
399 // CHECK: [[COPY_ELSE]]
400 // CHECK: br label {{%?}}[[COPY_CONT]]
401 //
402 // CHECK: [[COPY_CONT]]
403 // CHECK: void
404 
405 //
406 // Inter warp copy function
407 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
408 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
409 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
410 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
411 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
412 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
413 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
414 //
415 // [[DO_COPY]]
416 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
417 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
418 //
419 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
420 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
421 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
422 // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
423 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
424 //
425 // CHECK: [[COPY_ELSE]]
426 // CHECK: br label {{%?}}[[COPY_CONT]]
427 //
428 // Barrier after copy to shared memory storage medium.
429 // CHECK: [[COPY_CONT]]
430 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
431 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
432 //
433 // Read into warp 0.
434 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
435 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
436 //
437 // CHECK: [[DO_READ]]
438 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
439 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
440 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
441 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
442 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
443 // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
444 // CHECK: br label {{%?}}[[READ_CONT:.+]]
445 //
446 // CHECK: [[READ_ELSE]]
447 // CHECK: br label {{%?}}[[READ_CONT]]
448 //
449 // CHECK: [[READ_CONT]]
450 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
451 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
452 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
453 //
454 // [[DO_COPY]]
455 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
456 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
457 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
458 //
459 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
460 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
461 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
462 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
463 //
464 // CHECK: [[COPY_ELSE]]
465 // CHECK: br label {{%?}}[[COPY_CONT]]
466 //
467 // Barrier after copy to shared memory storage medium.
468 // CHECK: [[COPY_CONT]]
469 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
470 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
471 //
472 // Read into warp 0.
473 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
474 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
475 //
476 // CHECK: [[DO_READ]]
477 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
478 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
479 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
480 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
481 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
482 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
483 // CHECK: br label {{%?}}[[READ_CONT:.+]]
484 //
485 // CHECK: [[READ_ELSE]]
486 // CHECK: br label {{%?}}[[READ_CONT]]
487 //
488 // CHECK: [[READ_CONT]]
489 // CHECK: ret
490 
491 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
492 //
493 // CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
494 // CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true)
495 //
496 //
497 // define internal void [[PFN2]](
498 // CHECK: store i32 0, i32* [[A:%.+]], align
499 // CHECK: store i16 -32768, i16* [[B:%.+]], align
500 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
501 // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
502 // CHECK: store i32 [[OR]], i32* [[A]], align
503 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
504 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
505 // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
506 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
507 //
508 // CHECK: [[DO_MAX]]
509 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
510 //
511 // CHECK: [[MAX_ELSE]]
512 // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
513 // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
514 // CHECK: br label {{%?}}[[MAX_CONT]]
515 //
516 // CHECK: [[MAX_CONT]]
517 // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
518 // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
519 // CHECK: store i16 [[TRUNC]], i16* [[B]], align
520 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
521 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
522 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
523 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
524 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
525 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
526 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
527 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
528 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
529 // CHECK: br i1 [[CMP]], label
530 
531 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
532 // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
533 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
534 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
535 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
536 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
537 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
538 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
539 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
540 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
541 //
542 // CHECK: [[DO_MAX]]
543 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
544 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
545 //
546 // CHECK: [[MAX_ELSE]]
547 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
548 // CHECK: br label {{%?}}[[MAX_CONT]]
549 //
550 // CHECK: [[MAX_CONT]]
551 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
552 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
553 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
554 // CHECK: br label
555 //
556 // CHECK: ret
557 
558 //
559 // Reduction function
560 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
561 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
562 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
563 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
564 //
565 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
566 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
567 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
568 //
569 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
570 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
571 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
572 //
573 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
574 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
575 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
576 //
577 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
578 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
579 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
580 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
581 //
582 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
583 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
584 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
585 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
586 //
587 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
588 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
589 //
590 // CHECK: [[DO_MAX]]
591 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
592 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
593 //
594 // CHECK: [[MAX_ELSE]]
595 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
596 // CHECK: br label {{%?}}[[MAX_CONT]]
597 //
598 // CHECK: [[MAX_CONT]]
599 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
600 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
601 // CHECK: ret void
602 
603 //
604 // Shuffle and reduce function
605 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
606 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
607 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
608 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
609 //
610 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
611 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
612 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
613 //
614 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
615 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
616 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
617 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
618 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
619 //
620 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
621 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
622 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
623 //
624 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
625 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
626 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
627 //
628 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
629 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
630 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
631 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
632 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
633 //
634 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
635 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
636 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
637 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
638 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
639 //
640 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
641 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
642 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
643 //
644 // Condition to reduce
645 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
646 //
647 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
648 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
649 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
650 //
651 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
652 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
653 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
654 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
655 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
656 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
657 //
658 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
659 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
660 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
661 //
662 // CHECK: [[DO_REDUCE]]
663 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
664 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
665 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
666 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
667 //
668 // CHECK: [[REDUCE_ELSE]]
669 // CHECK: br label {{%?}}[[REDUCE_CONT]]
670 //
671 // CHECK: [[REDUCE_CONT]]
672 // Now check if we should just copy over the remote reduction list
673 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
674 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
675 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
676 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
677 //
678 // CHECK: [[DO_COPY]]
679 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
680 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
681 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
682 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
683 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
684 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
685 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
686 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
687 //
688 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
689 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
690 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
691 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
692 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
693 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
694 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
695 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
696 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
697 //
698 // CHECK: [[COPY_ELSE]]
699 // CHECK: br label {{%?}}[[COPY_CONT]]
700 //
701 // CHECK: [[COPY_CONT]]
702 // CHECK: void
703 
704 //
705 // Inter warp copy function
706 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
707 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
708 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
709 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
710 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
711 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
712 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
713 //
714 // [[DO_COPY]]
715 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
716 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
717 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
718 //
719 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
720 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
721 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
722 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
723 //
724 // CHECK: [[COPY_ELSE]]
725 // CHECK: br label {{%?}}[[COPY_CONT]]
726 //
727 // Barrier after copy to shared memory storage medium.
728 // CHECK: [[COPY_CONT]]
729 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
730 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
731 //
732 // Read into warp 0.
733 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
734 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
735 //
736 // CHECK: [[DO_READ]]
737 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
738 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
739 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
740 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
741 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
742 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
743 // CHECK: br label {{%?}}[[READ_CONT:.+]]
744 //
745 // CHECK: [[READ_ELSE]]
746 // CHECK: br label {{%?}}[[READ_CONT]]
747 //
748 // CHECK: [[READ_CONT]]
749 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
750 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
751 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
752 //
753 // [[DO_COPY]]
754 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
755 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
756 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
757 //
758 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
759 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
760 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
761 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
762 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
763 //
764 // CHECK: [[COPY_ELSE]]
765 // CHECK: br label {{%?}}[[COPY_CONT]]
766 //
767 // Barrier after copy to shared memory storage medium.
768 // CHECK: [[COPY_CONT]]
769 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
770 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
771 //
772 // Read into warp 0.
773 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
774 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
775 //
776 // CHECK: [[DO_READ]]
777 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
778 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
779 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
780 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
781 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
782 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
783 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
784 // CHECK: br label {{%?}}[[READ_CONT:.+]]
785 //
786 // CHECK: [[READ_ELSE]]
787 // CHECK: br label {{%?}}[[READ_CONT]]
788 //
789 // CHECK: [[READ_CONT]]
790 // CHECK: ret
791 
792 #endif
793