1 // Test target codegen - host bc file has to be created first.
2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix  PAR
5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
6 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
7 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
8 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
9 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
10 // expected-no-diagnostics
11 #ifndef HEADER
12 #define HEADER
13 
14 // CHECK-DAG: [[TEAM1_REDUCE_TY:%.+]] = type { [{{1024|2048}} x double] }
15 // CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] }
16 // CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] }
17 // CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] }
18 // SEQ-DAG: [[MAP_TY:%.+]] = type { [128 x i8] }
19 
20 // SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
21 // SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
22 // SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
23 // SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
24 // SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
25 
26 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
27 // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
28 
29 // Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
30 // CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 1
31 // CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 1
32 // CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0
33 
34 // CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer
35 
36 template<typename tx>
ftemplate(int n)37 tx ftemplate(int n) {
38   int a;
39   short b;
40   tx c;
41   float d;
42   double e;
43 
44   #pragma omp target
45   #pragma omp teams reduction(+: e)
46   {
47     e += 5;
48   }
49 
50   #pragma omp target
51   #pragma omp teams reduction(^: c) reduction(*: d)
52   {
53     c ^= 2;
54     d *= 33;
55   }
56 
57   #pragma omp target
58   #pragma omp teams reduction(|: a) reduction(max: b)
59   #pragma omp parallel reduction(|: a) reduction(max: b)
60   {
61     a |= 1;
62     b = 99 > b ? 99 : b;
63   }
64 
65   return a+b+c+d+e;
66 }
67 
bar(int n)68 int bar(int n){
69   int a = 0;
70 
71   a += ftemplate<char>(n);
72 
73   return a;
74 }
75 
76   // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l44}}_worker()
77 
78   // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l44]](
79   //
80   // CHECK: {{call|invoke}} void [[T1]]_worker()
81   //
82   // CHECK: call void @__kmpc_kernel_init(
83   // CHECK: call void @__kmpc_kernel_deinit(
84   //
85   // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
86   // CHECK: [[EV:%.+]] = load double, double* [[E]], align
87   // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
88   // CHECK: store double [[ADD]], double* [[E]], align
89   // CHECK: [[GEP1:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
90   // CHECK: [[BC:%.+]] = bitcast double* [[E]] to i8*
91   // CHECK: store i8* [[BC]], i8** [[GEP1]],
92   // CHECK: [[BC_RED_LIST:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8*
93   // CHECK: [[BUF:%.+]] = load i8*, i8** @
94   // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]])
95   // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
96   // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
97   //
98   // CHECK: [[IFLABEL]]
99   // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
100   // CHECK: [[EV:%.+]] = load double, double* [[E]], align
101   // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
102   // CHECK: store double [[ADD]], double* [[E_IN]], align
103   // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]])
104   // CHECK: br label %[[EXIT]]
105   //
106   // CHECK: [[EXIT]]
107 
108   //
109   // Reduction function
110   // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
111   // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
112   // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
113   // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
114   //
115   // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
116   // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
117   // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
118   //
119   // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
120   // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
121   // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
122   // CHECK: store double [[RES]], double* [[VAR_LHS]],
123   // CHECK: ret void
124 
125   //
126   // Shuffle and reduce function
127   // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
128   // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [1 x i8*], align
129   // CHECK: [[REMOTE_ELT:%.+]] = alloca double
130   //
131   // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
132   // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
133   // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
134   //
135   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
136   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
137   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
138   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
139   //
140   // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
141   // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
142   // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
143   // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
144   // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
145   // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
146   //
147   // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
148   // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
149   // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
150   //
151   // Condition to reduce
152   // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
153   //
154   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
155   // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
156   // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
157   //
158   // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
159   // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
160   // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
161   // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
162   // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
163   // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
164   //
165   // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
166   // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
167   // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
168   //
169   // CHECK: [[DO_REDUCE]]
170   // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8*
171   // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [1 x i8*]* [[REMOTE_RED_LIST]] to i8*
172   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
173   // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
174   //
175   // CHECK: [[REDUCE_ELSE]]
176   // CHECK: br label {{%?}}[[REDUCE_CONT]]
177   //
178   // CHECK: [[REDUCE_CONT]]
179   // Now check if we should just copy over the remote reduction list
180   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
181   // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
182   // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
183   // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
184   //
185   // CHECK: [[DO_COPY]]
186   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
187   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
188   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
189   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
190   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
191   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
192   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
193   // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
194   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
195   //
196   // CHECK: [[COPY_ELSE]]
197   // CHECK: br label {{%?}}[[COPY_CONT]]
198   //
199   // CHECK: [[COPY_CONT]]
200   // CHECK: void
201 
202   //
203   // Inter warp copy function
204   // CHECK: define internal void [[INTER_WARP_COPY]](i8* %0, i32 %1)
205   // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
206   // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
207   // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [1 x i8*]*
208   // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
209   // CHECK: br label
210   // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
211   // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
212   // CHECK: br i1 [[DONE_COPY]], label
213   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
214   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
215   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
216   //
217   // [[DO_COPY]]
218   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
219   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
220   // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
221   // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
222   //
223   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
224   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
225   // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
226   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
227   //
228   // CHECK: [[COPY_ELSE]]
229   // CHECK: br label {{%?}}[[COPY_CONT]]
230   //
231   // Barrier after copy to shared memory storage medium.
232   // CHECK: [[COPY_CONT]]
233   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
234   // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
235   //
236   // Read into warp 0.
237   // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
238   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
239   //
240   // CHECK: [[DO_READ]]
241   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
242   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
243   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
244   // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
245   // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
246   // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
247   // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
248   // CHECK: br label {{%?}}[[READ_CONT:.+]]
249   //
250   // CHECK: [[READ_ELSE]]
251   // CHECK: br label {{%?}}[[READ_CONT]]
252   //
253   // CHECK: [[READ_CONT]]
254   // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
255   // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
256   // CHECK: br label
257   // CHECK: ret
258 
259   // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8* %0, i32 %1, i8* %2)
260   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
261   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
262   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
263   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
264   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
265   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
266   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
267   // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]*
268   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
269   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
270   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
271   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
272   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
273   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double*
274   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
275   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
276   // CHECK: [[LOC_RED1:%.+]] = load double, double* [[RL_RED1]],
277   // CHECK: store double [[LOC_RED1]], double* [[GLOBAL_RED1_IDX_PTR]],
278   // CHECK: ret void
279 
280   // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8* %0, i32 %1, i8* %2)
281   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
282   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
283   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
284   // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*],
285   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
286   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
287   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
288   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
289   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
290   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
291   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
292   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
293   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
294   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8*
295   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
296   // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8*
297   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
298   // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]])
299   // CHECK: ret void
300 
301   // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8* %0, i32 %1, i8* %2)
302   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
303   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
304   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
305   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
306   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
307   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
308   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
309   // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]*
310   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
311   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
312   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
313   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
314   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
315   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double*
316   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
317   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
318   // CHECK: [[GLOBAL_RED1:%.+]] = load double, double* [[GLOBAL_RED1_IDX_PTR]],
319   // CHECK: store double [[GLOBAL_RED1]], double* [[RL_RED1]],
320   // CHECK: ret void
321 
322   // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8* %0, i32 %1, i8* %2)
323   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
324   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
325   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
326   // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*],
327   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
328   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
329   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
330   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
331   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
332   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
333   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
334   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
335   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
336   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8*
337   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
338   // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8*
339   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
340   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
341   // CHECK: ret void
342 
343   // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}_worker()
344 
345   // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l50]](
346   //
347   // CHECK: {{call|invoke}} void [[T2]]_worker()
348 
349   //
350   // CHECK: call void @__kmpc_kernel_init(
351   // CHECK: call void @__kmpc_kernel_deinit(
352   //
353   // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
354   // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
355   // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
356   // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
357   // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
358   // CHECK: store i8 [[TRUNC]], i8* [[C]], align
359   // CHECK: [[DV:%.+]] = load float, float* [[D]], align
360   // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
361   // CHECK: store float [[MUL]], float* [[D]], align
362   // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
363   // CHECK: store i8* [[C]], i8** [[GEP1]],
364   // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
365   // CHECK: [[BC:%.+]] = bitcast float* [[D]] to i8*
366   // CHECK: store i8* [[BC]], i8** [[GEP2]],
367   // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
368   // CHECK: [[BUF:%.+]] = load i8*, i8** @
369   // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]])
370   // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
371   // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
372   //
373   // CHECK: [[IFLABEL]]
374   // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
375   // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
376   // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
377   // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
378   // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
379   // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
380   // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
381   // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
382   // CHECK: [[DV:%.+]] = load float, float* [[D]], align
383   // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
384   // CHECK: store float [[MUL]], float* [[D_IN]], align
385   // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]])
386   // CHECK: br label %[[EXIT]]
387   //
388   // CHECK: [[EXIT]]
389 
390   //
391   // Reduction function
392   // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
393   // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
394   // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
395   //
396   // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
397   // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
398   //
399   // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
400   // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
401   // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
402   //
403   // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
404   // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
405   // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
406   //
407   // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
408   // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
409   // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
410   // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
411   // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
412   // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
413   // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
414   //
415   // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
416   // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
417   // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
418   // CHECK: store float [[RES]], float* [[VAR2_LHS]],
419   // CHECK: ret void
420 
421   //
422   // Shuffle and reduce function
423   // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
424   // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align
425   // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
426   // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
427   //
428   // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
429   // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
430   // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
431   //
432   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
433   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
434   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
435   // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
436   //
437   // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
438   // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
439   // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
440   // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
441   // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
442   //
443   // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
444   // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
445   //
446   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
447   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
448   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
449   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
450   //
451   // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
452   // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
453   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
454   // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
455   // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
456   // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
457   //
458   // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
459   // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
460   // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
461   //
462   // Condition to reduce
463   // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
464   //
465   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
466   // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
467   // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
468   //
469   // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
470   // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
471   // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
472   // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
473   // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
474   // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
475   //
476   // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
477   // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
478   // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
479   //
480   // CHECK: [[DO_REDUCE]]
481   // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
482   // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8*
483   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
484   // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
485   //
486   // CHECK: [[REDUCE_ELSE]]
487   // CHECK: br label {{%?}}[[REDUCE_CONT]]
488   //
489   // CHECK: [[REDUCE_CONT]]
490   // Now check if we should just copy over the remote reduction list
491   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
492   // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
493   // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
494   // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
495   //
496   // CHECK: [[DO_COPY]]
497   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
498   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
499   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
500   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
501   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
502   // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
503   //
504   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
505   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
506   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
507   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
508   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
509   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
510   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
511   // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
512   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
513   //
514   // CHECK: [[COPY_ELSE]]
515   // CHECK: br label {{%?}}[[COPY_CONT]]
516   //
517   // CHECK: [[COPY_CONT]]
518   // CHECK: void
519 
520   //
521   // Inter warp copy function
522   // CHECK: define internal void [[INTER_WARP_COPY]](i8* %0, i32 %1)
523   // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
524   // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
525   // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [2 x i8*]*
526   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
527   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
528   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
529   //
530   // [[DO_COPY]]
531   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
532   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
533   //
534   // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
535   // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
536   // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
537   // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
538   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
539   //
540   // CHECK: [[COPY_ELSE]]
541   // CHECK: br label {{%?}}[[COPY_CONT]]
542   //
543   // Barrier after copy to shared memory storage medium.
544   // CHECK: [[COPY_CONT]]
545   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
546   // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
547   //
548   // Read into warp 0.
549   // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
550   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
551   //
552   // CHECK: [[DO_READ]]
553   // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
554   // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
555   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
556   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
557   // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
558   // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
559   // CHECK: br label {{%?}}[[READ_CONT:.+]]
560   //
561   // CHECK: [[READ_ELSE]]
562   // CHECK: br label {{%?}}[[READ_CONT]]
563   //
564   // CHECK: [[READ_CONT]]
565   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
566   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
567   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
568   //
569   // [[DO_COPY]]
570   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
571   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
572   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
573   //
574   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
575   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
576   // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
577   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
578   //
579   // CHECK: [[COPY_ELSE]]
580   // CHECK: br label {{%?}}[[COPY_CONT]]
581   //
582   // Barrier after copy to shared memory storage medium.
583   // CHECK: [[COPY_CONT]]
584   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
585   // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
586   //
587   // Read into warp 0.
588   // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
589   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
590   //
591   // CHECK: [[DO_READ]]
592   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
593   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
594   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
595   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
596   // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
597   // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
598   // CHECK: br label {{%?}}[[READ_CONT:.+]]
599   //
600   // CHECK: [[READ_ELSE]]
601   // CHECK: br label {{%?}}[[READ_CONT]]
602   //
603   // CHECK: [[READ_CONT]]
604   // CHECK: ret
605 
606   // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8* %0, i32 %1, i8* %2)
607   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
608   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
609   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
610   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
611   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
612   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
613   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
614   // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
615   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
616   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
617   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
618   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
619   // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
620   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
621   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
622   // CHECK: [[LOC_RED1:%.+]] = load i8, i8* [[RL_RED1]],
623   // CHECK: store i8 [[LOC_RED1]], i8* [[GLOBAL_RED1_IDX_PTR]],
624   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
625   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
626   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float*
627   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
628   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
629   // CHECK: [[LOC_RED1:%.+]] = load float, float* [[RL_RED1]],
630   // CHECK: store float [[LOC_RED1]], float* [[GLOBAL_RED1_IDX_PTR]],
631   // CHECK: ret void
632 
633   // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8* %0, i32 %1, i8* %2)
634   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
635   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
636   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
637   // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
638   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
639   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
640   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
641   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
642   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
643   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
644   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
645   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
646   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
647   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]]
648   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
649   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
650   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
651   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8*
652   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
653   // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
654   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
655   // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]])
656   // CHECK: ret void
657 
658   // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8* %0, i32 %1, i8* %2)
659   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
660   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
661   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
662   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
663   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
664   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
665   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
666   // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
667   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
668   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
669   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
670   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
671   // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
672   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
673   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
674   // CHECK: [[GLOBAL_RED1:%.+]] = load i8, i8* [[GLOBAL_RED1_IDX_PTR]],
675   // CHECK: store i8 [[GLOBAL_RED1]], i8* [[RL_RED1]],
676   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
677   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
678   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float*
679   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
680   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
681   // CHECK: [[GLOBAL_RED1:%.+]] = load float, float* [[GLOBAL_RED1_IDX_PTR]],
682   // CHECK: store float [[GLOBAL_RED1]], float* [[RL_RED1]],
683   // CHECK: ret void
684 
685   // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8* %0, i32 %1, i8* %2)
686   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
687   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
688   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
689   // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
690   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
691   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
692   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
693   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
694   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
695   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
696   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
697   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
698   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
699   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]]
700   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
701   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
702   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
703   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8*
704   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
705   // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
706   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
707   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
708   // CHECK: ret void
709 
710   // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}(
711   //
712   // CHECK: call void @__kmpc_spmd_kernel_init(
713   // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
714   // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
715 
716   // CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}}
717   // CHECK: store i32 0,
718   // CHECK: store i32 0,
719   // CHECK: store i32 0, i32* [[A_ADDR:%.+]], align
720   // CHECK: store i16 -32768, i16* [[B_ADDR:%.+]], align
721   // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]])
722   // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
723   // CHECK: [[BC:%.+]] = bitcast i32* [[A_ADDR]] to i8*
724   // CHECK: store i8* [[BC]], i8** [[GEP1]],
725   // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
726   // CHECK: [[BC:%.+]] = bitcast i16* [[B_ADDR]] to i8*
727   // CHECK: store i8* [[BC]], i8** [[GEP2]],
728   // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
729   // CHECK: [[BUF:%.+]] = load i8*, i8** @
730   // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]])
731   // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
732   // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
733   //
734   // CHECK: [[IFLABEL]]
735   // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
736   // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align
737   // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
738   // CHECK: store i32 [[OR]], i32* [[A_IN]], align
739   // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
740   // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
741   // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align
742   // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
743   // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
744   // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
745   //
746   // CHECK: [[DO_MAX]]
747   // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
748   // CHECK: br label {{%?}}[[MAX_CONT:.+]]
749   //
750   // CHECK: [[MAX_ELSE]]
751   // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align
752   // CHECK: br label {{%?}}[[MAX_CONT]]
753   //
754   // CHECK: [[MAX_CONT]]
755   // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
756   // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
757   // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]])
758   // CHECK: br label %[[EXIT]]
759   //
760   // CHECK: [[EXIT]]
761 
762   // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}, i16* nonnull align {{[0-9]+}} dereferenceable{{.+}})
763   //
764   // CHECK: store i32 0, i32* [[A:%.+]], align
765   // CHECK: store i16 -32768, i16* [[B:%.+]], align
766   // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
767   // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
768   // CHECK: store i32 [[OR]], i32* [[A]], align
769   // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
770   // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
771   // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
772   // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
773   //
774   // CHECK: [[DO_MAX]]
775   // CHECK: br label {{%?}}[[MAX_CONT:.+]]
776   //
777   // CHECK: [[MAX_ELSE]]
778   // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
779   // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
780   // CHECK: br label {{%?}}[[MAX_CONT]]
781   //
782   // CHECK: [[MAX_CONT]]
783   // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
784   // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
785   // CHECK: store i16 [[TRUNC]], i16* [[B]], align
786   // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{.+}} 0, i[[SZ:.+]] 0
787   // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
788   // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
789   // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
790   // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
791   // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
792   // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
793   // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* [[LOC]], i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]])
794   // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
795   // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
796   //
797   // CHECK: [[IFLABEL]]
798   // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
799   // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
800   // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
801   // CHECK: store i32 [[OR]], i32* [[A_IN]], align
802   // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
803   // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
804   // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
805   // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
806   // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
807   // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
808   //
809   // CHECK: [[DO_MAX]]
810   // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
811   // CHECK: br label {{%?}}[[MAX_CONT:.+]]
812   //
813   // CHECK: [[MAX_ELSE]]
814   // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
815   // CHECK: br label {{%?}}[[MAX_CONT]]
816   //
817   // CHECK: [[MAX_CONT]]
818   // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
819   // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
820   // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
821   // CHECK: br label %[[EXIT]]
822   //
823   // CHECK: [[EXIT]]
824   // CHECK: ret void
825 
826   //
827   // Reduction function
828   // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
829   // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
830   // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
831   // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
832   //
833   // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
834   // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
835   // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
836   //
837   // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
838   // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
839   // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
840   //
841   // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
842   // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
843   // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
844   //
845   // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
846   // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
847   // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
848   // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
849   //
850   // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
851   // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
852   // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
853   // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
854   //
855   // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
856   // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
857   //
858   // CHECK: [[DO_MAX]]
859   // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
860   // CHECK: br label {{%?}}[[MAX_CONT:.+]]
861   //
862   // CHECK: [[MAX_ELSE]]
863   // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
864   // CHECK: br label {{%?}}[[MAX_CONT]]
865   //
866   // CHECK: [[MAX_CONT]]
867   // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
868   // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
869   // CHECK: ret void
870   //
871   // Shuffle and reduce function
872   // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
873   // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
874   // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
875   // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
876   //
877   // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
878   // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
879   // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
880   //
881   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
882   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
883   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
884   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
885   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
886   //
887   // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
888   // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
889   // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
890   //
891   // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
892   // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
893   // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
894   //
895   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
896   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
897   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
898   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
899   // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
900   //
901   // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
902   // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
903   // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
904   // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
905   // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
906   //
907   // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
908   // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
909   // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
910   //
911   // Condition to reduce
912   // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
913   //
914   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
915   // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
916   // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
917   //
918   // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
919   // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
920   // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
921   // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
922   // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
923   // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
924   //
925   // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
926   // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
927   // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
928   //
929   // CHECK: [[DO_REDUCE]]
930   // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
931   // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
932   // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
933   // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
934   //
935   // CHECK: [[REDUCE_ELSE]]
936   // CHECK: br label {{%?}}[[REDUCE_CONT]]
937   //
938   // CHECK: [[REDUCE_CONT]]
939   // Now check if we should just copy over the remote reduction list
940   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
941   // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
942   // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
943   // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
944   //
945   // CHECK: [[DO_COPY]]
946   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
947   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
948   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
949   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
950   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
951   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
952   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
953   // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
954   //
955   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
956   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
957   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
958   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
959   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
960   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
961   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
962   // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
963   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
964   //
965   // CHECK: [[COPY_ELSE]]
966   // CHECK: br label {{%?}}[[COPY_CONT]]
967   //
968   // CHECK: [[COPY_CONT]]
969   // CHECK: void
970 
971   //
972   // Inter warp copy function
973   // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8* %0, i32 %1)
974   // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
975   // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
976   // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
977   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
978   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
979   //
980   // [[DO_COPY]]
981   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
982   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
983   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
984   //
985   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
986   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
987   // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
988   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
989   //
990   // CHECK: [[COPY_ELSE]]
991   // CHECK: br label {{%?}}[[COPY_CONT]]
992   //
993   // Barrier after copy to shared memory storage medium.
994   // CHECK: [[COPY_CONT]]
995   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
996   // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
997   //
998   // Read into warp 0.
999   // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1000   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1001   //
1002   // CHECK: [[DO_READ]]
1003   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1004   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1005   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1006   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1007   // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1008   // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
1009   // CHECK: br label {{%?}}[[READ_CONT:.+]]
1010   //
1011   // CHECK: [[READ_ELSE]]
1012   // CHECK: br label {{%?}}[[READ_CONT]]
1013   //
1014   // CHECK: [[READ_CONT]]
1015   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1016   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1017   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1018   //
1019   // [[DO_COPY]]
1020   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1021   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1022   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1023   //
1024   // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1025   // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1026   // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1027   // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1028   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1029   //
1030   // CHECK: [[COPY_ELSE]]
1031   // CHECK: br label {{%?}}[[COPY_CONT]]
1032   //
1033   // Barrier after copy to shared memory storage medium.
1034   // CHECK: [[COPY_CONT]]
1035   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1036   // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
1037   //
1038   // Read into warp 0.
1039   // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1040   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1041   //
1042   // CHECK: [[DO_READ]]
1043   // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1044   // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1045   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1046   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1047   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1048   // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1049   // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
1050   // CHECK: br label {{%?}}[[READ_CONT:.+]]
1051   //
1052   // CHECK: [[READ_ELSE]]
1053   // CHECK: br label {{%?}}[[READ_CONT]]
1054   //
1055   // CHECK: [[READ_CONT]]
1056   // CHECK: ret
1057 
1058   //
1059   // Reduction function
1060   // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
1061   // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
1062   // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
1063   // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
1064   //
1065   // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
1066   // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
1067   // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
1068   //
1069   // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
1070   // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
1071   // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
1072   //
1073   // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
1074   // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
1075   // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
1076   //
1077   // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
1078   // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
1079   // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
1080   // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
1081   //
1082   // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
1083   // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
1084   // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
1085   // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
1086   //
1087   // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
1088   // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
1089   //
1090   // CHECK: [[DO_MAX]]
1091   // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
1092   // CHECK: br label {{%?}}[[MAX_CONT:.+]]
1093   //
1094   // CHECK: [[MAX_ELSE]]
1095   // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
1096   // CHECK: br label {{%?}}[[MAX_CONT]]
1097   //
1098   // CHECK: [[MAX_CONT]]
1099   // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
1100   // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
1101   // CHECK: ret void
1102 
1103   //
1104   // Shuffle and reduce function
1105   // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
1106   // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align
1107   // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
1108   // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
1109   //
1110   // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
1111   // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
1112   // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
1113   //
1114   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1115   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1116   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1117   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1118   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
1119   //
1120   // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1121   // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
1122   // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
1123   //
1124   // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
1125   // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
1126   // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
1127   //
1128   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1129   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1130   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1131   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1132   // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1133   //
1134   // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
1135   // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1136   // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
1137   // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
1138   // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
1139   //
1140   // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
1141   // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
1142   // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
1143   //
1144   // Condition to reduce
1145   // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
1146   //
1147   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
1148   // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
1149   // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
1150   //
1151   // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
1152   // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
1153   // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
1154   // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
1155   // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
1156   // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
1157   //
1158   // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
1159   // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
1160   // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
1161   //
1162   // CHECK: [[DO_REDUCE]]
1163   // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
1164   // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8*
1165   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
1166   // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
1167   //
1168   // CHECK: [[REDUCE_ELSE]]
1169   // CHECK: br label {{%?}}[[REDUCE_CONT]]
1170   //
1171   // CHECK: [[REDUCE_CONT]]
1172   // Now check if we should just copy over the remote reduction list
1173   // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
1174   // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
1175   // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
1176   // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1177   //
1178   // CHECK: [[DO_COPY]]
1179   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1180   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1181   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1182   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1183   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
1184   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1185   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
1186   // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
1187   //
1188   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1189   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1190   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1191   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1192   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
1193   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1194   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
1195   // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
1196   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1197   //
1198   // CHECK: [[COPY_ELSE]]
1199   // CHECK: br label {{%?}}[[COPY_CONT]]
1200   //
1201   // CHECK: [[COPY_CONT]]
1202   // CHECK: void
1203 
1204   //
1205   // Inter warp copy function
1206   // CHECK: define internal void [[INTER_WARP_COPY]](i8* %0, i32 %1)
1207   // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
1208   // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
1209   // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
1210   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1211   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1212   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1213   //
1214   // [[DO_COPY]]
1215   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
1216   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1217   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1218   //
1219   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1220   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
1221   // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1222   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1223   //
1224   // CHECK: [[COPY_ELSE]]
1225   // CHECK: br label {{%?}}[[COPY_CONT]]
1226   //
1227   // Barrier after copy to shared memory storage medium.
1228   // CHECK: [[COPY_CONT]]
1229   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1230   // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
1231   //
1232   // Read into warp 0.
1233   // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1234   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1235   //
1236   // CHECK: [[DO_READ]]
1237   // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1238   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
1239   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1240   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1241   // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1242   // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
1243   // CHECK: br label {{%?}}[[READ_CONT:.+]]
1244   //
1245   // CHECK: [[READ_ELSE]]
1246   // CHECK: br label {{%?}}[[READ_CONT]]
1247   //
1248   // CHECK: [[READ_CONT]]
1249   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1250   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1251   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1252   //
1253   // [[DO_COPY]]
1254   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
1255   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1256   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1257   //
1258   // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1259   // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1260   // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1261   // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1262   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1263   //
1264   // CHECK: [[COPY_ELSE]]
1265   // CHECK: br label {{%?}}[[COPY_CONT]]
1266   //
1267   // Barrier after copy to shared memory storage medium.
1268   // CHECK: [[COPY_CONT]]
1269   // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1270   // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
1271   //
1272   // Read into warp 0.
1273   // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1274   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1275   //
1276   // CHECK: [[DO_READ]]
1277   // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1278   // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1279   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
1280   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1281   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1282   // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1283   // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
1284   // CHECK: br label {{%?}}[[READ_CONT:.+]]
1285   //
1286   // CHECK: [[READ_ELSE]]
1287   // CHECK: br label {{%?}}[[READ_CONT]]
1288   //
1289   // CHECK: [[READ_CONT]]
1290   // CHECK: ret
1291 
1292   // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8* %0, i32 %1, i8* %2)
1293   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1294   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1295   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1296   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1297   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1298   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1299   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1300   // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
1301   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1302   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1303   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1304   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1305   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1306   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32*
1307   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1308   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1309   // CHECK: [[LOC_RED1:%.+]] = load i32, i32* [[RL_RED1]],
1310   // CHECK: store i32 [[LOC_RED1]], i32* [[GLOBAL_RED1_IDX_PTR]],
1311   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1312   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1313   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16*
1314   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1315   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1316   // CHECK: [[LOC_RED1:%.+]] = load i16, i16* [[RL_RED1]],
1317   // CHECK: store i16 [[LOC_RED1]], i16* [[GLOBAL_RED1_IDX_PTR]],
1318   // CHECK: ret void
1319 
1320   // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8* %0, i32 %1, i8* %2)
1321   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1322   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1323   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1324   // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
1325   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1326   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1327   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1328   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1329   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1330   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1331   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1332   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1333   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1334   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8*
1335   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1336   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1337   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1338   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1339   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8*
1340   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1341   // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
1342   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1343   // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]])
1344   // CHECK: ret void
1345 
1346   // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8* %0, i32 %1, i8* %2)
1347   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1348   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1349   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1350   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1351   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1352   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1353   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1354   // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
1355   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1356   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1357   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1358   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1359   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1360   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32*
1361   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1362   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1363   // CHECK: [[GLOBAL_RED1:%.+]] = load i32, i32* [[GLOBAL_RED1_IDX_PTR]],
1364   // CHECK: store i32 [[GLOBAL_RED1]], i32* [[RL_RED1]],
1365   // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1366   // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1367   // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16*
1368   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1369   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1370   // CHECK: [[GLOBAL_RED1:%.+]] = load i16, i16* [[GLOBAL_RED1_IDX_PTR]],
1371   // CHECK: store i16 [[GLOBAL_RED1]], i16* [[RL_RED1]],
1372   // CHECK: ret void
1373 
1374   // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8* %0, i32 %1, i8* %2)
1375   // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1376   // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1377   // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1378   // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
1379   // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1380   // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1381   // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1382   // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1383   // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1384   // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1385   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1386   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1387   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1388   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8*
1389   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1390   // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1391   // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1392   // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1393   // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8*
1394   // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1395   // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
1396   // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1397   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
1398   // CHECK: ret void
1399 
1400 #endif
1401