1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // Test target codegen - host bc file has to be created first.
3 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
4 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
5 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
6 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
7 // expected-no-diagnostics
8 #ifndef HEADER
9 #define HEADER
10 
11 #ifdef CK1
12 
13 template <typename T>
tmain(T argc)14 int tmain(T argc) {
15 #pragma omp target
16 #pragma omp teams
17   argc = 0;
18   return 0;
19 }
20 
21 
main(int argc,char ** argv)22 int main (int argc, char **argv) {
23 #pragma omp target
24 #pragma omp teams
25   {
26   argc = 0;
27   }
28   return tmain(argv);
29 }
30 
31 
32 // only nvptx side: do not outline teams region and do not call fork_teams
33 
34 
35 // target region in template
36 
37 
38 
39 #endif // CK1
40 
41 // Test target codegen - host bc file has to be created first.
42 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
43 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK3
44 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
45 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK4
46 // expected-no-diagnostics
47 #ifdef CK2
48 
49 template <typename T>
tmain(T argc)50 int tmain(T argc) {
51   int a = 10;
52   int b = 5;
53 #pragma omp target
54 #pragma omp teams num_teams(a) thread_limit(b)
55   {
56   argc = 0;
57   }
58   return 0;
59 }
60 
main(int argc,char ** argv)61 int main (int argc, char **argv) {
62   int a = 20;
63   int b = 5;
64 #pragma omp target
65 #pragma omp teams num_teams(a) thread_limit(b)
66   {
67   argc = 0;
68   }
69   return tmain(argv);
70 }
71 
72 
73 
74 
75 
76 
77 #endif // CK2
78 #endif
79 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
80 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
81 // CHECK5-NEXT:  entry:
82 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
83 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
84 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
85 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
86 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
87 // CHECK5:       .await.work:
88 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
89 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
90 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
91 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
92 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
93 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
94 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
95 // CHECK5:       .select.workers:
96 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
97 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
98 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
99 // CHECK5:       .execute.parallel:
100 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
101 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
102 // CHECK5-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
103 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
104 // CHECK5:       .terminate.parallel:
105 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
106 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
107 // CHECK5:       .barrier.parallel:
108 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
109 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
110 // CHECK5:       .exit:
111 // CHECK5-NEXT:    ret void
112 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
113 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
114 // CHECK5-NEXT:  entry:
115 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
116 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
117 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
118 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
119 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
120 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
121 // CHECK5-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
122 // CHECK5-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
123 // CHECK5-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
124 // CHECK5-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
125 // CHECK5-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
126 // CHECK5-NEXT:    [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
127 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
128 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
129 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
130 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
131 // CHECK5-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
132 // CHECK5-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
133 // CHECK5:       .worker:
134 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
135 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
136 // CHECK5:       .mastercheck:
137 // CHECK5-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
138 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
139 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
140 // CHECK5-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
141 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
142 // CHECK5-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
143 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
144 // CHECK5-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
145 // CHECK5-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
146 // CHECK5:       .master:
147 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
148 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
149 // CHECK5-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
150 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
151 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
152 // CHECK5-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
153 // CHECK5-NEXT:    [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8
154 // CHECK5-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
155 // CHECK5-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
156 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0
157 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
158 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV2]], align 8
159 // CHECK5-NEXT:    [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
160 // CHECK5-NEXT:    store i32 [[TMP10]], i32* [[ARGC9]], align 4
161 // CHECK5-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
162 // CHECK5-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
163 // CHECK5-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]]
164 // CHECK5-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
165 // CHECK5-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
166 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
167 // CHECK5:       .termination.notifier:
168 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
169 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
170 // CHECK5-NEXT:    br label [[DOTEXIT]]
171 // CHECK5:       .exit:
172 // CHECK5-NEXT:    ret void
173 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__
174 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
175 // CHECK5-NEXT:  entry:
176 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
177 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
178 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
179 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
180 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
181 // CHECK5-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
182 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
183 // CHECK5-NEXT:    store i32 0, i32* [[TMP0]], align 4
184 // CHECK5-NEXT:    ret void
185 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
186 // CHECK5-SAME: () #[[ATTR0]] {
187 // CHECK5-NEXT:  entry:
188 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
189 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
190 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
191 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
192 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
193 // CHECK5:       .await.work:
194 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
195 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
196 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
197 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
198 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
199 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
200 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
201 // CHECK5:       .select.workers:
202 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
203 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
204 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
205 // CHECK5:       .execute.parallel:
206 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
207 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
208 // CHECK5-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
209 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
210 // CHECK5:       .terminate.parallel:
211 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
212 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
213 // CHECK5:       .barrier.parallel:
214 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
215 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
216 // CHECK5:       .exit:
217 // CHECK5-NEXT:    ret void
218 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
219 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
220 // CHECK5-NEXT:  entry:
221 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
222 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
223 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
224 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
225 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
226 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
227 // CHECK5-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
228 // CHECK5-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
229 // CHECK5-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
230 // CHECK5-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
231 // CHECK5-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
232 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
233 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
234 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
235 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
236 // CHECK5-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
237 // CHECK5-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
238 // CHECK5:       .worker:
239 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
240 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
241 // CHECK5:       .mastercheck:
242 // CHECK5-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
243 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
244 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
245 // CHECK5-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
246 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
247 // CHECK5-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
248 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
249 // CHECK5-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
250 // CHECK5-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
251 // CHECK5:       .master:
252 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
253 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
254 // CHECK5-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
255 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
256 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
257 // CHECK5-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
258 // CHECK5-NEXT:    [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8
259 // CHECK5-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
260 // CHECK5-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
261 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0
262 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0*
263 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
264 // CHECK5-NEXT:    [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0
265 // CHECK5-NEXT:    store i8** [[TMP10]], i8*** [[ARGC8]], align 8
266 // CHECK5-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
267 // CHECK5-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
268 // CHECK5-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]]
269 // CHECK5-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
270 // CHECK5-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
271 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
272 // CHECK5:       .termination.notifier:
273 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
274 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
275 // CHECK5-NEXT:    br label [[DOTEXIT]]
276 // CHECK5:       .exit:
277 // CHECK5-NEXT:    ret void
278 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3
279 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] {
280 // CHECK5-NEXT:  entry:
281 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
282 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
283 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
284 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
285 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
286 // CHECK5-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
287 // CHECK5-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
288 // CHECK5-NEXT:    store i8** null, i8*** [[TMP0]], align 8
289 // CHECK5-NEXT:    ret void
290 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
291 // CHECK6-SAME: () #[[ATTR0:[0-9]+]] {
292 // CHECK6-NEXT:  entry:
293 // CHECK6-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
294 // CHECK6-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
295 // CHECK6-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
296 // CHECK6-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
297 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
298 // CHECK6:       .await.work:
299 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
300 // CHECK6-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
301 // CHECK6-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
302 // CHECK6-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
303 // CHECK6-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
304 // CHECK6-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
305 // CHECK6-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
306 // CHECK6:       .select.workers:
307 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
308 // CHECK6-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
309 // CHECK6-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
310 // CHECK6:       .execute.parallel:
311 // CHECK6-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
312 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
313 // CHECK6-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
314 // CHECK6-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
315 // CHECK6:       .terminate.parallel:
316 // CHECK6-NEXT:    call void @__kmpc_kernel_end_parallel()
317 // CHECK6-NEXT:    br label [[DOTBARRIER_PARALLEL]]
318 // CHECK6:       .barrier.parallel:
319 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
320 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK]]
321 // CHECK6:       .exit:
322 // CHECK6-NEXT:    ret void
323 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
324 // CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
325 // CHECK6-NEXT:  entry:
326 // CHECK6-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
327 // CHECK6-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
328 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
329 // CHECK6-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
330 // CHECK6-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
331 // CHECK6-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
332 // CHECK6-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
333 // CHECK6-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
334 // CHECK6-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
335 // CHECK6-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
336 // CHECK6-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
337 // CHECK6-NEXT:    [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
338 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
339 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
340 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
341 // CHECK6-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
342 // CHECK6-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
343 // CHECK6-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
344 // CHECK6:       .worker:
345 // CHECK6-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
346 // CHECK6-NEXT:    br label [[DOTEXIT:%.*]]
347 // CHECK6:       .mastercheck:
348 // CHECK6-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
349 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
350 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
351 // CHECK6-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
352 // CHECK6-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
353 // CHECK6-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
354 // CHECK6-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
355 // CHECK6-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
356 // CHECK6-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
357 // CHECK6:       .master:
358 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
359 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
360 // CHECK6-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
361 // CHECK6-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
362 // CHECK6-NEXT:    call void @__kmpc_data_sharing_init_stack()
363 // CHECK6-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1)
364 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
365 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[CONV2]], align 8
366 // CHECK6-NEXT:    [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
367 // CHECK6-NEXT:    store i32 [[TMP7]], i32* [[ARGC9]], align 4
368 // CHECK6-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
369 // CHECK6-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
370 // CHECK6-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]]
371 // CHECK6-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
372 // CHECK6-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
373 // CHECK6:       .termination.notifier:
374 // CHECK6-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
375 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
376 // CHECK6-NEXT:    br label [[DOTEXIT]]
377 // CHECK6:       .exit:
378 // CHECK6-NEXT:    ret void
379 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__
380 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
381 // CHECK6-NEXT:  entry:
382 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
383 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
384 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
385 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
386 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
387 // CHECK6-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
388 // CHECK6-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
389 // CHECK6-NEXT:    store i32 0, i32* [[TMP0]], align 4
390 // CHECK6-NEXT:    ret void
391 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
392 // CHECK6-SAME: () #[[ATTR0]] {
393 // CHECK6-NEXT:  entry:
394 // CHECK6-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
395 // CHECK6-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
396 // CHECK6-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
397 // CHECK6-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
398 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
399 // CHECK6:       .await.work:
400 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
401 // CHECK6-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
402 // CHECK6-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
403 // CHECK6-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
404 // CHECK6-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
405 // CHECK6-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
406 // CHECK6-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
407 // CHECK6:       .select.workers:
408 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
409 // CHECK6-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
410 // CHECK6-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
411 // CHECK6:       .execute.parallel:
412 // CHECK6-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
413 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
414 // CHECK6-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
415 // CHECK6-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
416 // CHECK6:       .terminate.parallel:
417 // CHECK6-NEXT:    call void @__kmpc_kernel_end_parallel()
418 // CHECK6-NEXT:    br label [[DOTBARRIER_PARALLEL]]
419 // CHECK6:       .barrier.parallel:
420 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
421 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK]]
422 // CHECK6:       .exit:
423 // CHECK6-NEXT:    ret void
424 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
425 // CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
426 // CHECK6-NEXT:  entry:
427 // CHECK6-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
428 // CHECK6-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
429 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
430 // CHECK6-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
431 // CHECK6-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
432 // CHECK6-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
433 // CHECK6-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
434 // CHECK6-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
435 // CHECK6-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
436 // CHECK6-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
437 // CHECK6-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
438 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
439 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
440 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
441 // CHECK6-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
442 // CHECK6-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
443 // CHECK6-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
444 // CHECK6:       .worker:
445 // CHECK6-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
446 // CHECK6-NEXT:    br label [[DOTEXIT:%.*]]
447 // CHECK6:       .mastercheck:
448 // CHECK6-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
449 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
450 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
451 // CHECK6-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
452 // CHECK6-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
453 // CHECK6-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
454 // CHECK6-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
455 // CHECK6-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
456 // CHECK6-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
457 // CHECK6:       .master:
458 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
459 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
460 // CHECK6-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
461 // CHECK6-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
462 // CHECK6-NEXT:    call void @__kmpc_data_sharing_init_stack()
463 // CHECK6-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1)
464 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
465 // CHECK6-NEXT:    [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
466 // CHECK6-NEXT:    [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
467 // CHECK6-NEXT:    store i8** [[TMP7]], i8*** [[ARGC8]], align 8
468 // CHECK6-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
469 // CHECK6-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
470 // CHECK6-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]]
471 // CHECK6-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
472 // CHECK6-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
473 // CHECK6:       .termination.notifier:
474 // CHECK6-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
475 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
476 // CHECK6-NEXT:    br label [[DOTEXIT]]
477 // CHECK6:       .exit:
478 // CHECK6-NEXT:    ret void
479 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1
480 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] {
481 // CHECK6-NEXT:  entry:
482 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
483 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
484 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
485 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
486 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
487 // CHECK6-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
488 // CHECK6-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
489 // CHECK6-NEXT:    store i8** null, i8*** [[TMP0]], align 8
490 // CHECK6-NEXT:    ret void
491 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
492 // CHECK7-SAME: () #[[ATTR0:[0-9]+]] {
493 // CHECK7-NEXT:  entry:
494 // CHECK7-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
495 // CHECK7-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
496 // CHECK7-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
497 // CHECK7-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
498 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
499 // CHECK7:       .await.work:
500 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
501 // CHECK7-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
502 // CHECK7-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
503 // CHECK7-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
504 // CHECK7-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
505 // CHECK7-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
506 // CHECK7-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
507 // CHECK7:       .select.workers:
508 // CHECK7-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
509 // CHECK7-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
510 // CHECK7-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
511 // CHECK7:       .execute.parallel:
512 // CHECK7-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
513 // CHECK7-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
514 // CHECK7-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
515 // CHECK7-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
516 // CHECK7:       .terminate.parallel:
517 // CHECK7-NEXT:    call void @__kmpc_kernel_end_parallel()
518 // CHECK7-NEXT:    br label [[DOTBARRIER_PARALLEL]]
519 // CHECK7:       .barrier.parallel:
520 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
521 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK]]
522 // CHECK7:       .exit:
523 // CHECK7-NEXT:    ret void
524 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
525 // CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
526 // CHECK7-NEXT:  entry:
527 // CHECK7-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
528 // CHECK7-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
529 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
530 // CHECK7-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
531 // CHECK7-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
532 // CHECK7-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
533 // CHECK7-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
534 // CHECK7-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
535 // CHECK7-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
536 // CHECK7-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
537 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
538 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
539 // CHECK7-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
540 // CHECK7-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
541 // CHECK7-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
542 // CHECK7:       .worker:
543 // CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
544 // CHECK7-NEXT:    br label [[DOTEXIT:%.*]]
545 // CHECK7:       .mastercheck:
546 // CHECK7-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
547 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
548 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
549 // CHECK7-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
550 // CHECK7-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
551 // CHECK7-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
552 // CHECK7-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
553 // CHECK7-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
554 // CHECK7-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
555 // CHECK7:       .master:
556 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
557 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
558 // CHECK7-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
559 // CHECK7-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
560 // CHECK7-NEXT:    call void @__kmpc_data_sharing_init_stack()
561 // CHECK7-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
562 // CHECK7-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4
563 // CHECK7-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
564 // CHECK7-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
565 // CHECK7-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
566 // CHECK7-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
567 // CHECK7-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
568 // CHECK7-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
569 // CHECK7-NEXT:    store i32 [[TMP10]], i32* [[ARGC7]], align 4
570 // CHECK7-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
571 // CHECK7-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
572 // CHECK7-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
573 // CHECK7-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
574 // CHECK7-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
575 // CHECK7-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
576 // CHECK7:       .termination.notifier:
577 // CHECK7-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
578 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
579 // CHECK7-NEXT:    br label [[DOTEXIT]]
580 // CHECK7:       .exit:
581 // CHECK7-NEXT:    ret void
582 // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__
583 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
584 // CHECK7-NEXT:  entry:
585 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
586 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
587 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
588 // CHECK7-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
589 // CHECK7-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
590 // CHECK7-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
591 // CHECK7-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
592 // CHECK7-NEXT:    store i32 0, i32* [[TMP0]], align 4
593 // CHECK7-NEXT:    ret void
594 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
595 // CHECK7-SAME: () #[[ATTR0]] {
596 // CHECK7-NEXT:  entry:
597 // CHECK7-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
598 // CHECK7-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
599 // CHECK7-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
600 // CHECK7-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
601 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
602 // CHECK7:       .await.work:
603 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
604 // CHECK7-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
605 // CHECK7-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
606 // CHECK7-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
607 // CHECK7-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
608 // CHECK7-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
609 // CHECK7-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
610 // CHECK7:       .select.workers:
611 // CHECK7-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
612 // CHECK7-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
613 // CHECK7-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
614 // CHECK7:       .execute.parallel:
615 // CHECK7-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
616 // CHECK7-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
617 // CHECK7-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
618 // CHECK7-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
619 // CHECK7:       .terminate.parallel:
620 // CHECK7-NEXT:    call void @__kmpc_kernel_end_parallel()
621 // CHECK7-NEXT:    br label [[DOTBARRIER_PARALLEL]]
622 // CHECK7:       .barrier.parallel:
623 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
624 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK]]
625 // CHECK7:       .exit:
626 // CHECK7-NEXT:    ret void
627 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
628 // CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
629 // CHECK7-NEXT:  entry:
630 // CHECK7-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
631 // CHECK7-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
632 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
633 // CHECK7-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
634 // CHECK7-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
635 // CHECK7-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
636 // CHECK7-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
637 // CHECK7-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
638 // CHECK7-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
639 // CHECK7-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
640 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
641 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
642 // CHECK7-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
643 // CHECK7-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
644 // CHECK7-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
645 // CHECK7:       .worker:
646 // CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
647 // CHECK7-NEXT:    br label [[DOTEXIT:%.*]]
648 // CHECK7:       .mastercheck:
649 // CHECK7-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
650 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
651 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
652 // CHECK7-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
653 // CHECK7-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
654 // CHECK7-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
655 // CHECK7-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
656 // CHECK7-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
657 // CHECK7-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
658 // CHECK7:       .master:
659 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
660 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
661 // CHECK7-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
662 // CHECK7-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
663 // CHECK7-NEXT:    call void @__kmpc_data_sharing_init_stack()
664 // CHECK7-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
665 // CHECK7-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4
666 // CHECK7-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
667 // CHECK7-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
668 // CHECK7-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
669 // CHECK7-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0*
670 // CHECK7-NEXT:    [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
671 // CHECK7-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0
672 // CHECK7-NEXT:    store i8** [[TMP10]], i8*** [[ARGC7]], align 4
673 // CHECK7-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
674 // CHECK7-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
675 // CHECK7-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
676 // CHECK7-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
677 // CHECK7-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
678 // CHECK7-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
679 // CHECK7:       .termination.notifier:
680 // CHECK7-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
681 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
682 // CHECK7-NEXT:    br label [[DOTEXIT]]
683 // CHECK7:       .exit:
684 // CHECK7-NEXT:    ret void
685 // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3
686 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
687 // CHECK7-NEXT:  entry:
688 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
689 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
690 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
691 // CHECK7-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
692 // CHECK7-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
693 // CHECK7-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
694 // CHECK7-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
695 // CHECK7-NEXT:    store i8** null, i8*** [[TMP0]], align 4
696 // CHECK7-NEXT:    ret void
697 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
698 // CHECK8-SAME: () #[[ATTR0:[0-9]+]] {
699 // CHECK8-NEXT:  entry:
700 // CHECK8-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
701 // CHECK8-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
702 // CHECK8-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
703 // CHECK8-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
704 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
705 // CHECK8:       .await.work:
706 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
707 // CHECK8-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
708 // CHECK8-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
709 // CHECK8-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
710 // CHECK8-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
711 // CHECK8-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
712 // CHECK8-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
713 // CHECK8:       .select.workers:
714 // CHECK8-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
715 // CHECK8-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
716 // CHECK8-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
717 // CHECK8:       .execute.parallel:
718 // CHECK8-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
719 // CHECK8-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
720 // CHECK8-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
721 // CHECK8-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
722 // CHECK8:       .terminate.parallel:
723 // CHECK8-NEXT:    call void @__kmpc_kernel_end_parallel()
724 // CHECK8-NEXT:    br label [[DOTBARRIER_PARALLEL]]
725 // CHECK8:       .barrier.parallel:
726 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
727 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK]]
728 // CHECK8:       .exit:
729 // CHECK8-NEXT:    ret void
730 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
731 // CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
732 // CHECK8-NEXT:  entry:
733 // CHECK8-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
734 // CHECK8-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
735 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
736 // CHECK8-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
737 // CHECK8-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
738 // CHECK8-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
739 // CHECK8-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
740 // CHECK8-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
741 // CHECK8-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
742 // CHECK8-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
743 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
744 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
745 // CHECK8-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
746 // CHECK8-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
747 // CHECK8-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
748 // CHECK8:       .worker:
749 // CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
750 // CHECK8-NEXT:    br label [[DOTEXIT:%.*]]
751 // CHECK8:       .mastercheck:
752 // CHECK8-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
753 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
754 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
755 // CHECK8-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
756 // CHECK8-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
757 // CHECK8-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
758 // CHECK8-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
759 // CHECK8-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
760 // CHECK8-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
761 // CHECK8:       .master:
762 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
763 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
764 // CHECK8-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
765 // CHECK8-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
766 // CHECK8-NEXT:    call void @__kmpc_data_sharing_init_stack()
767 // CHECK8-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
768 // CHECK8-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
769 // CHECK8-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
770 // CHECK8-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
771 // CHECK8-NEXT:    store i32 [[TMP7]], i32* [[ARGC7]], align 4
772 // CHECK8-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
773 // CHECK8-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
774 // CHECK8-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
775 // CHECK8-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
776 // CHECK8-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
777 // CHECK8:       .termination.notifier:
778 // CHECK8-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
779 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
780 // CHECK8-NEXT:    br label [[DOTEXIT]]
781 // CHECK8:       .exit:
782 // CHECK8-NEXT:    ret void
783 // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__
784 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
785 // CHECK8-NEXT:  entry:
786 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
787 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
788 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
789 // CHECK8-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
790 // CHECK8-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
791 // CHECK8-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
792 // CHECK8-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
793 // CHECK8-NEXT:    store i32 0, i32* [[TMP0]], align 4
794 // CHECK8-NEXT:    ret void
795 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
796 // CHECK8-SAME: () #[[ATTR0]] {
797 // CHECK8-NEXT:  entry:
798 // CHECK8-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
799 // CHECK8-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
800 // CHECK8-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
801 // CHECK8-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
802 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
803 // CHECK8:       .await.work:
804 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
805 // CHECK8-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
806 // CHECK8-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
807 // CHECK8-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
808 // CHECK8-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
809 // CHECK8-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
810 // CHECK8-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
811 // CHECK8:       .select.workers:
812 // CHECK8-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
813 // CHECK8-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
814 // CHECK8-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
815 // CHECK8:       .execute.parallel:
816 // CHECK8-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
817 // CHECK8-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
818 // CHECK8-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
819 // CHECK8-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
820 // CHECK8:       .terminate.parallel:
821 // CHECK8-NEXT:    call void @__kmpc_kernel_end_parallel()
822 // CHECK8-NEXT:    br label [[DOTBARRIER_PARALLEL]]
823 // CHECK8:       .barrier.parallel:
824 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
825 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK]]
826 // CHECK8:       .exit:
827 // CHECK8-NEXT:    ret void
828 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
829 // CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
830 // CHECK8-NEXT:  entry:
831 // CHECK8-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
832 // CHECK8-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
833 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
834 // CHECK8-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
835 // CHECK8-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
836 // CHECK8-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
837 // CHECK8-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
838 // CHECK8-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
839 // CHECK8-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
840 // CHECK8-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
841 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
842 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
843 // CHECK8-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
844 // CHECK8-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
845 // CHECK8-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
846 // CHECK8:       .worker:
847 // CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
848 // CHECK8-NEXT:    br label [[DOTEXIT:%.*]]
849 // CHECK8:       .mastercheck:
850 // CHECK8-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
851 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
852 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
853 // CHECK8-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
854 // CHECK8-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
855 // CHECK8-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
856 // CHECK8-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
857 // CHECK8-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
858 // CHECK8-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
859 // CHECK8:       .master:
860 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
861 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
862 // CHECK8-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
863 // CHECK8-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
864 // CHECK8-NEXT:    call void @__kmpc_data_sharing_init_stack()
865 // CHECK8-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
866 // CHECK8-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
867 // CHECK8-NEXT:    [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
868 // CHECK8-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
869 // CHECK8-NEXT:    store i8** [[TMP7]], i8*** [[ARGC7]], align 4
870 // CHECK8-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
871 // CHECK8-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
872 // CHECK8-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
873 // CHECK8-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
874 // CHECK8-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
875 // CHECK8:       .termination.notifier:
876 // CHECK8-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
877 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
878 // CHECK8-NEXT:    br label [[DOTEXIT]]
879 // CHECK8:       .exit:
880 // CHECK8-NEXT:    ret void
881 // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1
882 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
883 // CHECK8-NEXT:  entry:
884 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
885 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
886 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
887 // CHECK8-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
888 // CHECK8-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
889 // CHECK8-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
890 // CHECK8-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
891 // CHECK8-NEXT:    store i8** null, i8*** [[TMP0]], align 4
892 // CHECK8-NEXT:    ret void
893 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
894 // CHECK1-SAME: (i64 [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
895 // CHECK1-NEXT:  entry:
896 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
897 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
898 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
899 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
900 // CHECK1-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
901 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
902 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
903 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
904 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
905 // CHECK1:       user_code.entry:
906 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8
907 // CHECK1-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
908 // CHECK1-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
909 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
910 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
911 // CHECK1-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
912 // CHECK1-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]]
913 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC1]], i64 4)
914 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
915 // CHECK1-NEXT:    ret void
916 // CHECK1:       worker.exit:
917 // CHECK1-NEXT:    ret void
918 //
919 //
920 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
921 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
922 // CHECK1-NEXT:  entry:
923 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
924 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
925 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
926 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
927 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
928 // CHECK1-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
929 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
930 // CHECK1-NEXT:    store i32 0, i32* [[TMP0]], align 4
931 // CHECK1-NEXT:    ret void
932 //
933 //
934 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15
935 // CHECK1-SAME: (i8** [[ARGC:%.*]]) #[[ATTR0]] {
936 // CHECK1-NEXT:  entry:
937 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
938 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
939 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
940 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
941 // CHECK1-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
942 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
943 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
944 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
945 // CHECK1:       user_code.entry:
946 // CHECK1-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
947 // CHECK1-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
948 // CHECK1-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
949 // CHECK1-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8
950 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
951 // CHECK1-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
952 // CHECK1-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]]
953 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC1]], i64 8)
954 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
955 // CHECK1-NEXT:    ret void
956 // CHECK1:       worker.exit:
957 // CHECK1-NEXT:    ret void
958 //
959 //
960 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
961 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR0]] {
962 // CHECK1-NEXT:  entry:
963 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
964 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
965 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
966 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
967 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
968 // CHECK1-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
969 // CHECK1-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
970 // CHECK1-NEXT:    store i8** null, i8*** [[TMP0]], align 8
971 // CHECK1-NEXT:    ret void
972 //
973 //
974 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
975 // CHECK2-SAME: (i32 [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
976 // CHECK2-NEXT:  entry:
977 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
978 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
979 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
980 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
981 // CHECK2-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
982 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
983 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
984 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
985 // CHECK2:       user_code.entry:
986 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
987 // CHECK2-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
988 // CHECK2-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
989 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
990 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
991 // CHECK2-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
992 // CHECK2-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]]
993 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4)
994 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
995 // CHECK2-NEXT:    ret void
996 // CHECK2:       worker.exit:
997 // CHECK2-NEXT:    ret void
998 //
999 //
1000 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
1001 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
1002 // CHECK2-NEXT:  entry:
1003 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1004 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1005 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
1006 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1007 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1008 // CHECK2-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
1009 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
1010 // CHECK2-NEXT:    store i32 0, i32* [[TMP0]], align 4
1011 // CHECK2-NEXT:    ret void
1012 //
1013 //
1014 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15
1015 // CHECK2-SAME: (i8** [[ARGC:%.*]]) #[[ATTR0]] {
1016 // CHECK2-NEXT:  entry:
1017 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
1018 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1019 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1020 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1021 // CHECK2-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
1022 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
1023 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1024 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1025 // CHECK2:       user_code.entry:
1026 // CHECK2-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
1027 // CHECK2-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
1028 // CHECK2-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
1029 // CHECK2-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4
1030 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1031 // CHECK2-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
1032 // CHECK2-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]]
1033 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4)
1034 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
1035 // CHECK2-NEXT:    ret void
1036 // CHECK2:       worker.exit:
1037 // CHECK2-NEXT:    ret void
1038 //
1039 //
1040 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
1041 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
1042 // CHECK2-NEXT:  entry:
1043 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1044 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1045 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
1046 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1047 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1048 // CHECK2-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
1049 // CHECK2-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
1050 // CHECK2-NEXT:    store i8** null, i8*** [[TMP0]], align 4
1051 // CHECK2-NEXT:    ret void
1052 //
1053 //
1054 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64
1055 // CHECK3-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
1056 // CHECK3-NEXT:  entry:
1057 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
1058 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
1059 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
1060 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1061 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1062 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1063 // CHECK3-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
1064 // CHECK3-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
1065 // CHECK3-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
1066 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
1067 // CHECK3-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
1068 // CHECK3-NEXT:    [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
1069 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
1070 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1071 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1072 // CHECK3:       user_code.entry:
1073 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV2]], align 8
1074 // CHECK3-NEXT:    [[ARGC3:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
1075 // CHECK3-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC3]] to i32*
1076 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
1077 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1078 // CHECK3-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
1079 // CHECK3-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]]
1080 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC3]], i64 4)
1081 // CHECK3-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
1082 // CHECK3-NEXT:    ret void
1083 // CHECK3:       worker.exit:
1084 // CHECK3-NEXT:    ret void
1085 //
1086 //
1087 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
1088 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
1089 // CHECK3-NEXT:  entry:
1090 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
1091 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
1092 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
1093 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
1094 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
1095 // CHECK3-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
1096 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
1097 // CHECK3-NEXT:    store i32 0, i32* [[TMP0]], align 4
1098 // CHECK3-NEXT:    ret void
1099 //
1100 //
1101 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53
1102 // CHECK3-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR0]] {
1103 // CHECK3-NEXT:  entry:
1104 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
1105 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
1106 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
1107 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1108 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1109 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1110 // CHECK3-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
1111 // CHECK3-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
1112 // CHECK3-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
1113 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
1114 // CHECK3-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
1115 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
1116 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1117 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1118 // CHECK3:       user_code.entry:
1119 // CHECK3-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
1120 // CHECK3-NEXT:    [[ARGC2:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
1121 // CHECK3-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC2]] to i8***
1122 // CHECK3-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8
1123 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1124 // CHECK3-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
1125 // CHECK3-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]]
1126 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC2]], i64 8)
1127 // CHECK3-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
1128 // CHECK3-NEXT:    ret void
1129 // CHECK3:       worker.exit:
1130 // CHECK3-NEXT:    ret void
1131 //
1132 //
1133 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1
1134 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR0]] {
1135 // CHECK3-NEXT:  entry:
1136 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
1137 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
1138 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
1139 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
1140 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
1141 // CHECK3-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
1142 // CHECK3-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
1143 // CHECK3-NEXT:    store i8** null, i8*** [[TMP0]], align 8
1144 // CHECK3-NEXT:    ret void
1145 //
1146 //
1147 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64
1148 // CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
1149 // CHECK4-NEXT:  entry:
1150 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1151 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
1152 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
1153 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1154 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1155 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1156 // CHECK4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1157 // CHECK4-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
1158 // CHECK4-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
1159 // CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
1160 // CHECK4-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1161 // CHECK4-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1162 // CHECK4:       user_code.entry:
1163 // CHECK4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
1164 // CHECK4-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
1165 // CHECK4-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
1166 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
1167 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1168 // CHECK4-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
1169 // CHECK4-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]]
1170 // CHECK4-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4)
1171 // CHECK4-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
1172 // CHECK4-NEXT:    ret void
1173 // CHECK4:       worker.exit:
1174 // CHECK4-NEXT:    ret void
1175 //
1176 //
1177 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__
1178 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
1179 // CHECK4-NEXT:  entry:
1180 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1181 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1182 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
1183 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1184 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1185 // CHECK4-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
1186 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
1187 // CHECK4-NEXT:    store i32 0, i32* [[TMP0]], align 4
1188 // CHECK4-NEXT:    ret void
1189 //
1190 //
1191 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53
1192 // CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR0]] {
1193 // CHECK4-NEXT:  entry:
1194 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1195 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
1196 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
1197 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1198 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1199 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1200 // CHECK4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1201 // CHECK4-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
1202 // CHECK4-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
1203 // CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
1204 // CHECK4-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1205 // CHECK4-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1206 // CHECK4:       user_code.entry:
1207 // CHECK4-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
1208 // CHECK4-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
1209 // CHECK4-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
1210 // CHECK4-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4
1211 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1212 // CHECK4-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
1213 // CHECK4-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]]
1214 // CHECK4-NEXT:    call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4)
1215 // CHECK4-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
1216 // CHECK4-NEXT:    ret void
1217 // CHECK4:       worker.exit:
1218 // CHECK4-NEXT:    ret void
1219 //
1220 //
1221 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1
1222 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
1223 // CHECK4-NEXT:  entry:
1224 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1225 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1226 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
1227 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1228 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1229 // CHECK4-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
1230 // CHECK4-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
1231 // CHECK4-NEXT:    store i8** null, i8*** [[TMP0]], align 4
1232 // CHECK4-NEXT:    ret void
1233 //
1234