1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.blkhindx.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.blkhindx.blocklength;
36 uintptr_t x2 = res / inner_elements;
37 res %= inner_elements;
38 inner_elements /= md->u.blkhindx.child->u.contig.count;
39
40 uintptr_t x3 = res;
41
42 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
43 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
44 uintptr_t extent2 = md->u.blkhindx.child->extent;
45 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2));
46 }
47
yaksuri_cudai_pack_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)48 void yaksuri_cudai_pack_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
49 {
50 void *args[] = { &inbuf, &outbuf, &count, &md };
51 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig_double,
52 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
53 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
54 }
55
yaksuri_cudai_kernel_unpack_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)56 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
57 {
58 const char *__restrict__ sbuf = (const char *) inbuf;
59 char *__restrict__ dbuf = (char *) outbuf;
60 uintptr_t extent = md->extent;
61 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
62 uintptr_t res = idx;
63 uintptr_t inner_elements = md->num_elements;
64
65 if (idx >= (count * inner_elements))
66 return;
67
68 uintptr_t x0 = res / inner_elements;
69 res %= inner_elements;
70 inner_elements /= md->u.blkhindx.count;
71
72 uintptr_t x1 = res / inner_elements;
73 res %= inner_elements;
74 inner_elements /= md->u.blkhindx.blocklength;
75 uintptr_t x2 = res / inner_elements;
76 res %= inner_elements;
77 inner_elements /= md->u.blkhindx.child->u.contig.count;
78
79 uintptr_t x3 = res;
80
81 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
82 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
83 uintptr_t extent2 = md->u.blkhindx.child->extent;
84 *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
85 }
86
yaksuri_cudai_unpack_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)87 void yaksuri_cudai_unpack_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
88 {
89 void *args[] = { &inbuf, &outbuf, &count, &md };
90 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig_double,
91 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
92 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
93 }
94
yaksuri_cudai_kernel_pack_hvector_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)95 __global__ void yaksuri_cudai_kernel_pack_hvector_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
96 {
97 const char *__restrict__ sbuf = (const char *) inbuf;
98 char *__restrict__ dbuf = (char *) outbuf;
99 uintptr_t extent = md->extent;
100 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
101 uintptr_t res = idx;
102 uintptr_t inner_elements = md->num_elements;
103
104 if (idx >= (count * inner_elements))
105 return;
106
107 uintptr_t x0 = res / inner_elements;
108 res %= inner_elements;
109 inner_elements /= md->u.hvector.count;
110
111 uintptr_t x1 = res / inner_elements;
112 res %= inner_elements;
113 inner_elements /= md->u.hvector.blocklength;
114 uintptr_t x2 = res / inner_elements;
115 res %= inner_elements;
116 inner_elements /= md->u.hvector.child->u.blkhindx.count;
117
118 uintptr_t x3 = res / inner_elements;
119 res %= inner_elements;
120 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
121 uintptr_t x4 = res / inner_elements;
122 res %= inner_elements;
123 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.contig.count;
124
125 uintptr_t x5 = res;
126
127 intptr_t stride1 = md->u.hvector.stride;
128 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
129 uintptr_t extent2 = md->u.hvector.child->extent;
130 intptr_t stride3 = md->u.hvector.child->u.blkhindx.child->u.contig.child->extent;
131 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
132 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
133 }
134
yaksuri_cudai_pack_hvector_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)135 void yaksuri_cudai_pack_hvector_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
136 {
137 void *args[] = { &inbuf, &outbuf, &count, &md };
138 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_blkhindx_contig_double,
139 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
140 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
141 }
142
yaksuri_cudai_kernel_unpack_hvector_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)143 __global__ void yaksuri_cudai_kernel_unpack_hvector_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
144 {
145 const char *__restrict__ sbuf = (const char *) inbuf;
146 char *__restrict__ dbuf = (char *) outbuf;
147 uintptr_t extent = md->extent;
148 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
149 uintptr_t res = idx;
150 uintptr_t inner_elements = md->num_elements;
151
152 if (idx >= (count * inner_elements))
153 return;
154
155 uintptr_t x0 = res / inner_elements;
156 res %= inner_elements;
157 inner_elements /= md->u.hvector.count;
158
159 uintptr_t x1 = res / inner_elements;
160 res %= inner_elements;
161 inner_elements /= md->u.hvector.blocklength;
162 uintptr_t x2 = res / inner_elements;
163 res %= inner_elements;
164 inner_elements /= md->u.hvector.child->u.blkhindx.count;
165
166 uintptr_t x3 = res / inner_elements;
167 res %= inner_elements;
168 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
169 uintptr_t x4 = res / inner_elements;
170 res %= inner_elements;
171 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.contig.count;
172
173 uintptr_t x5 = res;
174
175 intptr_t stride1 = md->u.hvector.stride;
176 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
177 uintptr_t extent2 = md->u.hvector.child->extent;
178 intptr_t stride3 = md->u.hvector.child->u.blkhindx.child->u.contig.child->extent;
179 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
180 *((double *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
181 }
182
yaksuri_cudai_unpack_hvector_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)183 void yaksuri_cudai_unpack_hvector_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
184 {
185 void *args[] = { &inbuf, &outbuf, &count, &md };
186 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_blkhindx_contig_double,
187 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
188 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
189 }
190
yaksuri_cudai_kernel_pack_blkhindx_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)191 __global__ void yaksuri_cudai_kernel_pack_blkhindx_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
192 {
193 const char *__restrict__ sbuf = (const char *) inbuf;
194 char *__restrict__ dbuf = (char *) outbuf;
195 uintptr_t extent = md->extent;
196 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
197 uintptr_t res = idx;
198 uintptr_t inner_elements = md->num_elements;
199
200 if (idx >= (count * inner_elements))
201 return;
202
203 uintptr_t x0 = res / inner_elements;
204 res %= inner_elements;
205 inner_elements /= md->u.blkhindx.count;
206
207 uintptr_t x1 = res / inner_elements;
208 res %= inner_elements;
209 inner_elements /= md->u.blkhindx.blocklength;
210 uintptr_t x2 = res / inner_elements;
211 res %= inner_elements;
212 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
213
214 uintptr_t x3 = res / inner_elements;
215 res %= inner_elements;
216 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
217 uintptr_t x4 = res / inner_elements;
218 res %= inner_elements;
219 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.contig.count;
220
221 uintptr_t x5 = res;
222
223 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
224 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
225 uintptr_t extent2 = md->u.blkhindx.child->extent;
226 intptr_t stride3 = md->u.blkhindx.child->u.blkhindx.child->u.contig.child->extent;
227 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
228 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
229 }
230
yaksuri_cudai_pack_blkhindx_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)231 void yaksuri_cudai_pack_blkhindx_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
232 {
233 void *args[] = { &inbuf, &outbuf, &count, &md };
234 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_blkhindx_contig_double,
235 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
236 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
237 }
238
yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)239 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
240 {
241 const char *__restrict__ sbuf = (const char *) inbuf;
242 char *__restrict__ dbuf = (char *) outbuf;
243 uintptr_t extent = md->extent;
244 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
245 uintptr_t res = idx;
246 uintptr_t inner_elements = md->num_elements;
247
248 if (idx >= (count * inner_elements))
249 return;
250
251 uintptr_t x0 = res / inner_elements;
252 res %= inner_elements;
253 inner_elements /= md->u.blkhindx.count;
254
255 uintptr_t x1 = res / inner_elements;
256 res %= inner_elements;
257 inner_elements /= md->u.blkhindx.blocklength;
258 uintptr_t x2 = res / inner_elements;
259 res %= inner_elements;
260 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
261
262 uintptr_t x3 = res / inner_elements;
263 res %= inner_elements;
264 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
265 uintptr_t x4 = res / inner_elements;
266 res %= inner_elements;
267 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.contig.count;
268
269 uintptr_t x5 = res;
270
271 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
272 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
273 uintptr_t extent2 = md->u.blkhindx.child->extent;
274 intptr_t stride3 = md->u.blkhindx.child->u.blkhindx.child->u.contig.child->extent;
275 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
276 *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
277 }
278
yaksuri_cudai_unpack_blkhindx_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)279 void yaksuri_cudai_unpack_blkhindx_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
280 {
281 void *args[] = { &inbuf, &outbuf, &count, &md };
282 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_contig_double,
283 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
284 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
285 }
286
yaksuri_cudai_kernel_pack_hindexed_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)287 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
288 {
289 const char *__restrict__ sbuf = (const char *) inbuf;
290 char *__restrict__ dbuf = (char *) outbuf;
291 uintptr_t extent = md->extent;
292 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
293 uintptr_t res = idx;
294 uintptr_t inner_elements = md->num_elements;
295
296 if (idx >= (count * inner_elements))
297 return;
298
299 uintptr_t x0 = res / inner_elements;
300 res %= inner_elements;
301 inner_elements /= md->u.hindexed.count;
302
303 uintptr_t x1;
304 for (int i = 0; i < md->u.hindexed.count; i++) {
305 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
306 md->u.hindexed.child->num_elements;
307 if (res < in_elems) {
308 x1 = i;
309 res %= in_elems;
310 inner_elements = md->u.hindexed.child->num_elements;
311 break;
312 } else {
313 res -= in_elems;
314 }
315 }
316
317 uintptr_t x2 = res / inner_elements;
318 res %= inner_elements;
319 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
320
321 uintptr_t x3 = res / inner_elements;
322 res %= inner_elements;
323 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
324 uintptr_t x4 = res / inner_elements;
325 res %= inner_elements;
326 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.contig.count;
327
328 uintptr_t x5 = res;
329
330 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
331 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
332 uintptr_t extent2 = md->u.hindexed.child->extent;
333 intptr_t stride3 = md->u.hindexed.child->u.blkhindx.child->u.contig.child->extent;
334 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
335 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
336 }
337
yaksuri_cudai_pack_hindexed_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)338 void yaksuri_cudai_pack_hindexed_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
339 {
340 void *args[] = { &inbuf, &outbuf, &count, &md };
341 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_contig_double,
342 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
343 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
344 }
345
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)346 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
347 {
348 const char *__restrict__ sbuf = (const char *) inbuf;
349 char *__restrict__ dbuf = (char *) outbuf;
350 uintptr_t extent = md->extent;
351 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
352 uintptr_t res = idx;
353 uintptr_t inner_elements = md->num_elements;
354
355 if (idx >= (count * inner_elements))
356 return;
357
358 uintptr_t x0 = res / inner_elements;
359 res %= inner_elements;
360 inner_elements /= md->u.hindexed.count;
361
362 uintptr_t x1;
363 for (int i = 0; i < md->u.hindexed.count; i++) {
364 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
365 md->u.hindexed.child->num_elements;
366 if (res < in_elems) {
367 x1 = i;
368 res %= in_elems;
369 inner_elements = md->u.hindexed.child->num_elements;
370 break;
371 } else {
372 res -= in_elems;
373 }
374 }
375
376 uintptr_t x2 = res / inner_elements;
377 res %= inner_elements;
378 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
379
380 uintptr_t x3 = res / inner_elements;
381 res %= inner_elements;
382 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
383 uintptr_t x4 = res / inner_elements;
384 res %= inner_elements;
385 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.contig.count;
386
387 uintptr_t x5 = res;
388
389 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
390 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
391 uintptr_t extent2 = md->u.hindexed.child->extent;
392 intptr_t stride3 = md->u.hindexed.child->u.blkhindx.child->u.contig.child->extent;
393 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
394 *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
395 }
396
yaksuri_cudai_unpack_hindexed_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)397 void yaksuri_cudai_unpack_hindexed_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
398 {
399 void *args[] = { &inbuf, &outbuf, &count, &md };
400 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_contig_double,
401 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
402 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
403 }
404
yaksuri_cudai_kernel_pack_contig_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)405 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
406 {
407 const char *__restrict__ sbuf = (const char *) inbuf;
408 char *__restrict__ dbuf = (char *) outbuf;
409 uintptr_t extent = md->extent;
410 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
411 uintptr_t res = idx;
412 uintptr_t inner_elements = md->num_elements;
413
414 if (idx >= (count * inner_elements))
415 return;
416
417 uintptr_t x0 = res / inner_elements;
418 res %= inner_elements;
419 inner_elements /= md->u.contig.count;
420
421 uintptr_t x1 = res / inner_elements;
422 res %= inner_elements;
423 inner_elements /= md->u.contig.child->u.blkhindx.count;
424
425 uintptr_t x2 = res / inner_elements;
426 res %= inner_elements;
427 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
428 uintptr_t x3 = res / inner_elements;
429 res %= inner_elements;
430 inner_elements /= md->u.contig.child->u.blkhindx.child->u.contig.count;
431
432 uintptr_t x4 = res;
433
434 intptr_t stride1 = md->u.contig.child->extent;
435 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
436 intptr_t stride3 = md->u.contig.child->u.blkhindx.child->u.contig.child->extent;
437 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
438 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3));
439 }
440
yaksuri_cudai_pack_contig_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)441 void yaksuri_cudai_pack_contig_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
442 {
443 void *args[] = { &inbuf, &outbuf, &count, &md };
444 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_contig_double,
445 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
446 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
447 }
448
yaksuri_cudai_kernel_unpack_contig_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)449 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
450 {
451 const char *__restrict__ sbuf = (const char *) inbuf;
452 char *__restrict__ dbuf = (char *) outbuf;
453 uintptr_t extent = md->extent;
454 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
455 uintptr_t res = idx;
456 uintptr_t inner_elements = md->num_elements;
457
458 if (idx >= (count * inner_elements))
459 return;
460
461 uintptr_t x0 = res / inner_elements;
462 res %= inner_elements;
463 inner_elements /= md->u.contig.count;
464
465 uintptr_t x1 = res / inner_elements;
466 res %= inner_elements;
467 inner_elements /= md->u.contig.child->u.blkhindx.count;
468
469 uintptr_t x2 = res / inner_elements;
470 res %= inner_elements;
471 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
472 uintptr_t x3 = res / inner_elements;
473 res %= inner_elements;
474 inner_elements /= md->u.contig.child->u.blkhindx.child->u.contig.count;
475
476 uintptr_t x4 = res;
477
478 intptr_t stride1 = md->u.contig.child->extent;
479 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
480 intptr_t stride3 = md->u.contig.child->u.blkhindx.child->u.contig.child->extent;
481 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
482 *((double *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
483 }
484
yaksuri_cudai_unpack_contig_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)485 void yaksuri_cudai_unpack_contig_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
486 {
487 void *args[] = { &inbuf, &outbuf, &count, &md };
488 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_contig_double,
489 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
490 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
491 }
492
yaksuri_cudai_kernel_pack_resized_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)493 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
494 {
495 const char *__restrict__ sbuf = (const char *) inbuf;
496 char *__restrict__ dbuf = (char *) outbuf;
497 uintptr_t extent = md->extent;
498 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
499 uintptr_t res = idx;
500 uintptr_t inner_elements = md->num_elements;
501
502 if (idx >= (count * inner_elements))
503 return;
504
505 uintptr_t x0 = res / inner_elements;
506 res %= inner_elements;
507 inner_elements /= md->u.resized.child->u.blkhindx.count;
508
509 uintptr_t x1 = res / inner_elements;
510 res %= inner_elements;
511 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
512 uintptr_t x2 = res / inner_elements;
513 res %= inner_elements;
514 inner_elements /= md->u.resized.child->u.blkhindx.child->u.contig.count;
515
516 uintptr_t x3 = res;
517
518 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
519 intptr_t stride3 = md->u.resized.child->u.blkhindx.child->u.contig.child->extent;
520 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
521 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3));
522 }
523
yaksuri_cudai_pack_resized_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)524 void yaksuri_cudai_pack_resized_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
525 {
526 void *args[] = { &inbuf, &outbuf, &count, &md };
527 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx_contig_double,
528 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
529 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
530 }
531
yaksuri_cudai_kernel_unpack_resized_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)532 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
533 {
534 const char *__restrict__ sbuf = (const char *) inbuf;
535 char *__restrict__ dbuf = (char *) outbuf;
536 uintptr_t extent = md->extent;
537 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
538 uintptr_t res = idx;
539 uintptr_t inner_elements = md->num_elements;
540
541 if (idx >= (count * inner_elements))
542 return;
543
544 uintptr_t x0 = res / inner_elements;
545 res %= inner_elements;
546 inner_elements /= md->u.resized.child->u.blkhindx.count;
547
548 uintptr_t x1 = res / inner_elements;
549 res %= inner_elements;
550 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
551 uintptr_t x2 = res / inner_elements;
552 res %= inner_elements;
553 inner_elements /= md->u.resized.child->u.blkhindx.child->u.contig.count;
554
555 uintptr_t x3 = res;
556
557 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
558 intptr_t stride3 = md->u.resized.child->u.blkhindx.child->u.contig.child->extent;
559 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
560 *((double *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
561 }
562
yaksuri_cudai_unpack_resized_blkhindx_contig_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)563 void yaksuri_cudai_unpack_resized_blkhindx_contig_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
564 {
565 void *args[] = { &inbuf, &outbuf, &count, &md };
566 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx_contig_double,
567 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
568 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
569 }
570
571