1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.contig.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.contig.child->u.blkhindx.count;
36 
37     uintptr_t x2 = res / inner_elements;
38     res %= inner_elements;
39     inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
40     uintptr_t x3 = res;
41 
42     intptr_t stride1 = md->u.contig.child->extent;
43     intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
44     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(float)));
45 }
46 
yaksuri_cudai_pack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)47 void yaksuri_cudai_pack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
48 {
49 void *args[] = { &inbuf, &outbuf, &count, &md };
50     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_float,
51         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
52     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
53 }
54 
yaksuri_cudai_kernel_unpack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)55 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
56 {
57     const char *__restrict__ sbuf = (const char *) inbuf;
58     char *__restrict__ dbuf = (char *) outbuf;
59     uintptr_t extent = md->extent;
60     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
61     uintptr_t res = idx;
62     uintptr_t inner_elements = md->num_elements;
63 
64     if (idx >= (count * inner_elements))
65         return;
66 
67     uintptr_t x0 = res / inner_elements;
68     res %= inner_elements;
69     inner_elements /= md->u.contig.count;
70 
71     uintptr_t x1 = res / inner_elements;
72     res %= inner_elements;
73     inner_elements /= md->u.contig.child->u.blkhindx.count;
74 
75     uintptr_t x2 = res / inner_elements;
76     res %= inner_elements;
77     inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
78     uintptr_t x3 = res;
79 
80     intptr_t stride1 = md->u.contig.child->extent;
81     intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
82     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
83 }
84 
yaksuri_cudai_unpack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)85 void yaksuri_cudai_unpack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
86 {
87 void *args[] = { &inbuf, &outbuf, &count, &md };
88     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_float,
89         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
90     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
91 }
92 
yaksuri_cudai_kernel_pack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)93 __global__ void yaksuri_cudai_kernel_pack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
94 {
95     const char *__restrict__ sbuf = (const char *) inbuf;
96     char *__restrict__ dbuf = (char *) outbuf;
97     uintptr_t extent = md->extent;
98     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
99     uintptr_t res = idx;
100     uintptr_t inner_elements = md->num_elements;
101 
102     if (idx >= (count * inner_elements))
103         return;
104 
105     uintptr_t x0 = res / inner_elements;
106     res %= inner_elements;
107     inner_elements /= md->u.hvector.count;
108 
109     uintptr_t x1 = res / inner_elements;
110     res %= inner_elements;
111     inner_elements /= md->u.hvector.blocklength;
112     uintptr_t x2 = res / inner_elements;
113     res %= inner_elements;
114     inner_elements /= md->u.hvector.child->u.contig.count;
115 
116     uintptr_t x3 = res / inner_elements;
117     res %= inner_elements;
118     inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.count;
119 
120     uintptr_t x4 = res / inner_elements;
121     res %= inner_elements;
122     inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.blocklength;
123     uintptr_t x5 = res;
124 
125     intptr_t stride1 = md->u.hvector.stride;
126     intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
127     uintptr_t extent2 = md->u.hvector.child->extent;
128     intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.blkhindx.array_of_displs;
129     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float)));
130 }
131 
yaksuri_cudai_pack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)132 void yaksuri_cudai_pack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
133 {
134 void *args[] = { &inbuf, &outbuf, &count, &md };
135     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_contig_blkhindx_float,
136         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
137     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
138 }
139 
yaksuri_cudai_kernel_unpack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)140 __global__ void yaksuri_cudai_kernel_unpack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
141 {
142     const char *__restrict__ sbuf = (const char *) inbuf;
143     char *__restrict__ dbuf = (char *) outbuf;
144     uintptr_t extent = md->extent;
145     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
146     uintptr_t res = idx;
147     uintptr_t inner_elements = md->num_elements;
148 
149     if (idx >= (count * inner_elements))
150         return;
151 
152     uintptr_t x0 = res / inner_elements;
153     res %= inner_elements;
154     inner_elements /= md->u.hvector.count;
155 
156     uintptr_t x1 = res / inner_elements;
157     res %= inner_elements;
158     inner_elements /= md->u.hvector.blocklength;
159     uintptr_t x2 = res / inner_elements;
160     res %= inner_elements;
161     inner_elements /= md->u.hvector.child->u.contig.count;
162 
163     uintptr_t x3 = res / inner_elements;
164     res %= inner_elements;
165     inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.count;
166 
167     uintptr_t x4 = res / inner_elements;
168     res %= inner_elements;
169     inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.blocklength;
170     uintptr_t x5 = res;
171 
172     intptr_t stride1 = md->u.hvector.stride;
173     intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
174     uintptr_t extent2 = md->u.hvector.child->extent;
175     intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.blkhindx.array_of_displs;
176     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
177 }
178 
yaksuri_cudai_unpack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)179 void yaksuri_cudai_unpack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
180 {
181 void *args[] = { &inbuf, &outbuf, &count, &md };
182     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_contig_blkhindx_float,
183         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
184     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
185 }
186 
yaksuri_cudai_kernel_pack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)187 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
188 {
189     const char *__restrict__ sbuf = (const char *) inbuf;
190     char *__restrict__ dbuf = (char *) outbuf;
191     uintptr_t extent = md->extent;
192     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
193     uintptr_t res = idx;
194     uintptr_t inner_elements = md->num_elements;
195 
196     if (idx >= (count * inner_elements))
197         return;
198 
199     uintptr_t x0 = res / inner_elements;
200     res %= inner_elements;
201     inner_elements /= md->u.blkhindx.count;
202 
203     uintptr_t x1 = res / inner_elements;
204     res %= inner_elements;
205     inner_elements /= md->u.blkhindx.blocklength;
206     uintptr_t x2 = res / inner_elements;
207     res %= inner_elements;
208     inner_elements /= md->u.blkhindx.child->u.contig.count;
209 
210     uintptr_t x3 = res / inner_elements;
211     res %= inner_elements;
212     inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.count;
213 
214     uintptr_t x4 = res / inner_elements;
215     res %= inner_elements;
216     inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.blocklength;
217     uintptr_t x5 = res;
218 
219     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
220     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
221     uintptr_t extent2 = md->u.blkhindx.child->extent;
222     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.blkhindx.array_of_displs;
223     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float)));
224 }
225 
yaksuri_cudai_pack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)226 void yaksuri_cudai_pack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
227 {
228 void *args[] = { &inbuf, &outbuf, &count, &md };
229     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig_blkhindx_float,
230         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
231     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
232 }
233 
yaksuri_cudai_kernel_unpack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)234 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
235 {
236     const char *__restrict__ sbuf = (const char *) inbuf;
237     char *__restrict__ dbuf = (char *) outbuf;
238     uintptr_t extent = md->extent;
239     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
240     uintptr_t res = idx;
241     uintptr_t inner_elements = md->num_elements;
242 
243     if (idx >= (count * inner_elements))
244         return;
245 
246     uintptr_t x0 = res / inner_elements;
247     res %= inner_elements;
248     inner_elements /= md->u.blkhindx.count;
249 
250     uintptr_t x1 = res / inner_elements;
251     res %= inner_elements;
252     inner_elements /= md->u.blkhindx.blocklength;
253     uintptr_t x2 = res / inner_elements;
254     res %= inner_elements;
255     inner_elements /= md->u.blkhindx.child->u.contig.count;
256 
257     uintptr_t x3 = res / inner_elements;
258     res %= inner_elements;
259     inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.count;
260 
261     uintptr_t x4 = res / inner_elements;
262     res %= inner_elements;
263     inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.blocklength;
264     uintptr_t x5 = res;
265 
266     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
267     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
268     uintptr_t extent2 = md->u.blkhindx.child->extent;
269     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.blkhindx.array_of_displs;
270     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
271 }
272 
yaksuri_cudai_unpack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)273 void yaksuri_cudai_unpack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
274 {
275 void *args[] = { &inbuf, &outbuf, &count, &md };
276     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig_blkhindx_float,
277         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
278     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
279 }
280 
yaksuri_cudai_kernel_pack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)281 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
282 {
283     const char *__restrict__ sbuf = (const char *) inbuf;
284     char *__restrict__ dbuf = (char *) outbuf;
285     uintptr_t extent = md->extent;
286     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
287     uintptr_t res = idx;
288     uintptr_t inner_elements = md->num_elements;
289 
290     if (idx >= (count * inner_elements))
291         return;
292 
293     uintptr_t x0 = res / inner_elements;
294     res %= inner_elements;
295     inner_elements /= md->u.hindexed.count;
296 
297     uintptr_t x1;
298     for (int i = 0; i < md->u.hindexed.count; i++) {
299             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
300                                  md->u.hindexed.child->num_elements;
301             if (res < in_elems) {
302                     x1 = i;
303                     res %= in_elems;
304                     inner_elements = md->u.hindexed.child->num_elements;
305                     break;
306             } else {
307                     res -= in_elems;
308             }
309     }
310 
311     uintptr_t x2 = res / inner_elements;
312     res %= inner_elements;
313     inner_elements /= md->u.hindexed.child->u.contig.count;
314 
315     uintptr_t x3 = res / inner_elements;
316     res %= inner_elements;
317     inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.count;
318 
319     uintptr_t x4 = res / inner_elements;
320     res %= inner_elements;
321     inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.blocklength;
322     uintptr_t x5 = res;
323 
324     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
325     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
326     uintptr_t extent2 = md->u.hindexed.child->extent;
327     intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.blkhindx.array_of_displs;
328     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float)));
329 }
330 
yaksuri_cudai_pack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)331 void yaksuri_cudai_pack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
332 {
333 void *args[] = { &inbuf, &outbuf, &count, &md };
334     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_blkhindx_float,
335         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
336     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
337 }
338 
yaksuri_cudai_kernel_unpack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)339 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
340 {
341     const char *__restrict__ sbuf = (const char *) inbuf;
342     char *__restrict__ dbuf = (char *) outbuf;
343     uintptr_t extent = md->extent;
344     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
345     uintptr_t res = idx;
346     uintptr_t inner_elements = md->num_elements;
347 
348     if (idx >= (count * inner_elements))
349         return;
350 
351     uintptr_t x0 = res / inner_elements;
352     res %= inner_elements;
353     inner_elements /= md->u.hindexed.count;
354 
355     uintptr_t x1;
356     for (int i = 0; i < md->u.hindexed.count; i++) {
357             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
358                                  md->u.hindexed.child->num_elements;
359             if (res < in_elems) {
360                     x1 = i;
361                     res %= in_elems;
362                     inner_elements = md->u.hindexed.child->num_elements;
363                     break;
364             } else {
365                     res -= in_elems;
366             }
367     }
368 
369     uintptr_t x2 = res / inner_elements;
370     res %= inner_elements;
371     inner_elements /= md->u.hindexed.child->u.contig.count;
372 
373     uintptr_t x3 = res / inner_elements;
374     res %= inner_elements;
375     inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.count;
376 
377     uintptr_t x4 = res / inner_elements;
378     res %= inner_elements;
379     inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.blocklength;
380     uintptr_t x5 = res;
381 
382     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
383     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
384     uintptr_t extent2 = md->u.hindexed.child->extent;
385     intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.blkhindx.array_of_displs;
386     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
387 }
388 
yaksuri_cudai_unpack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)389 void yaksuri_cudai_unpack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
390 {
391 void *args[] = { &inbuf, &outbuf, &count, &md };
392     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_blkhindx_float,
393         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
394     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
395 }
396 
yaksuri_cudai_kernel_pack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)397 __global__ void yaksuri_cudai_kernel_pack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
398 {
399     const char *__restrict__ sbuf = (const char *) inbuf;
400     char *__restrict__ dbuf = (char *) outbuf;
401     uintptr_t extent = md->extent;
402     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
403     uintptr_t res = idx;
404     uintptr_t inner_elements = md->num_elements;
405 
406     if (idx >= (count * inner_elements))
407         return;
408 
409     uintptr_t x0 = res / inner_elements;
410     res %= inner_elements;
411     inner_elements /= md->u.contig.count;
412 
413     uintptr_t x1 = res / inner_elements;
414     res %= inner_elements;
415     inner_elements /= md->u.contig.child->u.contig.count;
416 
417     uintptr_t x2 = res / inner_elements;
418     res %= inner_elements;
419     inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.count;
420 
421     uintptr_t x3 = res / inner_elements;
422     res %= inner_elements;
423     inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.blocklength;
424     uintptr_t x4 = res;
425 
426     intptr_t stride1 = md->u.contig.child->extent;
427     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
428     intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.blkhindx.array_of_displs;
429     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(float)));
430 }
431 
yaksuri_cudai_pack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)432 void yaksuri_cudai_pack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
433 {
434 void *args[] = { &inbuf, &outbuf, &count, &md };
435     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_blkhindx_float,
436         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
437     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
438 }
439 
yaksuri_cudai_kernel_unpack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)440 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
441 {
442     const char *__restrict__ sbuf = (const char *) inbuf;
443     char *__restrict__ dbuf = (char *) outbuf;
444     uintptr_t extent = md->extent;
445     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
446     uintptr_t res = idx;
447     uintptr_t inner_elements = md->num_elements;
448 
449     if (idx >= (count * inner_elements))
450         return;
451 
452     uintptr_t x0 = res / inner_elements;
453     res %= inner_elements;
454     inner_elements /= md->u.contig.count;
455 
456     uintptr_t x1 = res / inner_elements;
457     res %= inner_elements;
458     inner_elements /= md->u.contig.child->u.contig.count;
459 
460     uintptr_t x2 = res / inner_elements;
461     res %= inner_elements;
462     inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.count;
463 
464     uintptr_t x3 = res / inner_elements;
465     res %= inner_elements;
466     inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.blocklength;
467     uintptr_t x4 = res;
468 
469     intptr_t stride1 = md->u.contig.child->extent;
470     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
471     intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.blkhindx.array_of_displs;
472     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
473 }
474 
yaksuri_cudai_unpack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)475 void yaksuri_cudai_unpack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
476 {
477 void *args[] = { &inbuf, &outbuf, &count, &md };
478     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_blkhindx_float,
479         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
480     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
481 }
482 
yaksuri_cudai_kernel_pack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)483 __global__ void yaksuri_cudai_kernel_pack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
484 {
485     const char *__restrict__ sbuf = (const char *) inbuf;
486     char *__restrict__ dbuf = (char *) outbuf;
487     uintptr_t extent = md->extent;
488     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
489     uintptr_t res = idx;
490     uintptr_t inner_elements = md->num_elements;
491 
492     if (idx >= (count * inner_elements))
493         return;
494 
495     uintptr_t x0 = res / inner_elements;
496     res %= inner_elements;
497     inner_elements /= md->u.resized.child->u.contig.count;
498 
499     uintptr_t x1 = res / inner_elements;
500     res %= inner_elements;
501     inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.count;
502 
503     uintptr_t x2 = res / inner_elements;
504     res %= inner_elements;
505     inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.blocklength;
506     uintptr_t x3 = res;
507 
508     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
509     intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.blkhindx.array_of_displs;
510     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(float)));
511 }
512 
yaksuri_cudai_pack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)513 void yaksuri_cudai_pack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
514 {
515 void *args[] = { &inbuf, &outbuf, &count, &md };
516     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig_blkhindx_float,
517         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
518     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
519 }
520 
yaksuri_cudai_kernel_unpack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)521 __global__ void yaksuri_cudai_kernel_unpack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
522 {
523     const char *__restrict__ sbuf = (const char *) inbuf;
524     char *__restrict__ dbuf = (char *) outbuf;
525     uintptr_t extent = md->extent;
526     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
527     uintptr_t res = idx;
528     uintptr_t inner_elements = md->num_elements;
529 
530     if (idx >= (count * inner_elements))
531         return;
532 
533     uintptr_t x0 = res / inner_elements;
534     res %= inner_elements;
535     inner_elements /= md->u.resized.child->u.contig.count;
536 
537     uintptr_t x1 = res / inner_elements;
538     res %= inner_elements;
539     inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.count;
540 
541     uintptr_t x2 = res / inner_elements;
542     res %= inner_elements;
543     inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.blocklength;
544     uintptr_t x3 = res;
545 
546     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
547     intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.blkhindx.array_of_displs;
548     *((float *) (void *) (dbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
549 }
550 
yaksuri_cudai_unpack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)551 void yaksuri_cudai_unpack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
552 {
553 void *args[] = { &inbuf, &outbuf, &count, &md };
554     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig_blkhindx_float,
555         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
556     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
557 }
558 
559