1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.resized.child->u.contig.count;
32 
33     uintptr_t x1 = res;
34 
35     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
36     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride2));
37 }
38 
yaksuri_cudai_pack_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)39 void yaksuri_cudai_pack_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
40 {
41 void *args[] = { &inbuf, &outbuf, &count, &md };
42     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig_float,
43         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
44     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
45 }
46 
yaksuri_cudai_kernel_unpack_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)47 __global__ void yaksuri_cudai_kernel_unpack_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
48 {
49     const char *__restrict__ sbuf = (const char *) inbuf;
50     char *__restrict__ dbuf = (char *) outbuf;
51     uintptr_t extent = md->extent;
52     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
53     uintptr_t res = idx;
54     uintptr_t inner_elements = md->num_elements;
55 
56     if (idx >= (count * inner_elements))
57         return;
58 
59     uintptr_t x0 = res / inner_elements;
60     res %= inner_elements;
61     inner_elements /= md->u.resized.child->u.contig.count;
62 
63     uintptr_t x1 = res;
64 
65     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
66     *((float *) (void *) (dbuf + x0 * extent + x1 * stride2)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
67 }
68 
yaksuri_cudai_unpack_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)69 void yaksuri_cudai_unpack_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
70 {
71 void *args[] = { &inbuf, &outbuf, &count, &md };
72     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig_float,
73         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
74     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
75 }
76 
yaksuri_cudai_kernel_pack_hvector_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)77 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
78 {
79     const char *__restrict__ sbuf = (const char *) inbuf;
80     char *__restrict__ dbuf = (char *) outbuf;
81     uintptr_t extent = md->extent;
82     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
83     uintptr_t res = idx;
84     uintptr_t inner_elements = md->num_elements;
85 
86     if (idx >= (count * inner_elements))
87         return;
88 
89     uintptr_t x0 = res / inner_elements;
90     res %= inner_elements;
91     inner_elements /= md->u.hvector.count;
92 
93     uintptr_t x1 = res / inner_elements;
94     res %= inner_elements;
95     inner_elements /= md->u.hvector.blocklength;
96     uintptr_t x2 = res / inner_elements;
97     res %= inner_elements;
98     inner_elements /= md->u.hvector.child->u.resized.child->u.contig.count;
99 
100     uintptr_t x3 = res;
101 
102     intptr_t stride1 = md->u.hvector.stride;
103     uintptr_t extent2 = md->u.hvector.child->extent;
104     intptr_t stride3 = md->u.hvector.child->u.resized.child->u.contig.child->extent;
105     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride3));
106 }
107 
yaksuri_cudai_pack_hvector_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)108 void yaksuri_cudai_pack_hvector_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
109 {
110 void *args[] = { &inbuf, &outbuf, &count, &md };
111     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_contig_float,
112         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
113     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
114 }
115 
yaksuri_cudai_kernel_unpack_hvector_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)116 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
117 {
118     const char *__restrict__ sbuf = (const char *) inbuf;
119     char *__restrict__ dbuf = (char *) outbuf;
120     uintptr_t extent = md->extent;
121     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
122     uintptr_t res = idx;
123     uintptr_t inner_elements = md->num_elements;
124 
125     if (idx >= (count * inner_elements))
126         return;
127 
128     uintptr_t x0 = res / inner_elements;
129     res %= inner_elements;
130     inner_elements /= md->u.hvector.count;
131 
132     uintptr_t x1 = res / inner_elements;
133     res %= inner_elements;
134     inner_elements /= md->u.hvector.blocklength;
135     uintptr_t x2 = res / inner_elements;
136     res %= inner_elements;
137     inner_elements /= md->u.hvector.child->u.resized.child->u.contig.count;
138 
139     uintptr_t x3 = res;
140 
141     intptr_t stride1 = md->u.hvector.stride;
142     uintptr_t extent2 = md->u.hvector.child->extent;
143     intptr_t stride3 = md->u.hvector.child->u.resized.child->u.contig.child->extent;
144     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
145 }
146 
yaksuri_cudai_unpack_hvector_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)147 void yaksuri_cudai_unpack_hvector_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
148 {
149 void *args[] = { &inbuf, &outbuf, &count, &md };
150     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_contig_float,
151         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
152     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
153 }
154 
yaksuri_cudai_kernel_pack_blkhindx_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)155 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
156 {
157     const char *__restrict__ sbuf = (const char *) inbuf;
158     char *__restrict__ dbuf = (char *) outbuf;
159     uintptr_t extent = md->extent;
160     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
161     uintptr_t res = idx;
162     uintptr_t inner_elements = md->num_elements;
163 
164     if (idx >= (count * inner_elements))
165         return;
166 
167     uintptr_t x0 = res / inner_elements;
168     res %= inner_elements;
169     inner_elements /= md->u.blkhindx.count;
170 
171     uintptr_t x1 = res / inner_elements;
172     res %= inner_elements;
173     inner_elements /= md->u.blkhindx.blocklength;
174     uintptr_t x2 = res / inner_elements;
175     res %= inner_elements;
176     inner_elements /= md->u.blkhindx.child->u.resized.child->u.contig.count;
177 
178     uintptr_t x3 = res;
179 
180     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
181     uintptr_t extent2 = md->u.blkhindx.child->extent;
182     intptr_t stride3 = md->u.blkhindx.child->u.resized.child->u.contig.child->extent;
183     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3));
184 }
185 
yaksuri_cudai_pack_blkhindx_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)186 void yaksuri_cudai_pack_blkhindx_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
187 {
188 void *args[] = { &inbuf, &outbuf, &count, &md };
189     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_contig_float,
190         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
191     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
192 }
193 
yaksuri_cudai_kernel_unpack_blkhindx_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)194 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
195 {
196     const char *__restrict__ sbuf = (const char *) inbuf;
197     char *__restrict__ dbuf = (char *) outbuf;
198     uintptr_t extent = md->extent;
199     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
200     uintptr_t res = idx;
201     uintptr_t inner_elements = md->num_elements;
202 
203     if (idx >= (count * inner_elements))
204         return;
205 
206     uintptr_t x0 = res / inner_elements;
207     res %= inner_elements;
208     inner_elements /= md->u.blkhindx.count;
209 
210     uintptr_t x1 = res / inner_elements;
211     res %= inner_elements;
212     inner_elements /= md->u.blkhindx.blocklength;
213     uintptr_t x2 = res / inner_elements;
214     res %= inner_elements;
215     inner_elements /= md->u.blkhindx.child->u.resized.child->u.contig.count;
216 
217     uintptr_t x3 = res;
218 
219     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
220     uintptr_t extent2 = md->u.blkhindx.child->extent;
221     intptr_t stride3 = md->u.blkhindx.child->u.resized.child->u.contig.child->extent;
222     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
223 }
224 
yaksuri_cudai_unpack_blkhindx_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)225 void yaksuri_cudai_unpack_blkhindx_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
226 {
227 void *args[] = { &inbuf, &outbuf, &count, &md };
228     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_contig_float,
229         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
230     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
231 }
232 
yaksuri_cudai_kernel_pack_hindexed_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)233 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
234 {
235     const char *__restrict__ sbuf = (const char *) inbuf;
236     char *__restrict__ dbuf = (char *) outbuf;
237     uintptr_t extent = md->extent;
238     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
239     uintptr_t res = idx;
240     uintptr_t inner_elements = md->num_elements;
241 
242     if (idx >= (count * inner_elements))
243         return;
244 
245     uintptr_t x0 = res / inner_elements;
246     res %= inner_elements;
247     inner_elements /= md->u.hindexed.count;
248 
249     uintptr_t x1;
250     for (int i = 0; i < md->u.hindexed.count; i++) {
251             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
252                                  md->u.hindexed.child->num_elements;
253             if (res < in_elems) {
254                     x1 = i;
255                     res %= in_elems;
256                     inner_elements = md->u.hindexed.child->num_elements;
257                     break;
258             } else {
259                     res -= in_elems;
260             }
261     }
262 
263     uintptr_t x2 = res / inner_elements;
264     res %= inner_elements;
265     inner_elements /= md->u.hindexed.child->u.resized.child->u.contig.count;
266 
267     uintptr_t x3 = res;
268 
269     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
270     uintptr_t extent2 = md->u.hindexed.child->extent;
271     intptr_t stride3 = md->u.hindexed.child->u.resized.child->u.contig.child->extent;
272     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3));
273 }
274 
yaksuri_cudai_pack_hindexed_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)275 void yaksuri_cudai_pack_hindexed_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
276 {
277 void *args[] = { &inbuf, &outbuf, &count, &md };
278     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_contig_float,
279         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
280     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
281 }
282 
yaksuri_cudai_kernel_unpack_hindexed_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)283 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
284 {
285     const char *__restrict__ sbuf = (const char *) inbuf;
286     char *__restrict__ dbuf = (char *) outbuf;
287     uintptr_t extent = md->extent;
288     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
289     uintptr_t res = idx;
290     uintptr_t inner_elements = md->num_elements;
291 
292     if (idx >= (count * inner_elements))
293         return;
294 
295     uintptr_t x0 = res / inner_elements;
296     res %= inner_elements;
297     inner_elements /= md->u.hindexed.count;
298 
299     uintptr_t x1;
300     for (int i = 0; i < md->u.hindexed.count; i++) {
301             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
302                                  md->u.hindexed.child->num_elements;
303             if (res < in_elems) {
304                     x1 = i;
305                     res %= in_elems;
306                     inner_elements = md->u.hindexed.child->num_elements;
307                     break;
308             } else {
309                     res -= in_elems;
310             }
311     }
312 
313     uintptr_t x2 = res / inner_elements;
314     res %= inner_elements;
315     inner_elements /= md->u.hindexed.child->u.resized.child->u.contig.count;
316 
317     uintptr_t x3 = res;
318 
319     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
320     uintptr_t extent2 = md->u.hindexed.child->extent;
321     intptr_t stride3 = md->u.hindexed.child->u.resized.child->u.contig.child->extent;
322     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
323 }
324 
yaksuri_cudai_unpack_hindexed_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)325 void yaksuri_cudai_unpack_hindexed_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
326 {
327 void *args[] = { &inbuf, &outbuf, &count, &md };
328     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_contig_float,
329         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
330     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
331 }
332 
yaksuri_cudai_kernel_pack_contig_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)333 __global__ void yaksuri_cudai_kernel_pack_contig_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
334 {
335     const char *__restrict__ sbuf = (const char *) inbuf;
336     char *__restrict__ dbuf = (char *) outbuf;
337     uintptr_t extent = md->extent;
338     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
339     uintptr_t res = idx;
340     uintptr_t inner_elements = md->num_elements;
341 
342     if (idx >= (count * inner_elements))
343         return;
344 
345     uintptr_t x0 = res / inner_elements;
346     res %= inner_elements;
347     inner_elements /= md->u.contig.count;
348 
349     uintptr_t x1 = res / inner_elements;
350     res %= inner_elements;
351     inner_elements /= md->u.contig.child->u.resized.child->u.contig.count;
352 
353     uintptr_t x2 = res;
354 
355     intptr_t stride1 = md->u.contig.child->extent;
356     intptr_t stride3 = md->u.contig.child->u.resized.child->u.contig.child->extent;
357     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride3));
358 }
359 
yaksuri_cudai_pack_contig_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)360 void yaksuri_cudai_pack_contig_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
361 {
362 void *args[] = { &inbuf, &outbuf, &count, &md };
363     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_contig_float,
364         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
365     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
366 }
367 
yaksuri_cudai_kernel_unpack_contig_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)368 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
369 {
370     const char *__restrict__ sbuf = (const char *) inbuf;
371     char *__restrict__ dbuf = (char *) outbuf;
372     uintptr_t extent = md->extent;
373     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
374     uintptr_t res = idx;
375     uintptr_t inner_elements = md->num_elements;
376 
377     if (idx >= (count * inner_elements))
378         return;
379 
380     uintptr_t x0 = res / inner_elements;
381     res %= inner_elements;
382     inner_elements /= md->u.contig.count;
383 
384     uintptr_t x1 = res / inner_elements;
385     res %= inner_elements;
386     inner_elements /= md->u.contig.child->u.resized.child->u.contig.count;
387 
388     uintptr_t x2 = res;
389 
390     intptr_t stride1 = md->u.contig.child->extent;
391     intptr_t stride3 = md->u.contig.child->u.resized.child->u.contig.child->extent;
392     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
393 }
394 
yaksuri_cudai_unpack_contig_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)395 void yaksuri_cudai_unpack_contig_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
396 {
397 void *args[] = { &inbuf, &outbuf, &count, &md };
398     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_contig_float,
399         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
400     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
401 }
402 
yaksuri_cudai_kernel_pack_resized_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)403 __global__ void yaksuri_cudai_kernel_pack_resized_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
404 {
405     const char *__restrict__ sbuf = (const char *) inbuf;
406     char *__restrict__ dbuf = (char *) outbuf;
407     uintptr_t extent = md->extent;
408     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
409     uintptr_t res = idx;
410     uintptr_t inner_elements = md->num_elements;
411 
412     if (idx >= (count * inner_elements))
413         return;
414 
415     uintptr_t x0 = res / inner_elements;
416     res %= inner_elements;
417     inner_elements /= md->u.resized.child->u.resized.child->u.contig.count;
418 
419     uintptr_t x1 = res;
420 
421     intptr_t stride3 = md->u.resized.child->u.resized.child->u.contig.child->extent;
422     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride3));
423 }
424 
yaksuri_cudai_pack_resized_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)425 void yaksuri_cudai_pack_resized_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
426 {
427 void *args[] = { &inbuf, &outbuf, &count, &md };
428     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_contig_float,
429         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
430     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
431 }
432 
yaksuri_cudai_kernel_unpack_resized_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)433 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
434 {
435     const char *__restrict__ sbuf = (const char *) inbuf;
436     char *__restrict__ dbuf = (char *) outbuf;
437     uintptr_t extent = md->extent;
438     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
439     uintptr_t res = idx;
440     uintptr_t inner_elements = md->num_elements;
441 
442     if (idx >= (count * inner_elements))
443         return;
444 
445     uintptr_t x0 = res / inner_elements;
446     res %= inner_elements;
447     inner_elements /= md->u.resized.child->u.resized.child->u.contig.count;
448 
449     uintptr_t x1 = res;
450 
451     intptr_t stride3 = md->u.resized.child->u.resized.child->u.contig.child->extent;
452     *((float *) (void *) (dbuf + x0 * extent + x1 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
453 }
454 
yaksuri_cudai_unpack_resized_resized_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)455 void yaksuri_cudai_unpack_resized_resized_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
456 {
457 void *args[] = { &inbuf, &outbuf, &count, &md };
458     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_contig_float,
459         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
460     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
461 }
462 
463